You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
`import os
import easyocr
import torch
import logging
from pdf2image import convert_from_path
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
--- Configuration ---
INPUT_FOLDER = r"C:\files"
OUTPUT_FOLDER = r"C:\files\output"
NUM_THREADS = 4 # Adjust based on your CPU cores
DPI = 150 # DPI for PDF to image conversion
LOG_FILE = os.path.join(OUTPUT_FOLDER, "ocr_log.txt")
print(f"✅ EasyOCR running on: {reader.device}")
logging.info(f"EasyOCR running on: {reader.device}")
def process_pdf(pdf_file):
"""Extracts text from a single PDF using EasyOCR and saves it."""
text_path = os.path.join(OUTPUT_FOLDER, f"{os.path.splitext(pdf_file)[0]}.txt")
if os.path.exists(text_path):
return f"Skipping {pdf_file}, already processed."
pdf_path = os.path.join(INPUT_FOLDER, pdf_file)
try:
images = convert_from_path(pdf_path, dpi=DPI)
if not images:
logging.error(f"No images generated for {pdf_file}.")
return f"Error: No images generated for {pdf_file}."
extracted_text = ""
for idx, img in enumerate(images):
logging.debug(f"Processing image {idx + 1} of {len(images)}")
result = reader.readtext(img, detail=0)
logging.debug(f"OCR result for image {idx + 1}: {result}")
extracted_text += "\n".join(result) + "\n"
if not extracted_text:
logging.warning(f"No text extracted for {pdf_file}.")
with open(text_path, "w", encoding="utf-8") as text_file:
text_file.write(extracted_text)
return f"Processed: {pdf_file}"
except FileNotFoundError:
logging.error(f"File not found: {pdf_path}")
return f"Error: File not found: {pdf_file}"
except Exception as e:
logging.exception(f"Error processing {pdf_file}: {e}")
return f"Error processing {pdf_file}: {e}"
--- Main Execution ---
if name == "main":
pdf_files = [f for f in os.listdir(INPUT_FOLDER) if f.lower().endswith(".pdf")]
logging.info(f"Found {len(pdf_files)} PDF files to process.")
with ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
results = list(
tqdm(
executor.map(process_pdf, pdf_files),
total=len(pdf_files),
desc="Processing PDFs",
)
)
for result in results:
print(result)
logging.info("OCR process completed.")
print("OCR process completed! Check the log file for details.")
`
The text was updated successfully, but these errors were encountered:
Any ideas why it refuses to use GPU
`import os
import easyocr
import torch
import logging
from pdf2image import convert_from_path
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
--- Configuration ---
INPUT_FOLDER = r"C:\files"
OUTPUT_FOLDER = r"C:\files\output"
NUM_THREADS = 4 # Adjust based on your CPU cores
DPI = 150 # DPI for PDF to image conversion
LOG_FILE = os.path.join(OUTPUT_FOLDER, "ocr_log.txt")
--- Setup Logging ---
logging.basicConfig(
filename=LOG_FILE,
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
)
--- Ensure Output Folder Exists ---
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
--- Force GPU Usage ---
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch.cuda.set_device(0) # Force CUDA device 0
print(f"Using device: {device}")
logging.info(f"Using device: {device}")
--- Test PyTorch GPU ---
try:
test_tensor = torch.rand(1).cuda()
print(f"✅ PyTorch Test Tensor allocated on: {test_tensor.device}")
except Exception as e:
print(f"❌ PyTorch GPU Error: {e}")
logging.error(f"PyTorch GPU Error: {e}")
--- Initialize EasyOCR (Force GPU) ---
reader = easyocr.Reader(["en"], gpu=True, quantize=False)
Debug: Check if EasyOCR is using GPU
print(f"✅ EasyOCR running on: {reader.device}")
logging.info(f"EasyOCR running on: {reader.device}")
def process_pdf(pdf_file):
"""Extracts text from a single PDF using EasyOCR and saves it."""
text_path = os.path.join(OUTPUT_FOLDER, f"{os.path.splitext(pdf_file)[0]}.txt")
--- Main Execution ---
if name == "main":
pdf_files = [f for f in os.listdir(INPUT_FOLDER) if f.lower().endswith(".pdf")]
logging.info(f"Found {len(pdf_files)} PDF files to process.")
`
The text was updated successfully, but these errors were encountered: