From 9241c19b08a38201916560aab42d1395e7e9b87b Mon Sep 17 00:00:00 2001 From: Manuel Date: Sat, 20 Sep 2025 22:42:04 +0000 Subject: [PATCH] fixed ocr --- main.py | 131 +++++++++++++++++++++++++++++++++++++++++------ requirements.txt | 4 ++ 2 files changed, 119 insertions(+), 16 deletions(-) diff --git a/main.py b/main.py index f48eb0b..d45db65 100644 --- a/main.py +++ b/main.py @@ -10,6 +10,8 @@ import yaml import os import httpx import glob +import cv2 +import numpy as np from contextlib import asynccontextmanager from datetime import datetime, timezone from pathlib import Path @@ -1067,7 +1069,8 @@ def run_pdf_ocr_task(job_id: str, input_path_str: str, output_path_str: str, ocr force_ocr=ocr_settings.get('force_ocr', True), clean=ocr_settings.get('clean', True), optimize=ocr_settings.get('optimize', 1), - progress_bar=False) + progress_bar=False, + image_dpi=ocr_settings.get('image_dpi', 300)) with open(output_path_str, "rb") as f: reader = pypdf.PdfReader(f) preview = "\n".join(page.extract_text() or "" for page in reader.pages) @@ -1085,33 +1088,112 @@ def run_pdf_ocr_task(job_id: str, input_path_str: str, output_path_str: str, ocr db.close() send_webhook_notification(job_id, app_config, base_url) +def image_adjustment_controller(img, brightness=128, + contrast=200): + + brightness = int((brightness - 0) * (255 - (-255)) / (510 - 0) + (-255)) + contrast = int((contrast - 0) * (127 - (-127)) / (254 - 0) + (-127)) + if brightness != 0: + if brightness > 0: + shadow = brightness + max = 255 + else: + shadow = 0 + max = 255 + brightness + + al_pha = (max - shadow) / 255 + ga_mma = shadow + # The function addWeighted calculates + # the weighted sum of two arrays + cal = cv2.addWeighted(img, al_pha, + img, 0, ga_mma) + else: + cal = img + if contrast != 0: + Alpha = float(131 * (contrast + 127)) / (127 * (131 - contrast)) + Gamma = 127 * (1 - Alpha) + # The function addWeighted calculates + # the weighted sum of two arrays + cal = cv2.addWeighted(cal, Alpha, + cal, 0, Gamma) + return cal + + +def preprocess_for_ocr(image_path: str) -> np.ndarray: + """Loads an image and applies preprocessing steps for OCR.""" + # Read the image using OpenCV + image = cv2.imread(image_path) + + # 1. Convert to grayscale + gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + contrast_img = image_adjustment_controller(gray, brightness=150, contrast=120) + # 2. Binarize the image (Otsu's thresholding is great for this) + # This turns the image into pure black and white + _, binary_image = cv2.threshold(contrast_img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) + + # 3. Denoise the image (optional but often helpful) + denoised_image = cv2.medianBlur(gray, 3) + + return binary_image + + @huey.task() def run_image_ocr_task(job_id: str, input_path_str: str, output_path_str: str, app_config: dict, base_url: str): + """ + Performs OCR on an image file, first applying preprocessing steps to clean + the image, and then saving the output as a searchable PDF. + """ db = SessionLocal() input_path = Path(input_path_str) try: job = get_job(db, job_id) if not job or job.status == 'cancelled': + logger.warning(f"OCR job {job_id} was cancelled or not found. Aborting task.") return - update_job_status(db, job_id, "processing", progress=50) - logger.info(f"Starting Image OCR for job {job_id}") - text = pytesseract.image_to_string(Image.open(str(input_path))) - out_path = Path(output_path_str) - tmp_out = out_path.with_name(f"{out_path.stem}.tmp-{uuid.uuid4().hex}{out_path.suffix}") - with tmp_out.open("w", encoding="utf-8") as f: - f.write(text) - tmp_out.replace(out_path) - mark_job_as_completed(db, job_id, output_filepath_str=output_path_str, preview=text) - logger.info(f"Image OCR for job {job_id} completed.") + + update_job_status(db, job_id, "processing") + logger.info(f"Starting Image to PDF OCR for job {job_id}") + + # Apply the preprocessing steps to the input image for better accuracy + logger.info(f"Preprocessing image for job {job_id}...") + preprocessed_image = preprocess_for_ocr(input_path_str) + + # Configure Tesseract for optimal performance. + # '--psm 3' enables automatic page segmentation, which is a robust default. + # '-l eng' specifies English as the language. This should be made dynamic if you support others. + tesseract_config = '--psm 3' + logger.info(f"Running Tesseract with config: '{tesseract_config}' for job {job_id}") + + # Generate a searchable PDF from the preprocessed image data + pdf_bytes = pytesseract.image_to_pdf_or_hocr( + Image.fromarray(preprocessed_image), # Convert numpy array back to PIL Image + extension='pdf', + config=tesseract_config + ) + with open(output_path_str, "wb") as f: + f.write(pdf_bytes) + + # Generate a plain text preview from the same preprocessed image + preview_text = pytesseract.image_to_string( + Image.fromarray(preprocessed_image), + config=tesseract_config + ) + + mark_job_as_completed(db, job_id, output_filepath_str=output_path_str, preview=preview_text) + logger.info(f"Image to PDF OCR for job {job_id} completed successfully.") + except Exception as e: - logger.exception(f"ERROR during Image OCR for job {job_id}") + logger.exception(f"ERROR during Image to PDF OCR for job {job_id}") update_job_status(db, job_id, "failed", error=f"Image OCR failed: {e}") + finally: try: + # Clean up the original uploaded file ensure_path_is_safe(input_path, [PATHS.UPLOADS_DIR]) input_path.unlink(missing_ok=True) except Exception: - logger.exception("Failed to cleanup input file after Image OCR.") + logger.exception(f"Failed to cleanup input file for job {job_id}.") + db.close() send_webhook_notification(job_id, app_config, base_url) @@ -1487,11 +1569,28 @@ async def finalize_upload(request: Request, payload: FinalizeUploadPayload, user create_job(db=db, job=job_data) run_tts_task(job_data.id, str(final_path), str(processed_path), payload.model_name, tts_config, APP_CONFIG, base_url) elif payload.task_type == "ocr": - stem, suffix = Path(safe_filename).stem, Path(safe_filename).suffix - processed_path = PATHS.PROCESSED_DIR / f"{stem}_{job_id}{suffix}" + stem, suffix = Path(safe_filename).stem, Path(safe_filename).suffix.lower() + IMAGE_EXTENSIONS = {'.png', '.jpg', '.jpeg', '.tiff', '.tif', '.bmp', '.webp'} + + # 1. Validate file type before creating a job + if suffix not in IMAGE_EXTENSIONS and suffix != '.pdf': + final_path.unlink(missing_ok=True) # Clean up the uploaded file + raise HTTPException( + status_code=415, + detail=f"Unsupported file type for OCR: '{suffix}'. Please upload a PDF or a supported image." + ) + + # 2. Set output path to always be a PDF + processed_path = PATHS.PROCESSED_DIR / f"{stem}_{job_id}.pdf" job_data.processed_filepath = str(processed_path) create_job(db=db, job=job_data) - run_pdf_ocr_task(job_data.id, str(final_path), str(processed_path), APP_CONFIG.get("ocr_settings", {}).get("ocrmypdf", {}), APP_CONFIG, base_url) + + # 3. Dispatch to the correct task based on file type + if suffix in IMAGE_EXTENSIONS: + # Call the existing image task, which is now modified to produce a PDF + run_image_ocr_task(job_data.id, str(final_path), str(processed_path), APP_CONFIG, base_url) + else: # It must be a .pdf due to the earlier check + run_pdf_ocr_task(job_data.id, str(final_path), str(processed_path), APP_CONFIG.get("ocr_settings", {}).get("ocrmypdf", {}), APP_CONFIG, base_url) elif payload.task_type == "conversion": try: tool, task_key = payload.output_format.split('_', 1) diff --git a/requirements.txt b/requirements.txt index 3dfab3e..d46d3c7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,6 +15,8 @@ pytesseract pypdf piper-tts kokoro-tts +opencv-python-headless +marker-pdf[full] # Configuration & Utilities werkzeug @@ -28,6 +30,8 @@ pydantic-settings python-multipart markitdown[pdf,docx,pptx,xlsx,outlook]==0.1.3 PyPDF2==3.0.1 +mechanize +css_parser #PyQt6==6.9.1 # uncomment for calibre, it uses the webengine #PyQt6-Qt6==6.9.2 #PyQt6-WebEngine==6.9.0