fixed ocr

2025-09-20 22:42:04 +00:00
parent a5fc23379f
commit 9241c19b08
2 changed files with 119 additions and 16 deletions
--- a/main.py
+++ b/main.py
@@ -10,6 +10,8 @@ import yaml
 import os
 import httpx
 import glob
+import cv2
+import numpy as np
 from contextlib import asynccontextmanager
 from datetime import datetime, timezone
 from pathlib import Path
@@ -1067,7 +1069,8 @@ def run_pdf_ocr_task(job_id: str, input_path_str: str, output_path_str: str, ocr
                     force_ocr=ocr_settings.get('force_ocr', True),
                     clean=ocr_settings.get('clean', True),
                     optimize=ocr_settings.get('optimize', 1),
-                     progress_bar=False)
+                     progress_bar=False,
+                     image_dpi=ocr_settings.get('image_dpi', 300))
        with open(output_path_str, "rb") as f:
            reader = pypdf.PdfReader(f)
            preview = "\n".join(page.extract_text() or "" for page in reader.pages)
@@ -1085,33 +1088,112 @@ def run_pdf_ocr_task(job_id: str, input_path_str: str, output_path_str: str, ocr
        db.close()
        send_webhook_notification(job_id, app_config, base_url)

+def image_adjustment_controller(img, brightness=128,
+               contrast=200):
+  
+    brightness = int((brightness - 0) * (255 - (-255)) / (510 - 0) + (-255))
+    contrast = int((contrast - 0) * (127 - (-127)) / (254 - 0) + (-127))
+    if brightness != 0:
+        if brightness > 0:
+            shadow = brightness
+            max = 255
+        else:
+            shadow = 0
+            max = 255 + brightness
+
+        al_pha = (max - shadow) / 255
+        ga_mma = shadow
+        # The function addWeighted calculates
+        # the weighted sum of two arrays
+        cal = cv2.addWeighted(img, al_pha, 
+                              img, 0, ga_mma)
+    else:
+        cal = img
+    if contrast != 0:
+        Alpha = float(131 * (contrast + 127)) / (127 * (131 - contrast))
+        Gamma = 127 * (1 - Alpha)
+        # The function addWeighted calculates
+        # the weighted sum of two arrays
+        cal = cv2.addWeighted(cal, Alpha, 
+                              cal, 0, Gamma)
+    return cal
+
+
+def preprocess_for_ocr(image_path: str) -> np.ndarray:
+    """Loads an image and applies preprocessing steps for OCR."""
+    # Read the image using OpenCV
+    image = cv2.imread(image_path)
+    
+    # 1. Convert to grayscale
+    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    contrast_img = image_adjustment_controller(gray, brightness=150, contrast=120)
+    # 2. Binarize the image (Otsu's thresholding is great for this)
+    # This turns the image into pure black and white
+    _, binary_image = cv2.threshold(contrast_img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+    
+    # 3. Denoise the image (optional but often helpful)
+    denoised_image = cv2.medianBlur(gray, 3)
+    
+    return binary_image
+
+
@huey.task()
 def run_image_ocr_task(job_id: str, input_path_str: str, output_path_str: str, app_config: dict, base_url: str):
+    """
+    Performs OCR on an image file, first applying preprocessing steps to clean
+    the image, and then saving the output as a searchable PDF.
+    """
    db = SessionLocal()
    input_path = Path(input_path_str)
    try:
        job = get_job(db, job_id)
        if not job or job.status == 'cancelled':
+            logger.warning(f"OCR job {job_id} was cancelled or not found. Aborting task.")
            return
-        update_job_status(db, job_id, "processing", progress=50)
-        logger.info(f"Starting Image OCR for job {job_id}")
-        text = pytesseract.image_to_string(Image.open(str(input_path)))
-        out_path = Path(output_path_str)
-        tmp_out = out_path.with_name(f"{out_path.stem}.tmp-{uuid.uuid4().hex}{out_path.suffix}")
-        with tmp_out.open("w", encoding="utf-8") as f:
-            f.write(text)
-        tmp_out.replace(out_path)
-        mark_job_as_completed(db, job_id, output_filepath_str=output_path_str, preview=text)
-        logger.info(f"Image OCR for job {job_id} completed.")
+            
+        update_job_status(db, job_id, "processing")
+        logger.info(f"Starting Image to PDF OCR for job {job_id}")
+
+        # Apply the preprocessing steps to the input image for better accuracy
+        logger.info(f"Preprocessing image for job {job_id}...")
+        preprocessed_image = preprocess_for_ocr(input_path_str)
+
+        # Configure Tesseract for optimal performance.
+        # '--psm 3' enables automatic page segmentation, which is a robust default.
+        # '-l eng' specifies English as the language. This should be made dynamic if you support others.
+        tesseract_config = '--psm 3'
+        logger.info(f"Running Tesseract with config: '{tesseract_config}' for job {job_id}")
+
+        # Generate a searchable PDF from the preprocessed image data
+        pdf_bytes = pytesseract.image_to_pdf_or_hocr(
+            Image.fromarray(preprocessed_image),  # Convert numpy array back to PIL Image
+            extension='pdf',
+            config=tesseract_config
+        )
+        with open(output_path_str, "wb") as f:
+            f.write(pdf_bytes)
+
+        # Generate a plain text preview from the same preprocessed image
+        preview_text = pytesseract.image_to_string(
+            Image.fromarray(preprocessed_image),
+            config=tesseract_config
+        )
+        
+        mark_job_as_completed(db, job_id, output_filepath_str=output_path_str, preview=preview_text)
+        logger.info(f"Image to PDF OCR for job {job_id} completed successfully.")
+        
    except Exception as e:
-        logger.exception(f"ERROR during Image OCR for job {job_id}")
+        logger.exception(f"ERROR during Image to PDF OCR for job {job_id}")
        update_job_status(db, job_id, "failed", error=f"Image OCR failed: {e}")
+        
    finally:
        try:
+            # Clean up the original uploaded file
            ensure_path_is_safe(input_path, [PATHS.UPLOADS_DIR])
            input_path.unlink(missing_ok=True)
        except Exception:
-            logger.exception("Failed to cleanup input file after Image OCR.")
+            logger.exception(f"Failed to cleanup input file for job {job_id}.")
+            
        db.close()
        send_webhook_notification(job_id, app_config, base_url)

@@ -1487,11 +1569,28 @@ async def finalize_upload(request: Request, payload: FinalizeUploadPayload, user
        create_job(db=db, job=job_data)
        run_tts_task(job_data.id, str(final_path), str(processed_path), payload.model_name, tts_config, APP_CONFIG, base_url)
    elif payload.task_type == "ocr":
-        stem, suffix = Path(safe_filename).stem, Path(safe_filename).suffix
-        processed_path = PATHS.PROCESSED_DIR / f"{stem}_{job_id}{suffix}"
+        stem, suffix = Path(safe_filename).stem, Path(safe_filename).suffix.lower()
+        IMAGE_EXTENSIONS = {'.png', '.jpg', '.jpeg', '.tiff', '.tif', '.bmp', '.webp'}
+
+        # 1. Validate file type before creating a job
+        if suffix not in IMAGE_EXTENSIONS and suffix != '.pdf':
+            final_path.unlink(missing_ok=True) # Clean up the uploaded file
+            raise HTTPException(
+                status_code=415, 
+                detail=f"Unsupported file type for OCR: '{suffix}'. Please upload a PDF or a supported image."
+            )
+
+        # 2. Set output path to always be a PDF
+        processed_path = PATHS.PROCESSED_DIR / f"{stem}_{job_id}.pdf"
        job_data.processed_filepath = str(processed_path)
        create_job(db=db, job=job_data)
-        run_pdf_ocr_task(job_data.id, str(final_path), str(processed_path), APP_CONFIG.get("ocr_settings", {}).get("ocrmypdf", {}), APP_CONFIG, base_url)
+        
+        # 3. Dispatch to the correct task based on file type
+        if suffix in IMAGE_EXTENSIONS:
+            # Call the existing image task, which is now modified to produce a PDF
+            run_image_ocr_task(job_data.id, str(final_path), str(processed_path), APP_CONFIG, base_url)
+        else:  # It must be a .pdf due to the earlier check
+            run_pdf_ocr_task(job_data.id, str(final_path), str(processed_path), APP_CONFIG.get("ocr_settings", {}).get("ocrmypdf", {}), APP_CONFIG, base_url)
    elif payload.task_type == "conversion":
        try:
            tool, task_key = payload.output_format.split('_', 1)
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,6 +15,8 @@ pytesseract
 pypdf
 piper-tts
 kokoro-tts
+opencv-python-headless
+marker-pdf[full]

 # Configuration & Utilities
 werkzeug
@@ -28,6 +30,8 @@ pydantic-settings
 python-multipart
 markitdown[pdf,docx,pptx,xlsx,outlook]==0.1.3
 PyPDF2==3.0.1
+mechanize
+css_parser
 #PyQt6==6.9.1 # uncomment for calibre, it uses the webengine
 #PyQt6-Qt6==6.9.2
 #PyQt6-WebEngine==6.9.0