fixed ocr

2025-09-20 22:42:04 +00:00
parent a5fc23379f
commit 9241c19b08
2 changed files with 119 additions and 16 deletions
--- a/main.py
+++ b/main.py
@@ -10,6 +10,8 @@ import yaml
 import os
 import httpx
 import glob
 import cv2
 import numpy as np
 from contextlib import asynccontextmanager
 from datetime import datetime, timezone
 from pathlib import Path
@@ -1067,7 +1069,8 @@ def run_pdf_ocr_task(job_id: str, input_path_str: str, output_path_str: str, ocr
                     force_ocr=ocr_settings.get('force_ocr', True),
                     clean=ocr_settings.get('clean', True),
                     optimize=ocr_settings.get('optimize', 1),
-                     progress_bar=False)
+                     progress_bar=False,
                     image_dpi=ocr_settings.get('image_dpi', 300))
        with open(output_path_str, "rb") as f:
            reader = pypdf.PdfReader(f)
            preview = "\n".join(page.extract_text() or "" for page in reader.pages)
@@ -1085,33 +1088,112 @@ def run_pdf_ocr_task(job_id: str, input_path_str: str, output_path_str: str, ocr
        db.close()
        send_webhook_notification(job_id, app_config, base_url)
 def image_adjustment_controller(img, brightness=128,
               contrast=200):
    brightness = int((brightness - 0) * (255 - (-255)) / (510 - 0) + (-255))
    contrast = int((contrast - 0) * (127 - (-127)) / (254 - 0) + (-127))
    if brightness != 0:
        if brightness > 0:
            shadow = brightness
            max = 255
        else:
            shadow = 0
            max = 255 + brightness
        al_pha = (max - shadow) / 255
        ga_mma = shadow
        # The function addWeighted calculates
        # the weighted sum of two arrays
        cal = cv2.addWeighted(img, al_pha, 
                              img, 0, ga_mma)
    else:
        cal = img
    if contrast != 0:
        Alpha = float(131 * (contrast + 127)) / (127 * (131 - contrast))
        Gamma = 127 * (1 - Alpha)
        # The function addWeighted calculates
        # the weighted sum of two arrays
        cal = cv2.addWeighted(cal, Alpha, 
                              cal, 0, Gamma)
    return cal
 def preprocess_for_ocr(image_path: str) -> np.ndarray:
    """Loads an image and applies preprocessing steps for OCR."""
    # Read the image using OpenCV
    image = cv2.imread(image_path)
    # 1. Convert to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    contrast_img = image_adjustment_controller(gray, brightness=150, contrast=120)
    # 2. Binarize the image (Otsu's thresholding is great for this)
    # This turns the image into pure black and white
    _, binary_image = cv2.threshold(contrast_img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    # 3. Denoise the image (optional but often helpful)
    denoised_image = cv2.medianBlur(gray, 3)
    return binary_image
@huey.task()
 def run_image_ocr_task(job_id: str, input_path_str: str, output_path_str: str, app_config: dict, base_url: str):
    """
    Performs OCR on an image file, first applying preprocessing steps to clean
    the image, and then saving the output as a searchable PDF.
    """
    db = SessionLocal()
    input_path = Path(input_path_str)
    try:
        job = get_job(db, job_id)
        if not job or job.status == 'cancelled':
            logger.warning(f"OCR job {job_id} was cancelled or not found. Aborting task.")
            return
-        update_job_status(db, job_id, "processing", progress=50)
+            
-        logger.info(f"Starting Image OCR for job {job_id}")
+        update_job_status(db, job_id, "processing")
-        text = pytesseract.image_to_string(Image.open(str(input_path)))
+        logger.info(f"Starting Image to PDF OCR for job {job_id}")
-        out_path = Path(output_path_str)
+
-        tmp_out = out_path.with_name(f"{out_path.stem}.tmp-{uuid.uuid4().hex}{out_path.suffix}")
+        # Apply the preprocessing steps to the input image for better accuracy
-        with tmp_out.open("w", encoding="utf-8") as f:
+        logger.info(f"Preprocessing image for job {job_id}...")
-            f.write(text)
+        preprocessed_image = preprocess_for_ocr(input_path_str)
-        tmp_out.replace(out_path)
+
-        mark_job_as_completed(db, job_id, output_filepath_str=output_path_str, preview=text)
+        # Configure Tesseract for optimal performance.
-        logger.info(f"Image OCR for job {job_id} completed.")
+        # '--psm 3' enables automatic page segmentation, which is a robust default.
        # '-l eng' specifies English as the language. This should be made dynamic if you support others.
        tesseract_config = '--psm 3'
        logger.info(f"Running Tesseract with config: '{tesseract_config}' for job {job_id}")
        # Generate a searchable PDF from the preprocessed image data
        pdf_bytes = pytesseract.image_to_pdf_or_hocr(
            Image.fromarray(preprocessed_image),  # Convert numpy array back to PIL Image
            extension='pdf',
            config=tesseract_config
        )
        with open(output_path_str, "wb") as f:
            f.write(pdf_bytes)
        # Generate a plain text preview from the same preprocessed image
        preview_text = pytesseract.image_to_string(
            Image.fromarray(preprocessed_image),
            config=tesseract_config
        )
        mark_job_as_completed(db, job_id, output_filepath_str=output_path_str, preview=preview_text)
        logger.info(f"Image to PDF OCR for job {job_id} completed successfully.")
    except Exception as e:
-        logger.exception(f"ERROR during Image OCR for job {job_id}")
+        logger.exception(f"ERROR during Image to PDF OCR for job {job_id}")
        update_job_status(db, job_id, "failed", error=f"Image OCR failed: {e}")
    finally:
        try:
            # Clean up the original uploaded file
            ensure_path_is_safe(input_path, [PATHS.UPLOADS_DIR])
            input_path.unlink(missing_ok=True)
        except Exception:
-            logger.exception("Failed to cleanup input file after Image OCR.")
+            logger.exception(f"Failed to cleanup input file for job {job_id}.")
        db.close()
        send_webhook_notification(job_id, app_config, base_url)
@@ -1487,10 +1569,27 @@ async def finalize_upload(request: Request, payload: FinalizeUploadPayload, user
        create_job(db=db, job=job_data)
        run_tts_task(job_data.id, str(final_path), str(processed_path), payload.model_name, tts_config, APP_CONFIG, base_url)
    elif payload.task_type == "ocr":
-        stem, suffix = Path(safe_filename).stem, Path(safe_filename).suffix
+        stem, suffix = Path(safe_filename).stem, Path(safe_filename).suffix.lower()
-        processed_path = PATHS.PROCESSED_DIR / f"{stem}_{job_id}{suffix}"
+        IMAGE_EXTENSIONS = {'.png', '.jpg', '.jpeg', '.tiff', '.tif', '.bmp', '.webp'}
        # 1. Validate file type before creating a job
        if suffix not in IMAGE_EXTENSIONS and suffix != '.pdf':
            final_path.unlink(missing_ok=True) # Clean up the uploaded file
            raise HTTPException(
                status_code=415, 
                detail=f"Unsupported file type for OCR: '{suffix}'. Please upload a PDF or a supported image."
            )
        # 2. Set output path to always be a PDF
        processed_path = PATHS.PROCESSED_DIR / f"{stem}_{job_id}.pdf"
        job_data.processed_filepath = str(processed_path)
        create_job(db=db, job=job_data)
        # 3. Dispatch to the correct task based on file type
        if suffix in IMAGE_EXTENSIONS:
            # Call the existing image task, which is now modified to produce a PDF
            run_image_ocr_task(job_data.id, str(final_path), str(processed_path), APP_CONFIG, base_url)
        else:  # It must be a .pdf due to the earlier check
            run_pdf_ocr_task(job_data.id, str(final_path), str(processed_path), APP_CONFIG.get("ocr_settings", {}).get("ocrmypdf", {}), APP_CONFIG, base_url)
    elif payload.task_type == "conversion":
        try:
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,6 +15,8 @@ pytesseract
 pypdf
 piper-tts
 kokoro-tts
 opencv-python-headless
 marker-pdf[full]
 # Configuration & Utilities
 werkzeug
@@ -28,6 +30,8 @@ pydantic-settings
 python-multipart
 markitdown[pdf,docx,pptx,xlsx,outlook]==0.1.3
 PyPDF2==3.0.1
 mechanize
 css_parser
 #PyQt6==6.9.1 # uncomment for calibre, it uses the webengine
 #PyQt6-Qt6==6.9.2
 #PyQt6-WebEngine==6.9.0