stability and settings
This commit is contained in:
571
main.py
571
main.py
@@ -1,64 +1,99 @@
|
||||
import logging
|
||||
import shutil
|
||||
import subprocess
|
||||
import traceback
|
||||
import uuid
|
||||
import shlex
|
||||
import yaml
|
||||
from contextlib import asynccontextmanager
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import List, Set
|
||||
from typing import Dict, List, Any
|
||||
|
||||
import ocrmypdf
|
||||
import pypdf
|
||||
import pytesseract
|
||||
from PIL import Image
|
||||
from faster_whisper import WhisperModel
|
||||
# MODIFICATION: Added Form for model selection
|
||||
from fastapi import (Depends, FastAPI, File, Form, HTTPException, Request,
|
||||
UploadFile, status)
|
||||
from fastapi.responses import FileResponse
|
||||
UploadFile, status, Body)
|
||||
from fastapi.responses import FileResponse, JSONResponse
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from fastapi.templating import Jinja2Templates
|
||||
from huey import SqliteHuey
|
||||
from pydantic import BaseModel, ConfigDict
|
||||
from pydantic_settings import BaseSettings
|
||||
from sqlalchemy import (Column, DateTime, Integer, String, Text,
|
||||
create_engine)
|
||||
create_engine, delete, event)
|
||||
from sqlalchemy.pool import NullPool
|
||||
from string import Formatter
|
||||
from sqlalchemy.orm import Session, declarative_base, sessionmaker
|
||||
from werkzeug.utils import secure_filename
|
||||
|
||||
# --------------------------------------------------------------------------------
|
||||
# --- 1. CONFIGURATION
|
||||
# --------------------------------------------------------------------------------
|
||||
class Settings(BaseSettings):
|
||||
|
||||
class AppPaths(BaseModel):
|
||||
BASE_DIR: Path = Path(__file__).resolve().parent
|
||||
UPLOADS_DIR: Path = BASE_DIR / "uploads"
|
||||
PROCESSED_DIR: Path = BASE_DIR / "processed"
|
||||
DATABASE_URL: str = f"sqlite:///{BASE_DIR / 'jobs.db'}"
|
||||
HUEY_DB_PATH: str = str(BASE_DIR / "huey.db")
|
||||
# MODIFICATION: Removed hardcoded model size, added a set of allowed models
|
||||
WHISPER_COMPUTE_TYPE: str = "int8"
|
||||
ALLOWED_WHISPER_MODELS: Set[str] = {"tiny", "base", "small", "medium", "large-v3", "distil-large-v2"}
|
||||
MAX_FILE_SIZE_BYTES: int = 500 * 1024 * 1024 # 500 MB
|
||||
ALLOWED_PDF_EXTENSIONS: set = {".pdf"}
|
||||
ALLOWED_IMAGE_EXTENSIONS: set = {".png", ".jpg", ".jpeg", ".tiff", ".tif"}
|
||||
ALLOWED_AUDIO_EXTENSIONS: set = {".mp3", "m4a", ".ogg", ".flac", ".opus"}
|
||||
SETTINGS_FILE: Path = BASE_DIR / "settings.yml"
|
||||
|
||||
settings = Settings()
|
||||
PATHS = AppPaths()
|
||||
APP_CONFIG: Dict[str, Any] = {}
|
||||
|
||||
def load_app_config():
|
||||
global APP_CONFIG
|
||||
try:
|
||||
with open(PATHS.SETTINGS_FILE, 'r') as f:
|
||||
APP_CONFIG = yaml.safe_load(f)
|
||||
APP_CONFIG['app_settings']['max_file_size_bytes'] = APP_CONFIG['app_settings']['max_file_size_mb'] * 1024 * 1024
|
||||
allowed_extensions = {
|
||||
".pdf", ".ps", ".eps", ".png", ".jpg", ".jpeg", ".tiff", ".tif", ".gif",
|
||||
".bmp", ".webp", ".svg", ".jxl", ".avif", ".ppm", ".mp3", ".m4a", ".ogg",
|
||||
".flac", ".opus", ".wav", ".aac", ".mp4", ".mkv", ".mov", ".webm", ".avi",
|
||||
".flv", ".md", ".txt", ".html", ".docx", ".odt", ".rst", ".epub", ".mobi",
|
||||
".azw3", ".pptx", ".xlsx"
|
||||
}
|
||||
APP_CONFIG['app_settings']['allowed_all_extensions'] = allowed_extensions
|
||||
logger.info("Successfully loaded settings from settings.yml")
|
||||
except (FileNotFoundError, yaml.YAMLError) as e:
|
||||
logger.error(f"Could not load settings.yml: {e}. App may not function correctly.")
|
||||
APP_CONFIG = {}
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
settings.UPLOADS_DIR.mkdir(exist_ok=True)
|
||||
settings.PROCESSED_DIR.mkdir(exist_ok=True)
|
||||
|
||||
PATHS.UPLOADS_DIR.mkdir(exist_ok=True)
|
||||
PATHS.PROCESSED_DIR.mkdir(exist_ok=True)
|
||||
|
||||
# --------------------------------------------------------------------------------
|
||||
# --- 2. DATABASE (for Job Tracking) - NO CHANGES
|
||||
# --- 2. DATABASE & Schemas
|
||||
# --------------------------------------------------------------------------------
|
||||
engine = create_engine(settings.DATABASE_URL, connect_args={"check_same_thread": False})
|
||||
engine = create_engine(
|
||||
PATHS.DATABASE_URL,
|
||||
connect_args={"check_same_thread": False, "timeout": 30},
|
||||
poolclass=NullPool,
|
||||
)
|
||||
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
||||
|
||||
# THIS IS THE CRITICAL FIX
|
||||
Base = declarative_base()
|
||||
|
||||
@event.listens_for(engine, "connect")
|
||||
def _set_sqlite_pragmas(dbapi_connection, connection_record):
|
||||
"""
|
||||
Enable WAL mode and set sane synchronous for better concurrency
|
||||
between the FastAPI process and Huey worker processes.
|
||||
"""
|
||||
c = dbapi_connection.cursor()
|
||||
try:
|
||||
c.execute("PRAGMA journal_mode=WAL;")
|
||||
c.execute("PRAGMA synchronous=NORMAL;")
|
||||
finally:
|
||||
c.close()
|
||||
|
||||
class Job(Base):
|
||||
__tablename__ = "jobs"
|
||||
id = Column(String, primary_key=True, index=True)
|
||||
@@ -80,10 +115,6 @@ def get_db():
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------------
|
||||
# --- 3. PYDANTIC SCHEMAS (Data Validation) - NO CHANGES
|
||||
# --------------------------------------------------------------------------------
|
||||
class JobCreate(BaseModel):
|
||||
id: str
|
||||
task_type: str
|
||||
@@ -104,9 +135,8 @@ class JobSchema(BaseModel):
|
||||
updated_at: datetime
|
||||
model_config = ConfigDict(from_attributes=True)
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------------
|
||||
# --- 4. CRUD OPERATIONS (Database Interactions) - NO CHANGES
|
||||
# --- 3. CRUD OPERATIONS (No Changes)
|
||||
# --------------------------------------------------------------------------------
|
||||
def get_job(db: Session, job_id: str):
|
||||
return db.query(Job).filter(Job.id == job_id).first()
|
||||
@@ -143,37 +173,101 @@ def mark_job_as_completed(db: Session, job_id: str, preview: str | None = None):
|
||||
db.commit()
|
||||
return db_job
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------------
|
||||
# --- 5. BACKGROUND TASKS (Huey)
|
||||
# --- 4. BACKGROUND TASK SETUP
|
||||
# --------------------------------------------------------------------------------
|
||||
huey = SqliteHuey(filename=settings.HUEY_DB_PATH)
|
||||
huey = SqliteHuey(filename=PATHS.HUEY_DB_PATH)
|
||||
|
||||
# --- START: NEW WHISPER MODEL CACHING ---
|
||||
# This dictionary will live in the memory of the Huey worker process,
|
||||
# allowing us to reuse loaded models across tasks.
|
||||
WHISPER_MODELS_CACHE: Dict[str, WhisperModel] = {}
|
||||
|
||||
def get_whisper_model(model_size: str, whisper_settings: dict) -> WhisperModel:
|
||||
"""
|
||||
Loads a Whisper model into the cache if not present, and returns the model.
|
||||
This ensures a model is only loaded into memory once per worker process.
|
||||
"""
|
||||
if model_size not in WHISPER_MODELS_CACHE:
|
||||
compute_type = whisper_settings.get('compute_type', 'int8')
|
||||
logger.info(f"Whisper model '{model_size}' not in cache. Loading into memory...")
|
||||
model = WhisperModel(model_size, device="cpu", compute_type=compute_type)
|
||||
WHISPER_MODELS_CACHE[model_size] = model
|
||||
logger.info(f"Model '{model_size}' loaded successfully.")
|
||||
else:
|
||||
logger.info(f"Found model '{model_size}' in cache. Reusing.")
|
||||
return WHISPER_MODELS_CACHE[model_size]
|
||||
# --- END: NEW WHISPER MODEL CACHING ---
|
||||
|
||||
# MODIFICATION: Removed global whisper model and lazy loader.
|
||||
# The model will now be loaded inside the task itself based on user selection.
|
||||
|
||||
@huey.task()
|
||||
def run_pdf_ocr_task(job_id: str, input_path_str: str, output_path_str: str):
|
||||
def run_transcription_task(job_id: str, input_path_str: str, output_path_str: str, model_size: str, whisper_settings: dict):
|
||||
db = SessionLocal()
|
||||
try:
|
||||
job = get_job(db, job_id)
|
||||
if not job or job.status == 'cancelled':
|
||||
logger.info(f"Job {job_id} was cancelled before starting.")
|
||||
return
|
||||
if not job or job.status == 'cancelled': return
|
||||
|
||||
update_job_status(db, job_id, "processing")
|
||||
|
||||
# --- MODIFIED: Use the caching function to get the model ---
|
||||
model = get_whisper_model(model_size, whisper_settings)
|
||||
|
||||
logger.info(f"Starting transcription for job {job_id}")
|
||||
segments, info = model.transcribe(input_path_str, beam_size=5)
|
||||
|
||||
full_transcript = []
|
||||
for segment in segments:
|
||||
job_check = get_job(db, job_id) # Check for cancellation during long tasks
|
||||
if job_check.status == 'cancelled':
|
||||
logger.info(f"Job {job_id} cancelled during transcription.")
|
||||
return
|
||||
|
||||
if info.duration > 0:
|
||||
progress = int((segment.end / info.duration) * 100)
|
||||
update_job_status(db, job_id, "processing", progress=progress)
|
||||
|
||||
full_transcript.append(segment.text.strip())
|
||||
|
||||
transcript_text = "\n".join(full_transcript)
|
||||
# write atomically to avoid partial files
|
||||
out_path = Path(output_path_str)
|
||||
tmp_out = out_path.with_suffix(out_path.suffix + f".{uuid.uuid4().hex}.tmp")
|
||||
with tmp_out.open("w", encoding="utf-8") as f:
|
||||
f.write(transcript_text)
|
||||
tmp_out.replace(out_path)
|
||||
|
||||
mark_job_as_completed(db, job_id, preview=transcript_text)
|
||||
logger.info(f"Transcription for job {job_id} completed.")
|
||||
except Exception:
|
||||
logger.exception(f"ERROR during transcription for job {job_id}")
|
||||
update_job_status(db, job_id, "failed", error="See server logs for details.")
|
||||
finally:
|
||||
Path(input_path_str).unlink(missing_ok=True)
|
||||
db.close()
|
||||
|
||||
# Other tasks remain unchanged
|
||||
@huey.task()
|
||||
def run_pdf_ocr_task(job_id: str, input_path_str: str, output_path_str: str, ocr_settings: dict):
|
||||
db = SessionLocal()
|
||||
try:
|
||||
job = get_job(db, job_id)
|
||||
if not job or job.status == 'cancelled': return
|
||||
update_job_status(db, job_id, "processing")
|
||||
logger.info(f"Starting PDF OCR for job {job_id}")
|
||||
|
||||
ocrmypdf.ocr(input_path_str, output_path_str, deskew=True, force_ocr=True, clean=True, optimize=1, progress_bar=False)
|
||||
|
||||
ocrmypdf.ocr(input_path_str, output_path_str,
|
||||
deskew=ocr_settings.get('deskew', True),
|
||||
force_ocr=ocr_settings.get('force_ocr', True),
|
||||
clean=ocr_settings.get('clean', True),
|
||||
optimize=ocr_settings.get('optimize', 1),
|
||||
progress_bar=False)
|
||||
with open(output_path_str, "rb") as f:
|
||||
reader = pypdf.PdfReader(f)
|
||||
preview = "\n".join(page.extract_text() or "" for page in reader.pages)
|
||||
mark_job_as_completed(db, job_id, preview=preview)
|
||||
logger.info(f"PDF OCR for job {job_id} completed.")
|
||||
except Exception as e:
|
||||
logger.error(f"ERROR during PDF OCR for job {job_id}: {e}\n{traceback.format_exc()}")
|
||||
update_job_status(db, job_id, "failed", error=str(e))
|
||||
except Exception:
|
||||
logger.exception(f"ERROR during PDF OCR for job {job_id}")
|
||||
update_job_status(db, job_id, "failed", error="See server logs for details.")
|
||||
finally:
|
||||
Path(input_path_str).unlink(missing_ok=True)
|
||||
db.close()
|
||||
@@ -183,10 +277,7 @@ def run_image_ocr_task(job_id: str, input_path_str: str, output_path_str: str):
|
||||
db = SessionLocal()
|
||||
try:
|
||||
job = get_job(db, job_id)
|
||||
if not job or job.status == 'cancelled':
|
||||
logger.info(f"Job {job_id} was cancelled before starting.")
|
||||
return
|
||||
|
||||
if not job or job.status == 'cancelled': return
|
||||
update_job_status(db, job_id, "processing", progress=50)
|
||||
logger.info(f"Starting Image OCR for job {job_id}")
|
||||
text = pytesseract.image_to_string(Image.open(input_path_str))
|
||||
@@ -194,175 +285,324 @@ def run_image_ocr_task(job_id: str, input_path_str: str, output_path_str: str):
|
||||
f.write(text)
|
||||
mark_job_as_completed(db, job_id, preview=text)
|
||||
logger.info(f"Image OCR for job {job_id} completed.")
|
||||
except Exception as e:
|
||||
logger.error(f"ERROR during Image OCR for job {job_id}: {e}\n{traceback.format_exc()}")
|
||||
update_job_status(db, job_id, "failed", error=str(e))
|
||||
except Exception:
|
||||
logger.exception(f"ERROR during Image OCR for job {job_id}")
|
||||
update_job_status(db, job_id, "failed", error="See server logs for details.")
|
||||
finally:
|
||||
Path(input_path_str).unlink(missing_ok=True)
|
||||
db.close()
|
||||
|
||||
# MODIFICATION: The task now accepts `model_size` and loads the model dynamically.
|
||||
|
||||
@huey.task()
|
||||
def run_transcription_task(job_id: str, input_path_str: str, output_path_str: str, model_size: str):
|
||||
def run_conversion_task(job_id: str, input_path_str: str, output_path_str: str, tool: str, task_key: str, conversion_tools_config: dict):
|
||||
db = SessionLocal()
|
||||
temp_input_file = None
|
||||
temp_output_file = None
|
||||
try:
|
||||
job = get_job(db, job_id)
|
||||
if not job or job.status == 'cancelled':
|
||||
logger.info(f"Job {job_id} was cancelled before starting.")
|
||||
return
|
||||
if not job or job.status == 'cancelled': return
|
||||
update_job_status(db, job_id, "processing", progress=25)
|
||||
logger.info(f"Starting conversion for job {job_id} using {tool} with task {task_key}")
|
||||
tool_config = conversion_tools_config.get(tool)
|
||||
if not tool_config: raise ValueError(f"Unknown conversion tool: {tool}")
|
||||
input_path = Path(input_path_str)
|
||||
output_path = Path(output_path_str)
|
||||
current_input_path = input_path
|
||||
if tool == "mozjpeg":
|
||||
temp_input_file = input_path.with_suffix('.temp.ppm')
|
||||
logger.info(f"Pre-converting for MozJPEG: {input_path} -> {temp_input_file}")
|
||||
pre_conv_cmd = ["vips", "copy", str(input_path), str(temp_input_file)]
|
||||
pre_conv_result = subprocess.run(pre_conv_cmd, capture_output=True, text=True, check=False, timeout=tool_config.get("timeout", 300))
|
||||
if pre_conv_result.returncode != 0:
|
||||
err = (pre_conv_result.stderr or "")[:4000]
|
||||
raise Exception(f"MozJPEG pre-conversion to PPM failed: {err}")
|
||||
current_input_path = temp_input_file
|
||||
update_job_status(db, job_id, "processing", progress=50)
|
||||
# Build safe mapping for formatting and validate placeholders
|
||||
ALLOWED_VARS = {"input", "output", "output_dir", "output_ext", "quality", "speed", "preset", "device", "dpi", "samplerate", "bitdepth"}
|
||||
def validate_and_build_command(template_str: str, mapping: dict):
|
||||
fmt = Formatter()
|
||||
used = {fname for _, fname, _, _ in fmt.parse(template_str) if fname}
|
||||
bad = used - ALLOWED_VARS
|
||||
if bad:
|
||||
raise ValueError(f"Command template contains disallowed placeholders: {bad}")
|
||||
formatted = template_str.format(**mapping)
|
||||
return shlex.split(formatted)
|
||||
|
||||
update_job_status(db, job_id, "processing")
|
||||
|
||||
# Load the specified model for this task
|
||||
logger.info(f"Loading faster-whisper model: {model_size} for job {job_id}...")
|
||||
model = WhisperModel(
|
||||
model_size,
|
||||
device="cpu",
|
||||
compute_type=settings.WHISPER_COMPUTE_TYPE
|
||||
)
|
||||
logger.info(f"Whisper model '{model_size}' loaded successfully.")
|
||||
|
||||
logger.info(f"Starting transcription for job {job_id}")
|
||||
segments, info = model.transcribe(input_path_str, beam_size=5)
|
||||
|
||||
full_transcript = []
|
||||
total_duration = info.duration
|
||||
for segment in segments:
|
||||
job_check = get_job(db, job_id)
|
||||
if job_check.status == 'cancelled':
|
||||
logger.info(f"Job {job_id} cancelled during transcription.")
|
||||
return
|
||||
# Use a temporary output path and atomically move into place after success
|
||||
temp_output_file = output_path.with_suffix(output_path.suffix + f".{uuid.uuid4().hex}.tmp")
|
||||
|
||||
# Update progress based on the segment's end time
|
||||
if total_duration > 0:
|
||||
progress = int((segment.end / total_duration) * 100)
|
||||
update_job_status(db, job_id, "processing", progress=progress)
|
||||
full_transcript.append(segment.text.strip())
|
||||
# Prepare mapping
|
||||
mapping = {
|
||||
"input": str(current_input_path),
|
||||
"output": str(temp_output_file),
|
||||
"output_dir": str(output_path.parent),
|
||||
"output_ext": output_path.suffix.lstrip('.'),
|
||||
}
|
||||
|
||||
transcript_text = "\n".join(full_transcript)
|
||||
with open(output_path_str, "w", encoding="utf-8") as f:
|
||||
f.write(transcript_text)
|
||||
mark_job_as_completed(db, job_id, preview=transcript_text)
|
||||
logger.info(f"Transcription for job {job_id} completed.")
|
||||
# Allow tool-specific adjustments to mapping
|
||||
if tool.startswith("ghostscript"):
|
||||
device, setting = task_key.split('_')
|
||||
mapping.update({"device": device, "dpi": setting, "preset": setting})
|
||||
elif tool == "pngquant":
|
||||
_, quality_key = task_key.split('_')
|
||||
quality_map = {"hq": "80-95", "mq": "65-80", "fast": "65-80"}
|
||||
speed_map = {"hq": "1", "mq": "3", "fast": "11"}
|
||||
mapping.update({"quality": quality_map.get(quality_key, "65-80"), "speed": speed_map.get(quality_key, "3")})
|
||||
elif tool == "sox":
|
||||
_, rate, depth = task_key.split('_')
|
||||
rate = rate.replace('k', '000') if 'k' in rate else rate
|
||||
depth = depth.replace('b', '') if 'b' in depth else '16'
|
||||
mapping.update({"samplerate": rate, "bitdepth": depth})
|
||||
elif tool == "mozjpeg":
|
||||
_, quality = task_key.split('_')
|
||||
quality = quality.replace('q', '')
|
||||
mapping.update({"quality": quality})
|
||||
|
||||
command_template_str = tool_config["command_template"]
|
||||
command = validate_and_build_command(command_template_str, mapping)
|
||||
logger.info(f"Executing command: {' '.join(command)}")
|
||||
# run with timeout and capture output; run_command helper ensures trimmed logs on failure
|
||||
def run_command(argv: List[str], timeout: int = 300):
|
||||
try:
|
||||
res = subprocess.run(argv, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, timeout=timeout)
|
||||
except subprocess.TimeoutExpired:
|
||||
raise Exception(f"Command timed out after {timeout}s")
|
||||
if res.returncode != 0:
|
||||
stderr = (res.stderr or "")[:4000]
|
||||
stdout = (res.stdout or "")[:4000]
|
||||
raise Exception(f"Command failed exit {res.returncode}. stderr: {stderr}; stdout: {stdout}")
|
||||
return res
|
||||
|
||||
result = run_command(command, timeout=tool_config.get("timeout", 300))
|
||||
if tool == "libreoffice":
|
||||
expected_output_filename = input_path.with_suffix(output_path.suffix).name
|
||||
generated_file = output_path.parent / expected_output_filename
|
||||
if generated_file.exists():
|
||||
# move generated file into place
|
||||
generated_file.replace(output_path)
|
||||
else:
|
||||
raise Exception(f"LibreOffice did not create the expected file: {expected_output_filename}")
|
||||
# move temp output into final location atomically
|
||||
if temp_output_file and temp_output_file.exists():
|
||||
temp_output_file.replace(output_path)
|
||||
|
||||
mark_job_as_completed(db, job_id, preview=f"Successfully converted file.")
|
||||
logger.info(f"Conversion for job {job_id} completed.")
|
||||
except Exception as e:
|
||||
logger.error(f"ERROR during transcription for job {job_id}: {e}\n{traceback.format_exc()}")
|
||||
update_job_status(db, job_id, "failed", error=str(e))
|
||||
logger.exception(f"ERROR during conversion for job {job_id}")
|
||||
update_job_status(db, job_id, "failed", error="See server logs for details.")
|
||||
finally:
|
||||
Path(input_path_str).unlink(missing_ok=True)
|
||||
if temp_input_file:
|
||||
temp_input_file.unlink(missing_ok=True)
|
||||
if temp_output_file:
|
||||
temp_output_file.unlink(missing_ok=True)
|
||||
db.close()
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------------
|
||||
# --- 6. FASTAPI APPLICATION
|
||||
# --- 5. FASTAPI APPLICATION
|
||||
# --------------------------------------------------------------------------------
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
logger.info("Application starting up...")
|
||||
Base.metadata.create_all(bind=engine)
|
||||
load_app_config()
|
||||
yield
|
||||
logger.info("Application shutting down...")
|
||||
|
||||
app = FastAPI(lifespan=lifespan)
|
||||
app.mount("/static", StaticFiles(directory=settings.BASE_DIR / "static"), name="static")
|
||||
templates = Jinja2Templates(directory=settings.BASE_DIR / "templates")
|
||||
app.mount("/static", StaticFiles(directory=PATHS.BASE_DIR / "static"), name="static")
|
||||
templates = Jinja2Templates(directory=PATHS.BASE_DIR / "templates")
|
||||
|
||||
# --- Helper Functions ---
|
||||
async def save_upload_file_chunked(upload_file: UploadFile, destination: Path):
|
||||
"""
|
||||
Streams the uploaded file in chunks directly to a file on disk.
|
||||
This is memory-efficient and reliable for large files.
|
||||
"""
|
||||
max_size = APP_CONFIG.get("app_settings", {}).get("max_file_size_bytes", 100 * 1024 * 1024)
|
||||
tmp = destination.with_suffix(destination.suffix + f".{uuid.uuid4().hex}.tmp")
|
||||
size = 0
|
||||
with open(destination, "wb") as buffer:
|
||||
while chunk := await upload_file.read(1024 * 1024): # 1MB chunks
|
||||
if size + len(chunk) > settings.MAX_FILE_SIZE_BYTES:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE,
|
||||
detail=f"File exceeds limit of {settings.MAX_FILE_SIZE_BYTES // 1024 // 1024} MB"
|
||||
)
|
||||
buffer.write(chunk)
|
||||
size += len(chunk)
|
||||
try:
|
||||
with tmp.open("wb") as buffer:
|
||||
while True:
|
||||
chunk = await upload_file.read(1024 * 1024)
|
||||
if not chunk:
|
||||
break
|
||||
size += len(chunk)
|
||||
if size > max_size:
|
||||
raise HTTPException(status_code=413, detail=f"File exceeds {max_size / 1024 / 1024} MB limit")
|
||||
buffer.write(chunk)
|
||||
# atomic move into place
|
||||
tmp.replace(destination)
|
||||
except Exception:
|
||||
tmp.unlink(missing_ok=True)
|
||||
raise
|
||||
|
||||
|
||||
def is_allowed_file(filename: str, allowed_extensions: set) -> bool:
|
||||
return Path(filename).suffix.lower() in allowed_extensions
|
||||
|
||||
# --- API Endpoints ---
|
||||
@app.get("/")
|
||||
async def get_index(request: Request):
|
||||
# MODIFICATION: Pass available models to the template
|
||||
return templates.TemplateResponse("index.html", {
|
||||
"request": request,
|
||||
"whisper_models": sorted(list(settings.ALLOWED_WHISPER_MODELS))
|
||||
})
|
||||
# --- Routes (only transcription route is modified) ---
|
||||
|
||||
@app.post("/ocr-pdf", status_code=status.HTTP_202_ACCEPTED)
|
||||
async def submit_pdf_ocr(file: UploadFile = File(...), db: Session = Depends(get_db)):
|
||||
if not is_allowed_file(file.filename, settings.ALLOWED_PDF_EXTENSIONS):
|
||||
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid file type. Please upload a PDF.")
|
||||
|
||||
job_id = uuid.uuid4().hex
|
||||
safe_basename = secure_filename(file.filename)
|
||||
unique_filename = f"{Path(safe_basename).stem}_{job_id}{Path(safe_basename).suffix}"
|
||||
upload_path = settings.UPLOADS_DIR / unique_filename
|
||||
processed_path = settings.PROCESSED_DIR / unique_filename
|
||||
|
||||
await save_upload_file_chunked(file, upload_path)
|
||||
|
||||
job_data = JobCreate(id=job_id, task_type="ocr", original_filename=file.filename, input_filepath=str(upload_path), processed_filepath=str(processed_path))
|
||||
new_job = create_job(db=db, job=job_data)
|
||||
|
||||
run_pdf_ocr_task(new_job.id, str(upload_path), str(processed_path))
|
||||
return {"job_id": new_job.id, "status": new_job.status}
|
||||
|
||||
@app.post("/ocr-image", status_code=status.HTTP_202_ACCEPTED)
|
||||
async def submit_image_ocr(file: UploadFile = File(...), db: Session = Depends(get_db)):
|
||||
if not is_allowed_file(file.filename, settings.ALLOWED_IMAGE_EXTENSIONS):
|
||||
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid file type. Please upload a PNG, JPG, or TIFF.")
|
||||
|
||||
job_id = uuid.uuid4().hex
|
||||
safe_basename = secure_filename(file.filename)
|
||||
file_ext = Path(safe_basename).suffix
|
||||
unique_filename = f"{Path(safe_basename).stem}_{job_id}{file_ext}"
|
||||
upload_path = settings.UPLOADS_DIR / unique_filename
|
||||
processed_path = settings.PROCESSED_DIR / f"{Path(safe_basename).stem}_{job_id}.txt"
|
||||
|
||||
await save_upload_file_chunked(file, upload_path)
|
||||
|
||||
job_data = JobCreate(id=job_id, task_type="ocr-image", original_filename=file.filename, input_filepath=str(upload_path), processed_filepath=str(processed_path))
|
||||
new_job = create_job(db=db, job=job_data)
|
||||
|
||||
run_image_ocr_task(new_job.id, str(upload_path), str(processed_path))
|
||||
return {"job_id": new_job.id, "status": new_job.status}
|
||||
|
||||
# MODIFICATION: Endpoint now accepts `model_size` as form data.
|
||||
@app.post("/transcribe-audio", status_code=status.HTTP_202_ACCEPTED)
|
||||
async def submit_audio_transcription(
|
||||
file: UploadFile = File(...),
|
||||
model_size: str = Form("base"),
|
||||
db: Session = Depends(get_db)
|
||||
):
|
||||
if not is_allowed_file(file.filename, settings.ALLOWED_AUDIO_EXTENSIONS):
|
||||
if not is_allowed_file(file.filename, {".mp3", ".wav", ".m4a", ".flac", ".ogg", ".opus"}):
|
||||
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid audio file type.")
|
||||
|
||||
# Validate the selected model size
|
||||
if model_size not in settings.ALLOWED_WHISPER_MODELS:
|
||||
whisper_config = APP_CONFIG.get("transcription_settings", {}).get("whisper", {})
|
||||
if model_size not in whisper_config.get("allowed_models", []):
|
||||
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=f"Invalid model size: {model_size}.")
|
||||
|
||||
job_id = uuid.uuid4().hex
|
||||
safe_basename = secure_filename(file.filename)
|
||||
stem, suffix = Path(safe_basename).stem, Path(safe_basename).suffix
|
||||
|
||||
|
||||
audio_filename = f"{stem}_{job_id}{suffix}"
|
||||
transcript_filename = f"{stem}_{job_id}.txt"
|
||||
upload_path = settings.UPLOADS_DIR / audio_filename
|
||||
processed_path = settings.PROCESSED_DIR / transcript_filename
|
||||
upload_path = PATHS.UPLOADS_DIR / audio_filename
|
||||
processed_path = PATHS.PROCESSED_DIR / transcript_filename
|
||||
|
||||
await save_upload_file_chunked(file, upload_path)
|
||||
|
||||
|
||||
job_data = JobCreate(id=job_id, task_type="transcription", original_filename=file.filename, input_filepath=str(upload_path), processed_filepath=str(processed_path))
|
||||
new_job = create_job(db=db, job=job_data)
|
||||
|
||||
# Pass the selected model size to the background task
|
||||
run_transcription_task(new_job.id, str(upload_path), str(processed_path), model_size=model_size)
|
||||
|
||||
# --- MODIFIED: Pass whisper_config to the task ---
|
||||
run_transcription_task(new_job.id, str(upload_path), str(processed_path), model_size=model_size, whisper_settings=whisper_config)
|
||||
|
||||
return {"job_id": new_job.id, "status": new_job.status}
|
||||
|
||||
@app.post("/job/{job_id}/cancel", status_code=status.HTTP_200_OK)
|
||||
|
||||
# --- Other routes remain unchanged ---
|
||||
|
||||
@app.get("/")
|
||||
async def get_index(request: Request):
|
||||
whisper_models = APP_CONFIG.get("transcription_settings", {}).get("whisper", {}).get("allowed_models", [])
|
||||
conversion_tools = APP_CONFIG.get("conversion_tools", {})
|
||||
return templates.TemplateResponse("index.html", {
|
||||
"request": request,
|
||||
"whisper_models": sorted(list(whisper_models)),
|
||||
"conversion_tools": conversion_tools
|
||||
})
|
||||
|
||||
@app.get("/settings")
|
||||
async def get_settings_page(request: Request):
|
||||
try:
|
||||
with open(PATHS.SETTINGS_FILE, 'r') as f:
|
||||
current_config = yaml.safe_load(f)
|
||||
except Exception as e:
|
||||
logger.error(f"Could not load settings.yml for settings page: {e}")
|
||||
current_config = {}
|
||||
return templates.TemplateResponse("settings.html", {"request": request, "config": current_config})
|
||||
|
||||
@app.post("/settings/save")
|
||||
async def save_settings(new_config: Dict = Body(...)):
|
||||
try:
|
||||
with open(PATHS.SETTINGS_FILE, 'w') as f:
|
||||
yaml.dump(new_config, f, default_flow_style=False, sort_keys=False)
|
||||
load_app_config()
|
||||
return JSONResponse({"message": "Settings saved successfully."})
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to save settings: {e}")
|
||||
raise HTTPException(status_code=500, detail="Could not write to settings.yml.")
|
||||
|
||||
@app.post("/settings/clear-history")
|
||||
async def clear_job_history(db: Session = Depends(get_db)):
|
||||
try:
|
||||
num_deleted = db.query(Job).delete()
|
||||
db.commit()
|
||||
logger.info(f"Cleared {num_deleted} jobs from history.")
|
||||
return {"deleted_count": num_deleted}
|
||||
except Exception as e:
|
||||
db.rollback()
|
||||
logger.error(f"Failed to clear job history: {e}")
|
||||
raise HTTPException(status_code=500, detail="Database error while clearing history.")
|
||||
|
||||
@app.post("/settings/delete-files")
|
||||
async def delete_processed_files():
|
||||
deleted_count = 0
|
||||
errors = []
|
||||
for f in PATHS.PROCESSED_DIR.glob('*'):
|
||||
try:
|
||||
if f.is_file():
|
||||
f.unlink()
|
||||
deleted_count += 1
|
||||
except Exception as e:
|
||||
errors.append(f.name)
|
||||
logger.error(f"Could not delete processed file {f.name}: {e}")
|
||||
if errors:
|
||||
raise HTTPException(status_code=500, detail=f"Could not delete some files: {', '.join(errors)}")
|
||||
logger.info(f"Deleted {deleted_count} files from processed directory.")
|
||||
return {"deleted_count": deleted_count}
|
||||
|
||||
@app.post("/convert-file", status_code=status.HTTP_202_ACCEPTED)
|
||||
async def submit_file_conversion(file: UploadFile = File(...), output_format: str = Form(...), db: Session = Depends(get_db)):
|
||||
allowed_exts = APP_CONFIG.get("app_settings", {}).get("allowed_all_extensions", set())
|
||||
if not is_allowed_file(file.filename, allowed_exts):
|
||||
raise HTTPException(status_code=400, detail=f"File type '{Path(file.filename).suffix}' not allowed.")
|
||||
conversion_tools = APP_CONFIG.get("conversion_tools", {})
|
||||
try:
|
||||
tool, task_key = output_format.split('_', 1)
|
||||
if tool not in conversion_tools or task_key not in conversion_tools[tool]["formats"]:
|
||||
raise ValueError()
|
||||
except ValueError:
|
||||
raise HTTPException(status_code=400, detail="Invalid output format selected.")
|
||||
job_id = uuid.uuid4().hex
|
||||
safe_basename = secure_filename(file.filename)
|
||||
original_stem = Path(safe_basename).stem
|
||||
target_ext = task_key.split('_')[0]
|
||||
if tool == "ghostscript_pdf":
|
||||
target_ext = "pdf"
|
||||
upload_filename = f"{original_stem}_{job_id}{Path(safe_basename).suffix}"
|
||||
processed_filename = f"{original_stem}_{job_id}.{target_ext}"
|
||||
upload_path = PATHS.UPLOADS_DIR / upload_filename
|
||||
processed_path = PATHS.PROCESSED_DIR / processed_filename
|
||||
await save_upload_file_chunked(file, upload_path)
|
||||
job_data = JobCreate(id=job_id, task_type="conversion", original_filename=file.filename,
|
||||
input_filepath=str(upload_path), processed_filepath=str(processed_path))
|
||||
new_job = create_job(db=db, job=job_data)
|
||||
run_conversion_task(new_job.id, str(upload_path), str(processed_path), tool, task_key, conversion_tools)
|
||||
return {"job_id": new_job.id, "status": new_job.status}
|
||||
|
||||
@app.post("/ocr-pdf", status_code=status.HTTP_202_ACCEPTED)
|
||||
async def submit_pdf_ocr(file: UploadFile = File(...), db: Session = Depends(get_db)):
|
||||
if not is_allowed_file(file.filename, {".pdf"}):
|
||||
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid file type. Please upload a PDF.")
|
||||
job_id = uuid.uuid4().hex
|
||||
safe_basename = secure_filename(file.filename)
|
||||
unique_filename = f"{Path(safe_basename).stem}_{job_id}{Path(safe_basename).suffix}"
|
||||
upload_path = PATHS.UPLOADS_DIR / unique_filename
|
||||
processed_path = PATHS.PROCESSED_DIR / unique_filename
|
||||
await save_upload_file_chunked(file, upload_path)
|
||||
job_data = JobCreate(id=job_id, task_type="ocr", original_filename=file.filename, input_filepath=str(upload_path), processed_filepath=str(processed_path))
|
||||
new_job = create_job(db=db, job=job_data)
|
||||
ocr_settings = APP_CONFIG.get("ocr_settings", {}).get("ocrmypdf", {})
|
||||
run_pdf_ocr_task(new_job.id, str(upload_path), str(processed_path), ocr_settings)
|
||||
return {"job_id": new_job.id, "status": new_job.status}
|
||||
|
||||
@app.post("/ocr-image", status_code=status.HTTP_202_ACCEPTED)
|
||||
async def submit_image_ocr(file: UploadFile = File(...), db: Session = Depends(get_db)):
|
||||
allowed_exts = {".png", ".jpg", ".jpeg", ".tiff", ".tif"}
|
||||
if not is_allowed_file(file.filename, allowed_exts):
|
||||
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid file type. Please upload a PNG, JPG, or TIFF.")
|
||||
job_id = uuid.uuid4().hex
|
||||
safe_basename = secure_filename(file.filename)
|
||||
file_ext = Path(safe_basename).suffix
|
||||
unique_filename = f"{Path(safe_basename).stem}_{job_id}{file_ext}"
|
||||
upload_path = PATHS.UPLOADS_DIR / unique_filename
|
||||
processed_path = PATHS.PROCESSED_DIR / f"{Path(safe_basename).stem}_{job_id}.txt"
|
||||
await save_upload_file_chunked(file, upload_path)
|
||||
job_data = JobCreate(id=job_id, task_type="ocr-image", original_filename=file.filename, input_filepath=str(upload_path), processed_filepath=str(processed_path))
|
||||
new_job = create_job(db=db, job=job_data)
|
||||
run_image_ocr_task(new_job.id, str(upload_path), str(processed_path))
|
||||
return {"job_id": new_job.id, "status": new_job.status}
|
||||
|
||||
@app.post("/job/{job_id}/cancel", status_code=status.HTTP_202_ACCEPTED)
|
||||
async def cancel_job(job_id: str, db: Session = Depends(get_db)):
|
||||
job = get_job(db, job_id)
|
||||
if not job:
|
||||
@@ -386,12 +626,13 @@ async def get_job_status(job_id: str, db: Session = Depends(get_db)):
|
||||
@app.get("/download/{filename}")
|
||||
async def download_file(filename: str):
|
||||
safe_filename = secure_filename(filename)
|
||||
file_path = settings.PROCESSED_DIR / safe_filename
|
||||
|
||||
if not file_path.resolve().is_relative_to(settings.PROCESSED_DIR.resolve()):
|
||||
file_path = PATHS.PROCESSED_DIR / safe_filename
|
||||
file_path = file_path.resolve()
|
||||
base = PATHS.PROCESSED_DIR.resolve()
|
||||
try:
|
||||
file_path.relative_to(base)
|
||||
except ValueError:
|
||||
raise HTTPException(status_code=403, detail="Access denied.")
|
||||
|
||||
if not file_path.is_file():
|
||||
raise HTTPException(status_code=404, detail="File not found.")
|
||||
|
||||
return FileResponse(path=file_path, filename=safe_filename, media_type="application/octet-stream")
|
||||
Reference in New Issue
Block a user