From ed58453466a84e3c2dcd7627829dc8350f0a6c06 Mon Sep 17 00:00:00 2001 From: Manuel Date: Sat, 20 Sep 2025 12:49:48 +0000 Subject: [PATCH] TTS and webhooks --- .env.example | 3 +- .gitignore | 5 +- docker-compose.yml | 1 + main.py | 1293 ++++++++++++++++++++++++++++++++++-------- requirements.txt | 2 + settings.default.yml | 30 +- static/css/style.css | 2 +- static/js/script.js | 218 +++++-- templates/index.html | 30 +- 9 files changed, 1291 insertions(+), 293 deletions(-) diff --git a/.env.example b/.env.example index 713031f..f0207bb 100644 --- a/.env.example +++ b/.env.example @@ -1,4 +1,5 @@ LOCAL_ONLY=True SECRET_KEY= UPLOADS_DIR=./uploads -PROCESSED_DIR=./processed \ No newline at end of file +PROCESSED_DIR=./processed +OMP_NUM_THREADS=1 \ No newline at end of file diff --git a/.gitignore b/.gitignore index 13f2ffb..8a28d1d 100644 --- a/.gitignore +++ b/.gitignore @@ -11,4 +11,7 @@ settings.yml app.log /config/* my.settings.yml -jobs.db-* \ No newline at end of file +jobs.db-* +venv312 +models +config \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index c7f289d..4fc44f6 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -10,6 +10,7 @@ services: - SECRET_KEY= # if using auth - UPLOADS_DIR=/app/uploads - PROCESSED_DIR=/app/processed + - OMP_NUM_THREADS=1 #user: "1000:1000" ports: - "6969:8000" diff --git a/main.py b/main.py index 057f72f..9065391 100644 --- a/main.py +++ b/main.py @@ -8,14 +8,20 @@ import uuid import shlex import yaml import os +import httpx +import glob from contextlib import asynccontextmanager from datetime import datetime, timezone from pathlib import Path -from typing import Dict, List, Any +from typing import Dict, List, Any, Optional import resource from threading import Semaphore from logging.handlers import RotatingFileHandler -from urllib.parse import urlencode +from urllib.parse import urljoin +import sys +import re +import importlib +import collections.abc import ocrmypdf import pypdf @@ -27,6 +33,7 @@ from fastapi import (Depends, FastAPI, File, Form, HTTPException, Request, from fastapi.responses import FileResponse, JSONResponse, RedirectResponse from fastapi.staticfiles import StaticFiles from fastapi.templating import Jinja2Templates +from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer from huey import SqliteHuey from pydantic import BaseModel, ConfigDict, field_serializer from sqlalchemy import (Column, DateTime, Integer, String, Text, @@ -39,13 +46,44 @@ from typing import List as TypingList from starlette.middleware.sessions import SessionMiddleware from authlib.integrations.starlette_client import OAuth from dotenv import load_dotenv +from piper import PiperVoice +import wave load_dotenv() + +# --- Optional Dependency Handling for Piper TTS --- +try: + from piper.synthesis import SynthesisConfig + # download helpers: some piper versions export download_voice, others expose ensure_voice_exists/find_voice + try: + # prefer the more explicit helpers if present + from piper.download import get_voices, ensure_voice_exists, find_voice, VoiceNotFoundError + except Exception: + # fall back to older API if available + try: + from piper.download import get_voices, download_voice, VoiceNotFoundError + ensure_voice_exists = None + find_voice = None + except Exception: + # partial import failed -> treat as piper-not-installed for download helpers + get_voices = None + download_voice = None + ensure_voice_exists = None + find_voice = None + VoiceNotFoundError = None +except ImportError: + SynthesisConfig = None + get_voices = None + download_voice = None + ensure_voice_exists = None + find_voice = None + VoiceNotFoundError = None + + # Instantiate OAuth object (was referenced in code) oauth = OAuth() - # -------------------------------------------------------------------------------- # --- 1. CONFIGURATION & SECURITY HELPERS # -------------------------------------------------------------------------------- @@ -70,9 +108,9 @@ def ensure_path_is_safe(p: Path, allowed_bases: List[Path]): def _limit_resources_preexec(): """Set resource limits for child processes to prevent DoS attacks.""" try: - # 6000s CPU, 2GB address space, i dont know if thats too much tbh + # 6000s CPU, 4GB address space resource.setrlimit(resource.RLIMIT_CPU, (6000, 6000)) - resource.setrlimit(resource.RLIMIT_AS, (4 * 1024 * 1024 * 1024, 2 * 1024 * 1024 * 1024)) + resource.setrlimit(resource.RLIMIT_AS, (4 * 1024 * 1024 * 1024, 4 * 1024 * 1024 * 1024)) except Exception as e: # This may fail in some environments (e.g. Windows, some containers) logging.getLogger(__name__).warning(f"Could not set resource limits: {e}") @@ -100,6 +138,10 @@ class AppPaths(BaseModel): UPLOADS_DIR: Path = UPLOADS_BASE PROCESSED_DIR: Path = PROCESSED_BASE CHUNK_TMP_DIR: Path = CHUNK_TMP_BASE + TTS_MODELS_DIR: Path = BASE_DIR / "models" / "tts" + KOKORO_TTS_MODELS_DIR: Path = BASE_DIR / "models" / "tts" / "kokoro" + KOKORO_MODEL_FILE: Path = KOKORO_TTS_MODELS_DIR / "kokoro-v1.0.onnx" + KOKORO_VOICES_FILE: Path = KOKORO_TTS_MODELS_DIR / "voices-v1.0.bin" DATABASE_URL: str = f"sqlite:///{BASE_DIR / 'jobs.db'}" HUEY_DB_PATH: str = str(BASE_DIR / "huey.db") CONFIG_DIR: Path = BASE_DIR / "config" @@ -112,6 +154,9 @@ PATHS.UPLOADS_DIR.mkdir(exist_ok=True, parents=True) PATHS.PROCESSED_DIR.mkdir(exist_ok=True, parents=True) PATHS.CHUNK_TMP_DIR.mkdir(exist_ok=True, parents=True) PATHS.CONFIG_DIR.mkdir(exist_ok=True, parents=True) +PATHS.TTS_MODELS_DIR.mkdir(exist_ok=True, parents=True) +PATHS.KOKORO_TTS_MODELS_DIR.mkdir(exist_ok=True, parents=True) + def load_app_config(): """ @@ -123,14 +168,18 @@ def load_app_config(): # --- Primary Method: Attempt to load settings.yml --- with open(PATHS.SETTINGS_FILE, 'r', encoding='utf8') as f: cfg_raw = yaml.safe_load(f) or {} - - # This logic block is intentionally duplicated to maintain compatibility + defaults = { - "app_settings": {"max_file_size_mb": 100, "allowed_all_extensions": []}, + "app_settings": {"max_file_size_mb": 100, "allowed_all_extensions": [], "app_public_url": ""}, "transcription_settings": {"whisper": {"allowed_models": ["tiny", "base", "small"], "compute_type": "int8", "device": "cpu"}}, + "tts_settings": { + "piper": {"model_dir": str(PATHS.TTS_MODELS_DIR), "use_cuda": False, "synthesis_config": {"length_scale": 1.0, "noise_scale": 0.667, "noise_w": 0.8}}, + "kokoro": {"model_dir": str(PATHS.KOKORO_TTS_MODELS_DIR), "command_template": "kokoro-tts {input} {output} --model {model_path} --voices {voices_path} --lang {lang} --voice {model_name}"} + }, "conversion_tools": {}, "ocr_settings": {"ocrmypdf": {}}, - "auth_settings": {"oidc_client_id": "", "oidc_client_secret": "", "oidc_server_metadata_url": "", "admin_users": []} + "auth_settings": {"oidc_client_id": "", "oidc_client_secret": "", "oidc_server_metadata_url": "", "admin_users": []}, + "webhook_settings": {"enabled": False, "allow_chunked_api_uploads": False, "allowed_callback_urls": [], "callback_bearer_token": ""} } cfg = defaults.copy() cfg.update(cfg_raw) # Merge loaded settings into defaults @@ -152,13 +201,17 @@ def load_app_config(): with open(PATHS.DEFAULT_SETTINGS_FILE, 'r', encoding='utf8') as f: cfg_raw = yaml.safe_load(f) or {} - # The same processing logic is applied to the fallback file defaults = { - "app_settings": {"max_file_size_mb": 100, "allowed_all_extensions": []}, + "app_settings": {"max_file_size_mb": 100, "allowed_all_extensions": [], "app_public_url": ""}, "transcription_settings": {"whisper": {"allowed_models": ["tiny", "base", "small"], "compute_type": "int8", "device": "cpu"}}, + "tts_settings": { + "piper": {"model_dir": str(PATHS.TTS_MODELS_DIR), "use_cuda": False, "synthesis_config": {"length_scale": 1.0, "noise_scale": 0.667, "noise_w": 0.8}}, + "kokoro": {"model_dir": str(PATHS.KOKORO_TTS_MODELS_DIR), "command_template": "kokoro-tts {input} {output} --model {model_path} --voices {voices_path} --lang {lang} --voice {model_name}"} + }, "conversion_tools": {}, "ocr_settings": {"ocrmypdf": {}}, - "auth_settings": {"oidc_client_id": "", "oidc_client_secret": "", "oidc_server_metadata_url": "", "admin_users": []} + "auth_settings": {"oidc_client_id": "", "oidc_client_secret": "", "oidc_server_metadata_url": "", "admin_users": []}, + "webhook_settings": {"enabled": False, "allow_chunked_api_uploads": False, "allowed_callback_urls": [], "callback_bearer_token": ""} } cfg = defaults.copy() cfg.update(cfg_raw) # Merge loaded settings into defaults @@ -177,11 +230,16 @@ def load_app_config(): # --- Final Failsafe: Use hardcoded defaults --- logger.error(f"CRITICAL: Fallback file settings.default.yml also failed: {e_fallback}. Using hardcoded defaults.") APP_CONFIG = { - "app_settings": {"max_file_size_mb": 100, "max_file_size_bytes": 100 * 1024 * 1024, "allowed_all_extensions": set()}, + "app_settings": {"max_file_size_mb": 100, "max_file_size_bytes": 100 * 1024 * 1024, "allowed_all_extensions": set(), "app_public_url": ""}, "transcription_settings": {"whisper": {"allowed_models": ["tiny", "base", "small"], "compute_type": "int8", "device": "cpu"}}, + "tts_settings": { + "piper": {"model_dir": str(PATHS.TTS_MODELS_DIR), "use_cuda": False, "synthesis_config": {"length_scale": 1.0, "noise_scale": 0.667, "noise_w": 0.8}}, + "kokoro": {"model_dir": str(PATHS.KOKORO_TTS_MODELS_DIR), "command_template": "kokoro-tts {input} {output} --model {model_path} --voices {voices_path} --lang {lang} --voice {model_name}"} + }, "conversion_tools": {}, "ocr_settings": {"ocrmypdf": {}}, - "auth_settings": {"oidc_client_id": "", "oidc_client_secret": "", "oidc_server_metadata_url": "", "admin_users": []} + "auth_settings": {"oidc_client_id": "", "oidc_client_secret": "", "oidc_server_metadata_url": "", "admin_users": []}, + "webhook_settings": {"enabled": False, "allow_chunked_api_uploads": False, "allowed_callback_urls": [], "callback_bearer_token": ""} } @@ -219,6 +277,7 @@ class Job(Base): output_filesize = Column(Integer, nullable=True) result_preview = Column(Text, nullable=True) error_message = Column(Text, nullable=True) + callback_url = Column(String, nullable=True) created_at = Column(DateTime, default=lambda: datetime.now(timezone.utc)) updated_at = Column(DateTime, default=lambda: datetime.now(timezone.utc), onupdate=lambda: datetime.now(timezone.utc)) @@ -236,6 +295,7 @@ class JobCreate(BaseModel): original_filename: str input_filepath: str input_filesize: int | None = None + callback_url: str | None = None processed_filepath: str | None = None class JobSchema(BaseModel): @@ -262,10 +322,13 @@ class FinalizeUploadPayload(BaseModel): total_chunks: int task_type: str model_size: str = "" + model_name: str = "" output_format: str = "" + callback_url: Optional[str] = None # For API chunked uploads + # -------------------------------------------------------------------------------- -# --- 3. CRUD OPERATIONS +# --- 3. CRUD OPERATIONS & WEBHOOKS # -------------------------------------------------------------------------------- def get_job(db: Session, job_id: str): return db.query(Job).filter(Job.id == job_id).first() @@ -312,11 +375,67 @@ def mark_job_as_completed(db: Session, job_id: str, output_filepath_str: str | N db.commit() return db_job +def send_webhook_notification(job_id: str, app_config: Dict[str, Any], base_url: str): + """Sends a notification to the callback URL if one is configured for the job.""" + webhook_config = app_config.get("webhook_settings", {}) + if not webhook_config.get("enabled", False): + return + + db = SessionLocal() + try: + job = get_job(db, job_id) + if not job or not job.callback_url: + return + + download_url = None + if job.status == "completed" and job.processed_filepath: + filename = Path(job.processed_filepath).name + public_url = app_config.get("app_settings", {}).get("app_public_url", base_url) + if not public_url: + logger.warning(f"app_public_url is not set. Cannot generate a full download URL for job {job_id}.") + download_url = f"/download/{filename}" # Relative URL as fallback + else: + download_url = urljoin(public_url, f"/download/{filename}") + + payload = { + "job_id": job.id, + "status": job.status, + "original_filename": job.original_filename, + "download_url": download_url, + "error_message": job.error_message, + "created_at": job.created_at.isoformat() + "Z", + "updated_at": job.updated_at.isoformat() + "Z", + } + + headers = {"Content-Type": "application/json", "User-Agent": "FileProcessor-Webhook/1.0"} + token = webhook_config.get("callback_bearer_token") + if token: + headers["Authorization"] = f"Bearer {token}" + + try: + with httpx.Client() as client: + response = client.post(job.callback_url, json=payload, headers=headers, timeout=15) + response.raise_for_status() + logger.info(f"Sent webhook notification for job {job_id} to {job.callback_url} (Status: {response.status_code})") + except httpx.RequestError as e: + logger.error(f"Failed to send webhook for job {job_id} to {job.callback_url}: {e}") + except httpx.HTTPStatusError as e: + logger.error(f"Webhook for job {job_id} received non-2xx response {e.response.status_code} from {job.callback_url}") + + except Exception as e: + logger.exception(f"An unexpected error occurred in send_webhook_notification for job {job_id}: {e}") + finally: + db.close() + + # -------------------------------------------------------------------------------- # --- 4. BACKGROUND TASK SETUP # -------------------------------------------------------------------------------- huey = SqliteHuey(filename=PATHS.HUEY_DB_PATH) WHISPER_MODELS_CACHE: Dict[str, WhisperModel] = {} +PIPER_VOICES_CACHE: Dict[str, "PiperVoice"] = {} +AVAILABLE_TTS_VOICES_CACHE: Dict[str, Any] | None = None + def get_whisper_model(model_size: str, whisper_settings: dict) -> WhisperModel: if model_size in WHISPER_MODELS_CACHE: @@ -331,6 +450,453 @@ def get_whisper_model(model_size: str, whisper_settings: dict) -> WhisperModel: logger.info(f"Model '{model_size}' loaded.") return model +def get_piper_voice(model_name: str, tts_settings: dict | None) -> "PiperVoice": + """ + Load (or download + load) a Piper voice in a robust way: + - Try Python API helpers (get_voices, ensure_voice_exists/find_voice, download_voice) + - On any failure, try CLI fallback (download_voice_cli) + - Attempt to locate model files after download (search subdirs) + - Try re-importing piper if bindings were previously unavailable + """ + # ----- Defensive normalization ----- + if tts_settings is None or not isinstance(tts_settings, dict): + logger.debug("get_piper_voice: normalizing tts_settings (was %r)", tts_settings) + tts_settings = {} + + model_dir_val = tts_settings.get("model_dir", None) + if model_dir_val is None: + model_dir = Path(str(PATHS.TTS_MODELS_DIR)) + else: + try: + model_dir = Path(model_dir_val) + except Exception: + logger.warning("Could not coerce tts_settings['model_dir']=%r to Path; using default.", model_dir_val) + model_dir = Path(str(PATHS.TTS_MODELS_DIR)) + model_dir.mkdir(parents=True, exist_ok=True) + + # If PiperVoice already cached, reuse + if model_name in PIPER_VOICES_CACHE: + logger.info("Reusing cached Piper voice '%s'.", model_name) + return PIPER_VOICES_CACHE[model_name] + + with _model_semaphore: + if model_name in PIPER_VOICES_CACHE: + return PIPER_VOICES_CACHE[model_name] + + # If Python bindings are missing, attempt CLI download first (and try re-import) + if PiperVoice is None: + logger.info("Piper Python bindings missing; attempting CLI download fallback for '%s' before failing import.", model_name) + cli_ok = False + try: + cli_ok = download_voice_cli(model_name, model_dir) + except Exception as e: + logger.warning("CLI download attempt raised: %s", e) + cli_ok = False + + if cli_ok: + # attempt to re-import piper package (maybe import issue was transient) + try: + importlib.invalidate_caches() + piper_mod = importlib.import_module("piper") + from piper import PiperVoice as _PiperVoice # noqa: F401 + from piper.synthesis import SynthesisConfig as _SynthesisConfig # noqa: F401 + globals().update({"PiperVoice": _PiperVoice, "SynthesisConfig": _SynthesisConfig}) + logger.info("Successfully re-imported piper after CLI download.") + except Exception: + logger.warning("Could not import piper after CLI download; bindings still unavailable.") + # If bindings still absent, we cannot load models; raise helpful error + if PiperVoice is None: + raise RuntimeError( + "Piper Python bindings are not installed or failed to import. " + "Tried CLI download fallback but python bindings are still unavailable. " + "Please install 'piper-tts' in the runtime used by this process." + ) + + # Now we have Piper bindings (or they were present to begin with). Attempt Python helpers. + onnx_path = None + config_path = None + + # Prefer using get_voices to update the index if available + voices_info = None + try: + if get_voices: + try: + voices_info = get_voices(str(model_dir), update_voices=True) + except TypeError: + # some versions may not support update_voices kwarg + voices_info = get_voices(str(model_dir)) + except Exception as e: + logger.debug("get_voices failed or unavailable: %s", e) + voices_info = None + + try: + # Preferred modern helpers + if ensure_voice_exists and find_voice: + try: + ensure_voice_exists(model_name, [model_dir], model_dir, voices_info) + onnx_path, config_path = find_voice(model_name, [model_dir]) + except Exception as e: + # Could be VoiceNotFoundError or other download error + logger.warning("ensure/find voice failed for %s: %s", model_name, e) + raise + elif download_voice: + # older API: call download helper directly + try: + download_voice(model_name, model_dir) + # attempt to locate files + onnx_path = model_dir / f"{model_name}.onnx" + config_path = model_dir / f"{model_name}.onnx.json" + except Exception: + logger.warning("download_voice failed for %s", model_name) + raise + else: + # No python download helper available + raise RuntimeError("No Python download helper available in installed piper package.") + except Exception as py_exc: + # Python helper route failed; try CLI fallback BEFORE giving up + logger.info("Python download route failed for '%s' (%s). Trying CLI fallback...", model_name, py_exc) + try: + cli_ok = download_voice_cli(model_name, model_dir) + except Exception as e: + logger.warning("CLI fallback attempt raised: %s", e) + cli_ok = False + + if not cli_ok: + # If CLI also failed, re-raise the original python exception to preserve context + logger.error("Both Python download helpers and CLI fallback failed for '%s'.", model_name) + raise + + # CLI succeeded (or at least returned success) — try to find files on disk + onnx_path, config_path = _find_model_files(model_name, model_dir) + if not (onnx_path and config_path): + # maybe CLI wrote into a nested dir or different name; try to search broadly + logger.info("Could not find model files after CLI download in %s; attempting broader search...", model_dir) + onnx_path, config_path = _find_model_files(model_name, model_dir) + if not (onnx_path and config_path): + logger.error("Model files still missing after CLI fallback for '%s'.", model_name) + raise RuntimeError(f"Piper voice files for '{model_name}' missing after CLI fallback.") + # continue to loading below + + # Final safety check and last-resort search + if not (onnx_path and config_path): + onnx_path, config_path = _find_model_files(model_name, model_dir) + + if not (onnx_path and config_path): + raise RuntimeError(f"Piper voice files for '{model_name}' are missing after attempts to download.") + + # Load the PiperVoice + try: + use_cuda = bool(tts_settings.get("use_cuda", False)) + voice = PiperVoice.load(str(onnx_path), config_path=str(config_path), use_cuda=use_cuda) + PIPER_VOICES_CACHE[model_name] = voice + logger.info("Loaded Piper voice '%s' from %s", model_name, onnx_path) + return voice + except Exception as e: + logger.exception("Failed to load Piper voice '%s' from files (%s, %s): %s", model_name, onnx_path, config_path, e) + raise + + +def _find_model_files(model_name: str, model_dir: Path): + """ + Try multiple strategies to find onnx and config files for a given model_name under model_dir. + Returns (onnx_path, config_path) or (None, None). + """ + # direct files in model_dir + onnx = model_dir / f"{model_name}.onnx" + cfg = model_dir / f"{model_name}.onnx.json" + if onnx.exists() and cfg.exists(): + return onnx, cfg + + # possible alternative names or nested directories: search recursively + matches_onnx = list(model_dir.rglob(f"{model_name}*.onnx")) + matches_cfg = list(model_dir.rglob(f"{model_name}*.onnx.json")) + if matches_onnx and matches_cfg: + # prefer same directory match + for o in matches_onnx: + for c in matches_cfg: + if o.parent == c.parent: + return o, c + # otherwise return first matches + return matches_onnx[0], matches_cfg[0] + + # last-resort: any onnx + any json in same subdir that contain model name token + for o in model_dir.rglob("*.onnx"): + if model_name in o.name: + # try find any matching json in same dir + cands = list(o.parent.glob("*.onnx.json")) + if cands: + return o, cands[0] + + return None, None + + +# --------------------------- +# CLI: list available voices +# --------------------------- +def list_voices_cli(timeout: int = 30, python_executables: Optional[List[str]] = None) -> List[str]: + """ + Run `python -m piper.download_voices` (no args) and parse output into a list of voice IDs. + Returns [] on failure. + """ + if python_executables is None: + python_executables = [sys.executable, "python3", "python"] + + # Regex: voice ids look like en_US-lessac-medium (letters/digits/._-) + voice_regex = re.compile(r'^([A-Za-z0-9_\-\.]+)') + + for py in python_executables: + cmd = [py, "-m", "piper.download_voices"] + try: + logger.debug("Trying Piper CLI list: %s", shlex.join(cmd)) + cp = subprocess.run( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + check=True, + timeout=timeout, + ) + out = cp.stdout.strip() + # If stdout empty, sometimes the script writes to stderr + if not out: + out = cp.stderr.strip() + + if not out: + logger.debug("Piper CLI listed nothing (empty output) for %s", py) + continue + + voices = [] + for line in out.splitlines(): + line = line.strip() + if not line: + continue + # Try to extract first token that matches voice id pattern + m = voice_regex.match(line) + if m: + v = m.group(1) + # basic sanity: avoid capturing words like 'Available' or headings + if re.search(r'\d', v) or '-' in v or '_' in v or '.' in v: + voices.append(v) + else: + # allow alphabetic tokens too (defensive) + voices.append(v) + else: + # Also handle lines like " - en_US-lessac-medium: description" + parts = re.split(r'[:\s]+', line) + if parts: + candidate = parts[0].lstrip('-').strip() + if candidate: + voices.append(candidate) + # Dedupe while preserving order + seen = set() + dedup = [] + for v in voices: + if v not in seen: + seen.add(v) + dedup.append(v) + logger.info("Piper CLI list returned %d voices via %s", len(dedup), py) + return dedup + except subprocess.CalledProcessError as e: + logger.debug("Piper CLI list (%s) non-zero exit. stdout=%s stderr=%s", py, e.stdout, e.stderr) + except FileNotFoundError: + logger.debug("Python executable not found: %s", py) + except subprocess.TimeoutExpired: + logger.warning("Piper CLI list timed out for %s", py) + except Exception as e: + logger.exception("Unexpected error running Piper CLI list with %s: %s", py, e) + + logger.error("All Piper CLI list attempts failed.") + return [] + +# --------------------------- +# CLI: download a voice +# --------------------------- +def download_voice_cli(model_name: str, model_dir: Path, python_executables: Optional[List[str]] = None, timeout: int = 300) -> bool: + """ + Try to download a Piper voice using CLI: + python -m piper.download_voices --data-dir + Returns True if the CLI ran and expected files exist afterwards (best effort). + """ + if python_executables is None: + python_executables = [sys.executable, "python3", "python"] + + model_dir = Path(model_dir) + model_dir.mkdir(parents=True, exist_ok=True) + + for py in python_executables: + cmd = [py, "-m", "piper.download_voices", model_name, "--data-dir", str(model_dir)] + try: + logger.info("Trying Piper CLI download: %s", shlex.join(cmd)) + cp = subprocess.run( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + check=True, + timeout=timeout, + ) + logger.debug("Piper CLI download stdout: %s", cp.stdout) + logger.debug("Piper CLI download stderr: %s", cp.stderr) + # Heuristic success check + onnx = model_dir / f"{model_name}.onnx" + cfg = model_dir / f"{model_name}.onnx.json" + if onnx.exists() and cfg.exists(): + logger.info("Piper CLI created expected files for %s", model_name) + return True + # Some versions might create nested dirs; treat non-error CLI execution as success (caller will re-check) + return True + except subprocess.CalledProcessError as e: + logger.warning("Piper CLI (%s) returned non-zero exit. stdout: %s; stderr: %s", py, e.stdout, e.stderr) + except FileNotFoundError: + logger.debug("Python executable %s not found.", py) + except subprocess.TimeoutExpired: + logger.warning("Piper CLI call timed out for python %s", py) + except Exception as e: + logger.exception("Unexpected error running Piper CLI download with %s: %s", py, e) + + logger.error("All Piper CLI attempts failed for model %s", model_name) + return False + +# --------------------------- +# Safe get_voices wrapper +# --------------------------- +def safe_get_voices(model_dir: Path) -> List[Dict]: + """ + Try to call the in-Python get_voices(..., update_voices=True) and return a list of dicts. + If that fails, fall back to list_voices_cli() and return a list of simple dicts: + [{"id": "en_US-lessac-medium", "name": "...", "local": False}, ...] + Keeps the shape flexible so your existing endpoint can use it with minimal changes. + """ + # Prefer Python API if available + try: + if get_voices: # get_voices imported earlier in your file + # Ensure up-to-date index (like CLI) + raw = get_voices(str(model_dir), update_voices=True) + # get_voices may already return the desired structure; normalise to a list of dicts + if isinstance(raw, dict): + # some versions return mapping id->meta + items = [] + for vid, meta in raw.items(): + d = {"id": vid} + if isinstance(meta, dict): + d.update(meta) + items.append(d) + return items + elif isinstance(raw, list): + return raw + else: + # unknown format -> fall back to CLI + logger.debug("get_voices returned unexpected type; falling back to CLI list.") + except Exception as e: + logger.warning("In-Python get_voices failed: %s. Falling back to CLI listing.", e) + + # CLI fallback: parse voice ids and create simple dicts + cli_list = list_voices_cli() + results = [{"id": vid, "name": vid, "local": False} for vid in cli_list] + return results + +def list_kokoro_voices_cli(timeout: int = 60) -> List[str]: + """ + Run `kokoro-tts --help-voices` and parse the output for available models. + Returns [] on failure. + """ + model_path = PATHS.KOKORO_MODEL_FILE + voices_path = PATHS.KOKORO_VOICES_FILE + if not (model_path.exists() and voices_path.exists()): + logger.warning("Cannot list Kokoro TTS voices because model/voices files are missing.") + return [] + + cmd = ["kokoro-tts", "--help-voices", "--model", str(model_path), "--voices", str(voices_path)] + try: + logger.debug("Trying Kokoro TTS CLI list: %s", shlex.join(cmd)) + cp = subprocess.run( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + check=True, + timeout=timeout, + ) + out = cp.stdout.strip() + if not out: + out = cp.stderr.strip() + + if not out: + logger.warning("Kokoro TTS CLI list returned no output.") + return [] + + voices = [] + voice_pattern = re.compile(r'^\s*\d+\.\s+([a-z]{2,3}_[a-zA-Z0-9]+)$') + for line in out.splitlines(): + line = line.strip() + match = voice_pattern.match(line) + if match: + voices.append(match.group(1)) + + logger.info("Kokoro TTS CLI list returned %d voices.", len(voices)) + return sorted(list(set(voices))) + except FileNotFoundError: + logger.info("Kokoro TTS ('kokoro-tts' command) not found in PATH. Kokoro TTS support disabled.") + return [] + except subprocess.CalledProcessError as e: + logger.error("Kokoro TTS CLI list command failed. stderr: %s", e.stderr[:1000]) + return [] + except subprocess.TimeoutExpired: + logger.warning("Kokoro TTS CLI list command timed out.") + return [] + except Exception as e: + logger.exception("Unexpected error running Kokoro TTS CLI list: %s", e) + return [] + +def list_kokoro_languages_cli(timeout: int = 60) -> List[str]: + """ + Run `kokoro-tts --help-languages` and parse the output for available languages. + Returns [] on failure. + """ + model_path = PATHS.KOKORO_MODEL_FILE + voices_path = PATHS.KOKORO_VOICES_FILE + if not (model_path.exists() and voices_path.exists()): + logger.warning("Cannot list Kokoro TTS languages because model/voices files are missing.") + return [] + + cmd = ["kokoro-tts", "--help-languages", "--model", str(model_path), "--voices", str(voices_path)] + try: + logger.debug("Trying Kokoro TTS language list: %s", shlex.join(cmd)) + cp = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=True, timeout=timeout) + out = cp.stdout.strip() + if not out: + out = cp.stderr.strip() + + if not out: + logger.warning("Kokoro TTS language list returned no output.") + return [] + + languages = [] + lang_pattern = re.compile(r'^\s*([a-z]{2,3}(?:-[a-z]{2,3})?)$') + for line in out.splitlines(): + line = line.strip() + if line.lower().startswith("supported languages"): + continue + match = lang_pattern.match(line) + if match: + languages.append(match.group(1)) + + logger.info("Kokoro TTS language list returned %d languages.", len(languages)) + return sorted(list(set(languages))) + except FileNotFoundError: + logger.info("Kokoro TTS ('kokoro-tts' command) not found in PATH. Kokoro TTS support disabled.") + return [] + except subprocess.CalledProcessError as e: + logger.error("Kokoro TTS language list command failed. stderr: %s", e.stderr[:1000]) + return [] + except subprocess.TimeoutExpired: + logger.warning("Kokoro TTS language list command timed out.") + return [] + except Exception as e: + logger.exception("Unexpected error running Kokoro TTS language list: %s", e) + return [] + + def run_command(argv: TypingList[str], timeout: int = 300): try: res = subprocess.run(argv, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, timeout=timeout, preexec_fn=_limit_resources_preexec) @@ -343,7 +909,11 @@ def run_command(argv: TypingList[str], timeout: int = 300): def validate_and_build_command(template_str: str, mapping: Dict[str, str]) -> TypingList[str]: fmt = Formatter() used = {fname for _, fname, _, _ in fmt.parse(template_str) if fname} - ALLOWED_VARS = {"input", "output", "output_dir", "output_ext", "quality", "speed", "preset", "device", "dpi", "samplerate", "bitdepth", "filter"} + ALLOWED_VARS = { + "input", "output", "output_dir", "output_ext", "quality", "speed", "preset", + "device", "dpi", "samplerate", "bitdepth", "filter", "model_name", + "model_path", "voices_path", "lang" + } bad = used - ALLOWED_VARS if bad: raise ValueError(f"Command template contains disallowed placeholders: {bad}") @@ -354,8 +924,10 @@ def validate_and_build_command(template_str: str, mapping: Dict[str, str]) -> Ty formatted = template_str.format(**safe_mapping) return shlex.split(formatted) +# --- TASK RUNNERS --- +# Each task now accepts app_config and base_url to facilitate webhook notifications @huey.task() -def run_transcription_task(job_id: str, input_path_str: str, output_path_str: str, model_size: str, whisper_settings: dict): +def run_transcription_task(job_id: str, input_path_str: str, output_path_str: str, model_size: str, whisper_settings: dict, app_config: dict, base_url: str): db = SessionLocal() input_path = Path(input_path_str) try: @@ -391,12 +963,96 @@ def run_transcription_task(job_id: str, input_path_str: str, output_path_str: st ensure_path_is_safe(input_path, [PATHS.UPLOADS_DIR, PATHS.CHUNK_TMP_DIR]) input_path.unlink(missing_ok=True) except Exception: - # swallow cleanup errors but log logger.exception("Failed to cleanup input file after transcription.") db.close() + send_webhook_notification(job_id, app_config, base_url) @huey.task() -def run_pdf_ocr_task(job_id: str, input_path_str: str, output_path_str: str, ocr_settings: dict): +def run_tts_task(job_id: str, input_path_str: str, output_path_str: str, model_name: str, tts_settings: dict, app_config: dict, base_url: str): + db = SessionLocal() + input_path = Path(input_path_str) + try: + job = get_job(db, job_id) + if not job or job.status == 'cancelled': + return + + update_job_status(db, job_id, "processing") + + engine, actual_model_name = "piper", model_name + if '/' in model_name: + parts = model_name.split('/', 1) + engine = parts[0] + actual_model_name = parts[1] + + + logger.info(f"Starting TTS for job {job_id} using engine '{engine}' with model '{actual_model_name}'") + out_path = Path(output_path_str) + tmp_out = out_path.with_name(f"{out_path.stem}.tmp-{uuid.uuid4().hex}{out_path.suffix}") + + if engine == "piper": + piper_settings = tts_settings.get("piper", {}) + voice = get_piper_voice(actual_model_name, piper_settings) + + with open(input_path, 'r', encoding='utf-8') as f: + text_to_speak = f.read() + + synthesis_params = piper_settings.get("synthesis_config", {}) + synthesis_config = SynthesisConfig(**synthesis_params) if SynthesisConfig else None + + with wave.open(str(tmp_out), "wb") as wav_file: + wav_file.setnchannels(1) + wav_file.setsampwidth(2) + wav_file.setframerate(voice.config.sample_rate) + voice.synthesize_wav(text_to_speak, wav_file, synthesis_config) + + elif engine == "kokoro": + kokoro_settings = tts_settings.get("kokoro", {}) + command_template_str = kokoro_settings.get("command_template") + if not command_template_str: + raise ValueError("Kokoro TTS command_template is not defined in settings.") + + try: + lang, voice_name = actual_model_name.split('/', 1) + except ValueError: + raise ValueError(f"Invalid Kokoro model format. Expected 'lang/voice', but got '{actual_model_name}'.") + + mapping = { + "input": str(input_path), + "output": str(tmp_out), + "lang": lang, + "model_name": voice_name, + "model_path": str(PATHS.KOKORO_MODEL_FILE), + "voices_path": str(PATHS.KOKORO_VOICES_FILE), + } + + command = validate_and_build_command(command_template_str, mapping) + logger.info(f"Executing Kokoro TTS command: {' '.join(command)}") + run_command(command, timeout=kokoro_settings.get("timeout", 300)) + + if not tmp_out.exists(): + raise FileNotFoundError("Kokoro TTS command did not produce an output file.") + + else: + raise ValueError(f"Unsupported TTS engine: {engine}") + + tmp_out.replace(out_path) + mark_job_as_completed(db, job_id, output_filepath_str=output_path_str, preview="Successfully generated audio.") + logger.info(f"TTS for job {job_id} completed.") + + except Exception as e: + logger.exception(f"ERROR during TTS for job {job_id}") + update_job_status(db, job_id, "failed", error=f"TTS failed: {e}") + finally: + try: + ensure_path_is_safe(input_path, [PATHS.UPLOADS_DIR, PATHS.CHUNK_TMP_DIR]) + input_path.unlink(missing_ok=True) + except Exception: + logger.exception("Failed to cleanup input file after TTS.") + db.close() + send_webhook_notification(job_id, app_config, base_url) + +@huey.task() +def run_pdf_ocr_task(job_id: str, input_path_str: str, output_path_str: str, ocr_settings: dict, app_config: dict, base_url: str): db = SessionLocal() input_path = Path(input_path_str) try: @@ -426,9 +1082,10 @@ def run_pdf_ocr_task(job_id: str, input_path_str: str, output_path_str: str, ocr except Exception: logger.exception("Failed to cleanup input file after PDF OCR.") db.close() + send_webhook_notification(job_id, app_config, base_url) @huey.task() -def run_image_ocr_task(job_id: str, input_path_str: str, output_path_str: str): +def run_image_ocr_task(job_id: str, input_path_str: str, output_path_str: str, app_config: dict, base_url: str): db = SessionLocal() input_path = Path(input_path_str) try: @@ -455,10 +1112,11 @@ def run_image_ocr_task(job_id: str, input_path_str: str, output_path_str: str): except Exception: logger.exception("Failed to cleanup input file after Image OCR.") db.close() + send_webhook_notification(job_id, app_config, base_url) @huey.task() -def run_conversion_task(job_id: str, input_path_str: str, output_path_str: str, tool: str, task_key: str, conversion_tools_config: dict): +def run_conversion_task(job_id: str, input_path_str: str, output_path_str: str, tool: str, task_key: str, conversion_tools_config: dict, app_config: dict, base_url: str): db = SessionLocal() input_path = Path(input_path_str) output_path = Path(output_path_str) @@ -476,7 +1134,6 @@ def run_conversion_task(job_id: str, input_path_str: str, output_path_str: str, current_input_path = input_path - # Pre-processing for specific tools if tool == "mozjpeg": temp_input_file = input_path.with_suffix('.temp.ppm') logger.info(f"Pre-converting for MozJPEG: {input_path} -> {temp_input_file}") @@ -489,7 +1146,6 @@ def run_conversion_task(job_id: str, input_path_str: str, output_path_str: str, update_job_status(db, job_id, "processing", progress=50) - # prepare temporary output and mapping temp_output_file = output_path.with_name(f"{output_path.stem}.tmp-{uuid.uuid4().hex}{output_path.suffix}") mapping = { "input": str(current_input_path), @@ -498,9 +1154,7 @@ def run_conversion_task(job_id: str, input_path_str: str, output_path_str: str, "output_ext": output_path.suffix.lstrip('.'), } - # tool specific mapping adjustments if tool.startswith("ghostscript"): - # task_key form: "device_setting" parts = task_key.split('_', 1) device = parts[0] if parts else "" setting = parts[1] if len(parts) > 1 else "" @@ -518,7 +1172,6 @@ def run_conversion_task(job_id: str, input_path_str: str, output_path_str: str, except: _, rate = task_key.split('_') depth = '' - rate = rate.replace('k', '000') if 'k' in rate else rate mapping.update({"samplerate": rate, "bitdepth": depth}) elif tool == "mozjpeg": @@ -534,7 +1187,7 @@ def run_conversion_task(job_id: str, input_path_str: str, output_path_str: str, command = validate_and_build_command(command_template_str, mapping) logger.info(f"Executing command: {' '.join(command)}") - result = run_command(command, timeout=tool_config.get("timeout", 300)) + run_command(command, timeout=tool_config.get("timeout", 300)) if temp_output_file and temp_output_file.exists(): temp_output_file.replace(output_path) @@ -546,7 +1199,7 @@ def run_conversion_task(job_id: str, input_path_str: str, output_path_str: str, update_job_status(db, job_id, "failed", error=f"Conversion failed: {e}") finally: try: - ensure_path_is_safe(input_path, [PATHS.UPLOADS_DIR]) + ensure_path_is_safe(input_path, [PATHS.UPLOADS_DIR, PATHS.CHUNK_TMP_DIR]) input_path.unlink(missing_ok=True) except Exception: logger.exception("Failed to cleanup main input file after conversion.") @@ -565,16 +1218,51 @@ def run_conversion_task(job_id: str, input_path_str: str, output_path_str: str, except Exception: logger.exception("Failed to cleanup temp output file after conversion.") db.close() + send_webhook_notification(job_id, app_config, base_url) # -------------------------------------------------------------------------------- # --- 5. FASTAPI APPLICATION # -------------------------------------------------------------------------------- +async def download_kokoro_models_if_missing(): + """Checks for Kokoro TTS model files and downloads them if they don't exist.""" + files_to_download = { + "model": {"path": PATHS.KOKORO_MODEL_FILE, "url": "https://github.com/nazdridoy/kokoro-tts/releases/download/v1.0.0/kokoro-v1.0.onnx"}, + "voices": {"path": PATHS.KOKORO_VOICES_FILE, "url": "https://github.com/nazdridoy/kokoro-tts/releases/download/v1.0.0/voices-v1.0.bin"} + } + async with httpx.AsyncClient() as client: + for name, details in files_to_download.items(): + path, url = details["path"], details["url"] + if not path.exists(): + logger.info(f"Kokoro TTS {name} file missing. Downloading from {url}...") + try: + with path.open("wb") as f: + async with client.stream("GET", url, follow_redirects=True, timeout=300) as response: + response.raise_for_status() + async for chunk in response.aiter_bytes(): + f.write(chunk) + logger.info(f"Successfully downloaded Kokoro TTS {name} file to {path}.") + except Exception as e: + logger.error(f"Failed to download Kokoro TTS {name} file: {e}") + if path.exists(): path.unlink(missing_ok=True) + else: + logger.info(f"Found existing Kokoro TTS {name} file at {path}.") + @asynccontextmanager async def lifespan(app: FastAPI): logger.info("Application starting up...") Base.metadata.create_all(bind=engine) load_app_config() - ENV = os.environ.get('ENV', 'dev').lower() # probably reduntant because I load the .env at the start but whatever + + # Download required models on startup + if shutil.which("kokoro-tts"): + await download_kokoro_models_if_missing() + + if PiperVoice is None: + logger.warning("piper-tts is not installed. Piper TTS features will be disabled. Install with: pip install piper-tts") + if not shutil.which("kokoro-tts"): + logger.warning("kokoro-tts command not found in PATH. Kokoro TTS features will be disabled.") + + ENV = os.environ.get('ENV', 'dev').lower() ALLOW_LOCAL_ONLY = os.environ.get('ALLOW_LOCAL_ONLY', 'false').lower() == 'true' if LOCAL_ONLY_MODE and ENV != 'dev' and not ALLOW_LOCAL_ONLY: raise RuntimeError('LOCAL_ONLY_MODE may only be enabled in dev or when ALLOW_LOCAL_ONLY=true is set.') @@ -605,13 +1293,12 @@ if not SECRET_KEY: logger.warning('SECRET_KEY is not set. Generating a temporary key. Sessions will not persist across restarts.') SECRET_KEY = os.urandom(24).hex() -# Should probably set https_only=True in production behind HTTPS i guess app.add_middleware( SessionMiddleware, secret_key=SECRET_KEY, - https_only=False, + https_only=False, # Set to True if behind HTTPS proxy same_site='lax', - max_age=14 * 24 * 60 * 60 # 14 days in seconds + max_age=14 * 24 * 60 * 60 # 14 days ) @@ -620,11 +1307,28 @@ app.mount("/static", StaticFiles(directory=str(PATHS.BASE_DIR / "static")), name templates = Jinja2Templates(directory=str(PATHS.BASE_DIR / "templates")) # --- AUTH & USER HELPERS --- +http_bearer = HTTPBearer() + def get_current_user(request: Request): if LOCAL_ONLY_MODE: return {'sub': 'local_user', 'email': 'local@user.com', 'name': 'Local User'} return request.session.get('user') +async def require_api_user(request: Request, creds: HTTPAuthorizationCredentials = Depends(http_bearer)): + """Dependency for API routes requiring OIDC bearer token authentication.""" + if LOCAL_ONLY_MODE: + return {'sub': 'local_api_user', 'email': 'local@api.user.com', 'name': 'Local API User'} + + if not creds: + raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Not authenticated") + token = creds.credentials + try: + user = await oauth.oidc.userinfo(token={'access_token': token}) + return dict(user) + except Exception as e: + logger.error(f"API token validation failed: {e}") + raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Invalid or expired token") + def is_admin(request: Request) -> bool: if LOCAL_ONLY_MODE: return True user = get_current_user(request) @@ -641,17 +1345,54 @@ def require_admin(request: Request): if not is_admin(request): raise HTTPException(status_code=403, detail="Administrator privileges required.") return True -# --- CHUNKED UPLOADs --- +# --- FILE SAVING UTILITY --- +async def save_upload_file(upload_file: UploadFile, destination: Path) -> int: + """ + Saves an uploaded file to a destination, handling size limits. + This function is used by both the simple API and the legacy direct-upload routes. + """ + max_size = APP_CONFIG.get("app_settings", {}).get("max_file_size_bytes", 100 * 1024 * 1024) + tmp_path = destination.with_name(f"{destination.stem}.tmp-{uuid.uuid4().hex}{destination.suffix}") + size = 0 + try: + with tmp_path.open("wb") as buffer: + while True: + chunk = await upload_file.read(1024 * 1024) + if not chunk: + break + size += len(chunk) + if size > max_size: + raise HTTPException(status_code=413, detail=f"File exceeds {max_size / 1024 / 1024} MB limit") + buffer.write(chunk) + tmp_path.replace(destination) + return size + except Exception as e: + try: + # Ensure temp file is cleaned up on error + ensure_path_is_safe(tmp_path, [PATHS.UPLOADS_DIR, PATHS.CHUNK_TMP_DIR]) + tmp_path.unlink(missing_ok=True) + except Exception: + logger.exception("Failed to remove temp upload file after error.") + # Re-raise the original exception + raise e + finally: + await upload_file.close() + +def is_allowed_file(filename: str, allowed_extensions: set) -> bool: + if not allowed_extensions: # If set is empty, allow all + return True + return Path(filename).suffix.lower() in allowed_extensions + +# --- CHUNKED UPLOADS (for UI) --- @app.post("/upload/chunk") async def upload_chunk( chunk: UploadFile = File(...), upload_id: str = Form(...), chunk_number: int = Form(...), - user: dict = Depends(require_user) # AUTHENTICATION + user: dict = Depends(require_user) ): safe_upload_id = secure_filename(upload_id) - temp_dir = PATHS.CHUNK_TMP_DIR / safe_upload_id - temp_dir = ensure_path_is_safe(temp_dir, [PATHS.CHUNK_TMP_DIR]) + temp_dir = ensure_path_is_safe(PATHS.CHUNK_TMP_DIR / safe_upload_id, [PATHS.CHUNK_TMP_DIR]) temp_dir.mkdir(exist_ok=True) chunk_path = temp_dir / f"{chunk_number}.chunk" @@ -677,36 +1418,49 @@ async def _stitch_chunks(temp_dir: Path, final_path: Path, total_chunks: int): shutil.rmtree(temp_dir, ignore_errors=True) @app.post("/upload/finalize", status_code=status.HTTP_202_ACCEPTED) -async def finalize_upload(payload: FinalizeUploadPayload, user: dict = Depends(require_user), db: Session = Depends(get_db)): +async def finalize_upload(request: Request, payload: FinalizeUploadPayload, user: dict = Depends(require_user), db: Session = Depends(get_db)): safe_upload_id = secure_filename(payload.upload_id) - temp_dir = PATHS.CHUNK_TMP_DIR / safe_upload_id - temp_dir = ensure_path_is_safe(temp_dir, [PATHS.CHUNK_TMP_DIR]) + temp_dir = ensure_path_is_safe(PATHS.CHUNK_TMP_DIR / safe_upload_id, [PATHS.CHUNK_TMP_DIR]) if not temp_dir.is_dir(): raise HTTPException(status_code=404, detail="Upload session not found or already finalized.") + webhook_config = APP_CONFIG.get("webhook_settings", {}) + if payload.callback_url and not is_allowed_callback_url(payload.callback_url, webhook_config.get("allowed_callback_urls", [])): + raise HTTPException(status_code=400, detail="Provided callback_url is not allowed.") + + job_id = uuid.uuid4().hex safe_filename = secure_filename(payload.original_filename) final_path = PATHS.UPLOADS_DIR / f"{Path(safe_filename).stem}_{job_id}{Path(safe_filename).suffix}" await _stitch_chunks(temp_dir, final_path, payload.total_chunks) + base_url = str(request.base_url) job_data = JobCreate( id=job_id, user_id=user['sub'], task_type=payload.task_type, original_filename=payload.original_filename, input_filepath=str(final_path), input_filesize=final_path.stat().st_size ) + # --- Task Dispatching for UI chunked uploads --- if payload.task_type == "transcription": stem = Path(safe_filename).stem processed_path = PATHS.PROCESSED_DIR / f"{stem}_{job_id}.txt" job_data.processed_filepath = str(processed_path) create_job(db=db, job=job_data) - run_transcription_task(job_id, str(final_path), str(processed_path), payload.model_size, APP_CONFIG.get("transcription_settings", {}).get("whisper", {})) + run_transcription_task(job_data.id, str(final_path), str(processed_path), payload.model_size, APP_CONFIG.get("transcription_settings", {}).get("whisper", {}), APP_CONFIG, base_url) + elif payload.task_type == "tts": + tts_config = APP_CONFIG.get("tts_settings", {}) + stem = Path(safe_filename).stem + processed_path = PATHS.PROCESSED_DIR / f"{stem}_{job_id}.wav" + job_data.processed_filepath = str(processed_path) + create_job(db=db, job=job_data) + run_tts_task(job_data.id, str(final_path), str(processed_path), payload.model_name, tts_config, APP_CONFIG, base_url) elif payload.task_type == "ocr": stem, suffix = Path(safe_filename).stem, Path(safe_filename).suffix processed_path = PATHS.PROCESSED_DIR / f"{stem}_{job_id}{suffix}" job_data.processed_filepath = str(processed_path) create_job(db=db, job=job_data) - run_pdf_ocr_task(job_id, str(final_path), str(processed_path), APP_CONFIG.get("ocr_settings", {}).get("ocrmypdf", {})) + run_pdf_ocr_task(job_data.id, str(final_path), str(processed_path), APP_CONFIG.get("ocr_settings", {}).get("ocrmypdf", {}), APP_CONFIG, base_url) elif payload.task_type == "conversion": try: tool, task_key = payload.output_format.split('_', 1) @@ -719,52 +1473,18 @@ async def finalize_upload(payload: FinalizeUploadPayload, user: dict = Depends(r processed_path = PATHS.PROCESSED_DIR / f"{original_stem}_{job_id}.{target_ext}" job_data.processed_filepath = str(processed_path) create_job(db=db, job=job_data) - run_conversion_task(job_id, str(final_path), str(processed_path), tool, task_key, APP_CONFIG.get("conversion_tools", {})) + run_conversion_task(job_data.id, str(final_path), str(processed_path), tool, task_key, APP_CONFIG.get("conversion_tools", {}), APP_CONFIG, base_url) else: final_path.unlink(missing_ok=True) raise HTTPException(status_code=400, detail="Invalid task type.") return {"job_id": job_id, "status": "pending"} -# --- OLD DIRECT-UPLOAD ROUTES (kept for compatibility) --- -# These use the same task functions but accept direct file uploads (no chunking). -async def save_upload_file_chunked(upload_file: UploadFile, destination: Path) -> int: - """ - Write upload to a tmp file in chunks, then atomically move to final destination. - Returns the final size of the file in bytes. - """ - max_size = APP_CONFIG.get("app_settings", {}).get("max_file_size_bytes", 100 * 1024 * 1024) - tmp = destination.with_name(f"{destination.stem}.tmp-{uuid.uuid4().hex}{destination.suffix}") - size = 0 - try: - with tmp.open("wb") as buffer: - while True: - chunk = await upload_file.read(1024 * 1024) - if not chunk: - break - size += len(chunk) - if size > max_size: - raise HTTPException(status_code=413, detail=f"File exceeds {max_size / 1024 / 1024} MB limit") - buffer.write(chunk) - tmp.replace(destination) - return size - except Exception: - try: - ensure_path_is_safe(tmp, [PATHS.UPLOADS_DIR, PATHS.CHUNK_TMP_DIR]) - tmp.unlink(missing_ok=True) - except Exception: - logger.exception("Failed to remove temp upload file after error.") - raise - -def is_allowed_file(filename: str, allowed_extensions: set) -> bool: - return Path(filename).suffix.lower() in allowed_extensions - +# --- LEGACY DIRECT-UPLOAD ROUTES (kept for compatibility) --- @app.post("/transcribe-audio", status_code=status.HTTP_202_ACCEPTED) async def submit_audio_transcription( - file: UploadFile = File(...), - model_size: str = Form("base"), - db: Session = Depends(get_db), - user: dict = Depends(require_user) + request: Request, file: UploadFile = File(...), model_size: str = Form("base"), + db: Session = Depends(get_db), user: dict = Depends(require_user) ): allowed_audio_exts = {".mp3", ".wav", ".m4a", ".flac", ".ogg", ".opus"} if not is_allowed_file(file.filename, allowed_audio_exts): @@ -774,105 +1494,282 @@ async def submit_audio_transcription( if model_size not in whisper_config.get("allowed_models", []): raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=f"Invalid model size: {model_size}.") - job_id = uuid.uuid4().hex - safe_basename = secure_filename(file.filename) + job_id, safe_basename = uuid.uuid4().hex, secure_filename(file.filename) stem, suffix = Path(safe_basename).stem, Path(safe_basename).suffix + upload_path = PATHS.UPLOADS_DIR / f"{stem}_{job_id}{suffix}" + processed_path = PATHS.PROCESSED_DIR / f"{stem}_{job_id}.txt" + input_size = await save_upload_file(file, upload_path) + base_url = str(request.base_url) - audio_filename = f"{stem}_{job_id}{suffix}" - transcript_filename = f"{stem}_{job_id}.txt" - upload_path = PATHS.UPLOADS_DIR / audio_filename - processed_path = PATHS.PROCESSED_DIR / transcript_filename - - input_size = await save_upload_file_chunked(file, upload_path) - - job_data = JobCreate( - id=job_id, - user_id=user['sub'], - task_type="transcription", - original_filename=file.filename, - input_filepath=str(upload_path), - input_filesize=input_size, - processed_filepath=str(processed_path) - ) + job_data = JobCreate(id=job_id, user_id=user['sub'], task_type="transcription", original_filename=file.filename, + input_filepath=str(upload_path), input_filesize=input_size, processed_filepath=str(processed_path)) new_job = create_job(db=db, job=job_data) - - run_transcription_task(new_job.id, str(upload_path), str(processed_path), model_size, whisper_settings=whisper_config) - + run_transcription_task(new_job.id, str(upload_path), str(processed_path), model_size, whisper_settings=whisper_config, app_config=APP_CONFIG, base_url=base_url) return {"job_id": new_job.id, "status": new_job.status, "status_url": f"/job/{new_job.id}"} @app.post("/convert-file", status_code=status.HTTP_202_ACCEPTED) -async def submit_file_conversion(file: UploadFile = File(...), output_format: str = Form(...), db: Session = Depends(get_db), user: dict = Depends(require_user)): +async def submit_file_conversion(request: Request, file: UploadFile = File(...), output_format: str = Form(...), db: Session = Depends(get_db), user: dict = Depends(require_user)): allowed_exts = APP_CONFIG.get("app_settings", {}).get("allowed_all_extensions", set()) if not is_allowed_file(file.filename, allowed_exts): raise HTTPException(status_code=400, detail=f"File type '{Path(file.filename).suffix}' not allowed.") conversion_tools = APP_CONFIG.get("conversion_tools", {}) try: tool, task_key = output_format.split('_', 1) - if tool not in conversion_tools or task_key not in conversion_tools[tool].get("formats", {}): - # fallback: allow tasks that exist but may not be in formats map (some configs only have commands) - if tool not in conversion_tools: - raise ValueError() + if tool not in conversion_tools: raise ValueError() except ValueError: raise HTTPException(status_code=400, detail="Invalid output format selected.") - job_id = uuid.uuid4().hex - safe_basename = secure_filename(file.filename) + + job_id, safe_basename = uuid.uuid4().hex, secure_filename(file.filename) original_stem = Path(safe_basename).stem target_ext = task_key.split('_')[0] - if tool == "ghostscript_pdf": - target_ext = "pdf" - upload_filename = f"{original_stem}_{job_id}{Path(safe_basename).suffix}" - processed_filename = f"{original_stem}_{job_id}.{target_ext}" - upload_path = PATHS.UPLOADS_DIR / upload_filename - processed_path = PATHS.PROCESSED_DIR / processed_filename - input_size = await save_upload_file_chunked(file, upload_path) + if tool == "ghostscript_pdf": target_ext = "pdf" + upload_path = PATHS.UPLOADS_DIR / f"{original_stem}_{job_id}{Path(safe_basename).suffix}" + processed_path = PATHS.PROCESSED_DIR / f"{original_stem}_{job_id}.{target_ext}" + input_size = await save_upload_file(file, upload_path) + base_url = str(request.base_url) + job_data = JobCreate(id=job_id, user_id=user['sub'], task_type="conversion", original_filename=file.filename, - input_filepath=str(upload_path), - input_filesize=input_size, - processed_filepath=str(processed_path)) + input_filepath=str(upload_path), input_filesize=input_size, processed_filepath=str(processed_path)) new_job = create_job(db=db, job=job_data) - run_conversion_task(new_job.id, str(upload_path), str(processed_path), tool, task_key, conversion_tools) + run_conversion_task(new_job.id, str(upload_path), str(processed_path), tool, task_key, conversion_tools, APP_CONFIG, base_url) return {"job_id": new_job.id, "status": new_job.status, "status_url": f"/job/{new_job.id}"} @app.post("/ocr-pdf", status_code=status.HTTP_202_ACCEPTED) -async def submit_pdf_ocr(file: UploadFile = File(...), db: Session = Depends(get_db), user: dict = Depends(require_user)): +async def submit_pdf_ocr(request: Request, file: UploadFile = File(...), db: Session = Depends(get_db), user: dict = Depends(require_user)): if not is_allowed_file(file.filename, {".pdf"}): raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid file type. Please upload a PDF.") - job_id = uuid.uuid4().hex - safe_basename = secure_filename(file.filename) + job_id, safe_basename = uuid.uuid4().hex, secure_filename(file.filename) unique_filename = f"{Path(safe_basename).stem}_{job_id}{Path(safe_basename).suffix}" upload_path = PATHS.UPLOADS_DIR / unique_filename processed_path = PATHS.PROCESSED_DIR / unique_filename - input_size = await save_upload_file_chunked(file, upload_path) + input_size = await save_upload_file(file, upload_path) + base_url = str(request.base_url) + job_data = JobCreate(id=job_id, user_id=user['sub'], task_type="ocr", original_filename=file.filename, - input_filepath=str(upload_path), - input_filesize=input_size, - processed_filepath=str(processed_path)) + input_filepath=str(upload_path), input_filesize=input_size, processed_filepath=str(processed_path)) new_job = create_job(db=db, job=job_data) ocr_settings = APP_CONFIG.get("ocr_settings", {}).get("ocrmypdf", {}) - run_pdf_ocr_task(new_job.id, str(upload_path), str(processed_path), ocr_settings) + run_pdf_ocr_task(new_job.id, str(upload_path), str(processed_path), ocr_settings, APP_CONFIG, base_url) return {"job_id": new_job.id, "status": new_job.status, "status_url": f"/job/{new_job.id}"} @app.post("/ocr-image", status_code=status.HTTP_202_ACCEPTED) -async def submit_image_ocr(file: UploadFile = File(...), db: Session = Depends(get_db), user: dict = Depends(require_user)): +async def submit_image_ocr(request: Request, file: UploadFile = File(...), db: Session = Depends(get_db), user: dict = Depends(require_user)): allowed_exts = {".png", ".jpg", ".jpeg", ".tiff", ".tif"} if not is_allowed_file(file.filename, allowed_exts): raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid file type. Please upload a PNG, JPG, or TIFF.") - job_id = uuid.uuid4().hex - safe_basename = secure_filename(file.filename) + job_id, safe_basename = uuid.uuid4().hex, secure_filename(file.filename) file_ext = Path(safe_basename).suffix unique_filename = f"{Path(safe_basename).stem}_{job_id}{file_ext}" upload_path = PATHS.UPLOADS_DIR / unique_filename processed_path = PATHS.PROCESSED_DIR / f"{Path(safe_basename).stem}_{job_id}.txt" - input_size = await save_upload_file_chunked(file, upload_path) + input_size = await save_upload_file(file, upload_path) + base_url = str(request.base_url) + job_data = JobCreate(id=job_id, user_id=user['sub'], task_type="ocr-image", original_filename=file.filename, - input_filepath=str(upload_path), - input_filesize=input_size, - processed_filepath=str(processed_path)) + input_filepath=str(upload_path), input_filesize=input_size, processed_filepath=str(processed_path)) new_job = create_job(db=db, job=job_data) - run_image_ocr_task(new_job.id, str(upload_path), str(processed_path)) + run_image_ocr_task(new_job.id, str(upload_path), str(processed_path), APP_CONFIG, base_url) return {"job_id": new_job.id, "status": new_job.status, "status_url": f"/job/{new_job.id}"} -# --- Routes for auth and pages --- +# -------------------------------------------------------------------------------- +# --- API V1 ROUTES (for programmatic access) +# -------------------------------------------------------------------------------- +def is_allowed_callback_url(url: str, allowed: List[str]) -> bool: + if not allowed: + return False + try: + parsed = urlparse(url) + if not parsed.scheme or not parsed.netloc: + return False + for a in allowed: + ap = urlparse(a) + if ap.scheme and ap.netloc: + if parsed.scheme == ap.scheme and parsed.netloc == ap.netloc: + return True + else: + # support legacy prefix entries - keep fallback + if url.startswith(a): + return True + return False + except Exception: + return False + +@app.get("/api/v1/tts-voices") +async def get_tts_voices_list(user: dict = Depends(require_user)): + global AVAILABLE_TTS_VOICES_CACHE + + kokoro_available = shutil.which("kokoro-tts") is not None + piper_available = PiperVoice is not None + + if not piper_available and not kokoro_available: + return JSONResponse(content={"error": "TTS feature not configured on server (no TTS engines found)."}, status_code=501) + + if AVAILABLE_TTS_VOICES_CACHE: + return AVAILABLE_TTS_VOICES_CACHE + + all_voices = [] + try: + if piper_available: + logger.info("Fetching available Piper voices list...") + piper_voices = safe_get_voices(PATHS.TTS_MODELS_DIR) + for voice in piper_voices: + voice['id'] = f"piper/{voice.get('id')}" + voice['name'] = f"Piper: {voice.get('name', voice.get('id'))}" + all_voices.extend(piper_voices) + + if kokoro_available: + logger.info("Fetching available Kokoro TTS voices and languages...") + kokoro_voices = list_kokoro_voices_cli() + kokoro_langs = list_kokoro_languages_cli() + for lang in kokoro_langs: + for voice in kokoro_voices: + all_voices.append({ + "id": f"kokoro/{lang}/{voice}", + "name": f"Kokoro ({lang}): {voice}", + "local": False + }) + + AVAILABLE_TTS_VOICES_CACHE = sorted(all_voices, key=lambda x: x['name']) + return AVAILABLE_TTS_VOICES_CACHE + except Exception as e: + logger.exception("Could not fetch list of TTS voices.") + raise HTTPException(status_code=500, detail=f"Could not retrieve voices list: {e}") + +# --- Standard API endpoint (non-chunked) --- +@app.post("/api/v1/process", status_code=status.HTTP_202_ACCEPTED, tags=["Webhook API"]) +async def api_process_file( + request: Request, file: UploadFile = File(...), task_type: str = Form(...), callback_url: str = Form(...), + model_size: Optional[str] = Form("base"), model_name: Optional[str] = Form(None), + output_format: Optional[str] = Form(None), + db: Session = Depends(get_db), user: dict = Depends(require_api_user) +): + """ + Programmatically submit a file for processing via a single HTTP request. + This is the recommended endpoint for services like n8n. + Requires bearer token authentication unless in LOCAL_ONLY_MODE. + """ + webhook_config = APP_CONFIG.get("webhook_settings", {}) + if not webhook_config.get("enabled", False): + raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail="Webhook processing is disabled on the server.") + + if not is_allowed_callback_url(callback_url, webhook_config.get("allowed_callback_urls", [])): + logger.warning(f"Rejected webhook from user '{user.get('email')}' with disallowed callback URL: {callback_url}") + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Provided callback_url is not in the list of allowed URLs.") + + job_id = uuid.uuid4().hex + safe_basename = secure_filename(file.filename) + stem, suffix = Path(safe_basename).stem, Path(safe_basename).suffix + upload_filename = f"{stem}_{job_id}{suffix}" + upload_path = PATHS.UPLOADS_DIR / upload_filename + + try: + input_size = await save_upload_file(file, upload_path) + except HTTPException as e: + raise e # Re-raise exceptions from save_upload_file (e.g., file too large) + except Exception as e: + logger.exception("Failed to save uploaded file for webhook processing.") + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to save file: {e}") + + base_url = str(request.base_url) + job_data_args = { + "id": job_id, "user_id": user['sub'], "original_filename": file.filename, + "input_filepath": str(upload_path), "input_filesize": input_size, + "callback_url": callback_url, "task_type": task_type, + } + + # --- API Task Dispatching Logic --- + if task_type == "transcription": + whisper_config = APP_CONFIG.get("transcription_settings", {}).get("whisper", {}) + if model_size not in whisper_config.get("allowed_models", []): + raise HTTPException(status_code=400, detail=f"Invalid model_size '{model_size}'") + processed_path = PATHS.PROCESSED_DIR / f"{stem}_{job_id}.txt" + job_data_args["processed_filepath"] = str(processed_path) + create_job(db=db, job=JobCreate(**job_data_args)) + run_transcription_task(job_id, str(upload_path), str(processed_path), model_size, whisper_config, APP_CONFIG, base_url) + + elif task_type == "tts": + if not is_allowed_file(file.filename, {".txt"}): + raise HTTPException(status_code=400, detail="Invalid file type for TTS, requires .txt") + if not model_name: + raise HTTPException(status_code=400, detail="model_name is required for TTS task.") + tts_config = APP_CONFIG.get("tts_settings", {}) + processed_path = PATHS.PROCESSED_DIR / f"{stem}_{job_id}.wav" + job_data_args["processed_filepath"] = str(processed_path) + create_job(db=db, job=JobCreate(**job_data_args)) + run_tts_task(job_id, str(upload_path), str(processed_path), model_name, tts_config, APP_CONFIG, base_url) + + elif task_type == "conversion": + if not output_format: + raise HTTPException(status_code=400, detail="output_format is required for conversion task.") + conversion_tools = APP_CONFIG.get("conversion_tools", {}) + try: + tool, task_key = output_format.split('_', 1) + if tool not in conversion_tools: raise ValueError("Invalid tool") + except ValueError: + raise HTTPException(status_code=400, detail="Invalid output_format selected.") + target_ext = task_key.split('_')[0] + if tool == "ghostscript_pdf": target_ext = "pdf" + processed_path = PATHS.PROCESSED_DIR / f"{stem}_{job_id}.{target_ext}" + job_data_args["processed_filepath"] = str(processed_path) + create_job(db=db, job=JobCreate(**job_data_args)) + run_conversion_task(job_id, str(upload_path), str(processed_path), tool, task_key, conversion_tools, APP_CONFIG, base_url) + + elif task_type == "ocr": + if not is_allowed_file(file.filename, {".pdf"}): + raise HTTPException(status_code=400, detail="Invalid file type for ocr, requires .pdf") + processed_path = PATHS.PROCESSED_DIR / f"{stem}_{job_id}{suffix}" + job_data_args["processed_filepath"] = str(processed_path) + create_job(db=db, job=JobCreate(**job_data_args)) + run_pdf_ocr_task(job_id, str(upload_path), str(processed_path), APP_CONFIG.get("ocr_settings", {}).get("ocrmypdf", {}), APP_CONFIG, base_url) + + elif task_type == "ocr-image": + if not is_allowed_file(file.filename, {".png", ".jpg", ".jpeg", ".tiff", ".tif"}): + raise HTTPException(status_code=400, detail="Invalid file type for ocr-image.") + processed_path = PATHS.PROCESSED_DIR / f"{stem}_{job_id}.txt" + job_data_args["processed_filepath"] = str(processed_path) + create_job(db=db, job=JobCreate(**job_data_args)) + run_image_ocr_task(job_id, str(upload_path), str(processed_path), APP_CONFIG, base_url) + + else: + upload_path.unlink(missing_ok=True) # Cleanup orphaned file + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=f"Invalid task_type: '{task_type}'") + + return {"job_id": job_id, "status": "pending"} + + +# --- Chunked API endpoints (optional) --- +@app.post("/api/v1/upload/chunk", tags=["Webhook API"]) +async def api_upload_chunk( + chunk: UploadFile = File(...), upload_id: str = Form(...), chunk_number: int = Form(...), + user: dict = Depends(require_api_user) +): + """API endpoint for uploading a single file chunk.""" + webhook_config = APP_CONFIG.get("webhook_settings", {}) + if not webhook_config.get("enabled", False) or not webhook_config.get("allow_chunked_api_uploads", False): + raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail="Chunked API uploads are disabled.") + + return await upload_chunk(chunk, upload_id, chunk_number, user) + +@app.post("/api/v1/upload/finalize", status_code=status.HTTP_202_ACCEPTED, tags=["Webhook API"]) +async def api_finalize_upload( + request: Request, payload: FinalizeUploadPayload, user: dict = Depends(require_api_user), db: Session = Depends(get_db) +): + """API endpoint to finalize a chunked upload and start a processing job.""" + webhook_config = APP_CONFIG.get("webhook_settings", {}) + if not webhook_config.get("enabled", False) or not webhook_config.get("allow_chunked_api_uploads", False): + raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail="Chunked API uploads are disabled.") + + # Validate callback URL if provided for a webhook job + if payload.callback_url and not is_allowed_callback_url(payload.callback_url, webhook_config.get("allowed_callback_urls", [])): + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Provided callback_url is not allowed.") + + # Re-use the main finalization logic, but with API user context + return await finalize_upload(request, payload, user, db) + + +# -------------------------------------------------------------------------------- +# --- AUTH & PAGE ROUTES +# -------------------------------------------------------------------------------- if not LOCAL_ONLY_MODE: @app.get('/login') async def login(request: Request): @@ -885,7 +1782,6 @@ if not LOCAL_ONLY_MODE: token = await oauth.oidc.authorize_access_token(request) user = await oauth.oidc.userinfo(token=token) request.session['user'] = dict(user) - # Store id_token in session for logout request.session['id_token'] = token.get('id_token') except Exception as e: logger.error(f"Authentication failed: {e}") @@ -895,29 +1791,18 @@ if not LOCAL_ONLY_MODE: @app.get("/logout") async def logout(request: Request): logout_endpoint = oauth.oidc.server_metadata.get("end_session_endpoint") - logger.info(f"OIDC end_session_endpoint: {logout_endpoint}") - - # local-only logout if provider doesn't expose end_session_endpoint if not logout_endpoint: request.session.clear() logger.warning("OIDC 'end_session_endpoint' not found. Performing local-only logout.") return RedirectResponse(url="/", status_code=302) - - # Prefer a single canonical / registered post-logout redirect URI from config post_logout_redirect_uri = str(request.url_for("get_index")) - logger.info(f"Post logout redirect URI: {post_logout_redirect_uri}") - logout_url = f"{logout_endpoint}?post_logout_redirect_uri={post_logout_redirect_uri}" - - logger.info(f"Redirecting to provider logout URL: {logout_url}") - request.session.clear() return RedirectResponse(url=logout_url, status_code=302) -#### TODO: Remove this weird forward authz endpoint, its needed if reverse proxy does foward auth - +# This is for reverse proxies that use forward auth @app.get("/api/authz/forward-auth") async def forward_auth(request: Request): redirect_uri = request.url_for('auth') @@ -930,146 +1815,94 @@ async def get_index(request: Request): whisper_models = APP_CONFIG.get("transcription_settings", {}).get("whisper", {}).get("allowed_models", []) conversion_tools = APP_CONFIG.get("conversion_tools", {}) return templates.TemplateResponse("index.html", { - "request": request, - "user": user, - "is_admin": admin_status, + "request": request, "user": user, "is_admin": admin_status, "whisper_models": sorted(list(whisper_models)), - "conversion_tools": conversion_tools, - "local_only_mode": LOCAL_ONLY_MODE + "conversion_tools": conversion_tools, "local_only_mode": LOCAL_ONLY_MODE }) @app.get("/settings") async def get_settings_page(request: Request): - """ - Displays the contents of the currently active configuration file. - It prioritizes settings.yml and falls back to settings.default.yml. - """ + """Displays the contents of the currently active configuration file.""" user = get_current_user(request) admin_status = is_admin(request) - current_config = {} - config_source = "none" # A helper variable to track which file was loaded - + current_config, config_source = {}, "none" try: - # 1. Attempt to load the primary, user-provided settings.yml with open(PATHS.SETTINGS_FILE, 'r', encoding='utf8') as f: current_config = yaml.safe_load(f) or {} config_source = str(PATHS.SETTINGS_FILE.name) - logger.info(f"Displaying configuration from '{config_source}' on settings page.") - except FileNotFoundError: - logger.warning(f"'{PATHS.SETTINGS_FILE.name}' not found. Attempting to display fallback configuration.") try: - # 2. If it's not found, fall back to the default settings file with open(PATHS.DEFAULT_SETTINGS_FILE, 'r', encoding='utf8') as f: current_config = yaml.safe_load(f) or {} config_source = str(PATHS.DEFAULT_SETTINGS_FILE.name) - logger.info(f"Displaying configuration from fallback '{config_source}' on settings page.") - - except Exception as e_fallback: - # 3. If even the default file fails, log the error and use an empty config - logger.exception(f"CRITICAL: Could not load fallback '{PATHS.DEFAULT_SETTINGS_FILE.name}' for settings page: {e_fallback}") - current_config = {} # Failsafe + except Exception as e: + logger.exception(f"CRITICAL: Could not load fallback config: {e}") config_source = "error" - - except Exception as e_primary: - # Handles other errors with the primary settings.yml (e.g., parsing errors, permissions) - logger.exception(f"Could not load '{PATHS.SETTINGS_FILE.name}' for settings page: {e_primary}") - current_config = {} # Failsafe + except Exception as e: + logger.exception(f"Could not load primary config: {e}") config_source = "error" return templates.TemplateResponse( "settings.html", - { - "request": request, - "config": current_config, - "config_source": config_source, # You can use this in the template! - "user": user, - "is_admin": admin_status, - "local_only_mode": LOCAL_ONLY_MODE, - } + {"request": request, "config": current_config, "config_source": config_source, + "user": user, "is_admin": admin_status, "local_only_mode": LOCAL_ONLY_MODE} ) - -import collections.abc - def deep_merge(source: dict, destination: dict) -> dict: - """ - Recursively merges the `source` dictionary into the `destination` dictionary. - - Values from `source` will overwrite values in `destination`. - """ + """Recursively merges dicts.""" for key, value in source.items(): if isinstance(value, collections.abc.Mapping): - # If the value is a dictionary, recurse node = destination.setdefault(key, {}) deep_merge(value, node) else: - # Otherwise, overwrite the value destination[key] = value return destination @app.post("/settings/save") async def save_settings( - request: Request, - new_config_from_ui: Dict = Body(...), - admin: bool = Depends(require_admin) + request: Request, new_config_from_ui: Dict = Body(...), admin: bool = Depends(require_admin) ): - """ - Safely updates settings.yml by merging UI changes with the existing file, - preserving any settings not managed by the UI. - """ + """Safely updates settings.yml by merging UI changes with the existing file.""" tmp_path = PATHS.SETTINGS_FILE.with_suffix(".tmp") user = get_current_user(request) - try: - # Handle the special case where the user wants to revert to defaults if not new_config_from_ui: if PATHS.SETTINGS_FILE.exists(): PATHS.SETTINGS_FILE.unlink() - logger.info(f"Admin '{user.get('email')}' reverted to default settings by deleting settings.yml.") + logger.info(f"Admin '{user.get('email')}' reverted to default settings.") load_app_config() return JSONResponse({"message": "Settings reverted to default."}) - # --- Read-Modify-Write Cycle --- - - # 1. READ: Load the current configuration from settings.yml on disk. - # If the file doesn't exist, start with an empty dictionary. try: with PATHS.SETTINGS_FILE.open("r", encoding="utf8") as f: current_config_on_disk = yaml.safe_load(f) or {} except FileNotFoundError: current_config_on_disk = {} - # 2. MODIFY: Deep merge the changes from the UI into the config from the disk. - # The UI config (`source`) overwrites keys in the disk config (`destination`). merged_config = deep_merge(source=new_config_from_ui, destination=current_config_on_disk) - # 3. WRITE: Save the fully merged configuration back to the file. with tmp_path.open("w", encoding="utf8") as f: yaml.safe_dump(merged_config, f, default_flow_style=False, sort_keys=False) tmp_path.replace(PATHS.SETTINGS_FILE) - logger.info(f"Admin '{user.get('email')}' successfully updated settings.yml.") - - # Reload the app config to apply changes immediately + logger.info(f"Admin '{user.get('email')}' updated settings.yml.") load_app_config() - - return JSONResponse({"message": "Settings saved successfully. The new configuration is now active."}) + return JSONResponse({"message": "Settings saved successfully."}) except Exception as e: logger.exception(f"Failed to update settings for admin '{user.get('email')}'") - if tmp_path.exists(): - tmp_path.unlink() + if tmp_path.exists(): tmp_path.unlink() raise HTTPException(status_code=500, detail=f"Could not save settings.yml: {e}") -# job management endpoints - +# -------------------------------------------------------------------------------- +# --- JOB MANAGEMENT & UTILITY ROUTES +# -------------------------------------------------------------------------------- @app.post("/settings/clear-history") async def clear_job_history(db: Session = Depends(get_db), user: dict = Depends(require_user)): try: num_deleted = db.query(Job).filter(Job.user_id == user['sub']).delete() db.commit() - logger.info(f"Cleared {num_deleted} jobs from history for user {user['sub']}.") + logger.info(f"Cleared {num_deleted} jobs for user {user['sub']}.") return {"deleted_count": num_deleted} except Exception: db.rollback() @@ -1078,22 +1911,20 @@ async def clear_job_history(db: Session = Depends(get_db), user: dict = Depends( @app.post("/settings/delete-files") async def delete_processed_files(db: Session = Depends(get_db), user: dict = Depends(require_user)): - deleted_count = 0 - errors = [] - user_jobs = get_jobs(db, user_id=user['sub']) - for job in user_jobs: + deleted_count, errors = 0, [] + for job in get_jobs(db, user_id=user['sub']): if job.processed_filepath: try: p = ensure_path_is_safe(Path(job.processed_filepath), [PATHS.PROCESSED_DIR]) if p.is_file(): p.unlink() deleted_count += 1 - except Exception as e: + except Exception: errors.append(Path(job.processed_filepath).name) - logger.exception(f"Could not delete processed file {Path(job.processed_filepath).name}") + logger.exception(f"Could not delete file {Path(job.processed_filepath).name}") if errors: raise HTTPException(status_code=500, detail=f"Could not delete some files: {', '.join(errors)}") - logger.info(f"Deleted {deleted_count} files from processed directory for user {user['sub']}.") + logger.info(f"Deleted {deleted_count} files for user {user['sub']}.") return {"deleted_count": deleted_count} @app.post("/job/{job_id}/cancel", status_code=status.HTTP_202_ACCEPTED) @@ -1123,7 +1954,9 @@ async def download_file(filename: str, db: Session = Depends(get_db), user: dict file_path = ensure_path_is_safe(PATHS.PROCESSED_DIR / safe_filename, [PATHS.PROCESSED_DIR]) if not file_path.is_file(): raise HTTPException(status_code=404, detail="File not found.") - job = db.query(Job).filter(Job.processed_filepath == str(file_path), Job.user_id == user['sub']).first() + # API users can download files they own via webhook URL. UI users need session. + job_owner_id = user.get('sub') if user else None + job = db.query(Job).filter(Job.processed_filepath == str(file_path), Job.user_id == job_owner_id).first() if not job: raise HTTPException(status_code=403, detail="You do not have permission to download this file.") download_filename = Path(job.original_filename).stem + Path(job.processed_filepath).suffix @@ -1133,7 +1966,7 @@ async def download_file(filename: str, db: Session = Depends(get_db), user: dict async def health(): try: with engine.connect() as conn: - conn.execute("SELECT 1") + conn.execution_options(isolation_level="AUTOCOMMIT").execute("SELECT 1") except Exception: logger.exception("Health check failed") return JSONResponse({"ok": False}, status_code=500) diff --git a/requirements.txt b/requirements.txt index be5f40d..3dfab3e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,6 +13,8 @@ faster-whisper ocrmypdf pytesseract pypdf +piper-tts +kokoro-tts # Configuration & Utilities werkzeug diff --git a/settings.default.yml b/settings.default.yml index 2906c10..5d8fc16 100644 --- a/settings.default.yml +++ b/settings.default.yml @@ -1,13 +1,31 @@ auth_settings: oidc_client_id: filewiz - oidc_client_secret: - oidc_server_metadata_url: https://accounts.test.de/oidc/.well-known/openid-configuration - oidc_userinfo_endpoint: https://accounts.test.de/oidc/me - oidc_end_session_endpoint: https://accounts.test.de/oidc/session/end + oidc_client_secret: + oidc_server_metadata_url: https://accounts.example.com/oidc/.well-known/openid-configuration + oidc_userinfo_endpoint: https://accounts.example.com/oidc/me + oidc_end_session_endpoint: https://accounts.example.com/oidc/session/end admin_users: - - admin@local.com + - user@example.com +web_hook_settings: + enabled: False + allow_chunked_api_uploads": False + allowed_callback_urls: + callback_bearer_token": +tts_settings: + piper: + model_dir: "./models/tts" + use_cuda: False + synthesis_config: + length_scale: 1.0 + noise_scale: 0.667 + noise_w: 0.8 + kokoro: + model_dir: "./models/tts/kokoro" + command_template: "kokoro-tts {input} {output} --model {model_path} --voices {voices_path} --voice {model_name}" + app_settings: max_file_size_mb: '2000' + # app_public_url: "http://localhost:8000" # Uncomment and set to your public URL if downloads don't work correctly allowed_all_extensions: - .aac - .aiff @@ -352,4 +370,4 @@ conversion_tools: formats: jpg_q85: JPEG (High Quality) jpg_q75: JPEG (Web Quality) - jpg_q60: JPEG (Aggressive Compression) \ No newline at end of file + jpg_q60: JPEG (Aggressive Compression) diff --git a/static/css/style.css b/static/css/style.css index 5b63c61..8c0466e 100644 --- a/static/css/style.css +++ b/static/css/style.css @@ -278,7 +278,7 @@ input[type="file"] { .actions-group { display: grid; - grid-template-columns: repeat(3, 1fr); /* 3 columns for wide screens */ + grid-template-columns: repeat(4, 1fr); /* 3 columns for wide screens */ gap: 1.5rem; margin-top: 1.5rem; } diff --git a/static/js/script.js b/static/js/script.js index 7d23215..7b8da3a 100644 --- a/static/js/script.js +++ b/static/js/script.js @@ -2,6 +2,18 @@ document.addEventListener('DOMContentLoaded', () => { // --- Constants --- const CHUNK_SIZE = 5 * 1024 * 1024; // 5 MB chunks + // Allow server to provide API prefix (e.g. "/api/v1") via window.APP_CONFIG.api_base + const API_BASE = (window.APP_CONFIG && window.APP_CONFIG.api_base) ? window.APP_CONFIG.api_base.replace(/\/$/, '') : ''; + + function apiUrl(path) { + // path may start with or without a leading slash + if (!path) return API_BASE || '/'; + if (path.startsWith('/')) { + return `${API_BASE}${path}`; + } + return `${API_BASE}/${path}`; + } + // --- User Locale and Timezone Detection --- const USER_LOCALE = navigator.language || 'en-US'; const USER_TIMEZONE = Intl.DateTimeFormat().resolvedOptions().timeZone; @@ -24,9 +36,11 @@ document.addEventListener('DOMContentLoaded', () => { const mainFileName = document.getElementById('main-file-name'); const mainOutputFormatSelect = document.getElementById('main-output-format-select'); const mainModelSizeSelect = document.getElementById('main-model-size-select'); + const mainTtsModelSelect = document.getElementById('main-tts-model-select'); const startConversionBtn = document.getElementById('start-conversion-btn'); const startOcrBtn = document.getElementById('start-ocr-btn'); const startTranscriptionBtn = document.getElementById('start-transcription-btn'); + const startTtsBtn = document.getElementById('start-tts-btn'); const jobListBody = document.getElementById('job-list-body'); @@ -36,37 +50,50 @@ document.addEventListener('DOMContentLoaded', () => { const dialogFileCount = document.getElementById('dialog-file-count'); const dialogInitialView = document.getElementById('dialog-initial-actions'); const dialogConvertView = document.getElementById('dialog-convert-view'); + const dialogTtsView = document.getElementById('dialog-tts-view'); const dialogConvertBtn = document.getElementById('dialog-action-convert'); const dialogOcrBtn = document.getElementById('dialog-action-ocr'); const dialogTranscribeBtn = document.getElementById('dialog-action-transcribe'); + const dialogTtsBtn = document.getElementById('dialog-action-tts'); const dialogCancelBtn = document.getElementById('dialog-action-cancel'); const dialogStartConversionBtn = document.getElementById('dialog-start-conversion'); + const dialogStartTtsBtn = document.getElementById('dialog-start-tts'); const dialogBackBtn = document.getElementById('dialog-back'); + const dialogBackTtsBtn = document.getElementById('dialog-back-tts'); const dialogOutputFormatSelect = document.getElementById('dialog-output-format-select'); + const dialogTtsModelSelect = document.getElementById('dialog-tts-model-select'); + // --- State Variables --- let conversionChoices = null; - let modelChoices = null; // For the model dropdown instance + let transcriptionChoices = null; + let ttsChoices = null; let dialogConversionChoices = null; + let dialogTtsChoices = null; + let ttsModelsCache = []; // Cache for formatted TTS models list const activePolls = new Map(); let stagedFiles = null; // --- Authentication-aware Fetch Wrapper --- - /** - * A wrapper around the native fetch API that handles 401 Unauthorized responses. - * If a 401 is received, it assumes the session has expired and redirects to the login page. - * @param {string} url - The URL to fetch. - * @param {object} options - The options for the fetch request. - * @returns {Promise} - A promise that resolves to the fetch Response. - */ - async function authFetch(url, options) { + async function authFetch(url, options = {}) { + // Normalize URL through apiUrl() if a bare endpoint is provided + if (typeof url === 'string' && url.startsWith('/')) { + url = apiUrl(url); + } + + // Add default options: include credentials and accept JSON by default + options = Object.assign({}, options); + if (!Object.prototype.hasOwnProperty.call(options, 'credentials')) { + options.credentials = 'include'; + } + options.headers = options.headers || {}; + if (!options.headers.Accept) options.headers.Accept = 'application/json'; + const response = await fetch(url, options); if (response.status === 401) { - // Use a simple alert for now. A more sophisticated modal could be used. alert('Your session has expired. You will be redirected to the login page.'); - window.location.href = '/login'; - // Throw an error to stop the promise chain of the calling function + window.location.href = apiUrl('/login'); throw new Error('Session expired'); } return response; @@ -141,7 +168,8 @@ document.addEventListener('DOMContentLoaded', () => { body: JSON.stringify(finalizePayload), }); if (!finalizeResponse.ok) { - const errorData = await finalizeResponse.json(); + let errorData = {}; + try { errorData = await finalizeResponse.json(); } catch (e) {} throw new Error(errorData.detail || 'Finalization failed'); } const result = await finalizeResponse.json(); @@ -203,13 +231,23 @@ document.addEventListener('DOMContentLoaded', () => { } options.output_format = selectedFormat; } else if (taskType === 'transcription') { - options.model_size = mainModelSizeSelect.value; + const selectedModel = transcriptionChoices.getValue(true); + options.model_size = selectedModel; + } else if (taskType === 'tts') { + const selectedModel = ttsChoices.getValue(true); + if (!selectedModel) { + alert('Please select a voice model.'); + return; + } + options.model_name = selectedModel; } + // Disable buttons during upload process startConversionBtn.disabled = true; startOcrBtn.disabled = true; startTranscriptionBtn.disabled = true; + startTtsBtn.disabled = true; const uploadPromises = files.map(file => uploadFileInChunks(file, taskType, options)); await Promise.allSettled(uploadPromises); @@ -220,6 +258,7 @@ document.addEventListener('DOMContentLoaded', () => { startConversionBtn.disabled = false; startOcrBtn.disabled = false; startTranscriptionBtn.disabled = false; + startTtsBtn.disabled = false; } @@ -251,17 +290,25 @@ document.addEventListener('DOMContentLoaded', () => { function showActionDialog() { dialogFileCount.textContent = stagedFiles.length; - dialogOutputFormatSelect.innerHTML = mainOutputFormatSelect.innerHTML; // Use main select as template + + // Setup Conversion Dropdown + dialogOutputFormatSelect.innerHTML = mainOutputFormatSelect.innerHTML; if (dialogConversionChoices) dialogConversionChoices.destroy(); dialogConversionChoices = new Choices(dialogOutputFormatSelect, { - searchEnabled: true, - itemSelectText: 'Select', - shouldSort: false, - placeholder: true, - placeholderValue: 'Select a format...', + searchEnabled: true, itemSelectText: 'Select', shouldSort: false, placeholder: true, placeholderValue: 'Select a format...', }); + + // Setup TTS Dropdown + if (dialogTtsChoices) dialogTtsChoices.destroy(); + dialogTtsChoices = new Choices(dialogTtsModelSelect, { + searchEnabled: true, itemSelectText: 'Select', shouldSort: false, placeholder: true, placeholderValue: 'Select a voice...', + }); + dialogTtsChoices.setChoices(ttsModelsCache, 'value', 'label', true); + + dialogInitialView.style.display = 'grid'; dialogConvertView.style.display = 'none'; + dialogTtsView.style.display = 'none'; actionDialog.classList.add('visible'); } @@ -269,21 +316,34 @@ document.addEventListener('DOMContentLoaded', () => { actionDialog.classList.remove('visible'); stagedFiles = null; if (dialogConversionChoices) { - dialogConversionChoices.hideDropdown(); dialogConversionChoices.destroy(); dialogConversionChoices = null; } + if (dialogTtsChoices) { + dialogTtsChoices.destroy(); + dialogTtsChoices = null; + } } - + + // --- Dialog Button Listeners --- dialogConvertBtn.addEventListener('click', () => { dialogInitialView.style.display = 'none'; dialogConvertView.style.display = 'block'; }); + dialogTtsBtn.addEventListener('click', () => { + dialogInitialView.style.display = 'none'; + dialogTtsView.style.display = 'block'; + }); dialogBackBtn.addEventListener('click', () => { dialogInitialView.style.display = 'grid'; dialogConvertView.style.display = 'none'; }); + dialogBackTtsBtn.addEventListener('click', () => { + dialogInitialView.style.display = 'grid'; + dialogTtsView.style.display = 'none'; + }); dialogStartConversionBtn.addEventListener('click', () => handleDialogAction('conversion')); + dialogStartTtsBtn.addEventListener('click', () => handleDialogAction('tts')); dialogOcrBtn.addEventListener('click', () => handleDialogAction('ocr')); dialogTranscribeBtn.addEventListener('click', () => handleDialogAction('transcription')); dialogCancelBtn.addEventListener('click', closeActionDialog); @@ -300,23 +360,81 @@ document.addEventListener('DOMContentLoaded', () => { options.output_format = selectedFormat; } else if (action === 'transcription') { options.model_size = mainModelSizeSelect.value; + } else if (action === 'tts') { + const selectedModel = dialogTtsChoices.getValue(true); + if (!selectedModel) { + alert('Please select a voice model.'); + return; + } + options.model_name = selectedModel; } Array.from(stagedFiles).forEach(file => uploadFileInChunks(file, action, options)); closeActionDialog(); } - /** - * Initializes all Choices.js dropdowns on the page. - */ + // ----------------------- + // TTS models loader (robust) + // ----------------------- + async function loadTtsModels() { + try { + const response = await authFetch('/api/v1/tts-voices'); + if (!response.ok) throw new Error('Failed to fetch TTS voices.'); + const voicesData = await response.json(); + + // voicesData might be an object map { id: meta } or an array [{ id, name, language, ... }] + const voicesArray = []; + if (Array.isArray(voicesData)) { + for (const v of voicesData) { + // Accept either { id, name, language } or { voice_id, title, locale } + const id = v.id || v.voice_id || v.voice || v.name || null; + const name = v.name || v.title || v.display_name || id || 'Unknown'; + const lang = (v.language && (v.language.name_native || v.language.name)) || v.locale || (id ? id.split(/[_-]/)[0] : 'Unknown'); + if (id) voicesArray.push({ id, name, lang }); + } + } else if (voicesData && typeof voicesData === 'object') { + for (const key in voicesData) { + if (!Object.prototype.hasOwnProperty.call(voicesData, key)) continue; + const v = voicesData[key]; + const id = v.id || key; + const name = v.name || v.title || v.display_name || id; + const lang = (v.language && (v.language.name_native || v.language.name)) || v.locale || (id ? id.split(/[_-]/)[0] : 'Unknown'); + voicesArray.push({ id, name, lang }); + } + } else { + throw new Error('Unexpected voices payload'); + } + + // Group by language + const groups = {}; + for (const v of voicesArray) { + const langLabel = v.lang || 'Unknown'; + if (!groups[langLabel]) { + groups[langLabel] = { label: langLabel, id: langLabel, disabled: false, choices: [] }; + } + groups[langLabel].choices.push({ + value: v.id, + label: `${v.name}` + }); + } + ttsModelsCache = Object.values(groups).sort((a,b) => a.label.localeCompare(b.label)); + // If ttsChoices exists, update it; otherwise the initializer will set choices + if (ttsChoices) { + ttsChoices.setChoices(ttsModelsCache, 'value', 'label', true); + } + } catch (error) { + console.error("Couldn't load TTS voices:", error); + if (error.message !== 'Session expired') { + if (ttsChoices) { + ttsChoices.setChoices([{ value: '', label: 'Error loading voices', disabled: true }], 'value', 'label'); + } + } + } + } + function initializeSelectors() { - // --- Conversion Dropdown --- if (conversionChoices) conversionChoices.destroy(); conversionChoices = new Choices(mainOutputFormatSelect, { - searchEnabled: true, - itemSelectText: 'Select', - shouldSort: false, - placeholder: true, - placeholderValue: 'Select a format...', + searchEnabled: true, itemSelectText: 'Select', shouldSort: false, placeholder: true, placeholderValue: 'Select a format...', }); const tools = window.APP_CONFIG.conversionTools || {}; const choicesArray = []; @@ -324,22 +442,22 @@ document.addEventListener('DOMContentLoaded', () => { const tool = tools[toolKey]; const group = { label: tool.name, id: toolKey, disabled: false, choices: [] }; for (const formatKey in tool.formats) { - group.choices.push({ - value: `${toolKey}_${formatKey}`, - label: `${tool.name} - ${formatKey.toUpperCase()} (${tool.formats[formatKey]})` - }); + group.choices.push({ value: `${toolKey}_${formatKey}`, label: `${tool.name} - ${formatKey.toUpperCase()} (${tool.formats[formatKey]})` }); } choicesArray.push(group); } conversionChoices.setChoices(choicesArray, 'value', 'label', true); - // --- Model Size Dropdown --- - if (modelChoices) modelChoices.destroy(); - modelChoices = new Choices(mainModelSizeSelect, { - searchEnabled: false, // Disables the search box - shouldSort: false, // Keeps the original