#!/usr/bin/env python3 # routers/ocr_router.py # Name: PGŽ Sport OCR router (lightweight) # Version: 1.0.0 # Authors: Damir Radulić / # Date: 2026-05-05 # Description: FastAPI APIRouter exposing POST /api/ocr/upload and # GET /api/ocr/health. Accepts PDF/JPG/PNG, runs Tesseract # (pdf2image for PDF), extracts vendor / OIB / invoice_no / # date / amount via simple regex, persists into # pgz_sport.invoice_uploads when possible. Designed to # degrade gracefully if pytesseract / pdf2image are not # installed (returns ocr_status='ocr_unavailable'). from __future__ import annotations import os import re import io import hashlib import json import traceback from pathlib import Path from datetime import datetime from typing import Optional, Tuple, Dict, Any, List from fastapi import APIRouter, UploadFile, File, HTTPException from fastapi.responses import JSONResponse import psycopg2 import psycopg2.extras # ── Optional OCR deps ──────────────────────────────────────────────────────── _TESS_OK = False _PDF2IMG_OK = False _PIL_OK = False try: import pytesseract # type: ignore _TESS_OK = True except Exception: pytesseract = None # type: ignore try: from pdf2image import convert_from_bytes # type: ignore _PDF2IMG_OK = True except Exception: convert_from_bytes = None # type: ignore try: from PIL import Image # type: ignore _PIL_OK = True except Exception: Image = None # type: ignore # ── Config ─────────────────────────────────────────────────────────────────── DB = dict( host="10.10.0.2", port=6432, dbname="rinet_v3", user="rinet", password=os.environ["DB_PASSWORD"], ) UPLOAD_DIR = Path("/opt/pgz-sport/uploads/ocr") UPLOAD_DIR.mkdir(parents=True, exist_ok=True) ALLOWED_EXT = {".pdf", ".jpg", ".jpeg", ".png"} ALLOWED_MIME = { "application/pdf", "image/jpeg", "image/jpg", "image/png", } MAX_BYTES = 25 * 1024 * 1024 # 25 MB TEXT_CAP = 8 * 1024 # 8 KB cap for response text payload router = APIRouter(prefix="/api/ocr", tags=["ocr"]) # ── DB helpers ─────────────────────────────────────────────────────────────── def _db(): c = psycopg2.connect(**DB) c.autocommit = True return c def _table_columns(schema: str, table: str) -> List[str]: try: with _db() as c, c.cursor() as cur: cur.execute( """ SELECT column_name FROM information_schema.columns WHERE table_schema = %s AND table_name = %s """, (schema, table), ) return [r[0] for r in cur.fetchall()] except Exception: return [] # ── Regex extractors ───────────────────────────────────────────────────────── RE_OIB_HR = re.compile(r"\bHR\s*(\d{11})\b") RE_OIB_BARE = re.compile(r"\b(\d{11})\b") RE_INVOICE = re.compile( r"(?im)^.*\b(?:Ra[čc]un|Invoice)\b[^\n\r]{0,80}$" ) RE_DATE_DMY = re.compile(r"\b(\d{2})[./](\d{2})[./](\d{4})\b") RE_DATE_YMD = re.compile(r"\b(\d{4})-(\d{2})-(\d{2})\b") # Amount candidates (1.234,56 or 1234,56 or 1234.56 or 1,234.56), at least 2 digits RE_AMOUNT = re.compile( r"(? Optional[float]: s = raw.strip().replace(" ", "") # If both . and , present, assume , decimal if last separator is , if "," in s and "." in s: if s.rfind(",") > s.rfind("."): s = s.replace(".", "").replace(",", ".") else: s = s.replace(",", "") elif "," in s: # 1.234,56 or 1234,56 → swap s = s.replace(".", "").replace(",", ".") try: return float(s) except Exception: return None def _first_nonempty_line(text: str) -> Optional[str]: for ln in (text or "").splitlines(): v = ln.strip() if v: return v[:200] return None def _parse_date(text: str) -> Optional[str]: m = RE_DATE_YMD.search(text or "") if m: try: return datetime(int(m.group(1)), int(m.group(2)), int(m.group(3))).date().isoformat() except Exception: pass m = RE_DATE_DMY.search(text or "") if m: try: return datetime(int(m.group(3)), int(m.group(2)), int(m.group(1))).date().isoformat() except Exception: pass return None def _parse_oib(text: str) -> Optional[str]: m = RE_OIB_HR.search(text or "") if m: return m.group(1) m = RE_OIB_BARE.search(text or "") if m: return m.group(1) return None def _parse_invoice_no(text: str) -> Optional[str]: m = RE_INVOICE.search(text or "") if not m: return None line = m.group(0).strip() # Try to grab the right-most token that looks like an invoice id cand = re.findall(r"[A-Z0-9][A-Z0-9\-/_.]{1,40}", line) if cand: # Drop pure words like "Račun"/"Invoice" for c in reversed(cand): if any(ch.isdigit() for ch in c): return c[:64] return line[:120] def _parse_amount(text: str) -> Optional[float]: if not text: return None best: Optional[float] = None for m in RE_AMOUNT.finditer(text): v = _norm_amount(m.group(1)) if v is None: continue if best is None or v > best: best = v return best def _extract_fields(text: str) -> Dict[str, Any]: return { "vendor": _first_nonempty_line(text), "oib": _parse_oib(text), "invoice_no": _parse_invoice_no(text), "date": _parse_date(text), "amount": _parse_amount(text), } # ── OCR engine ─────────────────────────────────────────────────────────────── def _ocr_image_bytes(data: bytes) -> Tuple[Optional[str], Optional[float]]: if not (_TESS_OK and _PIL_OK): return None, None try: img = Image.open(io.BytesIO(data)) img.load() text = pytesseract.image_to_string(img, lang=os.getenv("OCR_LANG", "hrv+eng")) # Confidence (best-effort) conf = None try: d = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT, lang=os.getenv("OCR_LANG", "hrv+eng")) confs = [int(c) for c in d.get("conf", []) if str(c).lstrip("-").isdigit() and int(c) >= 0] if confs: conf = round(sum(confs) / len(confs), 2) except Exception: pass return text, conf except Exception: return None, None def _ocr_pdf_bytes(data: bytes) -> Tuple[Optional[str], Optional[float]]: if not (_TESS_OK and _PDF2IMG_OK): return None, None try: pages = convert_from_bytes(data, dpi=200, fmt="png") except Exception: return None, None if not pages: return None, None out: List[str] = [] confs: List[float] = [] for p in pages[:8]: # cap to 8 pages try: out.append(pytesseract.image_to_string(p, lang=os.getenv("OCR_LANG", "hrv+eng"))) try: d = pytesseract.image_to_data(p, output_type=pytesseract.Output.DICT, lang=os.getenv("OCR_LANG", "hrv+eng")) cs = [int(c) for c in d.get("conf", []) if str(c).lstrip("-").isdigit() and int(c) >= 0] if cs: confs.append(sum(cs) / len(cs)) except Exception: pass except Exception: continue text = "\n\f\n".join(out) if out else None conf = round(sum(confs) / len(confs), 2) if confs else None return text, conf # ── Persistence ────────────────────────────────────────────────────────────── def _maybe_insert_upload(payload: Dict[str, Any]) -> Optional[int]: """Insert into pgz_sport.invoice_uploads — only writes columns that exist.""" cols = set(_table_columns("pgz_sport", "invoice_uploads")) if not cols: return None # Map our payload keys to potential DB columns candidates: Dict[str, Any] = { "file_name": payload.get("file_name"), "file_path": payload.get("file_path"), "file_size": payload.get("file_size"), "mime": payload.get("mime"), "sha256": payload.get("sha256"), "ocr_status": payload.get("ocr_status"), "ocr_engine": payload.get("ocr_engine"), "ocr_text": payload.get("ocr_text_full"), "ocr_confidence": payload.get("ocr_confidence"), "ai_invoice_no": (payload.get("extracted") or {}).get("invoice_no"), "ai_invoice_date": (payload.get("extracted") or {}).get("date"), "ai_vendor_name": (payload.get("extracted") or {}).get("vendor"), "ai_vendor_oib": (payload.get("extracted") or {}).get("oib"), "ai_amount_gross": (payload.get("extracted") or {}).get("amount"), "ai_engine": payload.get("ai_engine") or "regex-v1", "ai_extracted": json.dumps(payload.get("extracted") or {}), } insert_cols: List[str] = [] insert_vals: List[Any] = [] for k, v in candidates.items(): if k in cols and v is not None: insert_cols.append(k) insert_vals.append(v) if not insert_cols: return None sql = "INSERT INTO pgz_sport.invoice_uploads ({c}) VALUES ({p}) RETURNING id".format( c=", ".join(insert_cols), p=", ".join(["%s"] * len(insert_cols)), ) try: with _db() as c, c.cursor() as cur: cur.execute(sql, insert_vals) row = cur.fetchone() return int(row[0]) if row else None except Exception as e: print(f"[ocr_router] insert failed: {e}") return None # ── Endpoints ──────────────────────────────────────────────────────────────── @router.get("/health") def health(): return { "ok": True, "tesseract_available": bool(_TESS_OK and _PIL_OK), "pdf2image_available": bool(_PDF2IMG_OK), "upload_dir": str(UPLOAD_DIR), } @router.post("/upload") async def upload(file: UploadFile = File(...)): if not file or not file.filename: raise HTTPException(400, "no file") # Validate extension/mime ext = Path(file.filename).suffix.lower() if ext not in ALLOWED_EXT: raise HTTPException(400, f"extension not allowed: {ext}") # Read full body (bounded) data = await file.read() if not data: raise HTTPException(400, "empty file") if len(data) > MAX_BYTES: raise HTTPException(413, f"file too large: {len(data)} > {MAX_BYTES}") sha = hashlib.sha256(data).hexdigest() save_name = f"{sha}{ext}" abs_path = UPLOAD_DIR / save_name if not abs_path.exists(): try: abs_path.write_bytes(data) except Exception as e: raise HTTPException(500, f"could not persist file: {e}") rel_path = f"uploads/ocr/{save_name}" # Run OCR ocr_text: Optional[str] = None ocr_conf: Optional[float] = None ocr_engine = "tesseract" if ext == ".pdf": if not (_TESS_OK and _PDF2IMG_OK and _PIL_OK): ocr_status = "ocr_unavailable" else: ocr_text, ocr_conf = _ocr_pdf_bytes(data) ocr_status = "ocr_done" if ocr_text else "ocr_failed" else: if not (_TESS_OK and _PIL_OK): ocr_status = "ocr_unavailable" else: ocr_text, ocr_conf = _ocr_image_bytes(data) ocr_status = "ocr_done" if ocr_text else "ocr_failed" extracted = _extract_fields(ocr_text or "") # Truncated text for response text_resp = (ocr_text or "") if len(text_resp) > TEXT_CAP: text_resp = text_resp[:TEXT_CAP] payload: Dict[str, Any] = { "file_name": file.filename, "file_path": rel_path, "file_size": len(data), "mime": file.content_type or "application/octet-stream", "sha256": sha, "ocr_status": ocr_status, "ocr_engine": ocr_engine if ocr_status == "ocr_done" else None, "ocr_text_full": ocr_text, "ocr_confidence": ocr_conf, "extracted": extracted, "ai_engine": "regex-v1", } inserted_id = _maybe_insert_upload(payload) return JSONResponse( { "ok": True, "id": inserted_id, "file_path": rel_path, "file_name": file.filename, "file_size": len(data), "mime": payload["mime"], "sha256": sha, "ocr_status": ocr_status, "ocr_confidence": ocr_conf, "ocr_text": text_resp if ocr_text else None, "extracted": extracted, } )