pgz-sport/routers/ocr_router.py

#!/usr/bin/env python3
from __future__ import annotations
from dotenv import load_dotenv
load_dotenv('/opt/rinet-gpu/.env.master')
# auto-added by patch_scrapers_with_dotenv.sh
# routers/ocr_router.py
# Name:        PGŽ Sport OCR router (lightweight)
# Version:     1.0.0
# Authors:     Damir Radulić <dradulic@outlook.com> / <damir@rinet.one>
# Date:        2026-05-05
# Description: FastAPI APIRouter exposing POST /api/ocr/upload and
#              GET /api/ocr/health. Accepts PDF/JPG/PNG, runs Tesseract
#              (pdf2image for PDF), extracts vendor / OIB / invoice_no /
#              date / amount via simple regex, persists into
#              pgz_sport.invoice_uploads when possible. Designed to
#              degrade gracefully if pytesseract / pdf2image are not
#              installed (returns ocr_status='ocr_unavailable').


import os
import re
import io
import hashlib
import json
import traceback
from pathlib import Path
from datetime import datetime
from typing import Optional, Tuple, Dict, Any, List

from fastapi import APIRouter, UploadFile, File, HTTPException
from fastapi.responses import JSONResponse

import psycopg2
import psycopg2.extras

# ── Optional OCR deps ────────────────────────────────────────────────────────
_TESS_OK = False
_PDF2IMG_OK = False
_PIL_OK = False
try:
    import pytesseract  # type: ignore
    _TESS_OK = True
except Exception:
    pytesseract = None  # type: ignore

try:
    from pdf2image import convert_from_bytes  # type: ignore
    _PDF2IMG_OK = True
except Exception:
    convert_from_bytes = None  # type: ignore

try:
    from PIL import Image  # type: ignore
    _PIL_OK = True
except Exception:
    Image = None  # type: ignore

# ── Config ───────────────────────────────────────────────────────────────────
DB = dict(
    host="10.10.0.2",
    port=6432,
    dbname="rinet_v3",
    user="rinet",
    password=os.environ["DB_PASSWORD"],
)

UPLOAD_DIR = Path("/opt/pgz-sport/uploads/ocr")
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)

ALLOWED_EXT = {".pdf", ".jpg", ".jpeg", ".png"}
ALLOWED_MIME = {
    "application/pdf",
    "image/jpeg",
    "image/jpg",
    "image/png",
}
MAX_BYTES = 25 * 1024 * 1024  # 25 MB
TEXT_CAP = 8 * 1024           # 8 KB cap for response text payload

router = APIRouter(prefix="/api/ocr", tags=["ocr"])


# ── DB helpers ───────────────────────────────────────────────────────────────
def _db():
    c = psycopg2.connect(**DB)
    c.autocommit = True
    return c


def _table_columns(schema: str, table: str) -> List[str]:
    try:
        with _db() as c, c.cursor() as cur:
            cur.execute(
                """
                SELECT column_name FROM information_schema.columns
                WHERE table_schema = %s AND table_name = %s
                """,
                (schema, table),
            )
            return [r[0] for r in cur.fetchall()]
    except Exception:
        return []


# ── Regex extractors ─────────────────────────────────────────────────────────
RE_OIB_HR = re.compile(r"\bHR\s*(\d{11})\b")
RE_OIB_BARE = re.compile(r"\b(\d{11})\b")
RE_INVOICE = re.compile(
    r"(?im)^.*\b(?:Ra[čc]un|Invoice)\b[^\n\r]{0,80}$"
)
RE_DATE_DMY = re.compile(r"\b(\d{2})[./](\d{2})[./](\d{4})\b")
RE_DATE_YMD = re.compile(r"\b(\d{4})-(\d{2})-(\d{2})\b")
# Amount candidates (1.234,56 or 1234,56 or 1234.56 or 1,234.56), at least 2 digits
RE_AMOUNT = re.compile(
    r"(?<![\w.,])"
    r"(\d{1,3}(?:[.\s]\d{3})+,\d{2}|\d+,\d{2}|\d{1,3}(?:,\d{3})+\.\d{2}|\d+\.\d{2})"
    r"(?![\w])"
)


def _norm_amount(raw: str) -> Optional[float]:
    s = raw.strip().replace(" ", "")
    # If both . and , present, assume , decimal if last separator is ,
    if "," in s and "." in s:
        if s.rfind(",") > s.rfind("."):
            s = s.replace(".", "").replace(",", ".")
        else:
            s = s.replace(",", "")
    elif "," in s:
        # 1.234,56 or 1234,56 → swap
        s = s.replace(".", "").replace(",", ".")
    try:
        return float(s)
    except Exception:
        return None


def _first_nonempty_line(text: str) -> Optional[str]:
    for ln in (text or "").splitlines():
        v = ln.strip()
        if v:
            return v[:200]
    return None


def _parse_date(text: str) -> Optional[str]:
    m = RE_DATE_YMD.search(text or "")
    if m:
        try:
            return datetime(int(m.group(1)), int(m.group(2)), int(m.group(3))).date().isoformat()
        except Exception:
            pass
    m = RE_DATE_DMY.search(text or "")
    if m:
        try:
            return datetime(int(m.group(3)), int(m.group(2)), int(m.group(1))).date().isoformat()
        except Exception:
            pass
    return None


def _parse_oib(text: str) -> Optional[str]:
    m = RE_OIB_HR.search(text or "")
    if m:
        return m.group(1)
    m = RE_OIB_BARE.search(text or "")
    if m:
        return m.group(1)
    return None


def _parse_invoice_no(text: str) -> Optional[str]:
    m = RE_INVOICE.search(text or "")
    if not m:
        return None
    line = m.group(0).strip()
    # Try to grab the right-most token that looks like an invoice id
    cand = re.findall(r"[A-Z0-9][A-Z0-9\-/_.]{1,40}", line)
    if cand:
        # Drop pure words like "Račun"/"Invoice"
        for c in reversed(cand):
            if any(ch.isdigit() for ch in c):
                return c[:64]
    return line[:120]


def _parse_amount(text: str) -> Optional[float]:
    if not text:
        return None
    best: Optional[float] = None
    for m in RE_AMOUNT.finditer(text):
        v = _norm_amount(m.group(1))
        if v is None:
            continue
        if best is None or v > best:
            best = v
    return best


def _extract_fields(text: str) -> Dict[str, Any]:
    return {
        "vendor": _first_nonempty_line(text),
        "oib": _parse_oib(text),
        "invoice_no": _parse_invoice_no(text),
        "date": _parse_date(text),
        "amount": _parse_amount(text),
    }


# ── OCR engine ───────────────────────────────────────────────────────────────
def _ocr_image_bytes(data: bytes) -> Tuple[Optional[str], Optional[float]]:
    if not (_TESS_OK and _PIL_OK):
        return None, None
    try:
        img = Image.open(io.BytesIO(data))
        img.load()
        text = pytesseract.image_to_string(img, lang=os.getenv("OCR_LANG", "hrv+eng"))
        # Confidence (best-effort)
        conf = None
        try:
            d = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT,
                                          lang=os.getenv("OCR_LANG", "hrv+eng"))
            confs = [int(c) for c in d.get("conf", []) if str(c).lstrip("-").isdigit() and int(c) >= 0]
            if confs:
                conf = round(sum(confs) / len(confs), 2)
        except Exception:
            pass
        return text, conf
    except Exception:
        return None, None


def _ocr_pdf_bytes(data: bytes) -> Tuple[Optional[str], Optional[float]]:
    if not (_TESS_OK and _PDF2IMG_OK):
        return None, None
    try:
        pages = convert_from_bytes(data, dpi=200, fmt="png")
    except Exception:
        return None, None
    if not pages:
        return None, None
    out: List[str] = []
    confs: List[float] = []
    for p in pages[:8]:  # cap to 8 pages
        try:
            out.append(pytesseract.image_to_string(p, lang=os.getenv("OCR_LANG", "hrv+eng")))
            try:
                d = pytesseract.image_to_data(p, output_type=pytesseract.Output.DICT,
                                              lang=os.getenv("OCR_LANG", "hrv+eng"))
                cs = [int(c) for c in d.get("conf", []) if str(c).lstrip("-").isdigit() and int(c) >= 0]
                if cs:
                    confs.append(sum(cs) / len(cs))
            except Exception:
                pass
        except Exception:
            continue
    text = "\n\f\n".join(out) if out else None
    conf = round(sum(confs) / len(confs), 2) if confs else None
    return text, conf


# ── Persistence ──────────────────────────────────────────────────────────────
def _maybe_insert_upload(payload: Dict[str, Any]) -> Optional[int]:
    """Insert into pgz_sport.invoice_uploads — only writes columns that exist."""
    cols = set(_table_columns("pgz_sport", "invoice_uploads"))
    if not cols:
        return None

    # Map our payload keys to potential DB columns
    candidates: Dict[str, Any] = {
        "file_name": payload.get("file_name"),
        "file_path": payload.get("file_path"),
        "file_size": payload.get("file_size"),
        "mime": payload.get("mime"),
        "sha256": payload.get("sha256"),
        "ocr_status": payload.get("ocr_status"),
        "ocr_engine": payload.get("ocr_engine"),
        "ocr_text": payload.get("ocr_text_full"),
        "ocr_confidence": payload.get("ocr_confidence"),
        "ai_invoice_no": (payload.get("extracted") or {}).get("invoice_no"),
        "ai_invoice_date": (payload.get("extracted") or {}).get("date"),
        "ai_vendor_name": (payload.get("extracted") or {}).get("vendor"),
        "ai_vendor_oib": (payload.get("extracted") or {}).get("oib"),
        "ai_amount_gross": (payload.get("extracted") or {}).get("amount"),
        "ai_engine": payload.get("ai_engine") or "regex-v1",
        "ai_extracted": json.dumps(payload.get("extracted") or {}),
    }

    insert_cols: List[str] = []
    insert_vals: List[Any] = []
    for k, v in candidates.items():
        if k in cols and v is not None:
            insert_cols.append(k)
            insert_vals.append(v)

    if not insert_cols:
        return None

    sql = "INSERT INTO pgz_sport.invoice_uploads ({c}) VALUES ({p}) RETURNING id".format(
        c=", ".join(insert_cols),
        p=", ".join(["%s"] * len(insert_cols)),
    )
    try:
        with _db() as c, c.cursor() as cur:
            cur.execute(sql, insert_vals)
            row = cur.fetchone()
            return int(row[0]) if row else None
    except Exception as e:
        print(f"[ocr_router] insert failed: {e}")
        return None


# ── Endpoints ────────────────────────────────────────────────────────────────
@router.get("/health")
def health():
    return {
        "ok": True,
        "tesseract_available": bool(_TESS_OK and _PIL_OK),
        "pdf2image_available": bool(_PDF2IMG_OK),
        "upload_dir": str(UPLOAD_DIR),
    }


@router.post("/upload")
async def upload(file: UploadFile = File(...)):
    if not file or not file.filename:
        raise HTTPException(400, "no file")

    # Validate extension/mime
    ext = Path(file.filename).suffix.lower()
    if ext not in ALLOWED_EXT:
        raise HTTPException(400, f"extension not allowed: {ext}")

    # Read full body (bounded)
    data = await file.read()
    if not data:
        raise HTTPException(400, "empty file")
    if len(data) > MAX_BYTES:
        raise HTTPException(413, f"file too large: {len(data)} > {MAX_BYTES}")

    sha = hashlib.sha256(data).hexdigest()
    save_name = f"{sha}{ext}"
    abs_path = UPLOAD_DIR / save_name
    if not abs_path.exists():
        try:
            abs_path.write_bytes(data)
        except Exception as e:
            raise HTTPException(500, f"could not persist file: {e}")

    rel_path = f"uploads/ocr/{save_name}"

    # Run OCR
    ocr_text: Optional[str] = None
    ocr_conf: Optional[float] = None
    ocr_engine = "tesseract"
    if ext == ".pdf":
        if not (_TESS_OK and _PDF2IMG_OK and _PIL_OK):
            ocr_status = "ocr_unavailable"
        else:
            ocr_text, ocr_conf = _ocr_pdf_bytes(data)
            ocr_status = "ocr_done" if ocr_text else "ocr_failed"
    else:
        if not (_TESS_OK and _PIL_OK):
            ocr_status = "ocr_unavailable"
        else:
            ocr_text, ocr_conf = _ocr_image_bytes(data)
            ocr_status = "ocr_done" if ocr_text else "ocr_failed"

    extracted = _extract_fields(ocr_text or "")

    # Truncated text for response
    text_resp = (ocr_text or "")
    if len(text_resp) > TEXT_CAP:
        text_resp = text_resp[:TEXT_CAP]

    payload: Dict[str, Any] = {
        "file_name": file.filename,
        "file_path": rel_path,
        "file_size": len(data),
        "mime": file.content_type or "application/octet-stream",
        "sha256": sha,
        "ocr_status": ocr_status,
        "ocr_engine": ocr_engine if ocr_status == "ocr_done" else None,
        "ocr_text_full": ocr_text,
        "ocr_confidence": ocr_conf,
        "extracted": extracted,
        "ai_engine": "regex-v1",
    }

    inserted_id = _maybe_insert_upload(payload)

    return JSONResponse(
        {
            "ok": True,
            "id": inserted_id,
            "file_path": rel_path,
            "file_name": file.filename,
            "file_size": len(data),
            "mime": payload["mime"],
            "sha256": sha,
            "ocr_status": ocr_status,
            "ocr_confidence": ocr_conf,
            "ocr_text": text_resp if ocr_text else None,
            "extracted": extracted,
        }
    )