Merge agent1-ocr: OCR u ERP/CRM

2026-05-05 18:34:46 +02:00
parent 9b0ed43b92 f488623920
commit c4640ca3af
4 changed files with 615 additions and 1 deletions
@@ -0,0 +1,403 @@
+#!/usr/bin/env python3
+# routers/ocr_router.py
+# Name:        PGŽ Sport OCR router (lightweight)
+# Version:     1.0.0
+# Authors:     Damir Radulić <dradulic@outlook.com> / <damir@rinet.one>
+# Date:        2026-05-05
+# Description: FastAPI APIRouter exposing POST /api/ocr/upload and
+#              GET /api/ocr/health. Accepts PDF/JPG/PNG, runs Tesseract
+#              (pdf2image for PDF), extracts vendor / OIB / invoice_no /
+#              date / amount via simple regex, persists into
+#              pgz_sport.invoice_uploads when possible. Designed to
+#              degrade gracefully if pytesseract / pdf2image are not
+#              installed (returns ocr_status='ocr_unavailable').
+
+from __future__ import annotations
+
+import os
+import re
+import io
+import hashlib
+import json
+import traceback
+from pathlib import Path
+from datetime import datetime
+from typing import Optional, Tuple, Dict, Any, List
+
+from fastapi import APIRouter, UploadFile, File, HTTPException
+from fastapi.responses import JSONResponse
+
+import psycopg2
+import psycopg2.extras
+
+# ── Optional OCR deps ────────────────────────────────────────────────────────
+_TESS_OK = False
+_PDF2IMG_OK = False
+_PIL_OK = False
+try:
+    import pytesseract  # type: ignore
+    _TESS_OK = True
+except Exception:
+    pytesseract = None  # type: ignore
+
+try:
+    from pdf2image import convert_from_bytes  # type: ignore
+    _PDF2IMG_OK = True
+except Exception:
+    convert_from_bytes = None  # type: ignore
+
+try:
+    from PIL import Image  # type: ignore
+    _PIL_OK = True
+except Exception:
+    Image = None  # type: ignore
+
+# ── Config ───────────────────────────────────────────────────────────────────
+DB = dict(
+    host="10.10.0.2",
+    port=6432,
+    dbname="rinet_v3",
+    user="rinet",
+    password="R1net2026!SecureDB#v7",
+)
+
+UPLOAD_DIR = Path("/opt/pgz-sport/uploads/ocr")
+UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
+
+ALLOWED_EXT = {".pdf", ".jpg", ".jpeg", ".png"}
+ALLOWED_MIME = {
+    "application/pdf",
+    "image/jpeg",
+    "image/jpg",
+    "image/png",
+}
+MAX_BYTES = 25 * 1024 * 1024  # 25 MB
+TEXT_CAP = 8 * 1024           # 8 KB cap for response text payload
+
+router = APIRouter(prefix="/api/ocr", tags=["ocr"])
+
+
+# ── DB helpers ───────────────────────────────────────────────────────────────
+def _db():
+    c = psycopg2.connect(**DB)
+    c.autocommit = True
+    return c
+
+
+def _table_columns(schema: str, table: str) -> List[str]:
+    try:
+        with _db() as c, c.cursor() as cur:
+            cur.execute(
+                """
+                SELECT column_name FROM information_schema.columns
+                WHERE table_schema = %s AND table_name = %s
+                """,
+                (schema, table),
+            )
+            return [r[0] for r in cur.fetchall()]
+    except Exception:
+        return []
+
+
+# ── Regex extractors ─────────────────────────────────────────────────────────
+RE_OIB_HR = re.compile(r"\bHR\s*(\d{11})\b")
+RE_OIB_BARE = re.compile(r"\b(\d{11})\b")
+RE_INVOICE = re.compile(
+    r"(?im)^.*\b(?:Ra[čc]un|Invoice)\b[^\n\r]{0,80}$"
+)
+RE_DATE_DMY = re.compile(r"\b(\d{2})[./](\d{2})[./](\d{4})\b")
+RE_DATE_YMD = re.compile(r"\b(\d{4})-(\d{2})-(\d{2})\b")
+# Amount candidates (1.234,56 or 1234,56 or 1234.56 or 1,234.56), at least 2 digits
+RE_AMOUNT = re.compile(
+    r"(?<![\w.,])"
+    r"(\d{1,3}(?:[.\s]\d{3})+,\d{2}|\d+,\d{2}|\d{1,3}(?:,\d{3})+\.\d{2}|\d+\.\d{2})"
+    r"(?![\w])"
+)
+
+
+def _norm_amount(raw: str) -> Optional[float]:
+    s = raw.strip().replace(" ", "")
+    # If both . and , present, assume , decimal if last separator is ,
+    if "," in s and "." in s:
+        if s.rfind(",") > s.rfind("."):
+            s = s.replace(".", "").replace(",", ".")
+        else:
+            s = s.replace(",", "")
+    elif "," in s:
+        # 1.234,56 or 1234,56 → swap
+        s = s.replace(".", "").replace(",", ".")
+    try:
+        return float(s)
+    except Exception:
+        return None
+
+
+def _first_nonempty_line(text: str) -> Optional[str]:
+    for ln in (text or "").splitlines():
+        v = ln.strip()
+        if v:
+            return v[:200]
+    return None
+
+
+def _parse_date(text: str) -> Optional[str]:
+    m = RE_DATE_YMD.search(text or "")
+    if m:
+        try:
+            return datetime(int(m.group(1)), int(m.group(2)), int(m.group(3))).date().isoformat()
+        except Exception:
+            pass
+    m = RE_DATE_DMY.search(text or "")
+    if m:
+        try:
+            return datetime(int(m.group(3)), int(m.group(2)), int(m.group(1))).date().isoformat()
+        except Exception:
+            pass
+    return None
+
+
+def _parse_oib(text: str) -> Optional[str]:
+    m = RE_OIB_HR.search(text or "")
+    if m:
+        return m.group(1)
+    m = RE_OIB_BARE.search(text or "")
+    if m:
+        return m.group(1)
+    return None
+
+
+def _parse_invoice_no(text: str) -> Optional[str]:
+    m = RE_INVOICE.search(text or "")
+    if not m:
+        return None
+    line = m.group(0).strip()
+    # Try to grab the right-most token that looks like an invoice id
+    cand = re.findall(r"[A-Z0-9][A-Z0-9\-/_.]{1,40}", line)
+    if cand:
+        # Drop pure words like "Račun"/"Invoice"
+        for c in reversed(cand):
+            if any(ch.isdigit() for ch in c):
+                return c[:64]
+    return line[:120]
+
+
+def _parse_amount(text: str) -> Optional[float]:
+    if not text:
+        return None
+    best: Optional[float] = None
+    for m in RE_AMOUNT.finditer(text):
+        v = _norm_amount(m.group(1))
+        if v is None:
+            continue
+        if best is None or v > best:
+            best = v
+    return best
+
+
+def _extract_fields(text: str) -> Dict[str, Any]:
+    return {
+        "vendor": _first_nonempty_line(text),
+        "oib": _parse_oib(text),
+        "invoice_no": _parse_invoice_no(text),
+        "date": _parse_date(text),
+        "amount": _parse_amount(text),
+    }
+
+
+# ── OCR engine ───────────────────────────────────────────────────────────────
+def _ocr_image_bytes(data: bytes) -> Tuple[Optional[str], Optional[float]]:
+    if not (_TESS_OK and _PIL_OK):
+        return None, None
+    try:
+        img = Image.open(io.BytesIO(data))
+        img.load()
+        text = pytesseract.image_to_string(img, lang=os.getenv("OCR_LANG", "hrv+eng"))
+        # Confidence (best-effort)
+        conf = None
+        try:
+            d = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT,
+                                          lang=os.getenv("OCR_LANG", "hrv+eng"))
+            confs = [int(c) for c in d.get("conf", []) if str(c).lstrip("-").isdigit() and int(c) >= 0]
+            if confs:
+                conf = round(sum(confs) / len(confs), 2)
+        except Exception:
+            pass
+        return text, conf
+    except Exception:
+        return None, None
+
+
+def _ocr_pdf_bytes(data: bytes) -> Tuple[Optional[str], Optional[float]]:
+    if not (_TESS_OK and _PDF2IMG_OK):
+        return None, None
+    try:
+        pages = convert_from_bytes(data, dpi=200, fmt="png")
+    except Exception:
+        return None, None
+    if not pages:
+        return None, None
+    out: List[str] = []
+    confs: List[float] = []
+    for p in pages[:8]:  # cap to 8 pages
+        try:
+            out.append(pytesseract.image_to_string(p, lang=os.getenv("OCR_LANG", "hrv+eng")))
+            try:
+                d = pytesseract.image_to_data(p, output_type=pytesseract.Output.DICT,
+                                              lang=os.getenv("OCR_LANG", "hrv+eng"))
+                cs = [int(c) for c in d.get("conf", []) if str(c).lstrip("-").isdigit() and int(c) >= 0]
+                if cs:
+                    confs.append(sum(cs) / len(cs))
+            except Exception:
+                pass
+        except Exception:
+            continue
+    text = "\n\f\n".join(out) if out else None
+    conf = round(sum(confs) / len(confs), 2) if confs else None
+    return text, conf
+
+
+# ── Persistence ──────────────────────────────────────────────────────────────
+def _maybe_insert_upload(payload: Dict[str, Any]) -> Optional[int]:
+    """Insert into pgz_sport.invoice_uploads — only writes columns that exist."""
+    cols = set(_table_columns("pgz_sport", "invoice_uploads"))
+    if not cols:
+        return None
+
+    # Map our payload keys to potential DB columns
+    candidates: Dict[str, Any] = {
+        "file_name": payload.get("file_name"),
+        "file_path": payload.get("file_path"),
+        "file_size": payload.get("file_size"),
+        "mime": payload.get("mime"),
+        "sha256": payload.get("sha256"),
+        "ocr_status": payload.get("ocr_status"),
+        "ocr_engine": payload.get("ocr_engine"),
+        "ocr_text": payload.get("ocr_text_full"),
+        "ocr_confidence": payload.get("ocr_confidence"),
+        "ai_invoice_no": (payload.get("extracted") or {}).get("invoice_no"),
+        "ai_invoice_date": (payload.get("extracted") or {}).get("date"),
+        "ai_vendor_name": (payload.get("extracted") or {}).get("vendor"),
+        "ai_vendor_oib": (payload.get("extracted") or {}).get("oib"),
+        "ai_amount_gross": (payload.get("extracted") or {}).get("amount"),
+        "ai_engine": payload.get("ai_engine") or "regex-v1",
+        "ai_extracted": json.dumps(payload.get("extracted") or {}),
+    }
+
+    insert_cols: List[str] = []
+    insert_vals: List[Any] = []
+    for k, v in candidates.items():
+        if k in cols and v is not None:
+            insert_cols.append(k)
+            insert_vals.append(v)
+
+    if not insert_cols:
+        return None
+
+    sql = "INSERT INTO pgz_sport.invoice_uploads ({c}) VALUES ({p}) RETURNING id".format(
+        c=", ".join(insert_cols),
+        p=", ".join(["%s"] * len(insert_cols)),
+    )
+    try:
+        with _db() as c, c.cursor() as cur:
+            cur.execute(sql, insert_vals)
+            row = cur.fetchone()
+            return int(row[0]) if row else None
+    except Exception as e:
+        print(f"[ocr_router] insert failed: {e}")
+        return None
+
+
+# ── Endpoints ────────────────────────────────────────────────────────────────
+@router.get("/health")
+def health():
+    return {
+        "ok": True,
+        "tesseract_available": bool(_TESS_OK and _PIL_OK),
+        "pdf2image_available": bool(_PDF2IMG_OK),
+        "upload_dir": str(UPLOAD_DIR),
+    }
+
+
+@router.post("/upload")
+async def upload(file: UploadFile = File(...)):
+    if not file or not file.filename:
+        raise HTTPException(400, "no file")
+
+    # Validate extension/mime
+    ext = Path(file.filename).suffix.lower()
+    if ext not in ALLOWED_EXT:
+        raise HTTPException(400, f"extension not allowed: {ext}")
+
+    # Read full body (bounded)
+    data = await file.read()
+    if not data:
+        raise HTTPException(400, "empty file")
+    if len(data) > MAX_BYTES:
+        raise HTTPException(413, f"file too large: {len(data)} > {MAX_BYTES}")
+
+    sha = hashlib.sha256(data).hexdigest()
+    save_name = f"{sha}{ext}"
+    abs_path = UPLOAD_DIR / save_name
+    if not abs_path.exists():
+        try:
+            abs_path.write_bytes(data)
+        except Exception as e:
+            raise HTTPException(500, f"could not persist file: {e}")
+
+    rel_path = f"uploads/ocr/{save_name}"
+
+    # Run OCR
+    ocr_text: Optional[str] = None
+    ocr_conf: Optional[float] = None
+    ocr_engine = "tesseract"
+    if ext == ".pdf":
+        if not (_TESS_OK and _PDF2IMG_OK and _PIL_OK):
+            ocr_status = "ocr_unavailable"
+        else:
+            ocr_text, ocr_conf = _ocr_pdf_bytes(data)
+            ocr_status = "ocr_done" if ocr_text else "ocr_failed"
+    else:
+        if not (_TESS_OK and _PIL_OK):
+            ocr_status = "ocr_unavailable"
+        else:
+            ocr_text, ocr_conf = _ocr_image_bytes(data)
+            ocr_status = "ocr_done" if ocr_text else "ocr_failed"
+
+    extracted = _extract_fields(ocr_text or "")
+
+    # Truncated text for response
+    text_resp = (ocr_text or "")
+    if len(text_resp) > TEXT_CAP:
+        text_resp = text_resp[:TEXT_CAP]
+
+    payload: Dict[str, Any] = {
+        "file_name": file.filename,
+        "file_path": rel_path,
+        "file_size": len(data),
+        "mime": file.content_type or "application/octet-stream",
+        "sha256": sha,
+        "ocr_status": ocr_status,
+        "ocr_engine": ocr_engine if ocr_status == "ocr_done" else None,
+        "ocr_text_full": ocr_text,
+        "ocr_confidence": ocr_conf,
+        "extracted": extracted,
+        "ai_engine": "regex-v1",
+    }
+
+    inserted_id = _maybe_insert_upload(payload)
+
+    return JSONResponse(
+        {
+            "ok": True,
+            "id": inserted_id,
+            "file_path": rel_path,
+            "file_name": file.filename,
+            "file_size": len(data),
+            "mime": payload["mime"],
+            "sha256": sha,
+            "ocr_status": ocr_status,
+            "ocr_confidence": ocr_conf,
+            "ocr_text": text_resp if ocr_text else None,
+            "extracted": extracted,
+        }
+    )