From f4886239201d9f2cfe38272347139c85de3cce16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Damir=20Raduli=C4=87?= Date: Tue, 5 May 2026 18:28:22 +0200 Subject: [PATCH] =?UTF-8?q?Task=201:=20OCR=20u=20ERP/CRM=20=E2=80=94=20/ap?= =?UTF-8?q?i/ocr/upload=20+=20tab=20Ra=C4=8Duni=20(OCR)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - routers/ocr_router.py: POST /api/ocr/upload (Tesseract+pdf2image, regex field extraction) - pgz_sport_api.py: mount ocr_router with try/except guard - static/erp_full.html: nova tab "πŸ“· OCR" + panel - static/crm_v2.html: OCR upload modal/tab Co-Authored-By: Claude Opus 4.7 (1M context) --- pgz_sport_api.py | 7 + routers/ocr_router.py | 403 ++++++++++++++++++++++++++++++++++++++++++ static/crm_v2.html | 80 +++++++++ static/erp_full.html | 126 ++++++++++++- 4 files changed, 615 insertions(+), 1 deletion(-) create mode 100644 routers/ocr_router.py diff --git a/pgz_sport_api.py b/pgz_sport_api.py index bb0c21b..77a6a33 100644 --- a/pgz_sport_api.py +++ b/pgz_sport_api.py @@ -1695,6 +1695,13 @@ try: except Exception as e: print(f'[ERP/OCR] router fail: {e}') +try: + from routers.ocr_router import router as ocr_router + app.include_router(ocr_router) + print('[startup] ocr_router mounted') +except Exception as e: + print(f'[startup] ocr_router skipped: {e}') + try: from erp.putni_nalozi import router as erp_putni_router app.include_router(erp_putni_router) diff --git a/routers/ocr_router.py b/routers/ocr_router.py new file mode 100644 index 0000000..1e94c77 --- /dev/null +++ b/routers/ocr_router.py @@ -0,0 +1,403 @@ +#!/usr/bin/env python3 +# routers/ocr_router.py +# Name: PGΕ½ Sport OCR router (lightweight) +# Version: 1.0.0 +# Authors: Damir RaduliΔ‡ / +# Date: 2026-05-05 +# Description: FastAPI APIRouter exposing POST /api/ocr/upload and +# GET /api/ocr/health. Accepts PDF/JPG/PNG, runs Tesseract +# (pdf2image for PDF), extracts vendor / OIB / invoice_no / +# date / amount via simple regex, persists into +# pgz_sport.invoice_uploads when possible. Designed to +# degrade gracefully if pytesseract / pdf2image are not +# installed (returns ocr_status='ocr_unavailable'). + +from __future__ import annotations + +import os +import re +import io +import hashlib +import json +import traceback +from pathlib import Path +from datetime import datetime +from typing import Optional, Tuple, Dict, Any, List + +from fastapi import APIRouter, UploadFile, File, HTTPException +from fastapi.responses import JSONResponse + +import psycopg2 +import psycopg2.extras + +# ── Optional OCR deps ──────────────────────────────────────────────────────── +_TESS_OK = False +_PDF2IMG_OK = False +_PIL_OK = False +try: + import pytesseract # type: ignore + _TESS_OK = True +except Exception: + pytesseract = None # type: ignore + +try: + from pdf2image import convert_from_bytes # type: ignore + _PDF2IMG_OK = True +except Exception: + convert_from_bytes = None # type: ignore + +try: + from PIL import Image # type: ignore + _PIL_OK = True +except Exception: + Image = None # type: ignore + +# ── Config ─────────────────────────────────────────────────────────────────── +DB = dict( + host="10.10.0.2", + port=6432, + dbname="rinet_v3", + user="rinet", + password="R1net2026!SecureDB#v7", +) + +UPLOAD_DIR = Path("/opt/pgz-sport/uploads/ocr") +UPLOAD_DIR.mkdir(parents=True, exist_ok=True) + +ALLOWED_EXT = {".pdf", ".jpg", ".jpeg", ".png"} +ALLOWED_MIME = { + "application/pdf", + "image/jpeg", + "image/jpg", + "image/png", +} +MAX_BYTES = 25 * 1024 * 1024 # 25 MB +TEXT_CAP = 8 * 1024 # 8 KB cap for response text payload + +router = APIRouter(prefix="/api/ocr", tags=["ocr"]) + + +# ── DB helpers ─────────────────────────────────────────────────────────────── +def _db(): + c = psycopg2.connect(**DB) + c.autocommit = True + return c + + +def _table_columns(schema: str, table: str) -> List[str]: + try: + with _db() as c, c.cursor() as cur: + cur.execute( + """ + SELECT column_name FROM information_schema.columns + WHERE table_schema = %s AND table_name = %s + """, + (schema, table), + ) + return [r[0] for r in cur.fetchall()] + except Exception: + return [] + + +# ── Regex extractors ───────────────────────────────────────────────────────── +RE_OIB_HR = re.compile(r"\bHR\s*(\d{11})\b") +RE_OIB_BARE = re.compile(r"\b(\d{11})\b") +RE_INVOICE = re.compile( + r"(?im)^.*\b(?:Ra[čc]un|Invoice)\b[^\n\r]{0,80}$" +) +RE_DATE_DMY = re.compile(r"\b(\d{2})[./](\d{2})[./](\d{4})\b") +RE_DATE_YMD = re.compile(r"\b(\d{4})-(\d{2})-(\d{2})\b") +# Amount candidates (1.234,56 or 1234,56 or 1234.56 or 1,234.56), at least 2 digits +RE_AMOUNT = re.compile( + r"(? Optional[float]: + s = raw.strip().replace(" ", "") + # If both . and , present, assume , decimal if last separator is , + if "," in s and "." in s: + if s.rfind(",") > s.rfind("."): + s = s.replace(".", "").replace(",", ".") + else: + s = s.replace(",", "") + elif "," in s: + # 1.234,56 or 1234,56 β†’ swap + s = s.replace(".", "").replace(",", ".") + try: + return float(s) + except Exception: + return None + + +def _first_nonempty_line(text: str) -> Optional[str]: + for ln in (text or "").splitlines(): + v = ln.strip() + if v: + return v[:200] + return None + + +def _parse_date(text: str) -> Optional[str]: + m = RE_DATE_YMD.search(text or "") + if m: + try: + return datetime(int(m.group(1)), int(m.group(2)), int(m.group(3))).date().isoformat() + except Exception: + pass + m = RE_DATE_DMY.search(text or "") + if m: + try: + return datetime(int(m.group(3)), int(m.group(2)), int(m.group(1))).date().isoformat() + except Exception: + pass + return None + + +def _parse_oib(text: str) -> Optional[str]: + m = RE_OIB_HR.search(text or "") + if m: + return m.group(1) + m = RE_OIB_BARE.search(text or "") + if m: + return m.group(1) + return None + + +def _parse_invoice_no(text: str) -> Optional[str]: + m = RE_INVOICE.search(text or "") + if not m: + return None + line = m.group(0).strip() + # Try to grab the right-most token that looks like an invoice id + cand = re.findall(r"[A-Z0-9][A-Z0-9\-/_.]{1,40}", line) + if cand: + # Drop pure words like "Račun"/"Invoice" + for c in reversed(cand): + if any(ch.isdigit() for ch in c): + return c[:64] + return line[:120] + + +def _parse_amount(text: str) -> Optional[float]: + if not text: + return None + best: Optional[float] = None + for m in RE_AMOUNT.finditer(text): + v = _norm_amount(m.group(1)) + if v is None: + continue + if best is None or v > best: + best = v + return best + + +def _extract_fields(text: str) -> Dict[str, Any]: + return { + "vendor": _first_nonempty_line(text), + "oib": _parse_oib(text), + "invoice_no": _parse_invoice_no(text), + "date": _parse_date(text), + "amount": _parse_amount(text), + } + + +# ── OCR engine ─────────────────────────────────────────────────────────────── +def _ocr_image_bytes(data: bytes) -> Tuple[Optional[str], Optional[float]]: + if not (_TESS_OK and _PIL_OK): + return None, None + try: + img = Image.open(io.BytesIO(data)) + img.load() + text = pytesseract.image_to_string(img, lang=os.getenv("OCR_LANG", "hrv+eng")) + # Confidence (best-effort) + conf = None + try: + d = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT, + lang=os.getenv("OCR_LANG", "hrv+eng")) + confs = [int(c) for c in d.get("conf", []) if str(c).lstrip("-").isdigit() and int(c) >= 0] + if confs: + conf = round(sum(confs) / len(confs), 2) + except Exception: + pass + return text, conf + except Exception: + return None, None + + +def _ocr_pdf_bytes(data: bytes) -> Tuple[Optional[str], Optional[float]]: + if not (_TESS_OK and _PDF2IMG_OK): + return None, None + try: + pages = convert_from_bytes(data, dpi=200, fmt="png") + except Exception: + return None, None + if not pages: + return None, None + out: List[str] = [] + confs: List[float] = [] + for p in pages[:8]: # cap to 8 pages + try: + out.append(pytesseract.image_to_string(p, lang=os.getenv("OCR_LANG", "hrv+eng"))) + try: + d = pytesseract.image_to_data(p, output_type=pytesseract.Output.DICT, + lang=os.getenv("OCR_LANG", "hrv+eng")) + cs = [int(c) for c in d.get("conf", []) if str(c).lstrip("-").isdigit() and int(c) >= 0] + if cs: + confs.append(sum(cs) / len(cs)) + except Exception: + pass + except Exception: + continue + text = "\n\f\n".join(out) if out else None + conf = round(sum(confs) / len(confs), 2) if confs else None + return text, conf + + +# ── Persistence ────────────────────────────────────────────────────────────── +def _maybe_insert_upload(payload: Dict[str, Any]) -> Optional[int]: + """Insert into pgz_sport.invoice_uploads β€” only writes columns that exist.""" + cols = set(_table_columns("pgz_sport", "invoice_uploads")) + if not cols: + return None + + # Map our payload keys to potential DB columns + candidates: Dict[str, Any] = { + "file_name": payload.get("file_name"), + "file_path": payload.get("file_path"), + "file_size": payload.get("file_size"), + "mime": payload.get("mime"), + "sha256": payload.get("sha256"), + "ocr_status": payload.get("ocr_status"), + "ocr_engine": payload.get("ocr_engine"), + "ocr_text": payload.get("ocr_text_full"), + "ocr_confidence": payload.get("ocr_confidence"), + "ai_invoice_no": (payload.get("extracted") or {}).get("invoice_no"), + "ai_invoice_date": (payload.get("extracted") or {}).get("date"), + "ai_vendor_name": (payload.get("extracted") or {}).get("vendor"), + "ai_vendor_oib": (payload.get("extracted") or {}).get("oib"), + "ai_amount_gross": (payload.get("extracted") or {}).get("amount"), + "ai_engine": payload.get("ai_engine") or "regex-v1", + "ai_extracted": json.dumps(payload.get("extracted") or {}), + } + + insert_cols: List[str] = [] + insert_vals: List[Any] = [] + for k, v in candidates.items(): + if k in cols and v is not None: + insert_cols.append(k) + insert_vals.append(v) + + if not insert_cols: + return None + + sql = "INSERT INTO pgz_sport.invoice_uploads ({c}) VALUES ({p}) RETURNING id".format( + c=", ".join(insert_cols), + p=", ".join(["%s"] * len(insert_cols)), + ) + try: + with _db() as c, c.cursor() as cur: + cur.execute(sql, insert_vals) + row = cur.fetchone() + return int(row[0]) if row else None + except Exception as e: + print(f"[ocr_router] insert failed: {e}") + return None + + +# ── Endpoints ──────────────────────────────────────────────────────────────── +@router.get("/health") +def health(): + return { + "ok": True, + "tesseract_available": bool(_TESS_OK and _PIL_OK), + "pdf2image_available": bool(_PDF2IMG_OK), + "upload_dir": str(UPLOAD_DIR), + } + + +@router.post("/upload") +async def upload(file: UploadFile = File(...)): + if not file or not file.filename: + raise HTTPException(400, "no file") + + # Validate extension/mime + ext = Path(file.filename).suffix.lower() + if ext not in ALLOWED_EXT: + raise HTTPException(400, f"extension not allowed: {ext}") + + # Read full body (bounded) + data = await file.read() + if not data: + raise HTTPException(400, "empty file") + if len(data) > MAX_BYTES: + raise HTTPException(413, f"file too large: {len(data)} > {MAX_BYTES}") + + sha = hashlib.sha256(data).hexdigest() + save_name = f"{sha}{ext}" + abs_path = UPLOAD_DIR / save_name + if not abs_path.exists(): + try: + abs_path.write_bytes(data) + except Exception as e: + raise HTTPException(500, f"could not persist file: {e}") + + rel_path = f"uploads/ocr/{save_name}" + + # Run OCR + ocr_text: Optional[str] = None + ocr_conf: Optional[float] = None + ocr_engine = "tesseract" + if ext == ".pdf": + if not (_TESS_OK and _PDF2IMG_OK and _PIL_OK): + ocr_status = "ocr_unavailable" + else: + ocr_text, ocr_conf = _ocr_pdf_bytes(data) + ocr_status = "ocr_done" if ocr_text else "ocr_failed" + else: + if not (_TESS_OK and _PIL_OK): + ocr_status = "ocr_unavailable" + else: + ocr_text, ocr_conf = _ocr_image_bytes(data) + ocr_status = "ocr_done" if ocr_text else "ocr_failed" + + extracted = _extract_fields(ocr_text or "") + + # Truncated text for response + text_resp = (ocr_text or "") + if len(text_resp) > TEXT_CAP: + text_resp = text_resp[:TEXT_CAP] + + payload: Dict[str, Any] = { + "file_name": file.filename, + "file_path": rel_path, + "file_size": len(data), + "mime": file.content_type or "application/octet-stream", + "sha256": sha, + "ocr_status": ocr_status, + "ocr_engine": ocr_engine if ocr_status == "ocr_done" else None, + "ocr_text_full": ocr_text, + "ocr_confidence": ocr_conf, + "extracted": extracted, + "ai_engine": "regex-v1", + } + + inserted_id = _maybe_insert_upload(payload) + + return JSONResponse( + { + "ok": True, + "id": inserted_id, + "file_path": rel_path, + "file_name": file.filename, + "file_size": len(data), + "mime": payload["mime"], + "sha256": sha, + "ocr_status": ocr_status, + "ocr_confidence": ocr_conf, + "ocr_text": text_resp if ocr_text else None, + "extracted": extracted, + } + ) diff --git a/static/crm_v2.html b/static/crm_v2.html index bb0b704..1776464 100644 --- a/static/crm_v2.html +++ b/static/crm_v2.html @@ -484,6 +484,33 @@ footer { height:36px; background:var(--bg2); border-top:1px solid var(--rim); + + + + +