2e022a7dcc
BUGS FIXED:
1. _serve_spa_fallback() returned index.html instead of sport2.html
→ User clicked /analitika /sufinanciranje etc and got wrong UI (DABI title)
→ Should serve sport2.html (PGZ SPORT - Platforma) with Analiza/Mreza/Link tabs
2. 9 router files had "from __future__" NOT at top of file
→ SyntaxError on import → routers SKIPPED → intermittent API failures
→ Affected: ocr.py, ocr_router.py, putni_nalozi.py, obrasci_router.py,
clan_panel_router.py, audit_seal_router.py, erp_full_router.py,
notif_router.py, seal.py
ROOT CAUSE:
Prior dehardcode batch (Master Zakon #1 sweep) inserted env-loading
imports BEFORE "from __future__ import annotations" — Python parser
requires __future__ FIRST.
FIX:
- _serve_spa_fallback() candidates list: sport2.html first
- Moved __future__ to top (preserving shebang + encoding + comments) in all 9
VERIFIED:
- 0 failed routers (was 7+)
- Analiza API: 10/10 success ~60-87ms
- Summary API: 5/5 success ~40ms
- sport.rinet.one/ → PGZ SPORT - Platforma (Analiza+Mreza tabs)
- All 9 SPA fallback routes serve sport2.html
Damir uploaded screenshot showing Analiza tab working (2,049 igraca,
82 klubova) but described as intermittent — root cause was router fails
causing some API endpoints to be missing/unreliable. Fixed.
407 lines
14 KiB
Python
407 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
from __future__ import annotations
|
|
from dotenv import load_dotenv
|
|
load_dotenv('/opt/rinet-gpu/.env.master')
|
|
# auto-added by patch_scrapers_with_dotenv.sh
|
|
# routers/ocr_router.py
|
|
# Name: PGŽ Sport OCR router (lightweight)
|
|
# Version: 1.0.0
|
|
# Authors: Damir Radulić <dradulic@outlook.com> / <damir@rinet.one>
|
|
# Date: 2026-05-05
|
|
# Description: FastAPI APIRouter exposing POST /api/ocr/upload and
|
|
# GET /api/ocr/health. Accepts PDF/JPG/PNG, runs Tesseract
|
|
# (pdf2image for PDF), extracts vendor / OIB / invoice_no /
|
|
# date / amount via simple regex, persists into
|
|
# pgz_sport.invoice_uploads when possible. Designed to
|
|
# degrade gracefully if pytesseract / pdf2image are not
|
|
# installed (returns ocr_status='ocr_unavailable').
|
|
|
|
|
|
import os
|
|
import re
|
|
import io
|
|
import hashlib
|
|
import json
|
|
import traceback
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
from typing import Optional, Tuple, Dict, Any, List
|
|
|
|
from fastapi import APIRouter, UploadFile, File, HTTPException
|
|
from fastapi.responses import JSONResponse
|
|
|
|
import psycopg2
|
|
import psycopg2.extras
|
|
|
|
# ── Optional OCR deps ────────────────────────────────────────────────────────
|
|
_TESS_OK = False
|
|
_PDF2IMG_OK = False
|
|
_PIL_OK = False
|
|
try:
|
|
import pytesseract # type: ignore
|
|
_TESS_OK = True
|
|
except Exception:
|
|
pytesseract = None # type: ignore
|
|
|
|
try:
|
|
from pdf2image import convert_from_bytes # type: ignore
|
|
_PDF2IMG_OK = True
|
|
except Exception:
|
|
convert_from_bytes = None # type: ignore
|
|
|
|
try:
|
|
from PIL import Image # type: ignore
|
|
_PIL_OK = True
|
|
except Exception:
|
|
Image = None # type: ignore
|
|
|
|
# ── Config ───────────────────────────────────────────────────────────────────
|
|
DB = dict(
|
|
host="10.10.0.2",
|
|
port=6432,
|
|
dbname="rinet_v3",
|
|
user="rinet",
|
|
password=os.environ["DB_PASSWORD"],
|
|
)
|
|
|
|
UPLOAD_DIR = Path("/opt/pgz-sport/uploads/ocr")
|
|
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
ALLOWED_EXT = {".pdf", ".jpg", ".jpeg", ".png"}
|
|
ALLOWED_MIME = {
|
|
"application/pdf",
|
|
"image/jpeg",
|
|
"image/jpg",
|
|
"image/png",
|
|
}
|
|
MAX_BYTES = 25 * 1024 * 1024 # 25 MB
|
|
TEXT_CAP = 8 * 1024 # 8 KB cap for response text payload
|
|
|
|
router = APIRouter(prefix="/api/ocr", tags=["ocr"])
|
|
|
|
|
|
# ── DB helpers ───────────────────────────────────────────────────────────────
|
|
def _db():
|
|
c = psycopg2.connect(**DB)
|
|
c.autocommit = True
|
|
return c
|
|
|
|
|
|
def _table_columns(schema: str, table: str) -> List[str]:
|
|
try:
|
|
with _db() as c, c.cursor() as cur:
|
|
cur.execute(
|
|
"""
|
|
SELECT column_name FROM information_schema.columns
|
|
WHERE table_schema = %s AND table_name = %s
|
|
""",
|
|
(schema, table),
|
|
)
|
|
return [r[0] for r in cur.fetchall()]
|
|
except Exception:
|
|
return []
|
|
|
|
|
|
# ── Regex extractors ─────────────────────────────────────────────────────────
|
|
RE_OIB_HR = re.compile(r"\bHR\s*(\d{11})\b")
|
|
RE_OIB_BARE = re.compile(r"\b(\d{11})\b")
|
|
RE_INVOICE = re.compile(
|
|
r"(?im)^.*\b(?:Ra[čc]un|Invoice)\b[^\n\r]{0,80}$"
|
|
)
|
|
RE_DATE_DMY = re.compile(r"\b(\d{2})[./](\d{2})[./](\d{4})\b")
|
|
RE_DATE_YMD = re.compile(r"\b(\d{4})-(\d{2})-(\d{2})\b")
|
|
# Amount candidates (1.234,56 or 1234,56 or 1234.56 or 1,234.56), at least 2 digits
|
|
RE_AMOUNT = re.compile(
|
|
r"(?<![\w.,])"
|
|
r"(\d{1,3}(?:[.\s]\d{3})+,\d{2}|\d+,\d{2}|\d{1,3}(?:,\d{3})+\.\d{2}|\d+\.\d{2})"
|
|
r"(?![\w])"
|
|
)
|
|
|
|
|
|
def _norm_amount(raw: str) -> Optional[float]:
|
|
s = raw.strip().replace(" ", "")
|
|
# If both . and , present, assume , decimal if last separator is ,
|
|
if "," in s and "." in s:
|
|
if s.rfind(",") > s.rfind("."):
|
|
s = s.replace(".", "").replace(",", ".")
|
|
else:
|
|
s = s.replace(",", "")
|
|
elif "," in s:
|
|
# 1.234,56 or 1234,56 → swap
|
|
s = s.replace(".", "").replace(",", ".")
|
|
try:
|
|
return float(s)
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
def _first_nonempty_line(text: str) -> Optional[str]:
|
|
for ln in (text or "").splitlines():
|
|
v = ln.strip()
|
|
if v:
|
|
return v[:200]
|
|
return None
|
|
|
|
|
|
def _parse_date(text: str) -> Optional[str]:
|
|
m = RE_DATE_YMD.search(text or "")
|
|
if m:
|
|
try:
|
|
return datetime(int(m.group(1)), int(m.group(2)), int(m.group(3))).date().isoformat()
|
|
except Exception:
|
|
pass
|
|
m = RE_DATE_DMY.search(text or "")
|
|
if m:
|
|
try:
|
|
return datetime(int(m.group(3)), int(m.group(2)), int(m.group(1))).date().isoformat()
|
|
except Exception:
|
|
pass
|
|
return None
|
|
|
|
|
|
def _parse_oib(text: str) -> Optional[str]:
|
|
m = RE_OIB_HR.search(text or "")
|
|
if m:
|
|
return m.group(1)
|
|
m = RE_OIB_BARE.search(text or "")
|
|
if m:
|
|
return m.group(1)
|
|
return None
|
|
|
|
|
|
def _parse_invoice_no(text: str) -> Optional[str]:
|
|
m = RE_INVOICE.search(text or "")
|
|
if not m:
|
|
return None
|
|
line = m.group(0).strip()
|
|
# Try to grab the right-most token that looks like an invoice id
|
|
cand = re.findall(r"[A-Z0-9][A-Z0-9\-/_.]{1,40}", line)
|
|
if cand:
|
|
# Drop pure words like "Račun"/"Invoice"
|
|
for c in reversed(cand):
|
|
if any(ch.isdigit() for ch in c):
|
|
return c[:64]
|
|
return line[:120]
|
|
|
|
|
|
def _parse_amount(text: str) -> Optional[float]:
|
|
if not text:
|
|
return None
|
|
best: Optional[float] = None
|
|
for m in RE_AMOUNT.finditer(text):
|
|
v = _norm_amount(m.group(1))
|
|
if v is None:
|
|
continue
|
|
if best is None or v > best:
|
|
best = v
|
|
return best
|
|
|
|
|
|
def _extract_fields(text: str) -> Dict[str, Any]:
|
|
return {
|
|
"vendor": _first_nonempty_line(text),
|
|
"oib": _parse_oib(text),
|
|
"invoice_no": _parse_invoice_no(text),
|
|
"date": _parse_date(text),
|
|
"amount": _parse_amount(text),
|
|
}
|
|
|
|
|
|
# ── OCR engine ───────────────────────────────────────────────────────────────
|
|
def _ocr_image_bytes(data: bytes) -> Tuple[Optional[str], Optional[float]]:
|
|
if not (_TESS_OK and _PIL_OK):
|
|
return None, None
|
|
try:
|
|
img = Image.open(io.BytesIO(data))
|
|
img.load()
|
|
text = pytesseract.image_to_string(img, lang=os.getenv("OCR_LANG", "hrv+eng"))
|
|
# Confidence (best-effort)
|
|
conf = None
|
|
try:
|
|
d = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT,
|
|
lang=os.getenv("OCR_LANG", "hrv+eng"))
|
|
confs = [int(c) for c in d.get("conf", []) if str(c).lstrip("-").isdigit() and int(c) >= 0]
|
|
if confs:
|
|
conf = round(sum(confs) / len(confs), 2)
|
|
except Exception:
|
|
pass
|
|
return text, conf
|
|
except Exception:
|
|
return None, None
|
|
|
|
|
|
def _ocr_pdf_bytes(data: bytes) -> Tuple[Optional[str], Optional[float]]:
|
|
if not (_TESS_OK and _PDF2IMG_OK):
|
|
return None, None
|
|
try:
|
|
pages = convert_from_bytes(data, dpi=200, fmt="png")
|
|
except Exception:
|
|
return None, None
|
|
if not pages:
|
|
return None, None
|
|
out: List[str] = []
|
|
confs: List[float] = []
|
|
for p in pages[:8]: # cap to 8 pages
|
|
try:
|
|
out.append(pytesseract.image_to_string(p, lang=os.getenv("OCR_LANG", "hrv+eng")))
|
|
try:
|
|
d = pytesseract.image_to_data(p, output_type=pytesseract.Output.DICT,
|
|
lang=os.getenv("OCR_LANG", "hrv+eng"))
|
|
cs = [int(c) for c in d.get("conf", []) if str(c).lstrip("-").isdigit() and int(c) >= 0]
|
|
if cs:
|
|
confs.append(sum(cs) / len(cs))
|
|
except Exception:
|
|
pass
|
|
except Exception:
|
|
continue
|
|
text = "\n\f\n".join(out) if out else None
|
|
conf = round(sum(confs) / len(confs), 2) if confs else None
|
|
return text, conf
|
|
|
|
|
|
# ── Persistence ──────────────────────────────────────────────────────────────
|
|
def _maybe_insert_upload(payload: Dict[str, Any]) -> Optional[int]:
|
|
"""Insert into pgz_sport.invoice_uploads — only writes columns that exist."""
|
|
cols = set(_table_columns("pgz_sport", "invoice_uploads"))
|
|
if not cols:
|
|
return None
|
|
|
|
# Map our payload keys to potential DB columns
|
|
candidates: Dict[str, Any] = {
|
|
"file_name": payload.get("file_name"),
|
|
"file_path": payload.get("file_path"),
|
|
"file_size": payload.get("file_size"),
|
|
"mime": payload.get("mime"),
|
|
"sha256": payload.get("sha256"),
|
|
"ocr_status": payload.get("ocr_status"),
|
|
"ocr_engine": payload.get("ocr_engine"),
|
|
"ocr_text": payload.get("ocr_text_full"),
|
|
"ocr_confidence": payload.get("ocr_confidence"),
|
|
"ai_invoice_no": (payload.get("extracted") or {}).get("invoice_no"),
|
|
"ai_invoice_date": (payload.get("extracted") or {}).get("date"),
|
|
"ai_vendor_name": (payload.get("extracted") or {}).get("vendor"),
|
|
"ai_vendor_oib": (payload.get("extracted") or {}).get("oib"),
|
|
"ai_amount_gross": (payload.get("extracted") or {}).get("amount"),
|
|
"ai_engine": payload.get("ai_engine") or "regex-v1",
|
|
"ai_extracted": json.dumps(payload.get("extracted") or {}),
|
|
}
|
|
|
|
insert_cols: List[str] = []
|
|
insert_vals: List[Any] = []
|
|
for k, v in candidates.items():
|
|
if k in cols and v is not None:
|
|
insert_cols.append(k)
|
|
insert_vals.append(v)
|
|
|
|
if not insert_cols:
|
|
return None
|
|
|
|
sql = "INSERT INTO pgz_sport.invoice_uploads ({c}) VALUES ({p}) RETURNING id".format(
|
|
c=", ".join(insert_cols),
|
|
p=", ".join(["%s"] * len(insert_cols)),
|
|
)
|
|
try:
|
|
with _db() as c, c.cursor() as cur:
|
|
cur.execute(sql, insert_vals)
|
|
row = cur.fetchone()
|
|
return int(row[0]) if row else None
|
|
except Exception as e:
|
|
print(f"[ocr_router] insert failed: {e}")
|
|
return None
|
|
|
|
|
|
# ── Endpoints ────────────────────────────────────────────────────────────────
|
|
@router.get("/health")
|
|
def health():
|
|
return {
|
|
"ok": True,
|
|
"tesseract_available": bool(_TESS_OK and _PIL_OK),
|
|
"pdf2image_available": bool(_PDF2IMG_OK),
|
|
"upload_dir": str(UPLOAD_DIR),
|
|
}
|
|
|
|
|
|
@router.post("/upload")
|
|
async def upload(file: UploadFile = File(...)):
|
|
if not file or not file.filename:
|
|
raise HTTPException(400, "no file")
|
|
|
|
# Validate extension/mime
|
|
ext = Path(file.filename).suffix.lower()
|
|
if ext not in ALLOWED_EXT:
|
|
raise HTTPException(400, f"extension not allowed: {ext}")
|
|
|
|
# Read full body (bounded)
|
|
data = await file.read()
|
|
if not data:
|
|
raise HTTPException(400, "empty file")
|
|
if len(data) > MAX_BYTES:
|
|
raise HTTPException(413, f"file too large: {len(data)} > {MAX_BYTES}")
|
|
|
|
sha = hashlib.sha256(data).hexdigest()
|
|
save_name = f"{sha}{ext}"
|
|
abs_path = UPLOAD_DIR / save_name
|
|
if not abs_path.exists():
|
|
try:
|
|
abs_path.write_bytes(data)
|
|
except Exception as e:
|
|
raise HTTPException(500, f"could not persist file: {e}")
|
|
|
|
rel_path = f"uploads/ocr/{save_name}"
|
|
|
|
# Run OCR
|
|
ocr_text: Optional[str] = None
|
|
ocr_conf: Optional[float] = None
|
|
ocr_engine = "tesseract"
|
|
if ext == ".pdf":
|
|
if not (_TESS_OK and _PDF2IMG_OK and _PIL_OK):
|
|
ocr_status = "ocr_unavailable"
|
|
else:
|
|
ocr_text, ocr_conf = _ocr_pdf_bytes(data)
|
|
ocr_status = "ocr_done" if ocr_text else "ocr_failed"
|
|
else:
|
|
if not (_TESS_OK and _PIL_OK):
|
|
ocr_status = "ocr_unavailable"
|
|
else:
|
|
ocr_text, ocr_conf = _ocr_image_bytes(data)
|
|
ocr_status = "ocr_done" if ocr_text else "ocr_failed"
|
|
|
|
extracted = _extract_fields(ocr_text or "")
|
|
|
|
# Truncated text for response
|
|
text_resp = (ocr_text or "")
|
|
if len(text_resp) > TEXT_CAP:
|
|
text_resp = text_resp[:TEXT_CAP]
|
|
|
|
payload: Dict[str, Any] = {
|
|
"file_name": file.filename,
|
|
"file_path": rel_path,
|
|
"file_size": len(data),
|
|
"mime": file.content_type or "application/octet-stream",
|
|
"sha256": sha,
|
|
"ocr_status": ocr_status,
|
|
"ocr_engine": ocr_engine if ocr_status == "ocr_done" else None,
|
|
"ocr_text_full": ocr_text,
|
|
"ocr_confidence": ocr_conf,
|
|
"extracted": extracted,
|
|
"ai_engine": "regex-v1",
|
|
}
|
|
|
|
inserted_id = _maybe_insert_upload(payload)
|
|
|
|
return JSONResponse(
|
|
{
|
|
"ok": True,
|
|
"id": inserted_id,
|
|
"file_path": rel_path,
|
|
"file_name": file.filename,
|
|
"file_size": len(data),
|
|
"mime": payload["mime"],
|
|
"sha256": sha,
|
|
"ocr_status": ocr_status,
|
|
"ocr_confidence": ocr_conf,
|
|
"ocr_text": text_resp if ocr_text else None,
|
|
"extracted": extracted,
|
|
}
|
|
)
|