Merge agent1-ocr: OCR u ERP/CRM

This commit is contained in:
Damir Radulić
2026-05-05 18:34:46 +02:00
4 changed files with 615 additions and 1 deletions
+403
View File
@@ -0,0 +1,403 @@
#!/usr/bin/env python3
# routers/ocr_router.py
# Name: PGŽ Sport OCR router (lightweight)
# Version: 1.0.0
# Authors: Damir Radulić <dradulic@outlook.com> / <damir@rinet.one>
# Date: 2026-05-05
# Description: FastAPI APIRouter exposing POST /api/ocr/upload and
# GET /api/ocr/health. Accepts PDF/JPG/PNG, runs Tesseract
# (pdf2image for PDF), extracts vendor / OIB / invoice_no /
# date / amount via simple regex, persists into
# pgz_sport.invoice_uploads when possible. Designed to
# degrade gracefully if pytesseract / pdf2image are not
# installed (returns ocr_status='ocr_unavailable').
from __future__ import annotations
import os
import re
import io
import hashlib
import json
import traceback
from pathlib import Path
from datetime import datetime
from typing import Optional, Tuple, Dict, Any, List
from fastapi import APIRouter, UploadFile, File, HTTPException
from fastapi.responses import JSONResponse
import psycopg2
import psycopg2.extras
# ── Optional OCR deps ────────────────────────────────────────────────────────
_TESS_OK = False
_PDF2IMG_OK = False
_PIL_OK = False
try:
import pytesseract # type: ignore
_TESS_OK = True
except Exception:
pytesseract = None # type: ignore
try:
from pdf2image import convert_from_bytes # type: ignore
_PDF2IMG_OK = True
except Exception:
convert_from_bytes = None # type: ignore
try:
from PIL import Image # type: ignore
_PIL_OK = True
except Exception:
Image = None # type: ignore
# ── Config ───────────────────────────────────────────────────────────────────
DB = dict(
host="10.10.0.2",
port=6432,
dbname="rinet_v3",
user="rinet",
password="R1net2026!SecureDB#v7",
)
UPLOAD_DIR = Path("/opt/pgz-sport/uploads/ocr")
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
ALLOWED_EXT = {".pdf", ".jpg", ".jpeg", ".png"}
ALLOWED_MIME = {
"application/pdf",
"image/jpeg",
"image/jpg",
"image/png",
}
MAX_BYTES = 25 * 1024 * 1024 # 25 MB
TEXT_CAP = 8 * 1024 # 8 KB cap for response text payload
router = APIRouter(prefix="/api/ocr", tags=["ocr"])
# ── DB helpers ───────────────────────────────────────────────────────────────
def _db():
c = psycopg2.connect(**DB)
c.autocommit = True
return c
def _table_columns(schema: str, table: str) -> List[str]:
try:
with _db() as c, c.cursor() as cur:
cur.execute(
"""
SELECT column_name FROM information_schema.columns
WHERE table_schema = %s AND table_name = %s
""",
(schema, table),
)
return [r[0] for r in cur.fetchall()]
except Exception:
return []
# ── Regex extractors ─────────────────────────────────────────────────────────
RE_OIB_HR = re.compile(r"\bHR\s*(\d{11})\b")
RE_OIB_BARE = re.compile(r"\b(\d{11})\b")
RE_INVOICE = re.compile(
r"(?im)^.*\b(?:Ra[čc]un|Invoice)\b[^\n\r]{0,80}$"
)
RE_DATE_DMY = re.compile(r"\b(\d{2})[./](\d{2})[./](\d{4})\b")
RE_DATE_YMD = re.compile(r"\b(\d{4})-(\d{2})-(\d{2})\b")
# Amount candidates (1.234,56 or 1234,56 or 1234.56 or 1,234.56), at least 2 digits
RE_AMOUNT = re.compile(
r"(?<![\w.,])"
r"(\d{1,3}(?:[.\s]\d{3})+,\d{2}|\d+,\d{2}|\d{1,3}(?:,\d{3})+\.\d{2}|\d+\.\d{2})"
r"(?![\w])"
)
def _norm_amount(raw: str) -> Optional[float]:
s = raw.strip().replace(" ", "")
# If both . and , present, assume , decimal if last separator is ,
if "," in s and "." in s:
if s.rfind(",") > s.rfind("."):
s = s.replace(".", "").replace(",", ".")
else:
s = s.replace(",", "")
elif "," in s:
# 1.234,56 or 1234,56 → swap
s = s.replace(".", "").replace(",", ".")
try:
return float(s)
except Exception:
return None
def _first_nonempty_line(text: str) -> Optional[str]:
for ln in (text or "").splitlines():
v = ln.strip()
if v:
return v[:200]
return None
def _parse_date(text: str) -> Optional[str]:
m = RE_DATE_YMD.search(text or "")
if m:
try:
return datetime(int(m.group(1)), int(m.group(2)), int(m.group(3))).date().isoformat()
except Exception:
pass
m = RE_DATE_DMY.search(text or "")
if m:
try:
return datetime(int(m.group(3)), int(m.group(2)), int(m.group(1))).date().isoformat()
except Exception:
pass
return None
def _parse_oib(text: str) -> Optional[str]:
m = RE_OIB_HR.search(text or "")
if m:
return m.group(1)
m = RE_OIB_BARE.search(text or "")
if m:
return m.group(1)
return None
def _parse_invoice_no(text: str) -> Optional[str]:
m = RE_INVOICE.search(text or "")
if not m:
return None
line = m.group(0).strip()
# Try to grab the right-most token that looks like an invoice id
cand = re.findall(r"[A-Z0-9][A-Z0-9\-/_.]{1,40}", line)
if cand:
# Drop pure words like "Račun"/"Invoice"
for c in reversed(cand):
if any(ch.isdigit() for ch in c):
return c[:64]
return line[:120]
def _parse_amount(text: str) -> Optional[float]:
if not text:
return None
best: Optional[float] = None
for m in RE_AMOUNT.finditer(text):
v = _norm_amount(m.group(1))
if v is None:
continue
if best is None or v > best:
best = v
return best
def _extract_fields(text: str) -> Dict[str, Any]:
return {
"vendor": _first_nonempty_line(text),
"oib": _parse_oib(text),
"invoice_no": _parse_invoice_no(text),
"date": _parse_date(text),
"amount": _parse_amount(text),
}
# ── OCR engine ───────────────────────────────────────────────────────────────
def _ocr_image_bytes(data: bytes) -> Tuple[Optional[str], Optional[float]]:
if not (_TESS_OK and _PIL_OK):
return None, None
try:
img = Image.open(io.BytesIO(data))
img.load()
text = pytesseract.image_to_string(img, lang=os.getenv("OCR_LANG", "hrv+eng"))
# Confidence (best-effort)
conf = None
try:
d = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT,
lang=os.getenv("OCR_LANG", "hrv+eng"))
confs = [int(c) for c in d.get("conf", []) if str(c).lstrip("-").isdigit() and int(c) >= 0]
if confs:
conf = round(sum(confs) / len(confs), 2)
except Exception:
pass
return text, conf
except Exception:
return None, None
def _ocr_pdf_bytes(data: bytes) -> Tuple[Optional[str], Optional[float]]:
if not (_TESS_OK and _PDF2IMG_OK):
return None, None
try:
pages = convert_from_bytes(data, dpi=200, fmt="png")
except Exception:
return None, None
if not pages:
return None, None
out: List[str] = []
confs: List[float] = []
for p in pages[:8]: # cap to 8 pages
try:
out.append(pytesseract.image_to_string(p, lang=os.getenv("OCR_LANG", "hrv+eng")))
try:
d = pytesseract.image_to_data(p, output_type=pytesseract.Output.DICT,
lang=os.getenv("OCR_LANG", "hrv+eng"))
cs = [int(c) for c in d.get("conf", []) if str(c).lstrip("-").isdigit() and int(c) >= 0]
if cs:
confs.append(sum(cs) / len(cs))
except Exception:
pass
except Exception:
continue
text = "\n\f\n".join(out) if out else None
conf = round(sum(confs) / len(confs), 2) if confs else None
return text, conf
# ── Persistence ──────────────────────────────────────────────────────────────
def _maybe_insert_upload(payload: Dict[str, Any]) -> Optional[int]:
"""Insert into pgz_sport.invoice_uploads — only writes columns that exist."""
cols = set(_table_columns("pgz_sport", "invoice_uploads"))
if not cols:
return None
# Map our payload keys to potential DB columns
candidates: Dict[str, Any] = {
"file_name": payload.get("file_name"),
"file_path": payload.get("file_path"),
"file_size": payload.get("file_size"),
"mime": payload.get("mime"),
"sha256": payload.get("sha256"),
"ocr_status": payload.get("ocr_status"),
"ocr_engine": payload.get("ocr_engine"),
"ocr_text": payload.get("ocr_text_full"),
"ocr_confidence": payload.get("ocr_confidence"),
"ai_invoice_no": (payload.get("extracted") or {}).get("invoice_no"),
"ai_invoice_date": (payload.get("extracted") or {}).get("date"),
"ai_vendor_name": (payload.get("extracted") or {}).get("vendor"),
"ai_vendor_oib": (payload.get("extracted") or {}).get("oib"),
"ai_amount_gross": (payload.get("extracted") or {}).get("amount"),
"ai_engine": payload.get("ai_engine") or "regex-v1",
"ai_extracted": json.dumps(payload.get("extracted") or {}),
}
insert_cols: List[str] = []
insert_vals: List[Any] = []
for k, v in candidates.items():
if k in cols and v is not None:
insert_cols.append(k)
insert_vals.append(v)
if not insert_cols:
return None
sql = "INSERT INTO pgz_sport.invoice_uploads ({c}) VALUES ({p}) RETURNING id".format(
c=", ".join(insert_cols),
p=", ".join(["%s"] * len(insert_cols)),
)
try:
with _db() as c, c.cursor() as cur:
cur.execute(sql, insert_vals)
row = cur.fetchone()
return int(row[0]) if row else None
except Exception as e:
print(f"[ocr_router] insert failed: {e}")
return None
# ── Endpoints ────────────────────────────────────────────────────────────────
@router.get("/health")
def health():
return {
"ok": True,
"tesseract_available": bool(_TESS_OK and _PIL_OK),
"pdf2image_available": bool(_PDF2IMG_OK),
"upload_dir": str(UPLOAD_DIR),
}
@router.post("/upload")
async def upload(file: UploadFile = File(...)):
if not file or not file.filename:
raise HTTPException(400, "no file")
# Validate extension/mime
ext = Path(file.filename).suffix.lower()
if ext not in ALLOWED_EXT:
raise HTTPException(400, f"extension not allowed: {ext}")
# Read full body (bounded)
data = await file.read()
if not data:
raise HTTPException(400, "empty file")
if len(data) > MAX_BYTES:
raise HTTPException(413, f"file too large: {len(data)} > {MAX_BYTES}")
sha = hashlib.sha256(data).hexdigest()
save_name = f"{sha}{ext}"
abs_path = UPLOAD_DIR / save_name
if not abs_path.exists():
try:
abs_path.write_bytes(data)
except Exception as e:
raise HTTPException(500, f"could not persist file: {e}")
rel_path = f"uploads/ocr/{save_name}"
# Run OCR
ocr_text: Optional[str] = None
ocr_conf: Optional[float] = None
ocr_engine = "tesseract"
if ext == ".pdf":
if not (_TESS_OK and _PDF2IMG_OK and _PIL_OK):
ocr_status = "ocr_unavailable"
else:
ocr_text, ocr_conf = _ocr_pdf_bytes(data)
ocr_status = "ocr_done" if ocr_text else "ocr_failed"
else:
if not (_TESS_OK and _PIL_OK):
ocr_status = "ocr_unavailable"
else:
ocr_text, ocr_conf = _ocr_image_bytes(data)
ocr_status = "ocr_done" if ocr_text else "ocr_failed"
extracted = _extract_fields(ocr_text or "")
# Truncated text for response
text_resp = (ocr_text or "")
if len(text_resp) > TEXT_CAP:
text_resp = text_resp[:TEXT_CAP]
payload: Dict[str, Any] = {
"file_name": file.filename,
"file_path": rel_path,
"file_size": len(data),
"mime": file.content_type or "application/octet-stream",
"sha256": sha,
"ocr_status": ocr_status,
"ocr_engine": ocr_engine if ocr_status == "ocr_done" else None,
"ocr_text_full": ocr_text,
"ocr_confidence": ocr_conf,
"extracted": extracted,
"ai_engine": "regex-v1",
}
inserted_id = _maybe_insert_upload(payload)
return JSONResponse(
{
"ok": True,
"id": inserted_id,
"file_path": rel_path,
"file_name": file.filename,
"file_size": len(data),
"mime": payload["mime"],
"sha256": sha,
"ocr_status": ocr_status,
"ocr_confidence": ocr_conf,
"ocr_text": text_resp if ocr_text else None,
"extracted": extracted,
}
)