Merge agent1-ocr: OCR u ERP/CRM
This commit is contained in:
@@ -0,0 +1,403 @@
|
||||
#!/usr/bin/env python3
|
||||
# routers/ocr_router.py
|
||||
# Name: PGŽ Sport OCR router (lightweight)
|
||||
# Version: 1.0.0
|
||||
# Authors: Damir Radulić <dradulic@outlook.com> / <damir@rinet.one>
|
||||
# Date: 2026-05-05
|
||||
# Description: FastAPI APIRouter exposing POST /api/ocr/upload and
|
||||
# GET /api/ocr/health. Accepts PDF/JPG/PNG, runs Tesseract
|
||||
# (pdf2image for PDF), extracts vendor / OIB / invoice_no /
|
||||
# date / amount via simple regex, persists into
|
||||
# pgz_sport.invoice_uploads when possible. Designed to
|
||||
# degrade gracefully if pytesseract / pdf2image are not
|
||||
# installed (returns ocr_status='ocr_unavailable').
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import re
|
||||
import io
|
||||
import hashlib
|
||||
import json
|
||||
import traceback
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from typing import Optional, Tuple, Dict, Any, List
|
||||
|
||||
from fastapi import APIRouter, UploadFile, File, HTTPException
|
||||
from fastapi.responses import JSONResponse
|
||||
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
|
||||
# ── Optional OCR deps ────────────────────────────────────────────────────────
|
||||
_TESS_OK = False
|
||||
_PDF2IMG_OK = False
|
||||
_PIL_OK = False
|
||||
try:
|
||||
import pytesseract # type: ignore
|
||||
_TESS_OK = True
|
||||
except Exception:
|
||||
pytesseract = None # type: ignore
|
||||
|
||||
try:
|
||||
from pdf2image import convert_from_bytes # type: ignore
|
||||
_PDF2IMG_OK = True
|
||||
except Exception:
|
||||
convert_from_bytes = None # type: ignore
|
||||
|
||||
try:
|
||||
from PIL import Image # type: ignore
|
||||
_PIL_OK = True
|
||||
except Exception:
|
||||
Image = None # type: ignore
|
||||
|
||||
# ── Config ───────────────────────────────────────────────────────────────────
|
||||
DB = dict(
|
||||
host="10.10.0.2",
|
||||
port=6432,
|
||||
dbname="rinet_v3",
|
||||
user="rinet",
|
||||
password="R1net2026!SecureDB#v7",
|
||||
)
|
||||
|
||||
UPLOAD_DIR = Path("/opt/pgz-sport/uploads/ocr")
|
||||
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
ALLOWED_EXT = {".pdf", ".jpg", ".jpeg", ".png"}
|
||||
ALLOWED_MIME = {
|
||||
"application/pdf",
|
||||
"image/jpeg",
|
||||
"image/jpg",
|
||||
"image/png",
|
||||
}
|
||||
MAX_BYTES = 25 * 1024 * 1024 # 25 MB
|
||||
TEXT_CAP = 8 * 1024 # 8 KB cap for response text payload
|
||||
|
||||
router = APIRouter(prefix="/api/ocr", tags=["ocr"])
|
||||
|
||||
|
||||
# ── DB helpers ───────────────────────────────────────────────────────────────
|
||||
def _db():
|
||||
c = psycopg2.connect(**DB)
|
||||
c.autocommit = True
|
||||
return c
|
||||
|
||||
|
||||
def _table_columns(schema: str, table: str) -> List[str]:
|
||||
try:
|
||||
with _db() as c, c.cursor() as cur:
|
||||
cur.execute(
|
||||
"""
|
||||
SELECT column_name FROM information_schema.columns
|
||||
WHERE table_schema = %s AND table_name = %s
|
||||
""",
|
||||
(schema, table),
|
||||
)
|
||||
return [r[0] for r in cur.fetchall()]
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
|
||||
# ── Regex extractors ─────────────────────────────────────────────────────────
|
||||
RE_OIB_HR = re.compile(r"\bHR\s*(\d{11})\b")
|
||||
RE_OIB_BARE = re.compile(r"\b(\d{11})\b")
|
||||
RE_INVOICE = re.compile(
|
||||
r"(?im)^.*\b(?:Ra[čc]un|Invoice)\b[^\n\r]{0,80}$"
|
||||
)
|
||||
RE_DATE_DMY = re.compile(r"\b(\d{2})[./](\d{2})[./](\d{4})\b")
|
||||
RE_DATE_YMD = re.compile(r"\b(\d{4})-(\d{2})-(\d{2})\b")
|
||||
# Amount candidates (1.234,56 or 1234,56 or 1234.56 or 1,234.56), at least 2 digits
|
||||
RE_AMOUNT = re.compile(
|
||||
r"(?<![\w.,])"
|
||||
r"(\d{1,3}(?:[.\s]\d{3})+,\d{2}|\d+,\d{2}|\d{1,3}(?:,\d{3})+\.\d{2}|\d+\.\d{2})"
|
||||
r"(?![\w])"
|
||||
)
|
||||
|
||||
|
||||
def _norm_amount(raw: str) -> Optional[float]:
|
||||
s = raw.strip().replace(" ", "")
|
||||
# If both . and , present, assume , decimal if last separator is ,
|
||||
if "," in s and "." in s:
|
||||
if s.rfind(",") > s.rfind("."):
|
||||
s = s.replace(".", "").replace(",", ".")
|
||||
else:
|
||||
s = s.replace(",", "")
|
||||
elif "," in s:
|
||||
# 1.234,56 or 1234,56 → swap
|
||||
s = s.replace(".", "").replace(",", ".")
|
||||
try:
|
||||
return float(s)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def _first_nonempty_line(text: str) -> Optional[str]:
|
||||
for ln in (text or "").splitlines():
|
||||
v = ln.strip()
|
||||
if v:
|
||||
return v[:200]
|
||||
return None
|
||||
|
||||
|
||||
def _parse_date(text: str) -> Optional[str]:
|
||||
m = RE_DATE_YMD.search(text or "")
|
||||
if m:
|
||||
try:
|
||||
return datetime(int(m.group(1)), int(m.group(2)), int(m.group(3))).date().isoformat()
|
||||
except Exception:
|
||||
pass
|
||||
m = RE_DATE_DMY.search(text or "")
|
||||
if m:
|
||||
try:
|
||||
return datetime(int(m.group(3)), int(m.group(2)), int(m.group(1))).date().isoformat()
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def _parse_oib(text: str) -> Optional[str]:
|
||||
m = RE_OIB_HR.search(text or "")
|
||||
if m:
|
||||
return m.group(1)
|
||||
m = RE_OIB_BARE.search(text or "")
|
||||
if m:
|
||||
return m.group(1)
|
||||
return None
|
||||
|
||||
|
||||
def _parse_invoice_no(text: str) -> Optional[str]:
|
||||
m = RE_INVOICE.search(text or "")
|
||||
if not m:
|
||||
return None
|
||||
line = m.group(0).strip()
|
||||
# Try to grab the right-most token that looks like an invoice id
|
||||
cand = re.findall(r"[A-Z0-9][A-Z0-9\-/_.]{1,40}", line)
|
||||
if cand:
|
||||
# Drop pure words like "Račun"/"Invoice"
|
||||
for c in reversed(cand):
|
||||
if any(ch.isdigit() for ch in c):
|
||||
return c[:64]
|
||||
return line[:120]
|
||||
|
||||
|
||||
def _parse_amount(text: str) -> Optional[float]:
|
||||
if not text:
|
||||
return None
|
||||
best: Optional[float] = None
|
||||
for m in RE_AMOUNT.finditer(text):
|
||||
v = _norm_amount(m.group(1))
|
||||
if v is None:
|
||||
continue
|
||||
if best is None or v > best:
|
||||
best = v
|
||||
return best
|
||||
|
||||
|
||||
def _extract_fields(text: str) -> Dict[str, Any]:
|
||||
return {
|
||||
"vendor": _first_nonempty_line(text),
|
||||
"oib": _parse_oib(text),
|
||||
"invoice_no": _parse_invoice_no(text),
|
||||
"date": _parse_date(text),
|
||||
"amount": _parse_amount(text),
|
||||
}
|
||||
|
||||
|
||||
# ── OCR engine ───────────────────────────────────────────────────────────────
|
||||
def _ocr_image_bytes(data: bytes) -> Tuple[Optional[str], Optional[float]]:
|
||||
if not (_TESS_OK and _PIL_OK):
|
||||
return None, None
|
||||
try:
|
||||
img = Image.open(io.BytesIO(data))
|
||||
img.load()
|
||||
text = pytesseract.image_to_string(img, lang=os.getenv("OCR_LANG", "hrv+eng"))
|
||||
# Confidence (best-effort)
|
||||
conf = None
|
||||
try:
|
||||
d = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT,
|
||||
lang=os.getenv("OCR_LANG", "hrv+eng"))
|
||||
confs = [int(c) for c in d.get("conf", []) if str(c).lstrip("-").isdigit() and int(c) >= 0]
|
||||
if confs:
|
||||
conf = round(sum(confs) / len(confs), 2)
|
||||
except Exception:
|
||||
pass
|
||||
return text, conf
|
||||
except Exception:
|
||||
return None, None
|
||||
|
||||
|
||||
def _ocr_pdf_bytes(data: bytes) -> Tuple[Optional[str], Optional[float]]:
|
||||
if not (_TESS_OK and _PDF2IMG_OK):
|
||||
return None, None
|
||||
try:
|
||||
pages = convert_from_bytes(data, dpi=200, fmt="png")
|
||||
except Exception:
|
||||
return None, None
|
||||
if not pages:
|
||||
return None, None
|
||||
out: List[str] = []
|
||||
confs: List[float] = []
|
||||
for p in pages[:8]: # cap to 8 pages
|
||||
try:
|
||||
out.append(pytesseract.image_to_string(p, lang=os.getenv("OCR_LANG", "hrv+eng")))
|
||||
try:
|
||||
d = pytesseract.image_to_data(p, output_type=pytesseract.Output.DICT,
|
||||
lang=os.getenv("OCR_LANG", "hrv+eng"))
|
||||
cs = [int(c) for c in d.get("conf", []) if str(c).lstrip("-").isdigit() and int(c) >= 0]
|
||||
if cs:
|
||||
confs.append(sum(cs) / len(cs))
|
||||
except Exception:
|
||||
pass
|
||||
except Exception:
|
||||
continue
|
||||
text = "\n\f\n".join(out) if out else None
|
||||
conf = round(sum(confs) / len(confs), 2) if confs else None
|
||||
return text, conf
|
||||
|
||||
|
||||
# ── Persistence ──────────────────────────────────────────────────────────────
|
||||
def _maybe_insert_upload(payload: Dict[str, Any]) -> Optional[int]:
|
||||
"""Insert into pgz_sport.invoice_uploads — only writes columns that exist."""
|
||||
cols = set(_table_columns("pgz_sport", "invoice_uploads"))
|
||||
if not cols:
|
||||
return None
|
||||
|
||||
# Map our payload keys to potential DB columns
|
||||
candidates: Dict[str, Any] = {
|
||||
"file_name": payload.get("file_name"),
|
||||
"file_path": payload.get("file_path"),
|
||||
"file_size": payload.get("file_size"),
|
||||
"mime": payload.get("mime"),
|
||||
"sha256": payload.get("sha256"),
|
||||
"ocr_status": payload.get("ocr_status"),
|
||||
"ocr_engine": payload.get("ocr_engine"),
|
||||
"ocr_text": payload.get("ocr_text_full"),
|
||||
"ocr_confidence": payload.get("ocr_confidence"),
|
||||
"ai_invoice_no": (payload.get("extracted") or {}).get("invoice_no"),
|
||||
"ai_invoice_date": (payload.get("extracted") or {}).get("date"),
|
||||
"ai_vendor_name": (payload.get("extracted") or {}).get("vendor"),
|
||||
"ai_vendor_oib": (payload.get("extracted") or {}).get("oib"),
|
||||
"ai_amount_gross": (payload.get("extracted") or {}).get("amount"),
|
||||
"ai_engine": payload.get("ai_engine") or "regex-v1",
|
||||
"ai_extracted": json.dumps(payload.get("extracted") or {}),
|
||||
}
|
||||
|
||||
insert_cols: List[str] = []
|
||||
insert_vals: List[Any] = []
|
||||
for k, v in candidates.items():
|
||||
if k in cols and v is not None:
|
||||
insert_cols.append(k)
|
||||
insert_vals.append(v)
|
||||
|
||||
if not insert_cols:
|
||||
return None
|
||||
|
||||
sql = "INSERT INTO pgz_sport.invoice_uploads ({c}) VALUES ({p}) RETURNING id".format(
|
||||
c=", ".join(insert_cols),
|
||||
p=", ".join(["%s"] * len(insert_cols)),
|
||||
)
|
||||
try:
|
||||
with _db() as c, c.cursor() as cur:
|
||||
cur.execute(sql, insert_vals)
|
||||
row = cur.fetchone()
|
||||
return int(row[0]) if row else None
|
||||
except Exception as e:
|
||||
print(f"[ocr_router] insert failed: {e}")
|
||||
return None
|
||||
|
||||
|
||||
# ── Endpoints ────────────────────────────────────────────────────────────────
|
||||
@router.get("/health")
|
||||
def health():
|
||||
return {
|
||||
"ok": True,
|
||||
"tesseract_available": bool(_TESS_OK and _PIL_OK),
|
||||
"pdf2image_available": bool(_PDF2IMG_OK),
|
||||
"upload_dir": str(UPLOAD_DIR),
|
||||
}
|
||||
|
||||
|
||||
@router.post("/upload")
|
||||
async def upload(file: UploadFile = File(...)):
|
||||
if not file or not file.filename:
|
||||
raise HTTPException(400, "no file")
|
||||
|
||||
# Validate extension/mime
|
||||
ext = Path(file.filename).suffix.lower()
|
||||
if ext not in ALLOWED_EXT:
|
||||
raise HTTPException(400, f"extension not allowed: {ext}")
|
||||
|
||||
# Read full body (bounded)
|
||||
data = await file.read()
|
||||
if not data:
|
||||
raise HTTPException(400, "empty file")
|
||||
if len(data) > MAX_BYTES:
|
||||
raise HTTPException(413, f"file too large: {len(data)} > {MAX_BYTES}")
|
||||
|
||||
sha = hashlib.sha256(data).hexdigest()
|
||||
save_name = f"{sha}{ext}"
|
||||
abs_path = UPLOAD_DIR / save_name
|
||||
if not abs_path.exists():
|
||||
try:
|
||||
abs_path.write_bytes(data)
|
||||
except Exception as e:
|
||||
raise HTTPException(500, f"could not persist file: {e}")
|
||||
|
||||
rel_path = f"uploads/ocr/{save_name}"
|
||||
|
||||
# Run OCR
|
||||
ocr_text: Optional[str] = None
|
||||
ocr_conf: Optional[float] = None
|
||||
ocr_engine = "tesseract"
|
||||
if ext == ".pdf":
|
||||
if not (_TESS_OK and _PDF2IMG_OK and _PIL_OK):
|
||||
ocr_status = "ocr_unavailable"
|
||||
else:
|
||||
ocr_text, ocr_conf = _ocr_pdf_bytes(data)
|
||||
ocr_status = "ocr_done" if ocr_text else "ocr_failed"
|
||||
else:
|
||||
if not (_TESS_OK and _PIL_OK):
|
||||
ocr_status = "ocr_unavailable"
|
||||
else:
|
||||
ocr_text, ocr_conf = _ocr_image_bytes(data)
|
||||
ocr_status = "ocr_done" if ocr_text else "ocr_failed"
|
||||
|
||||
extracted = _extract_fields(ocr_text or "")
|
||||
|
||||
# Truncated text for response
|
||||
text_resp = (ocr_text or "")
|
||||
if len(text_resp) > TEXT_CAP:
|
||||
text_resp = text_resp[:TEXT_CAP]
|
||||
|
||||
payload: Dict[str, Any] = {
|
||||
"file_name": file.filename,
|
||||
"file_path": rel_path,
|
||||
"file_size": len(data),
|
||||
"mime": file.content_type or "application/octet-stream",
|
||||
"sha256": sha,
|
||||
"ocr_status": ocr_status,
|
||||
"ocr_engine": ocr_engine if ocr_status == "ocr_done" else None,
|
||||
"ocr_text_full": ocr_text,
|
||||
"ocr_confidence": ocr_conf,
|
||||
"extracted": extracted,
|
||||
"ai_engine": "regex-v1",
|
||||
}
|
||||
|
||||
inserted_id = _maybe_insert_upload(payload)
|
||||
|
||||
return JSONResponse(
|
||||
{
|
||||
"ok": True,
|
||||
"id": inserted_id,
|
||||
"file_path": rel_path,
|
||||
"file_name": file.filename,
|
||||
"file_size": len(data),
|
||||
"mime": payload["mime"],
|
||||
"sha256": sha,
|
||||
"ocr_status": ocr_status,
|
||||
"ocr_confidence": ocr_conf,
|
||||
"ocr_text": text_resp if ocr_text else None,
|
||||
"extracted": extracted,
|
||||
}
|
||||
)
|
||||
Reference in New Issue
Block a user