PGŽ Sport Platform — Round 1+2 baseline (sport2.html + API)

2026-05-04 23:39:08 +02:00
commit a7ec0a86be
1820 changed files with 694455 additions and 0 deletions
@@ -0,0 +1,183 @@
+#!/usr/bin/env python3
+"""
+OCR worker daemon — pgz_sport.invoice_uploads
+Polls pending uploads → OCR (tesseract / pdfplumber) → regex extraction → (optional LLM)
+                  → updates ai_invoice_no, ai_vendor_name, ai_vendor_oib, ai_amount_gross, ai_extracted
+                  → flips ocr_status to 'done' or 'failed'
+"""
+import os, re, time, json, subprocess, traceback, hashlib
+import psycopg2, psycopg2.extras
+from pathlib import Path
+
+DB = dict(host='localhost', port=5432, dbname='rinet_v3', user='rinet', password='R1net2026!SecureDB#v7')
+POLL = 8  # seconds
+
+def db():
+    c = psycopg2.connect(**DB); c.autocommit = True; return c
+
+def claim_one():
+    """Claim 1 pending row → 'processing'."""
+    with db() as c:
+        cur = c.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
+        cur.execute("""UPDATE pgz_sport.invoice_uploads
+                       SET ocr_status='processing', processed_at=NOW()
+                       WHERE id = (SELECT id FROM pgz_sport.invoice_uploads
+                                   WHERE ocr_status='pending'
+                                   ORDER BY uploaded_at LIMIT 1 FOR UPDATE SKIP LOCKED)
+                       RETURNING *""")
+        return cur.fetchone()
+
+def update_done(uid, fields):
+    sets, args = [], []
+    for k,v in fields.items():
+        sets.append(f"{k}=%s"); args.append(v)
+    args.append(uid)
+    with db() as c:
+        c.cursor().execute(f"UPDATE pgz_sport.invoice_uploads SET {','.join(sets)} WHERE id=%s", args)
+
+def fail(uid, err):
+    update_done(uid, {'ocr_status':'failed', 'ai_extracted': json.dumps({'error': err[:500]})})
+
+def extract_text_from_file(path):
+    """Returns (text, method)."""
+    p = Path(path)
+    if not p.exists(): return ('', 'missing')
+    suf = p.suffix.lower()
+    if suf == '.pdf':
+        # Try pdftotext first (fast, embedded text)
+        try:
+            r = subprocess.run(['pdftotext','-layout',str(p),'-'], capture_output=True, timeout=60)
+            txt = r.stdout.decode('utf-8','ignore')
+            if len(txt.strip()) > 100: return (txt, 'pdftotext')
+        except Exception: pass
+        # Fallback: rasterize + tesseract
+        try:
+            tmp = f'/tmp/ocr_{p.stem}'
+            os.makedirs(tmp, exist_ok=True)
+            subprocess.run(['pdftoppm','-r','200',str(p), f'{tmp}/page'], timeout=120, check=True)
+            chunks = []
+            for img in sorted(Path(tmp).glob('page-*.ppm')):
+                r = subprocess.run(['tesseract', str(img),'-','-l','hrv+eng','--psm','6'], capture_output=True, timeout=90)
+                chunks.append(r.stdout.decode('utf-8','ignore'))
+            for f in Path(tmp).glob('*'): f.unlink()
+            os.rmdir(tmp)
+            return ('\n'.join(chunks), 'tesseract')
+        except Exception as e:
+            return ('', f'pdf_err:{e}')
+    elif suf in ('.jpg','.jpeg','.png','.tiff','.tif'):
+        try:
+            r = subprocess.run(['tesseract', str(p),'-','-l','hrv+eng','--psm','6'], capture_output=True, timeout=90)
+            return (r.stdout.decode('utf-8','ignore'), 'tesseract')
+        except Exception as e:
+            return ('', f'img_err:{e}')
+    elif suf in ('.txt','.csv'):
+        return (p.read_text(errors='ignore'), 'text')
+    return ('', f'unsupported:{suf}')
+
+# Croatian invoice patterns
+RE_OIB = re.compile(r'\b(\d{11})\b')
+RE_DATE_DOT = re.compile(r'\b(\d{1,2})[.\s\-]+(\d{1,2})[.\s\-]+(20\d{2})\b')
+RE_DATE_ISO = re.compile(r'\b(20\d{2})[\-/](\d{1,2})[\-/](\d{1,2})\b')
+RE_INVOICE_NO = re.compile(r'(?i)(?:ra[čc]un|invoice|broj|fakture|broj fakture|no\.?|br\.?)[\s:]+([A-Z0-9\-/.]{4,30})')
+RE_AMOUNT = re.compile(r'(?i)(?:ukupno|to pay|total|za platiti|iznos|sveukupno|za naplatu)[\s:€]*([\d.,]{4,15})')
+RE_IBAN = re.compile(r'\b(HR\d{19})\b')
+RE_VAT = re.compile(r'(?i)(?:pdv|vat)[\s:]*?([\d,.]+)')
+
+def parse_amount(s):
+    if not s: return None
+    s = s.replace(' ','').replace('.','').replace(',','.') if s.count(',')==1 else s.replace(',','')
+    try: return float(s)
+    except: return None
+
+def extract_fields(text):
+    """Best-effort regex-based field extraction for HR invoices."""
+    out = {'raw_chars': len(text)}
+    # OIBs (vendor first usually appears in header)
+    oibs = RE_OIB.findall(text or '')
+    if oibs:
+        out['oibs_found'] = list(dict.fromkeys(oibs))
+        out['vendor_oib'] = oibs[0]
+        if len(oibs) > 1: out['customer_oib'] = oibs[1]
+    # Invoice number
+    m = RE_INVOICE_NO.search(text or '')
+    if m: out['invoice_no'] = m.group(1).strip()
+    # Date
+    for rx, order in [(RE_DATE_DOT,'dmy'), (RE_DATE_ISO,'ymd')]:
+        m = rx.search(text or '')
+        if m:
+            g = m.groups()
+            if order=='dmy': out['invoice_date'] = f"{g[2]}-{g[1].zfill(2)}-{g[0].zfill(2)}"
+            else: out['invoice_date'] = f"{g[0]}-{g[1].zfill(2)}-{g[2].zfill(2)}"
+            break
+    # Amount
+    matches = RE_AMOUNT.findall(text or '')
+    amts = [parse_amount(m) for m in matches]
+    amts = [a for a in amts if a and a > 0.01]
+    if amts:
+        out['amount_gross'] = max(amts)  # usually total is the largest
+        out['amounts_found'] = amts[:5]
+    # IBAN
+    m = RE_IBAN.search((text or '').replace(' ',''))
+    if m: out['iban'] = m.group(1)
+    # First non-empty line as vendor name guess
+    if text:
+        for line in text.split('\n')[:8]:
+            ln = line.strip()
+            if 5 < len(ln) < 80 and not RE_OIB.search(ln) and not any(c.isdigit() for c in ln[:3]):
+                out['vendor_name'] = ln
+                break
+    return out
+
+def process(row):
+    uid = row['id']
+    print(f"[OCR] uid={uid} klub={row['klub_id']} file={row['file_name']}")
+    try:
+        text, method = extract_text_from_file(row['file_path'])
+        if len(text.strip()) < 20:
+            fail(uid, f"OCR yielded {len(text.strip())} chars (method={method})")
+            print(f"[OCR] uid={uid} FAIL — empty")
+            return
+        fields = extract_fields(text)
+        fields['ocr_method'] = method
+        upd = {
+            'ocr_status': 'done',
+            'ai_invoice_no': fields.get('invoice_no'),
+            'ai_invoice_date': fields.get('invoice_date'),
+            'ai_vendor_name': fields.get('vendor_name'),
+            'ai_vendor_oib': fields.get('vendor_oib'),
+            'ai_amount_gross': fields.get('amount_gross'),
+            'ai_extracted': json.dumps(fields, ensure_ascii=False, default=str),
+            'ocr_text': text[:50000]
+        }
+        # If ocr_text column doesn't exist, drop it
+        try:
+            update_done(uid, upd)
+        except Exception as e:
+            if 'ocr_text' in str(e):
+                upd.pop('ocr_text', None)
+                update_done(uid, upd)
+            else: raise
+        print(f"[OCR] uid={uid} OK · vendor={fields.get('vendor_name','?')[:30]} · amt={fields.get('amount_gross','?')} · oib={fields.get('vendor_oib','?')}")
+    except Exception as e:
+        traceback.print_exc()
+        fail(uid, str(e))
+
+def main():
+    print(f"[OCR worker] starting, poll every {POLL}s")
+    idle = 0
+    while True:
+        try:
+            row = claim_one()
+            if row:
+                process(row); idle = 0
+            else:
+                idle += 1
+                if idle % 10 == 0: print(f"[OCR] idle x{idle}")
+                time.sleep(POLL)
+        except KeyboardInterrupt:
+            print('\n[OCR] shutdown'); break
+        except Exception as e:
+            print('[OCR] loop error:', e); traceback.print_exc(); time.sleep(POLL*2)
+
+if __name__ == '__main__':
+    main()