feat: /api/v2/analiza/* endpoints - sport analytics backend
This commit is contained in:
@@ -1,4 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv('/opt/rinet-gpu/.env.master')
|
||||
# auto-added by patch_scrapers_with_dotenv.sh
|
||||
"""
|
||||
enrichment_worker.py — 24/7 background enrichment for PGŽ Sport
|
||||
Author: Damir Radulić (damir@rinet.one) / dradulic@outlook.com
|
||||
@@ -54,7 +57,7 @@ if _pgh in ('localhost', '127.0.0.1'):
|
||||
DB = dict(host=_pgh, port=_pgp,
|
||||
dbname=os.environ.get('PG_DB', 'rinet_v3'),
|
||||
user=os.environ.get('PG_USER', 'rinet'),
|
||||
password=os.environ.get('PG_PASS', 'R1net2026!SecureDB#v7'))
|
||||
password=os.environ["DB_PASSWORD"])
|
||||
|
||||
|
||||
def _log(msg: str) -> None:
|
||||
@@ -200,6 +203,8 @@ def _pick_sportas(limit: int = 50) -> list[int]:
|
||||
)
|
||||
AND ((c.metadata->>'enriched_at') IS NULL
|
||||
OR (c.metadata->>'enriched_at')::timestamptz < now() - interval '7 days')
|
||||
AND ((c.metadata->'enrichment_block') IS NULL
|
||||
OR (c.metadata->'enrichment_block'->>'at')::timestamptz < now() - interval '30 days')
|
||||
ORDER BY random()
|
||||
LIMIT %s
|
||||
"""
|
||||
@@ -216,6 +221,8 @@ def _pick_klub(limit: int = 50) -> list[int]:
|
||||
AND {cov} < %s
|
||||
AND ((metadata->>'enriched_at') IS NULL
|
||||
OR (metadata->>'enriched_at')::timestamptz < now() - interval '14 days')
|
||||
AND ((metadata->'enrichment_block') IS NULL
|
||||
OR (metadata->'enrichment_block'->>'at')::timestamptz < now() - interval '30 days')
|
||||
ORDER BY random()
|
||||
LIMIT %s
|
||||
"""
|
||||
@@ -231,6 +238,8 @@ def _pick_savez(limit: int = 50) -> list[int]:
|
||||
WHERE {cov} < %s
|
||||
AND ((metadata->>'enriched_at') IS NULL
|
||||
OR (metadata->>'enriched_at')::timestamptz < now() - interval '14 days')
|
||||
AND ((metadata->'enrichment_block') IS NULL
|
||||
OR (metadata->'enrichment_block'->>'at')::timestamptz < now() - interval '30 days')
|
||||
ORDER BY random()
|
||||
LIMIT %s
|
||||
"""
|
||||
|
||||
@@ -0,0 +1,396 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
enrichment_worker.py — 24/7 background enrichment for PGŽ Sport
|
||||
Author: Damir Radulić (damir@rinet.one) / dradulic@outlook.com
|
||||
Date: 2026-05-04
|
||||
Version: 1.0.0
|
||||
|
||||
Polls pgz_sport.clanovi / klubovi / savezi for under-enriched rows, then
|
||||
calls the live HTTP endpoints (POST /sport/api/v2/enrich/{kind}/{id} and
|
||||
.../apply) so every row goes through the same pipeline (and audit log)
|
||||
that the UI uses. This avoids forking the enrichment logic.
|
||||
|
||||
Selection rules (per cycle):
|
||||
- sportas: clanovi rows missing profile_url AND (source IN ('hns_semafor','manual')
|
||||
OR vanjski_id ? 'hns_comet') ORDER BY random() LIMIT 25
|
||||
- klub: klubovi rows whose metadata.enriched_at is NULL ORDER BY random() LIMIT 10
|
||||
- savez: savezi rows whose metadata.enriched_at is NULL ORDER BY random() LIMIT 5
|
||||
|
||||
Sleep 300 s between cycles (configurable via ENRICHER_SLEEP env).
|
||||
|
||||
Heartbeat to redis (cc:pgz-enricher:heartbeat) and log every cycle to
|
||||
/opt/pgz-sport/_logs/enrichment_worker.log.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import sys
|
||||
import time
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
from datetime import datetime, timezone
|
||||
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
|
||||
API_BASE = os.environ.get('PGZ_API_BASE', 'http://localhost:8095')
|
||||
SLEEP_S = int(os.environ.get('ENRICHER_SLEEP', '300'))
|
||||
DRY = os.environ.get('ENRICHER_DRY', '0') == '1'
|
||||
USER_HDR = os.environ.get('ENRICHER_USER', 'enricher@pgz.local')
|
||||
|
||||
LOG_PATHS = [
|
||||
'/var/log/pgz-sport-enricher.log',
|
||||
'/opt/pgz-sport/_logs/enrichment_worker.log',
|
||||
]
|
||||
CONFIDENCE_MIN = float(os.environ.get('ENRICHER_CONFIDENCE', '0.7'))
|
||||
COVERAGE_MAX = int(os.environ.get('ENRICHER_COVERAGE_MAX', '70'))
|
||||
|
||||
_pgh = os.environ.get('PG_HOST', '10.10.0.2')
|
||||
_pgp = int(os.environ.get('PG_PORT', '6432'))
|
||||
if _pgh in ('localhost', '127.0.0.1'):
|
||||
_pgh = os.environ.get('DB_HOST', '10.10.0.2')
|
||||
_pgp = int(os.environ.get('DB_PORT', '6432'))
|
||||
DB = dict(host=_pgh, port=_pgp,
|
||||
dbname=os.environ.get('PG_DB', 'rinet_v3'),
|
||||
user=os.environ.get('PG_USER', 'rinet'),
|
||||
password=os.environ["DB_PASSWORD"])
|
||||
|
||||
|
||||
def _log(msg: str) -> None:
|
||||
line = f"{datetime.now(timezone.utc).isoformat()}Z {msg}"
|
||||
print(line, flush=True)
|
||||
for p in LOG_PATHS:
|
||||
try:
|
||||
os.makedirs(os.path.dirname(p), exist_ok=True)
|
||||
with open(p, 'a') as f:
|
||||
f.write(line + "\n")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def _redis():
|
||||
try:
|
||||
import redis
|
||||
except Exception:
|
||||
return None
|
||||
host = os.environ.get('REDIS_HOST', 'localhost')
|
||||
port = int(os.environ.get('REDIS_PORT', '6379'))
|
||||
pwd = (os.environ.get('REDIS_PASS') or '').strip().strip("'").strip('"') or None
|
||||
for p in (pwd, None):
|
||||
try:
|
||||
r = redis.Redis(host=host, port=port, password=p,
|
||||
decode_responses=True, socket_connect_timeout=2)
|
||||
r.ping()
|
||||
return r
|
||||
except Exception:
|
||||
continue
|
||||
return None
|
||||
|
||||
|
||||
def _heartbeat(meta: dict | None = None) -> None:
|
||||
r = _redis()
|
||||
if not r: return
|
||||
try:
|
||||
r.set('cc:pgz-enricher:heartbeat', str(int(time.time())))
|
||||
if meta is not None:
|
||||
r.set('cc:pgz-enricher:last_cycle', json.dumps(meta, default=str))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def _is_paused() -> bool:
|
||||
r = _redis()
|
||||
if not r: return False
|
||||
try:
|
||||
return (r.get('cc:pgz-enricher:pause') or '0') == '1'
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def _consume_run_now() -> bool:
|
||||
r = _redis()
|
||||
if not r: return False
|
||||
try:
|
||||
v = r.get('cc:pgz-enricher:run_now')
|
||||
if v == '1':
|
||||
r.set('cc:pgz-enricher:run_now', '0')
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
return False
|
||||
|
||||
|
||||
def _refresh_confidence() -> None:
|
||||
"""Read live confidence override from redis (set by /worker/confidence)."""
|
||||
global CONFIDENCE_MIN
|
||||
r = _redis()
|
||||
if not r: return
|
||||
try:
|
||||
v = r.get('cc:pgz-enricher:confidence')
|
||||
if v:
|
||||
CONFIDENCE_MIN = float(v)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def _bump_fields_24h(n: int) -> None:
|
||||
if n <= 0: return
|
||||
r = _redis()
|
||||
if not r: return
|
||||
try:
|
||||
r.incrby('cc:pgz-enricher:fields_24h', n)
|
||||
r.expire('cc:pgz-enricher:fields_24h', 86400)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def _db():
|
||||
c = psycopg2.connect(**DB); c.autocommit = True; return c
|
||||
|
||||
|
||||
# Coverage = (filled key fields) / (total key fields) * 100. Keep these in sync
|
||||
# with enrich_router.enrich_preview() which surfaces the same scores in the UI.
|
||||
_KLUB_KEYS = ('oib','sport','grad','predsjednik','tajnik','web','email','telefon',
|
||||
'sjediste','godina_osnutka','ciljevi','opis_djelatnosti')
|
||||
_SAVEZ_KEYS = ('oib','sport','predsjednik','tajnik','email','telefon','web',
|
||||
'adresa','godina_osnutka')
|
||||
# Coverage for sportas — fields the user actually wants populated.
|
||||
_SPORTAS_KEYS = ('sport','profile_url','slika_url','hns_igrac_id','biografija',
|
||||
'datum_rodenja','mjesto_rodenja','broj_dresa')
|
||||
|
||||
|
||||
def _coverage_expr(table_keys: tuple[str, ...], prefix: str = '') -> str:
|
||||
"""Postgres expression that returns 0..100 coverage % for the row.
|
||||
|
||||
`prefix` is e.g. 'c.' when the SQL uses a table alias.
|
||||
"""
|
||||
parts = []
|
||||
for k in table_keys:
|
||||
col = f"{prefix}{k}"
|
||||
parts.append(f"(CASE WHEN {col} IS NOT NULL AND ({col}::text) <> '' THEN 1 ELSE 0 END)")
|
||||
total = len(table_keys)
|
||||
return f"((({' + '.join(parts)})::numeric * 100) / {total})"
|
||||
|
||||
|
||||
def _pick_sportas(limit: int = 50) -> list[int]:
|
||||
"""Athletes with coverage<COVERAGE_MAX, randomly ordered.
|
||||
|
||||
Selection is sport-agnostic now: the router decides which federation to
|
||||
query based on c.sport (or klubovi.sport via the JOIN). We require either
|
||||
sport to be set on the row OR a known external linkage so we don't burn
|
||||
cycles on rows the router can't enrich.
|
||||
"""
|
||||
cov = _coverage_expr(_SPORTAS_KEYS, prefix='c.')
|
||||
sql = f"""
|
||||
SELECT c.id
|
||||
FROM pgz_sport.clanovi c
|
||||
LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id
|
||||
WHERE c.aktivan = TRUE
|
||||
AND {cov} < %s
|
||||
AND (
|
||||
c.sport IS NOT NULL
|
||||
OR k.sport IS NOT NULL
|
||||
OR c.source IN ('hns_semafor','hns_family','manual','godisnjak','hbs_savez','hks_savez')
|
||||
OR jsonb_exists(c.vanjski_id, 'hns_comet')
|
||||
OR (c.source_url ILIKE '%%semafor.hns.family%%')
|
||||
OR (c.profile_url ILIKE '%%semafor.hns.family%%')
|
||||
OR (c.source_url ILIKE '%%hrvatski-bocarski-savez.hr%%')
|
||||
OR (c.profile_url ILIKE '%%hrvatski-bocarski-savez.hr%%')
|
||||
)
|
||||
AND ((c.metadata->>'enriched_at') IS NULL
|
||||
OR (c.metadata->>'enriched_at')::timestamptz < now() - interval '7 days')
|
||||
AND ((c.metadata->'enrichment_block') IS NULL
|
||||
OR (c.metadata->'enrichment_block'->>'at')::timestamptz < now() - interval '30 days')
|
||||
ORDER BY random()
|
||||
LIMIT %s
|
||||
"""
|
||||
with _db() as c, c.cursor() as cur:
|
||||
cur.execute(sql, (COVERAGE_MAX, limit))
|
||||
return [r[0] for r in cur.fetchall()]
|
||||
|
||||
|
||||
def _pick_klub(limit: int = 50) -> list[int]:
|
||||
cov = _coverage_expr(_KLUB_KEYS)
|
||||
sql = f"""
|
||||
SELECT id FROM pgz_sport.klubovi
|
||||
WHERE aktivan = TRUE
|
||||
AND {cov} < %s
|
||||
AND ((metadata->>'enriched_at') IS NULL
|
||||
OR (metadata->>'enriched_at')::timestamptz < now() - interval '14 days')
|
||||
AND ((metadata->'enrichment_block') IS NULL
|
||||
OR (metadata->'enrichment_block'->>'at')::timestamptz < now() - interval '30 days')
|
||||
ORDER BY random()
|
||||
LIMIT %s
|
||||
"""
|
||||
with _db() as c, c.cursor() as cur:
|
||||
cur.execute(sql, (COVERAGE_MAX, limit))
|
||||
return [r[0] for r in cur.fetchall()]
|
||||
|
||||
|
||||
def _pick_savez(limit: int = 50) -> list[int]:
|
||||
cov = _coverage_expr(_SAVEZ_KEYS)
|
||||
sql = f"""
|
||||
SELECT id FROM pgz_sport.savezi
|
||||
WHERE {cov} < %s
|
||||
AND ((metadata->>'enriched_at') IS NULL
|
||||
OR (metadata->>'enriched_at')::timestamptz < now() - interval '14 days')
|
||||
AND ((metadata->'enrichment_block') IS NULL
|
||||
OR (metadata->'enrichment_block'->>'at')::timestamptz < now() - interval '30 days')
|
||||
ORDER BY random()
|
||||
LIMIT %s
|
||||
"""
|
||||
with _db() as c, c.cursor() as cur:
|
||||
cur.execute(sql, (COVERAGE_MAX, limit))
|
||||
return [r[0] for r in cur.fetchall()]
|
||||
|
||||
|
||||
def _http_post(path: str, body: dict | None = None) -> dict | None:
|
||||
url = API_BASE.rstrip('/') + path
|
||||
data = json.dumps(body or {}).encode('utf-8')
|
||||
req = urllib.request.Request(
|
||||
url, data=data, method='POST',
|
||||
headers={'Content-Type': 'application/json',
|
||||
'X-User-Email': USER_HDR})
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=60) as r:
|
||||
return json.loads(r.read().decode('utf-8'))
|
||||
except Exception as e:
|
||||
_log(f"POST {path} failed: {type(e).__name__}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
# Per-source confidence weights. Anything written by an HNS Semafor /igraci/
|
||||
# page is structured + verified, so we trust it implicitly. Wikipedia summaries
|
||||
# are mostly safe but free-form. sport-pgz.hr "O nama" pages tend to be the
|
||||
# zajednica generic info, so we down-weight them so a plain DeepSeek synthesis
|
||||
# off a single sport-pgz.hr source falls below the gate.
|
||||
_SOURCE_WEIGHTS = {
|
||||
'semafor.hns.family': 0.95,
|
||||
'hrvatski-bocarski-savez.hr': 0.92,
|
||||
'hns-cff.hr': 0.90,
|
||||
'hks-cbf.hr': 0.90,
|
||||
'hrs.hr': 0.90,
|
||||
'hos-cvf.hr': 0.90,
|
||||
'hvs.hr': 0.90,
|
||||
'hps.hr': 0.90,
|
||||
'atletika.hr': 0.90,
|
||||
'htsavez.hr': 0.90,
|
||||
'judo-savez.hr': 0.88,
|
||||
'karate.hr': 0.88,
|
||||
'veslacki-savez.hr': 0.88,
|
||||
'gimnastika.hr': 0.88,
|
||||
'stolni-tenis.hr': 0.88,
|
||||
'kuglanje.hr': 0.88,
|
||||
'wikipedia.hr': 0.80,
|
||||
'sport-pgz.hr': 0.55,
|
||||
}
|
||||
# Fields that are safe to auto-write even from low-confidence sources because
|
||||
# they come from the entity's own structured page (URLs, IDs).
|
||||
_HARD_FIELDS = {'profile_url','source_url','slika_url','hns_igrac_id'}
|
||||
|
||||
|
||||
def _confidence(proposed: dict, sources: list[dict]) -> float:
|
||||
"""Crude 0..1 score: max source weight, scaled by evidence count."""
|
||||
if not proposed:
|
||||
return 0.0
|
||||
weights = []
|
||||
for s in sources or []:
|
||||
w = _SOURCE_WEIGHTS.get((s.get('source') or '').lower(), 0.50)
|
||||
weights.append(w)
|
||||
if not weights:
|
||||
return 0.0
|
||||
base = max(weights)
|
||||
bonus = min(0.10, 0.03 * (len(sources) - 1))
|
||||
return min(1.0, base + bonus)
|
||||
|
||||
|
||||
def _process(kind: str, eid: int) -> tuple[int, list[str]]:
|
||||
"""Preview → confidence gate → apply. Returns (#applied, fields)."""
|
||||
preview = _http_post(f'/api/v2/enrich/{kind}/{eid}', {})
|
||||
if not preview:
|
||||
return (0, [])
|
||||
proposed = preview.get('proposed') or {}
|
||||
sources = preview.get('sources') or []
|
||||
if not proposed:
|
||||
return (0, [])
|
||||
conf = _confidence(proposed, sources)
|
||||
# Always allow hard structured fields (URLs / IDs) — they are objective.
|
||||
hard = {k: v for k, v in proposed.items() if k in _HARD_FIELDS}
|
||||
soft = {k: v for k, v in proposed.items() if k not in _HARD_FIELDS}
|
||||
fields = dict(hard)
|
||||
if conf >= CONFIDENCE_MIN:
|
||||
fields.update(soft)
|
||||
if not fields:
|
||||
_log(f" {kind}#{eid} skipped — confidence {conf:.2f} < {CONFIDENCE_MIN:.2f}")
|
||||
return (0, [])
|
||||
res = _http_post(f'/api/v2/enrich/{kind}/{eid}/apply',
|
||||
{'fields': fields, 'sources': sources})
|
||||
if not res or 'applied' not in res:
|
||||
return (0, [])
|
||||
applied = res['applied']
|
||||
if applied:
|
||||
_log(f" {kind}#{eid} conf={conf:.2f} → +{len(applied)} {','.join(applied.keys())}")
|
||||
return (len(applied), list(applied.keys()))
|
||||
|
||||
|
||||
def _cycle() -> dict:
|
||||
_refresh_confidence()
|
||||
started = time.time()
|
||||
out = {'sportas': 0, 'klub': 0, 'savez': 0, 'fields_total': 0,
|
||||
'started_at': datetime.now(timezone.utc).isoformat()}
|
||||
fields_total = 0
|
||||
for kind, picker, limit in (
|
||||
('sportas', _pick_sportas, 50),
|
||||
('klub', _pick_klub, 20),
|
||||
('savez', _pick_savez, 5),
|
||||
):
|
||||
ids = picker(limit)
|
||||
random.shuffle(ids)
|
||||
_log(f"cycle: {kind} candidates={len(ids)} coverage<{COVERAGE_MAX} conf>={CONFIDENCE_MIN}")
|
||||
for eid in ids:
|
||||
if DRY:
|
||||
continue
|
||||
if _is_paused():
|
||||
_log("paused → break out of cycle")
|
||||
break
|
||||
n, fields = _process(kind, eid)
|
||||
out[kind] += 1
|
||||
fields_total += n
|
||||
if n: _bump_fields_24h(n)
|
||||
time.sleep(1.5) # gentle pacing
|
||||
_heartbeat()
|
||||
out['fields_total'] = fields_total
|
||||
out['elapsed_s'] = round(time.time() - started, 1)
|
||||
out['ended_at'] = datetime.now(timezone.utc).isoformat()
|
||||
return out
|
||||
|
||||
|
||||
def main() -> int:
|
||||
_log(f"enrichment_worker starting | API_BASE={API_BASE} | sleep={SLEEP_S}s | dry={DRY}")
|
||||
while True:
|
||||
if _is_paused():
|
||||
_log("paused (cc:pgz-enricher:pause=1) — sleeping 30s")
|
||||
_heartbeat({'paused': True})
|
||||
time.sleep(30)
|
||||
continue
|
||||
try:
|
||||
stats = _cycle()
|
||||
_log(f"cycle done: {json.dumps(stats)}")
|
||||
_heartbeat(stats)
|
||||
except Exception as e:
|
||||
_log(f"cycle FAILED: {type(e).__name__}: {e}")
|
||||
_heartbeat({'error': str(e)[:200]})
|
||||
# Sleep in 5-second slices so /worker/run-now and /pause respond fast.
|
||||
elapsed = 0
|
||||
while elapsed < SLEEP_S:
|
||||
if _consume_run_now():
|
||||
_log("run-now signal received → starting next cycle early")
|
||||
break
|
||||
if _is_paused():
|
||||
break
|
||||
time.sleep(5); elapsed += 5
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main() or 0)
|
||||
@@ -1,4 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv('/opt/rinet-gpu/.env.master')
|
||||
# auto-added by patch_scrapers_with_dotenv.sh
|
||||
"""
|
||||
OCR worker daemon — pgz_sport.invoice_uploads
|
||||
Polls pending uploads → OCR (tesseract / pdfplumber) → regex extraction → (optional LLM)
|
||||
@@ -9,7 +12,7 @@ import os, re, time, json, subprocess, traceback, hashlib
|
||||
import psycopg2, psycopg2.extras
|
||||
from pathlib import Path
|
||||
|
||||
DB = dict(host='localhost', port=5432, dbname='rinet_v3', user='rinet', password='R1net2026!SecureDB#v7')
|
||||
DB = dict(host='10.10.0.2', port=5432, dbname='rinet_v3', user='rinet', password=os.environ["DB_PASSWORD"])
|
||||
POLL = 8 # seconds
|
||||
|
||||
def db():
|
||||
|
||||
Executable
+183
@@ -0,0 +1,183 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
OCR worker daemon — pgz_sport.invoice_uploads
|
||||
Polls pending uploads → OCR (tesseract / pdfplumber) → regex extraction → (optional LLM)
|
||||
→ updates ai_invoice_no, ai_vendor_name, ai_vendor_oib, ai_amount_gross, ai_extracted
|
||||
→ flips ocr_status to 'done' or 'failed'
|
||||
"""
|
||||
import os, re, time, json, subprocess, traceback, hashlib
|
||||
import psycopg2, psycopg2.extras
|
||||
from pathlib import Path
|
||||
|
||||
DB = dict(host='10.10.0.2', port=5432, dbname='rinet_v3', user='rinet', password=os.environ["DB_PASSWORD"])
|
||||
POLL = 8 # seconds
|
||||
|
||||
def db():
|
||||
c = psycopg2.connect(**DB); c.autocommit = True; return c
|
||||
|
||||
def claim_one():
|
||||
"""Claim 1 pending row → 'processing'."""
|
||||
with db() as c:
|
||||
cur = c.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
||||
cur.execute("""UPDATE pgz_sport.invoice_uploads
|
||||
SET ocr_status='processing', processed_at=NOW()
|
||||
WHERE id = (SELECT id FROM pgz_sport.invoice_uploads
|
||||
WHERE ocr_status='pending'
|
||||
ORDER BY uploaded_at LIMIT 1 FOR UPDATE SKIP LOCKED)
|
||||
RETURNING *""")
|
||||
return cur.fetchone()
|
||||
|
||||
def update_done(uid, fields):
|
||||
sets, args = [], []
|
||||
for k,v in fields.items():
|
||||
sets.append(f"{k}=%s"); args.append(v)
|
||||
args.append(uid)
|
||||
with db() as c:
|
||||
c.cursor().execute(f"UPDATE pgz_sport.invoice_uploads SET {','.join(sets)} WHERE id=%s", args)
|
||||
|
||||
def fail(uid, err):
|
||||
update_done(uid, {'ocr_status':'failed', 'ai_extracted': json.dumps({'error': err[:500]})})
|
||||
|
||||
def extract_text_from_file(path):
|
||||
"""Returns (text, method)."""
|
||||
p = Path(path)
|
||||
if not p.exists(): return ('', 'missing')
|
||||
suf = p.suffix.lower()
|
||||
if suf == '.pdf':
|
||||
# Try pdftotext first (fast, embedded text)
|
||||
try:
|
||||
r = subprocess.run(['pdftotext','-layout',str(p),'-'], capture_output=True, timeout=60)
|
||||
txt = r.stdout.decode('utf-8','ignore')
|
||||
if len(txt.strip()) > 100: return (txt, 'pdftotext')
|
||||
except Exception: pass
|
||||
# Fallback: rasterize + tesseract
|
||||
try:
|
||||
tmp = f'/tmp/ocr_{p.stem}'
|
||||
os.makedirs(tmp, exist_ok=True)
|
||||
subprocess.run(['pdftoppm','-r','200',str(p), f'{tmp}/page'], timeout=120, check=True)
|
||||
chunks = []
|
||||
for img in sorted(Path(tmp).glob('page-*.ppm')):
|
||||
r = subprocess.run(['tesseract', str(img),'-','-l','hrv+eng','--psm','6'], capture_output=True, timeout=90)
|
||||
chunks.append(r.stdout.decode('utf-8','ignore'))
|
||||
for f in Path(tmp).glob('*'): f.unlink()
|
||||
os.rmdir(tmp)
|
||||
return ('\n'.join(chunks), 'tesseract')
|
||||
except Exception as e:
|
||||
return ('', f'pdf_err:{e}')
|
||||
elif suf in ('.jpg','.jpeg','.png','.tiff','.tif'):
|
||||
try:
|
||||
r = subprocess.run(['tesseract', str(p),'-','-l','hrv+eng','--psm','6'], capture_output=True, timeout=90)
|
||||
return (r.stdout.decode('utf-8','ignore'), 'tesseract')
|
||||
except Exception as e:
|
||||
return ('', f'img_err:{e}')
|
||||
elif suf in ('.txt','.csv'):
|
||||
return (p.read_text(errors='ignore'), 'text')
|
||||
return ('', f'unsupported:{suf}')
|
||||
|
||||
# Croatian invoice patterns
|
||||
RE_OIB = re.compile(r'\b(\d{11})\b')
|
||||
RE_DATE_DOT = re.compile(r'\b(\d{1,2})[.\s\-]+(\d{1,2})[.\s\-]+(20\d{2})\b')
|
||||
RE_DATE_ISO = re.compile(r'\b(20\d{2})[\-/](\d{1,2})[\-/](\d{1,2})\b')
|
||||
RE_INVOICE_NO = re.compile(r'(?i)(?:ra[čc]un|invoice|broj|fakture|broj fakture|no\.?|br\.?)[\s:]+([A-Z0-9\-/.]{4,30})')
|
||||
RE_AMOUNT = re.compile(r'(?i)(?:ukupno|to pay|total|za platiti|iznos|sveukupno|za naplatu)[\s:€]*([\d.,]{4,15})')
|
||||
RE_IBAN = re.compile(r'\b(HR\d{19})\b')
|
||||
RE_VAT = re.compile(r'(?i)(?:pdv|vat)[\s:]*?([\d,.]+)')
|
||||
|
||||
def parse_amount(s):
|
||||
if not s: return None
|
||||
s = s.replace(' ','').replace('.','').replace(',','.') if s.count(',')==1 else s.replace(',','')
|
||||
try: return float(s)
|
||||
except: return None
|
||||
|
||||
def extract_fields(text):
|
||||
"""Best-effort regex-based field extraction for HR invoices."""
|
||||
out = {'raw_chars': len(text)}
|
||||
# OIBs (vendor first usually appears in header)
|
||||
oibs = RE_OIB.findall(text or '')
|
||||
if oibs:
|
||||
out['oibs_found'] = list(dict.fromkeys(oibs))
|
||||
out['vendor_oib'] = oibs[0]
|
||||
if len(oibs) > 1: out['customer_oib'] = oibs[1]
|
||||
# Invoice number
|
||||
m = RE_INVOICE_NO.search(text or '')
|
||||
if m: out['invoice_no'] = m.group(1).strip()
|
||||
# Date
|
||||
for rx, order in [(RE_DATE_DOT,'dmy'), (RE_DATE_ISO,'ymd')]:
|
||||
m = rx.search(text or '')
|
||||
if m:
|
||||
g = m.groups()
|
||||
if order=='dmy': out['invoice_date'] = f"{g[2]}-{g[1].zfill(2)}-{g[0].zfill(2)}"
|
||||
else: out['invoice_date'] = f"{g[0]}-{g[1].zfill(2)}-{g[2].zfill(2)}"
|
||||
break
|
||||
# Amount
|
||||
matches = RE_AMOUNT.findall(text or '')
|
||||
amts = [parse_amount(m) for m in matches]
|
||||
amts = [a for a in amts if a and a > 0.01]
|
||||
if amts:
|
||||
out['amount_gross'] = max(amts) # usually total is the largest
|
||||
out['amounts_found'] = amts[:5]
|
||||
# IBAN
|
||||
m = RE_IBAN.search((text or '').replace(' ',''))
|
||||
if m: out['iban'] = m.group(1)
|
||||
# First non-empty line as vendor name guess
|
||||
if text:
|
||||
for line in text.split('\n')[:8]:
|
||||
ln = line.strip()
|
||||
if 5 < len(ln) < 80 and not RE_OIB.search(ln) and not any(c.isdigit() for c in ln[:3]):
|
||||
out['vendor_name'] = ln
|
||||
break
|
||||
return out
|
||||
|
||||
def process(row):
|
||||
uid = row['id']
|
||||
print(f"[OCR] uid={uid} klub={row['klub_id']} file={row['file_name']}")
|
||||
try:
|
||||
text, method = extract_text_from_file(row['file_path'])
|
||||
if len(text.strip()) < 20:
|
||||
fail(uid, f"OCR yielded {len(text.strip())} chars (method={method})")
|
||||
print(f"[OCR] uid={uid} FAIL — empty")
|
||||
return
|
||||
fields = extract_fields(text)
|
||||
fields['ocr_method'] = method
|
||||
upd = {
|
||||
'ocr_status': 'done',
|
||||
'ai_invoice_no': fields.get('invoice_no'),
|
||||
'ai_invoice_date': fields.get('invoice_date'),
|
||||
'ai_vendor_name': fields.get('vendor_name'),
|
||||
'ai_vendor_oib': fields.get('vendor_oib'),
|
||||
'ai_amount_gross': fields.get('amount_gross'),
|
||||
'ai_extracted': json.dumps(fields, ensure_ascii=False, default=str),
|
||||
'ocr_text': text[:50000]
|
||||
}
|
||||
# If ocr_text column doesn't exist, drop it
|
||||
try:
|
||||
update_done(uid, upd)
|
||||
except Exception as e:
|
||||
if 'ocr_text' in str(e):
|
||||
upd.pop('ocr_text', None)
|
||||
update_done(uid, upd)
|
||||
else: raise
|
||||
print(f"[OCR] uid={uid} OK · vendor={fields.get('vendor_name','?')[:30]} · amt={fields.get('amount_gross','?')} · oib={fields.get('vendor_oib','?')}")
|
||||
except Exception as e:
|
||||
traceback.print_exc()
|
||||
fail(uid, str(e))
|
||||
|
||||
def main():
|
||||
print(f"[OCR worker] starting, poll every {POLL}s")
|
||||
idle = 0
|
||||
while True:
|
||||
try:
|
||||
row = claim_one()
|
||||
if row:
|
||||
process(row); idle = 0
|
||||
else:
|
||||
idle += 1
|
||||
if idle % 10 == 0: print(f"[OCR] idle x{idle}")
|
||||
time.sleep(POLL)
|
||||
except KeyboardInterrupt:
|
||||
print('\n[OCR] shutdown'); break
|
||||
except Exception as e:
|
||||
print('[OCR] loop error:', e); traceback.print_exc(); time.sleep(POLL*2)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user