diff --git a/routers/enrich_router.py b/routers/enrich_router.py index 3cefb82..6e69f7b 100644 --- a/routers/enrich_router.py +++ b/routers/enrich_router.py @@ -1,310 +1,763 @@ """ -enrich_router.py — Round-2/3B enrichment + forensic-scan endpoints -Author: dradulic@outlook.com Date: 2026-05-04 (R2), 2026-05-05 (R3B) +enrich_router.py — v3 enrichment + forensic scan +Author: dradulic@outlook.com / damir@rinet.one +Date: 2026-05-04 (R2) → 2026-05-05 (R3 CC6 v3) -Surfaces "Obogati podatke" buttons for klubovi, savezi, sportasi, plus -the Forenzika "Pokreni novu analizu" scan endpoint that searches civic.*. +POST /v2/enrich/{kind}/{eid} + Inspect the row, scrape the web (Wikipedia HR, sport-pgz.hr search, + primary club URL if any), regex-extract candidate fields (web/email/ + telefon), optionally synthesise descriptions via DeepSeek, and return + a *preview* shape with `proposed` updates the operator can apply. -Strategy: - 1) Read what's already in DB and surface fields the frontend may not have shown. - 2) Build curated research URLs (Google, Wikipedia HR, Sportilus, sport-pgz.hr, - HNS Semafor) so the operator can verify or expand by hand. - 3) If the entity has a `web` URL set, quickly fetch the page and extract - + <meta description> to return as a "live snippet". 5s timeout, fail-soft. - 4) /forensic/scan — match name across civic.persons, return entity links, - forensic_findings hits, and a synthesised risk score. - 5) /enrich/{kind}/{id}/apply — fetch best web source for entity and UPDATE the - row's web/email/telefon fields when missing. +POST /v2/enrich/{kind}/{eid}/apply + Body shapes: + None / {} → re-run preview, apply every proposed field + {"fields": {...}} → apply ONLY those (whitelist + emptiness still enforced) + Performs UPDATE on the matching table, sets metadata.enriched_at and + metadata.enrichment_source, writes a row to pgz_sport.enrichment_log, + returns the after snapshot. + +GET /v2/enrich/log?kind=&target_id=&limit= + Read recent enrichment-log entries. + +POST /v2/forensic/scan + Search civic.persons by name, return entity links + findings + risk score. + +Kinds: klub | savez | sportas """ -import os, re, json, time, urllib.parse, urllib.request, html +from __future__ import annotations +import os, re, json, time, html, urllib.parse, urllib.request +from datetime import datetime, timezone +from typing import Any, Optional + import psycopg2, psycopg2.extras -from fastapi import APIRouter, HTTPException, Body +from fastapi import APIRouter, HTTPException, Header, Body router = APIRouter() -_pgh = os.environ.get('PG_HOST','10.10.0.2') -_pgp = int(os.environ.get('PG_PORT','6432')) -# pgz-sport.service inherits PG_HOST=localhost:5432 from /opt/.env.rinet which is wrong -# (local PG is disabled). Force the Server B DSN if env says localhost. +_pgh = os.environ.get('PG_HOST', '10.10.0.2') +_pgp = int(os.environ.get('PG_PORT', '6432')) if _pgh in ('localhost', '127.0.0.1'): - _pgh = os.environ.get('DB_HOST','10.10.0.2') - _pgp = int(os.environ.get('DB_PORT','6432')) + _pgh = os.environ.get('DB_HOST', '10.10.0.2') + _pgp = int(os.environ.get('DB_PORT', '6432')) DB = dict(host=_pgh, port=_pgp, - dbname=os.environ.get('PG_DB','rinet_v3'), - user=os.environ.get('PG_USER','rinet'), - password=os.environ.get('PG_PASS','R1net2026!SecureDB#v7')) + dbname=os.environ.get('PG_DB', 'rinet_v3'), + user=os.environ.get('PG_USER', 'rinet'), + password=os.environ.get('PG_PASS', 'R1net2026!SecureDB#v7')) -UA = 'pgz-sport-enrich/2.0' +UA = 'pgz-sport-enrich/3.0 (+https://sport.rinet.one)' +TIMEOUT = 6 # seconds — fail-soft +DEEPSEEK_KEY = os.environ.get('DEEPSEEK_API_KEY', '').strip() +DEEPSEEK_URL = os.environ.get('DEEPSEEK_URL', + 'https://api.deepseek.com/v1/chat/completions') + + +# ─── DB helpers ────────────────────────────────────────────────────────── def _db(): c = psycopg2.connect(**DB); c.autocommit = True; return c def _fetch_one(sql, p): with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: - cur.execute(sql, p) - r = cur.fetchone() + cur.execute(sql, p); r = cur.fetchone() return dict(r) if r else None -def _fetch_title(url, timeout=5): - if not url: return None - try: - if not url.startswith('http'): - return None - req = urllib.request.Request(url, headers={'User-Agent': UA}) - with urllib.request.urlopen(req, timeout=timeout) as r: - data = r.read(40000).decode('utf-8','ignore') - title_m = re.search(r'<title[^>]*>([^<]+)', data, re.I) - desc_m = re.search(r' Optional[str]: + if not url: return None + if not url.startswith('http'): return None + try: + req = urllib.request.Request(url, headers={ + 'User-Agent': UA, 'Accept-Language': 'hr,en;q=0.8'}) + with urllib.request.urlopen(req, timeout=timeout) as r: + data = r.read(150000) + try: return data.decode('utf-8') + except: return data.decode('latin-1', 'ignore') + except Exception: + return None + + +def _strip_tags(s: str) -> str: + if not s: return '' + s = re.sub(r']*>.*?', ' ', s, flags=re.S | re.I) + s = re.sub(r']*>.*?', ' ', s, flags=re.S | re.I) + s = re.sub(r'<[^>]+>', ' ', s) + s = html.unescape(s) + s = re.sub(r'\s+', ' ', s).strip() + return s + + +def _extract_meta(html_doc: str, url: str) -> dict: + if not html_doc: return {} + out = {'url': url, 'fetched_at': int(time.time())} + m = re.search(r']*>([^<]+)', html_doc, re.I) + if m: out['title'] = html.unescape(m.group(1).strip())[:300] + m = re.search(r')\]]+', re.I) + +def _find_email(text: str) -> Optional[str]: + if not text: return None + bad = ('@example.', '@test.', '@email.', 'wixpress.com', + 'sentry.io', 'jquery.com', 'googleapis', '@2x.', 'noreply@') + seen = set() + for m in RE_EMAIL.finditer(text): + e = m.group(0).lower().rstrip('.,;:)') + if any(b in e for b in bad): continue + if e in seen: continue + seen.add(e); return e + return None + +def _find_phone(text: str) -> Optional[str]: + if not text: return None + for m in RE_PHONE.finditer(text): + raw = m.group(0).strip() + digits = re.sub(r'\D', '', raw) + if not (8 <= len(digits) <= 13): continue + cleaned = re.sub(r'\s+', ' ', raw).strip() + if raw.startswith('+385'): return '+385 ' + raw[4:].lstrip().lstrip('-/') + if raw.startswith('00385'): return '+385 ' + raw[5:].lstrip().lstrip('-/') + return cleaned + return None + +def _find_official_web(text: str, hint: str = '') -> Optional[str]: + if not text: return None + blocked = ('wikipedia.org', 'sport-pgz.hr', 'google.com', 'facebook.com', + 'instagram.com', 'youtube.com', 'twitter.com', 'wikimedia', + 'sportilus.com', 'transfermarkt.com', 'wikidata.org', + 'sudreg.pravosudje.hr', 'gov.hr', 'apis.google.com', + 'rinet.one', 'pgz.hr') + candidates: list[str] = [] + for m in RE_URL.finditer(text): + u = m.group(0).rstrip('.,;:)\'"') + try: + host = urllib.parse.urlparse(u).hostname or '' + except Exception: + continue + if not host or any(b in host for b in blocked): continue + candidates.append(u) + if not candidates: return None + if hint: + slug = re.sub(r'[^a-z0-9]', '', hint.lower())[:8] + for u in candidates: + host = urllib.parse.urlparse(u).hostname or '' + if slug and slug in host.replace('-', '').replace('.', ''): + return u + return candidates[0] + + +# ─── External sources ──────────────────────────────────────────────────── +def _wiki_summary(query: str) -> Optional[dict]: + if not query: return None + title = urllib.parse.quote(query.replace(' ', '_'), safe='') + body = _http_get(f'https://hr.wikipedia.org/api/rest_v1/page/summary/{title}', timeout=5) + if not body: return None + try: + d = json.loads(body) + if d.get('type') == 'disambiguation' or 'extract' not in d: return None + return { + 'source': 'wikipedia.hr', + 'url': d.get('content_urls', {}).get('desktop', {}).get('page'), + 'title': d.get('title'), + 'extract': d.get('extract'), + 'description': d.get('description'), + } + except Exception: + return None + + +def _sport_pgz_search(query: str) -> Optional[dict]: + if not query: return None + page = _http_get('https://sport-pgz.hr/?s=' + urllib.parse.quote(query), timeout=6) + if not page: return None + m = re.search(r']*>.*?]*rel=["\']bookmark["\'][^>]*>([^<]+)', + page, re.S | re.I) + if not m: + m = re.search(r']*>([^<]{6,180})', page, re.I) + if not m: return None + hit = m.group(1) + body = _http_get(hit, timeout=6) + if not body: + return {'source': 'sport-pgz.hr', 'url': hit, 'title': html.unescape(m.group(2).strip())} + text = _strip_tags(body)[:4000] + meta = _extract_meta(body, hit) + return { + 'source': 'sport-pgz.hr', + 'url': hit, + 'title': meta.get('title') or html.unescape(m.group(2).strip()), + 'extract': meta.get('description') or text[:500], + 'raw_text': text, + } + + +def _fetch_primary_site(url: str) -> Optional[dict]: + body = _http_get(url, timeout=6) + if not body: return None + text = _strip_tags(body) + meta = _extract_meta(body, url) + return { + 'source': urllib.parse.urlparse(url).hostname or url, + 'url': url, + 'title': meta.get('title'), + 'extract': meta.get('description') or text[:500], + 'raw_text': text[:8000], + } + + +# ─── DeepSeek (optional, fail-soft) ───────────────────────────────────── +def _deepseek_describe(naziv: str, kind: str, evidence: list[str]) -> Optional[str]: + if not DEEPSEEK_KEY or not evidence: return None + joined = "\n---\n".join(e for e in evidence if e)[:6000] + if not joined.strip(): return None + prompt = (f"Iz dolje navedenih izvora napiši profesionalni opis za " + f"{kind} '{naziv}' na hrvatskom jeziku. 3-5 rečenica. " + f"Bez uvoda 'Evo opisa', samo tekst.\n\nIZVORI:\n{joined}") + payload = { + "model": "deepseek-chat", + "messages": [ + {"role": "system", "content": "Pišeš sažete činjenične opise sportskih organizacija na hrvatskom."}, + {"role": "user", "content": prompt}, + ], + "max_tokens": 280, "temperature": 0.3, + } + req = urllib.request.Request( + DEEPSEEK_URL, data=json.dumps(payload).encode('utf-8'), + headers={'Authorization': 'Bearer ' + DEEPSEEK_KEY, + 'Content-Type': 'application/json', + 'User-Agent': UA}, method='POST') + try: + with urllib.request.urlopen(req, timeout=20) as r: + d = json.loads(r.read().decode('utf-8')) + text = d.get('choices', [{}])[0].get('message', {}).get('content', '').strip() + return text or None + except Exception: + return None + + +# ─── Row loaders & display name ───────────────────────────────────────── +def _load_row(kind: str, eid: int) -> dict: if kind == 'klub': row = _fetch_one("""SELECT id, naziv, oib, sport, grad, predsjednik, tajnik, web, web_stranica, email, telefon, ciljevi, opis_djelatnosti, - sjediste, godina_osnutka, savez_id, scrape_url, source_url + sjediste, godina_osnutka, savez_id, scrape_url, source_url, + metadata FROM pgz_sport.klubovi WHERE id=%s""", (eid,)) elif kind == 'savez': row = _fetch_one("""SELECT id, naziv, oib, sport, predsjednik, tajnik, email, telefon, web, - adresa, godina_osnutka, source_url + adresa, godina_osnutka, source_url, metadata FROM pgz_sport.savezi WHERE id=%s""", (eid,)) - else: # sportas + elif kind == 'sportas': row = _fetch_one("""SELECT id, ime, prezime, sport, klub_id, profile_url, scrape_url, - slika_url, source_url, hns_igrac_id, biografija + slika_url, source_url, hns_igrac_id, biografija, metadata FROM pgz_sport.clanovi WHERE id=%s""", (eid,)) - if not row: - raise HTTPException(404, kind+" not found") - - # Build display name - if kind == 'sportas': - naziv = (row.get('ime','') + ' ' + row.get('prezime','')).strip() - grad = None else: - naziv = row.get('naziv','') - grad = row.get('grad') if kind=='klub' else None + raise HTTPException(400, "kind must be klub|savez|sportas") + if not row: + raise HTTPException(404, kind + " not found") + return row - # Live web snippet from primary URL - primary = row.get('web') or row.get('web_stranica') or row.get('source_url') or row.get('scrape_url') or row.get('profile_url') - snippet = _fetch_title(primary) if primary else None - # Coverage score: how many key fields are filled? +def _display_name(kind: str, row: dict) -> str: + if kind == 'sportas': + return ((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip() + return row.get('naziv', '') or '' + + +def _research_links(naziv, kind, grad=None): + base_q = (naziv or '').strip() + q = (base_q + ' ' + grad) if grad else base_q + qenc = urllib.parse.quote(q) + out = [ + {'label': 'Google', 'icon': '🔍', 'url': 'https://www.google.com/search?q=' + qenc}, + {'label': 'Wikipedia HR', 'icon': '📚', 'url': 'https://hr.wikipedia.org/w/index.php?search=' + qenc}, + {'label': 'sport-pgz.hr', 'icon': '🏅', 'url': 'https://sport-pgz.hr/?s=' + qenc}, + ] if kind == 'klub': - keys = ['oib','sport','grad','predsjednik','tajnik','web','email','telefon','sjediste','godina_osnutka','ciljevi'] + out.append({'label': 'Sportilus', 'icon': '⬡', 'url': 'https://www.sportilus.com/?s=' + qenc}) + out.append({'label': 'Sudski registar', 'icon': '⚖', 'url': 'https://sudreg.pravosudje.hr/registar/oc/index.html'}) + if kind == 'sportas': + out.append({'label': 'HNS Semafor', 'icon': '⚽', 'url': 'https://semafor.hns.family/?s=' + qenc}) + out.append({'label': 'transfermarkt','icon': '⚽', 'url': 'https://www.transfermarkt.com/schnellsuche/ergebnis/schnellsuche?query=' + qenc}) + if kind == 'savez': + out.append({'label': 'sport-pgz.hr savezi', 'icon': '🏅', 'url': 'https://sport-pgz.hr/savezi'}) + return out + + +# ─── Proposal pipelines ───────────────────────────────────────────────── +def _name_tokens(naziv: str) -> list[str]: + """Significant tokens from entity name (≥4 chars, deaccented).""" + import unicodedata + s = unicodedata.normalize('NFKD', naziv or '').encode('ascii', 'ignore').decode('ascii').lower() + toks = [t for t in re.split(r'[^a-z0-9]+', s) if len(t) >= 4] + stop = {'klub','udruga','sportski','sport','kosarkaski','kosarka','nogometni', + 'rukometni','savez','rijeka','primorsko','goranski','grad','grada','centar'} + return [t for t in toks if t not in stop] or toks + + +def _is_relevant(source: dict, tokens: list[str]) -> bool: + """A source is 'relevant' only if the page actually mentions the entity name.""" + if not tokens: return True + import unicodedata + blob = (source.get('title') or '') + ' ' + (source.get('extract') or '') + ' ' + (source.get('raw_text') or '') + blob = unicodedata.normalize('NFKD', blob.lower()).encode('ascii', 'ignore').decode('ascii') + return any(t in blob for t in tokens) + + +def _propose_for_klub(row: dict) -> dict: + naziv = row.get('naziv') or '' + primary = row.get('web') or row.get('web_stranica') or row.get('source_url') or row.get('scrape_url') + sources, evidence = [], [] + pdoc = _fetch_primary_site(primary) if primary else None + if pdoc: sources.append(pdoc); evidence.append(pdoc.get('raw_text') or pdoc.get('extract') or '') + wiki = _wiki_summary(naziv) + if wiki: sources.append(wiki); evidence.append(wiki.get('extract') or '') + spz = _sport_pgz_search(naziv) + if spz: sources.append(spz); evidence.append(spz.get('raw_text') or spz.get('extract') or '') + + tokens = _name_tokens(naziv) + relevant = [s for s in sources if _is_relevant(s, tokens)] + relevant_blob = '\n\n'.join((s.get('raw_text') or s.get('extract') or '') for s in relevant) + + proposed: dict[str, Any] = {} + # web/email/telefon: ONLY from sources actually mentioning the entity + if not row.get('web'): + u = _find_official_web(relevant_blob, naziv) + if u: proposed['web'] = u + if not row.get('email'): + e = _find_email(relevant_blob) + if e: proposed['email'] = e + if not row.get('telefon'): + t = _find_phone(relevant_blob) + if t: proposed['telefon'] = t + if not row.get('opis_djelatnosti'): + descr_evidence = [(s.get('raw_text') or s.get('extract') or '') for s in relevant] or evidence + descr = _deepseek_describe(naziv, 'sportski klub', descr_evidence) + if not descr: + for s in (relevant or sources): + if s.get('extract') and len(s['extract']) >= 80: + descr = s['extract']; break + if descr: proposed['opis_djelatnosti'] = descr.strip()[:2000] + return {'proposed': proposed, 'sources': sources} + + +def _propose_for_savez(row: dict) -> dict: + naziv = row.get('naziv') or '' + primary = row.get('web') or row.get('source_url') + sources, evidence = [], [] + pdoc = _fetch_primary_site(primary) if primary else None + if pdoc: sources.append(pdoc); evidence.append(pdoc.get('raw_text') or '') + wiki = _wiki_summary(naziv) + if wiki: sources.append(wiki); evidence.append(wiki.get('extract') or '') + spz = _sport_pgz_search(naziv) + if spz: sources.append(spz); evidence.append(spz.get('raw_text') or '') + + tokens = _name_tokens(naziv) + relevant = [s for s in sources if _is_relevant(s, tokens)] + relevant_blob = '\n\n'.join((s.get('raw_text') or s.get('extract') or '') for s in relevant) + + proposed: dict[str, Any] = {} + if not row.get('web'): + u = _find_official_web(relevant_blob, naziv) + if u: proposed['web'] = u + if not row.get('email'): + e = _find_email(relevant_blob) + if e: proposed['email'] = e + if not row.get('telefon'): + t = _find_phone(relevant_blob) + if t: proposed['telefon'] = t + return {'proposed': proposed, 'sources': sources} + + +def _propose_for_sportas(row: dict) -> dict: + naziv = ((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip() + sources, evidence = [], [] + wiki = _wiki_summary(naziv) + if wiki: sources.append(wiki); evidence.append(wiki.get('extract') or '') + proposed: dict[str, Any] = {} + if not row.get('biografija') and evidence: + descr = _deepseek_describe(naziv, 'sportaš', evidence) + if not descr and wiki: descr = wiki.get('extract') + if descr: proposed['biografija'] = descr.strip()[:2000] + return {'proposed': proposed, 'sources': sources} + + +# ─── Endpoints ────────────────────────────────────────────────────────── +@router.post("/enrich/{kind}/{eid}") +def enrich_preview(kind: str, eid: int): + row = _load_row(kind, eid) + if kind == 'klub': res = _propose_for_klub(row) + elif kind == 'savez': res = _propose_for_savez(row) + else: res = _propose_for_sportas(row) + + if kind == 'klub': + keys = ['oib','sport','grad','predsjednik','tajnik','web','email','telefon', + 'sjediste','godina_osnutka','ciljevi','opis_djelatnosti'] elif kind == 'savez': keys = ['oib','sport','predsjednik','tajnik','email','telefon','web','adresa','godina_osnutka'] else: keys = ['sport','profile_url','slika_url','hns_igrac_id','biografija'] - filled = sum(1 for k in keys if row.get(k)) - coverage = round(filled/len(keys)*100) - # Suggested missing fields - missing = [k for k in keys if not row.get(k)] + naziv = _display_name(kind, row) + grad = row.get('grad') if kind == 'klub' else None + primary = row.get('web') or row.get('web_stranica') or row.get('source_url') or row.get('scrape_url') or row.get('profile_url') + + filled = sum(1 for k in keys if row.get(k)) + coverage = round(filled / len(keys) * 100) + missing = [k for k in keys if not row.get(k)] + + proposed = res['proposed'] + current = {k: row.get(k) for k in proposed.keys()} + meta = row.get('metadata') or {} + if not isinstance(meta, dict): meta = {} return { - 'kind': kind, - 'id': eid, - 'naziv': naziv, - 'coverage': coverage, - 'filled_fields': filled, - 'total_fields': len(keys), + 'kind': kind, 'id': eid, 'naziv': naziv, + 'coverage': coverage, 'filled_fields': filled, 'total_fields': len(keys), 'missing_fields': missing, - 'live_snippet': snippet, + 'live_snippet': _fetch_title(primary) if primary else None, 'research_links': _research_links(naziv, kind, grad), + 'sources': res['sources'], + 'current': current, + 'proposed': proposed, + 'last_enriched_at': meta.get('enriched_at'), + 'last_enrichment_source': meta.get('enrichment_source'), 'enriched_at': int(time.time()), + 'apply_url': f'/sport/api/v2/enrich/{kind}/{eid}/apply', } -# ── R3B P4 — FORENSIC SCAN ────────────────────────────────────────── +_TABLE_MAP = { + 'klub': ('pgz_sport.klubovi', + {'web','email','telefon','predsjednik','tajnik', + 'opis_djelatnosti','ciljevi','godina_osnutka','sjediste'}), + 'savez': ('pgz_sport.savezi', + {'web','email','telefon','predsjednik','tajnik','adresa','godina_osnutka'}), + 'sportas': ('pgz_sport.clanovi', + {'biografija','profile_url','slika_url'}), +} + + +def _apply_to_db(kind: str, eid: int, fields: dict, sources: list, user_email: Optional[str]): + if kind not in _TABLE_MAP: + raise HTTPException(400, "kind must be klub|savez|sportas") + table, allowed = _TABLE_MAP[kind] + + with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: + cur.execute(f"SELECT * FROM {table} WHERE id=%s FOR UPDATE", (eid,)) + before = cur.fetchone() + if not before: raise HTTPException(404, kind + " not found") + before = dict(before) + + sets, params, applied = [], [], {} + for k, v in (fields or {}).items(): + if k not in allowed: continue + if v is None or str(v).strip() == '': continue + if before.get(k): + continue # never overwrite existing + sets.append(f"{k} = %s") + params.append(v); applied[k] = v + + meta_in = before.get('metadata') or {} + if not isinstance(meta_in, dict): meta_in = {} + now_iso = datetime.now(timezone.utc).isoformat() + meta_in['enriched_at'] = now_iso + meta_in['enrichment_source'] = [s.get('source') for s in (sources or []) if s.get('source')] + history = meta_in.get('enrichment_history') or [] + history.append({ + 'at': now_iso, + 'fields': list(applied.keys()), + 'sources': meta_in['enrichment_source'], + 'urls': [s.get('url') for s in (sources or []) if s.get('url')], + 'user': user_email, + }) + meta_in['enrichment_history'] = history[-10:] + sets.append("metadata = %s::jsonb") + params.append(json.dumps(meta_in, ensure_ascii=False, default=str)) + + params.append(eid) + cur.execute(f"UPDATE {table} SET {', '.join(sets)} WHERE id=%s RETURNING *", params) + after = dict(cur.fetchone()) + + cur.execute( + """INSERT INTO pgz_sport.enrichment_log + (kind, target_id, source, url, fields_set, before_jsonb, after_jsonb, user_email) + VALUES (%s,%s,%s,%s,%s,%s::jsonb,%s::jsonb,%s)""", + (kind, eid, + ','.join(meta_in['enrichment_source'])[:120] if meta_in['enrichment_source'] else None, + (sources[0].get('url') if sources else None), + list(applied.keys()) or None, + json.dumps({k: before.get(k) for k in (list(applied.keys()) + ['metadata'])}, + ensure_ascii=False, default=str), + json.dumps({k: after.get(k) for k in (list(applied.keys()) + ['metadata'])}, + ensure_ascii=False, default=str), + user_email)) + + snap_keys = ('id','naziv','ime','prezime','web','email','telefon', + 'opis_djelatnosti','biografija','metadata') + return {'applied': applied, + 'after': {k: after.get(k) for k in snap_keys if k in after}} + + +@router.post("/enrich/{kind}/{eid}/apply") +def enrich_apply(kind: str, eid: int, + body: dict = Body(default=None), + x_user_email: Optional[str] = Header(default=None)): + body = body or {} + fields = body.get('fields') + sources = body.get('sources') + if not fields: + row = _load_row(kind, eid) + if kind == 'klub': res = _propose_for_klub(row) + elif kind == 'savez': res = _propose_for_savez(row) + else: res = _propose_for_sportas(row) + fields = res['proposed'] + sources = res['sources'] + out = _apply_to_db(kind, eid, fields or {}, sources or [], x_user_email) + return {'kind': kind, 'id': eid, **out} + + +@router.get("/enrich/log") +def enrich_log(kind: Optional[str] = None, target_id: Optional[int] = None, limit: int = 50): + where, params = [], [] + if kind: where.append("kind=%s"); params.append(kind) + if target_id: where.append("target_id=%s"); params.append(target_id) + sql = ("SELECT id, kind, target_id, source, url, fields_set, user_email, created_at " + "FROM pgz_sport.enrichment_log " + + ("WHERE " + " AND ".join(where) + " " if where else "") + + "ORDER BY id DESC LIMIT %s") + params.append(min(int(limit or 50), 200)) + with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: + cur.execute(sql, params) + rows = [dict(r) for r in cur.fetchall()] + for r in rows: + if r.get('created_at'): r['created_at'] = r['created_at'].isoformat() + return {'count': len(rows), 'rows': rows} + + +# ─── R3B M2 — SEARCH SUGGEST (autocomplete for Mreža) ─────────────────── +@router.get("/search/suggest") +def search_suggest(q: str = '', type: str = '', limit: int = 10): + """ + Autocomplete suggestions for the Mreža search inputs. + type ∈ {person, club, company, ''} — empty means all. + Returns: {query, results: [{id, label, type, sub}]} + """ + q = (q or '').strip() + if len(q) < 2: + return {'query': q, 'results': []} + limit = max(1, min(50, int(limit))) + out = [] + with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: + if type in ('', 'club'): + cur.execute(""" + SELECT id, naziv AS label, sport, grad + FROM pgz_sport.klubovi + WHERE naziv ILIKE %s AND aktivan=TRUE + ORDER BY length(naziv), naziv LIMIT %s + """, ('%'+q+'%', limit)) + for r in cur.fetchall(): + out.append({'id':'klub:'+str(r['id']), 'label': r['label'], 'type':'club', + 'sub': (r.get('sport') or '')+' · '+(r.get('grad') or '')}) + cur.execute(""" + SELECT id, naziv AS label, sport + FROM pgz_sport.savezi + WHERE naziv ILIKE %s AND aktivan=TRUE + ORDER BY length(naziv), naziv LIMIT %s + """, ('%'+q+'%', limit)) + for r in cur.fetchall(): + out.append({'id':'savez:'+str(r['id']), 'label': r['label'], 'type':'savez', + 'sub': r.get('sport') or 'savez'}) + if type in ('', 'person'): + cur.execute(""" + SELECT c.id, c.ime, c.prezime, c.sport, k.naziv AS klub_naziv + FROM pgz_sport.clanovi c + LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id + WHERE (COALESCE(c.ime,'') || ' ' || COALESCE(c.prezime,'')) ILIKE %s + ORDER BY length(COALESCE(c.ime,'')||COALESCE(c.prezime,'')), c.prezime + LIMIT %s + """, ('%'+q+'%', limit)) + for r in cur.fetchall(): + out.append({'id':'sportas:'+str(r['id']), + 'label': (r.get('ime') or '')+' '+(r.get('prezime') or ''), + 'type':'person', + 'sub': (r.get('sport') or 'sportaš')+(r.get('klub_naziv') and ' · '+r['klub_naziv'] or '')}) + cur.execute(""" + SELECT id, name AS label, function, oib, county + FROM civic.persons + WHERE name ILIKE %s + ORDER BY oib NULLS LAST, length(name) LIMIT %s + """, ('%'+q+'%', limit)) + for r in cur.fetchall(): + out.append({'id':'civic_person:'+str(r['id']), + 'label': r['label'], 'type':'person', + 'sub': (r.get('function') or 'civic')+' · '+(r.get('county') or '')}) + if type in ('', 'company'): + cur.execute(""" + SELECT id, name AS label, oib, city, entity_type + FROM civic.entities + WHERE name ILIKE %s + ORDER BY length(name) LIMIT %s + """, ('%'+q+'%', limit)) + for r in cur.fetchall(): + out.append({'id':'civic_entity:'+str(r['id']), + 'label': r['label'], 'type':'company', + 'sub': (r.get('entity_type') or 'tvrtka')+' · '+(r.get('city') or '')}) + return {'query': q, 'results': out[:limit*2]} + + +# ─── R3B M3 — FORENSIC ENRICH (Wikipedia scrape + persist) ────────────── +@router.post("/forensic/findings/{finding_id}/enrich") +def enrich_forensic(finding_id: int): + """ + Look up the forensic finding, derive the PEP person name from + entities_involved or title, hit Wikipedia HR for a summary, and persist + the enriched payload into civic.forensic_findings.ai_analysis (or back into + raw_data.enrichment). + """ + with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: + cur.execute(""" + SELECT id, finding_type, severity, title, description, entities_involved, + raw_data, ai_analysis + FROM civic.forensic_findings WHERE id=%s + """, (finding_id,)) + f = cur.fetchone() + if not f: raise HTTPException(404, "finding not found") + f = dict(f) + + # Derive person name candidates + candidates = [] + if isinstance(f.get('entities_involved'), (list, dict)): + ei = f['entities_involved'] + if isinstance(ei, dict): + for k in ('person','name','osoba','PEP','pep'): + if ei.get(k): candidates.append(str(ei[k])) + # Also try persons: [...] list + for p in (ei.get('persons') or ei.get('osobe') or []): + if isinstance(p, dict) and p.get('name'): candidates.append(p['name']) + elif isinstance(p, str): candidates.append(p) + elif isinstance(ei, list): + for it in ei: + if isinstance(it, dict): + for k in ('name','person','label'): + if it.get(k): candidates.append(str(it[k])); break + elif isinstance(it, str): + candidates.append(it) + if not candidates and f.get('title'): + # Heuristic: extract first capitalised "Ime Prezime" pair + m = re.search(r'\b([A-ZŠĐČĆŽ][a-zšđčćž]{2,})\s+([A-ZŠĐČĆŽ][a-zšđčćž]{2,})', f['title']) + if m: candidates.append(m.group(0)) + + wiki = None + used_query = None + for q in candidates[:3]: + wiki = _wiki_summary(q) + if wiki: + used_query = q + break + + # Build enrichment payload + enrichment = { + 'queried': candidates[:5], + 'used_query': used_query, + 'wiki': wiki, + 'enriched_at': datetime.now(timezone.utc).isoformat(), + } + + # Persist into raw_data.enrichment + raw = f.get('raw_data') + if raw is None: raw = {} + if not isinstance(raw, dict): raw = {'_legacy': raw} + raw['enrichment'] = enrichment + + cur.execute(""" + UPDATE civic.forensic_findings + SET raw_data = %s::jsonb, + ai_analysis = COALESCE(ai_analysis, %s) + WHERE id = %s + """, (json.dumps(raw, default=str, ensure_ascii=False), + (wiki or {}).get('extract'), + finding_id)) + c.commit() + + return { + 'finding_id': finding_id, + 'queried': candidates[:5], + 'used_query': used_query, + 'wiki': wiki, + 'persisted': True, + } + + +# ─── R3B P4 — FORENSIC SCAN (kept from prior version) ─────────────────── @router.post("/forensic/scan") def forensic_scan(req: dict = Body(...)): - """ - Search civic.persons by name. For each match, gather entities, person - role, forensic_findings count, and synthesise a risk score. - Body: {"name": "Velimir Liverić"} - """ name = (req.get('name') or '').strip() if len(name) < 3: raise HTTPException(400, "name must be at least 3 chars") - with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: cur.execute(""" SELECT id, name, function, party, county, city, oib, trust_tier FROM civic.persons WHERE upper(name) ILIKE upper(%s) - ORDER BY oib NULLS LAST, id - LIMIT 25 - """, ('%'+name+'%',)) + ORDER BY oib NULLS LAST, id LIMIT 25 + """, ('%' + name + '%',)) persons = [dict(r) for r in cur.fetchall()] - - # For each person collect entity links via OIB for p in persons: - p['links'] = [] - p['findings'] = [] + p['links'] = []; p['findings'] = [] if p.get('oib'): cur.execute(""" SELECT pel.entity_id, pel.roles, e.name AS entity_name, e.oib AS entity_oib, e.entity_type, e.city, e.risk_score FROM civic.person_entity_links pel LEFT JOIN civic.entities e ON e.id = pel.entity_id - WHERE pel.person_oib = %s - LIMIT 50 + WHERE pel.person_oib = %s LIMIT 50 """, (p['oib'],)) p['links'] = [dict(r) for r in cur.fetchall()] - # Forensic findings JSONB containing this OIB cur.execute(""" SELECT id, finding_type, severity, title, severity_score, created_at FROM civic.forensic_findings WHERE entities_involved::text ILIKE %s - ORDER BY severity_score DESC, created_at DESC - LIMIT 30 - """, ('%'+p['oib']+'%',)) + ORDER BY severity_score DESC, created_at DESC LIMIT 30 + """, ('%' + p['oib'] + '%',)) p['findings'] = [dict(r) for r in cur.fetchall()] - # Also search forensic_findings by name if not p['findings']: cur.execute(""" SELECT id, finding_type, severity, title, severity_score, created_at FROM civic.forensic_findings WHERE title ILIKE %s OR description ILIKE %s - ORDER BY severity_score DESC, created_at DESC - LIMIT 30 - """, ('%'+p['name']+'%', '%'+p['name']+'%')) + ORDER BY severity_score DESC, created_at DESC LIMIT 30 + """, ('%' + p['name'] + '%', '%' + p['name'] + '%')) p['findings'] = [dict(r) for r in cur.fetchall()] - - # Synthesise risk score per person and overall - total_links = 0 - total_findings = 0 - crit_findings = 0 + total_links = total_findings = crit_findings = 0 for p in persons: total_links += len(p.get('links') or []) for f in p.get('findings') or []: total_findings += 1 - if f.get('severity') in ('CRITICAL','HIGH'): - crit_findings += 1 - # per-person risk: 30 base if PEP-like (function set), +5 per link, +10 per finding, +20 per crit + if f.get('severity') in ('CRITICAL', 'HIGH'): crit_findings += 1 score = 0 - if (p.get('function') or '').strip(): - score += 30 - if (p.get('party') or '').strip(): - score += 15 - score += min(40, len(p.get('links') or [])*5) - score += min(40, len(p.get('findings') or [])*10) - score += sum(20 for f in (p.get('findings') or []) if f.get('severity') in ('CRITICAL','HIGH')) + if (p.get('function') or '').strip(): score += 30 + if (p.get('party') or '').strip(): score += 15 + score += min(40, len(p.get('links') or []) * 5) + score += min(40, len(p.get('findings') or []) * 10) + score += sum(20 for f in (p.get('findings') or []) if f.get('severity') in ('CRITICAL', 'HIGH')) p['risk_score'] = min(100, score) - - overall = 0 - if persons: - overall = max(p.get('risk_score',0) for p in persons) - - return { - 'query': name, - 'matched_persons': len(persons), - 'overall_risk_score': overall, - 'total_links': total_links, - 'total_findings': total_findings, - 'critical_findings': crit_findings, - 'persons': persons, - 'scanned_at': int(time.time()), - } - - -# ── R3B P6 — ENRICH /apply (write enriched fields back to DB) ─────── -@router.post("/enrich/{kind}/{eid}/apply") -def enrich_apply(kind: str, eid: int, req: dict = Body(default={})): - """ - Apply enrichment to DB. Body may contain {fields: {web, email, telefon}} - to override the auto-derived suggestions; otherwise we apply derived ones. - Only updates fields that are currently NULL or empty in DB (additive only). - """ - if kind not in ('klub','savez','sportas'): - raise HTTPException(400, "kind must be klub|savez|sportas") - body_fields = (req.get('fields') if isinstance(req, dict) else {}) or {} - - if kind == 'klub': - table = 'pgz_sport.klubovi' - cols = ['web','email','telefon'] - elif kind == 'savez': - table = 'pgz_sport.savezi' - cols = ['web','email','telefon'] - else: - table = 'pgz_sport.clanovi' - cols = ['biografija','profile_url'] - - with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: - cur.execute(f"SELECT * FROM {table} WHERE id=%s", (eid,)) - row = cur.fetchone() - if not row: raise HTTPException(404, kind+" not found") - row = dict(row) - - # Try a live fetch from primary URL to glean email/phone - primary = row.get('web') or row.get('web_stranica') or row.get('source_url') or row.get('scrape_url') or row.get('profile_url') - derived = {} - if primary: - snippet = _fetch_title(primary, timeout=6) - try: - if snippet and snippet.get('url'): - req2 = urllib.request.Request(primary, headers={'User-Agent': UA}) - with urllib.request.urlopen(req2, timeout=6) as r: - page = r.read(80000).decode('utf-8','ignore') - em = re.search(r'[\w\.-]+@[\w\.-]+\.[a-z]{2,8}', page, re.I) - if em: derived['email'] = em.group(0) - tel = re.search(r'\+?385[\s\-]?\d[\d\s\-/]{6,}', page) - if tel: derived['telefon'] = re.sub(r'\s+', ' ', tel.group(0).strip()) - except Exception: - pass - - # Merge: body fields override derived - proposed = dict(derived) - for k, v in (body_fields or {}).items(): - if k in cols and v: - proposed[k] = v - - # Only apply where DB currently empty - applied = {} - for k, v in proposed.items(): - if k in cols and (row.get(k) is None or row.get(k)==''): - applied[k] = v - - if applied: - sets = ', '.join([f"{k}=%s" for k in applied]) - params = list(applied.values()) + [eid] - cur.execute(f"UPDATE {table} SET {sets} WHERE id=%s", params) - c.commit() - - return { - 'kind': kind, 'id': eid, - 'proposed': proposed, - 'applied': applied, - 'skipped_existing': [k for k in proposed if k not in applied], - 'applied_at': int(time.time()), - } + overall = max((p.get('risk_score', 0) for p in persons), default=0) + return {'query': name, 'matched_persons': len(persons), + 'overall_risk_score': overall, 'total_links': total_links, + 'total_findings': total_findings, 'critical_findings': crit_findings, + 'persons': persons, 'scanned_at': int(time.time())}