""" enrich_router.py — v3 enrichment + forensic scan Author: dradulic@outlook.com / damir@rinet.one Date: 2026-05-04 (R2) → 2026-05-05 (R3 CC6 v3) POST /v2/enrich/{kind}/{eid} Inspect the row, scrape the web (Wikipedia HR, sport-pgz.hr search, primary club URL if any), regex-extract candidate fields (web/email/ telefon), optionally synthesise descriptions via DeepSeek, and return a *preview* shape with `proposed` updates the operator can apply. POST /v2/enrich/{kind}/{eid}/apply Body shapes: None / {} → re-run preview, apply every proposed field {"fields": {...}} → apply ONLY those (whitelist + emptiness still enforced) Performs UPDATE on the matching table, sets metadata.enriched_at and metadata.enrichment_source, writes a row to pgz_sport.enrichment_log, returns the after snapshot. GET /v2/enrich/log?kind=&target_id=&limit= Read recent enrichment-log entries. POST /v2/forensic/scan Search civic.persons by name, return entity links + findings + risk score. Kinds: klub | savez | sportas """ from __future__ import annotations import os, re, json, time, html, urllib.parse, urllib.request from datetime import datetime, timezone from typing import Any, Optional import psycopg2, psycopg2.extras from fastapi import APIRouter, HTTPException, Header, Body router = APIRouter() _pgh = os.environ.get('PG_HOST', '10.10.0.2') _pgp = int(os.environ.get('PG_PORT', '6432')) if _pgh in ('localhost', '127.0.0.1'): _pgh = os.environ.get('DB_HOST', '10.10.0.2') _pgp = int(os.environ.get('DB_PORT', '6432')) DB = dict(host=_pgh, port=_pgp, dbname=os.environ.get('PG_DB', 'rinet_v3'), user=os.environ.get('PG_USER', 'rinet'), password=os.environ.get('PG_PASS', 'R1net2026!SecureDB#v7')) UA = 'pgz-sport-enrich/3.0 (+https://sport.rinet.one)' TIMEOUT = 6 # seconds — fail-soft # Optional JS-aware fallback (Playwright). Lazy-loaded, never required. import sys as _sys _sys.path.insert(0, '/opt/pgz-sport') try: from enrichment import playwright_scraper as _pw_scraper _HAS_PW = _pw_scraper.HAS_PLAYWRIGHT except Exception: _pw_scraper = None _HAS_PW = False DEEPSEEK_KEY = os.environ.get('DEEPSEEK_API_KEY', '').strip() DEEPSEEK_URL = os.environ.get('DEEPSEEK_URL', 'https://api.deepseek.com/v1/chat/completions') # ─── DB helpers ────────────────────────────────────────────────────────── def _db(): c = psycopg2.connect(**DB); c.autocommit = True; return c def _fetch_one(sql, p): with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: cur.execute(sql, p); r = cur.fetchone() return dict(r) if r else None # ─── HTTP helpers ──────────────────────────────────────────────────────── def _http_get(url: str, timeout: int = TIMEOUT) -> Optional[str]: if not url: return None if not url.startswith('http'): return None try: req = urllib.request.Request(url, headers={ 'User-Agent': UA, 'Accept-Language': 'hr,en;q=0.8'}) with urllib.request.urlopen(req, timeout=timeout) as r: data = r.read(150000) try: return data.decode('utf-8') except: return data.decode('latin-1', 'ignore') except Exception: return None def _strip_tags(s: str) -> str: if not s: return '' s = re.sub(r']*>.*?', ' ', s, flags=re.S | re.I) s = re.sub(r']*>.*?', ' ', s, flags=re.S | re.I) s = re.sub(r'<[^>]+>', ' ', s) s = html.unescape(s) s = re.sub(r'\s+', ' ', s).strip() return s def _extract_meta(html_doc: str, url: str) -> dict: if not html_doc: return {} out = {'url': url, 'fetched_at': int(time.time())} m = re.search(r']*>([^<]+)', html_doc, re.I) if m: out['title'] = html.unescape(m.group(1).strip())[:300] m = re.search(r')\]]+', re.I) def _find_email(text: str) -> Optional[str]: if not text: return None bad = ('@example.', '@test.', '@email.', 'wixpress.com', 'sentry.io', 'jquery.com', 'googleapis', '@2x.', 'noreply@') seen = set() for m in RE_EMAIL.finditer(text): e = m.group(0).lower().rstrip('.,;:)') if any(b in e for b in bad): continue if e in seen: continue seen.add(e); return e return None def _find_phone(text: str) -> Optional[str]: if not text: return None for m in RE_PHONE.finditer(text): raw = m.group(0).strip() digits = re.sub(r'\D', '', raw) if not (8 <= len(digits) <= 13): continue cleaned = re.sub(r'\s+', ' ', raw).strip() if raw.startswith('+385'): return '+385 ' + raw[4:].lstrip().lstrip('-/') if raw.startswith('00385'): return '+385 ' + raw[5:].lstrip().lstrip('-/') return cleaned return None def _find_official_web(text: str, hint: str = '') -> Optional[str]: if not text: return None blocked = ('wikipedia.org', 'sport-pgz.hr', 'google.com', 'facebook.com', 'instagram.com', 'youtube.com', 'twitter.com', 'wikimedia', 'sportilus.com', 'transfermarkt.com', 'wikidata.org', 'sudreg.pravosudje.hr', 'gov.hr', 'apis.google.com', 'rinet.one', 'pgz.hr') candidates: list[str] = [] for m in RE_URL.finditer(text): u = m.group(0).rstrip('.,;:)\'"') try: host = urllib.parse.urlparse(u).hostname or '' except Exception: continue if not host or any(b in host for b in blocked): continue candidates.append(u) if not candidates: return None if hint: slug = re.sub(r'[^a-z0-9]', '', hint.lower())[:8] for u in candidates: host = urllib.parse.urlparse(u).hostname or '' if slug and slug in host.replace('-', '').replace('.', ''): return u return candidates[0] # ─── External sources ──────────────────────────────────────────────────── def _wiki_summary(query: str) -> Optional[dict]: if not query: return None title = urllib.parse.quote(query.replace(' ', '_'), safe='') body = _http_get(f'https://hr.wikipedia.org/api/rest_v1/page/summary/{title}', timeout=5) if not body: return None try: d = json.loads(body) if d.get('type') == 'disambiguation' or 'extract' not in d: return None return { 'source': 'wikipedia.hr', 'url': d.get('content_urls', {}).get('desktop', {}).get('page'), 'title': d.get('title'), 'extract': d.get('extract'), 'description': d.get('description'), } except Exception: return None def _sport_pgz_search(query: str) -> Optional[dict]: if not query: return None page = _http_get('https://sport-pgz.hr/?s=' + urllib.parse.quote(query), timeout=6) if not page: # Plain HTTP failed → try JS-rendered fallback if available. if _HAS_PW and _pw_scraper is not None: return _pw_scraper.scrape_sport_pgz_klub(query) return None m = re.search(r']*>.*?]*rel=["\']bookmark["\'][^>]*>([^<]+)', page, re.S | re.I) if not m: m = re.search(r']*>([^<]{6,180})', page, re.I) if not m: # Search page rendered but yielded nothing parseable — try JS fallback. if _HAS_PW and _pw_scraper is not None: return _pw_scraper.scrape_sport_pgz_klub(query) return None hit = m.group(1) body = _http_get(hit, timeout=6) if not body: return {'source': 'sport-pgz.hr', 'url': hit, 'title': html.unescape(m.group(2).strip())} text = _strip_tags(body)[:4000] meta = _extract_meta(body, hit) return { 'source': 'sport-pgz.hr', 'url': hit, 'title': meta.get('title') or html.unescape(m.group(2).strip()), 'extract': meta.get('description') or text[:500], 'raw_text': text, } def _fetch_primary_site(url: str) -> Optional[dict]: body = _http_get(url, timeout=6) if not body: return None text = _strip_tags(body) meta = _extract_meta(body, url) return { 'source': urllib.parse.urlparse(url).hostname or url, 'url': url, 'title': meta.get('title'), 'extract': meta.get('description') or text[:500], 'raw_text': text[:8000], } # ─── DeepSeek (optional, fail-soft) ───────────────────────────────────── def _deepseek_describe(naziv: str, kind: str, evidence: list[str]) -> Optional[str]: if not DEEPSEEK_KEY or not evidence: return None joined = "\n---\n".join(e for e in evidence if e)[:6000] if not joined.strip(): return None prompt = (f"Iz dolje navedenih izvora napiši profesionalni opis za " f"{kind} '{naziv}' na hrvatskom jeziku. 3-5 rečenica. " f"Bez uvoda 'Evo opisa', samo tekst.\n\nIZVORI:\n{joined}") payload = { "model": "deepseek-chat", "messages": [ {"role": "system", "content": "Pišeš sažete činjenične opise sportskih organizacija na hrvatskom."}, {"role": "user", "content": prompt}, ], "max_tokens": 280, "temperature": 0.3, } req = urllib.request.Request( DEEPSEEK_URL, data=json.dumps(payload).encode('utf-8'), headers={'Authorization': 'Bearer ' + DEEPSEEK_KEY, 'Content-Type': 'application/json', 'User-Agent': UA}, method='POST') try: with urllib.request.urlopen(req, timeout=20) as r: d = json.loads(r.read().decode('utf-8')) text = d.get('choices', [{}])[0].get('message', {}).get('content', '').strip() return text or None except Exception: return None # ─── Row loaders & display name ───────────────────────────────────────── def _load_row(kind: str, eid: int) -> dict: if kind == 'klub': row = _fetch_one("""SELECT id, naziv, oib, sport, grad, predsjednik, tajnik, web, web_stranica, email, telefon, ciljevi, opis_djelatnosti, sjediste, godina_osnutka, savez_id, scrape_url, source_url, metadata FROM pgz_sport.klubovi WHERE id=%s""", (eid,)) elif kind == 'savez': row = _fetch_one("""SELECT id, naziv, oib, sport, predsjednik, tajnik, email, telefon, web, adresa, godina_osnutka, source_url, metadata FROM pgz_sport.savezi WHERE id=%s""", (eid,)) elif kind == 'sportas': row = _fetch_one("""SELECT id, ime, prezime, sport, klub_id, profile_url, scrape_url, slika_url, source_url, hns_igrac_id, biografija, metadata FROM pgz_sport.clanovi WHERE id=%s""", (eid,)) else: raise HTTPException(400, "kind must be klub|savez|sportas") if not row: raise HTTPException(404, kind + " not found") return row def _display_name(kind: str, row: dict) -> str: if kind == 'sportas': return ((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip() return row.get('naziv', '') or '' def _research_links(naziv, kind, grad=None): base_q = (naziv or '').strip() q = (base_q + ' ' + grad) if grad else base_q qenc = urllib.parse.quote(q) out = [ {'label': 'Google', 'icon': '🔍', 'url': 'https://www.google.com/search?q=' + qenc}, {'label': 'Wikipedia HR', 'icon': '📚', 'url': 'https://hr.wikipedia.org/w/index.php?search=' + qenc}, {'label': 'sport-pgz.hr', 'icon': '🏅', 'url': 'https://sport-pgz.hr/?s=' + qenc}, ] if kind == 'klub': out.append({'label': 'Sportilus', 'icon': '⬡', 'url': 'https://www.sportilus.com/?s=' + qenc}) out.append({'label': 'Sudski registar', 'icon': '⚖', 'url': 'https://sudreg.pravosudje.hr/registar/oc/index.html'}) if kind == 'sportas': out.append({'label': 'HNS Semafor', 'icon': '⚽', 'url': 'https://semafor.hns.family/?s=' + qenc}) out.append({'label': 'transfermarkt','icon': '⚽', 'url': 'https://www.transfermarkt.com/schnellsuche/ergebnis/schnellsuche?query=' + qenc}) if kind == 'savez': out.append({'label': 'sport-pgz.hr savezi', 'icon': '🏅', 'url': 'https://sport-pgz.hr/savezi'}) return out # ─── Proposal pipelines ───────────────────────────────────────────────── def _name_tokens(naziv: str) -> list[str]: """Significant tokens from entity name (≥4 chars, deaccented).""" import unicodedata s = unicodedata.normalize('NFKD', naziv or '').encode('ascii', 'ignore').decode('ascii').lower() toks = [t for t in re.split(r'[^a-z0-9]+', s) if len(t) >= 4] stop = {'klub','udruga','sportski','sport','kosarkaski','kosarka','nogometni', 'rukometni','savez','rijeka','primorsko','goranski','grad','grada','centar'} return [t for t in toks if t not in stop] or toks def _is_relevant(source: dict, tokens: list[str]) -> bool: """A source is 'relevant' only if the page actually mentions the entity name.""" if not tokens: return True import unicodedata blob = (source.get('title') or '') + ' ' + (source.get('extract') or '') + ' ' + (source.get('raw_text') or '') blob = unicodedata.normalize('NFKD', blob.lower()).encode('ascii', 'ignore').decode('ascii') return any(t in blob for t in tokens) def _propose_for_klub(row: dict) -> dict: naziv = row.get('naziv') or '' primary = row.get('web') or row.get('web_stranica') or row.get('source_url') or row.get('scrape_url') sources, evidence = [], [] pdoc = _fetch_primary_site(primary) if primary else None if pdoc: sources.append(pdoc); evidence.append(pdoc.get('raw_text') or pdoc.get('extract') or '') wiki = _wiki_summary(naziv) if wiki: sources.append(wiki); evidence.append(wiki.get('extract') or '') spz = _sport_pgz_search(naziv) if spz: sources.append(spz); evidence.append(spz.get('raw_text') or spz.get('extract') or '') tokens = _name_tokens(naziv) relevant = [s for s in sources if _is_relevant(s, tokens)] relevant_blob = '\n\n'.join((s.get('raw_text') or s.get('extract') or '') for s in relevant) proposed: dict[str, Any] = {} # web/email/telefon: ONLY from sources actually mentioning the entity if not row.get('web'): u = _find_official_web(relevant_blob, naziv) if u: proposed['web'] = u if not row.get('email'): e = _find_email(relevant_blob) if e: proposed['email'] = e if not row.get('telefon'): t = _find_phone(relevant_blob) if t: proposed['telefon'] = t if not row.get('opis_djelatnosti'): descr_evidence = [(s.get('raw_text') or s.get('extract') or '') for s in relevant] or evidence descr = _deepseek_describe(naziv, 'sportski klub', descr_evidence) if not descr: for s in (relevant or sources): if s.get('extract') and len(s['extract']) >= 80: descr = s['extract']; break if descr: proposed['opis_djelatnosti'] = descr.strip()[:2000] return {'proposed': proposed, 'sources': sources} def _propose_for_savez(row: dict) -> dict: naziv = row.get('naziv') or '' primary = row.get('web') or row.get('source_url') sources, evidence = [], [] pdoc = _fetch_primary_site(primary) if primary else None if pdoc: sources.append(pdoc); evidence.append(pdoc.get('raw_text') or '') wiki = _wiki_summary(naziv) if wiki: sources.append(wiki); evidence.append(wiki.get('extract') or '') spz = _sport_pgz_search(naziv) if spz: sources.append(spz); evidence.append(spz.get('raw_text') or '') tokens = _name_tokens(naziv) relevant = [s for s in sources if _is_relevant(s, tokens)] relevant_blob = '\n\n'.join((s.get('raw_text') or s.get('extract') or '') for s in relevant) proposed: dict[str, Any] = {} if not row.get('web'): u = _find_official_web(relevant_blob, naziv) if u: proposed['web'] = u if not row.get('email'): e = _find_email(relevant_blob) if e: proposed['email'] = e if not row.get('telefon'): t = _find_phone(relevant_blob) if t: proposed['telefon'] = t return {'proposed': proposed, 'sources': sources} def _propose_for_sportas(row: dict) -> dict: naziv = ((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip() sources, evidence = [], [] wiki = _wiki_summary(naziv) if wiki: sources.append(wiki); evidence.append(wiki.get('extract') or '') proposed: dict[str, Any] = {} if not row.get('biografija') and evidence: descr = _deepseek_describe(naziv, 'sportaš', evidence) if not descr and wiki: descr = wiki.get('extract') if descr: proposed['biografija'] = descr.strip()[:2000] return {'proposed': proposed, 'sources': sources} # ─── Endpoints ────────────────────────────────────────────────────────── @router.post("/enrich/{kind}/{eid}") def enrich_preview(kind: str, eid: int): row = _load_row(kind, eid) if kind == 'klub': res = _propose_for_klub(row) elif kind == 'savez': res = _propose_for_savez(row) else: res = _propose_for_sportas(row) if kind == 'klub': keys = ['oib','sport','grad','predsjednik','tajnik','web','email','telefon', 'sjediste','godina_osnutka','ciljevi','opis_djelatnosti'] elif kind == 'savez': keys = ['oib','sport','predsjednik','tajnik','email','telefon','web','adresa','godina_osnutka'] else: keys = ['sport','profile_url','slika_url','hns_igrac_id','biografija'] naziv = _display_name(kind, row) grad = row.get('grad') if kind == 'klub' else None primary = row.get('web') or row.get('web_stranica') or row.get('source_url') or row.get('scrape_url') or row.get('profile_url') filled = sum(1 for k in keys if row.get(k)) coverage = round(filled / len(keys) * 100) missing = [k for k in keys if not row.get(k)] proposed = res['proposed'] current = {k: row.get(k) for k in proposed.keys()} meta = row.get('metadata') or {} if not isinstance(meta, dict): meta = {} return { 'kind': kind, 'id': eid, 'naziv': naziv, 'coverage': coverage, 'filled_fields': filled, 'total_fields': len(keys), 'missing_fields': missing, 'live_snippet': _fetch_title(primary) if primary else None, 'research_links': _research_links(naziv, kind, grad), 'sources': res['sources'], 'current': current, 'proposed': proposed, 'last_enriched_at': meta.get('enriched_at'), 'last_enrichment_source': meta.get('enrichment_source'), 'enriched_at': int(time.time()), 'apply_url': f'/sport/api/v2/enrich/{kind}/{eid}/apply', } _TABLE_MAP = { 'klub': ('pgz_sport.klubovi', {'web','email','telefon','predsjednik','tajnik', 'opis_djelatnosti','ciljevi','godina_osnutka','sjediste'}), 'savez': ('pgz_sport.savezi', {'web','email','telefon','predsjednik','tajnik','adresa','godina_osnutka'}), 'sportas': ('pgz_sport.clanovi', {'biografija','profile_url','slika_url'}), } def _apply_to_db(kind: str, eid: int, fields: dict, sources: list, user_email: Optional[str]): if kind not in _TABLE_MAP: raise HTTPException(400, "kind must be klub|savez|sportas") table, allowed = _TABLE_MAP[kind] with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: cur.execute(f"SELECT * FROM {table} WHERE id=%s FOR UPDATE", (eid,)) before = cur.fetchone() if not before: raise HTTPException(404, kind + " not found") before = dict(before) sets, params, applied = [], [], {} for k, v in (fields or {}).items(): if k not in allowed: continue if v is None or str(v).strip() == '': continue if before.get(k): continue # never overwrite existing sets.append(f"{k} = %s") params.append(v); applied[k] = v meta_in = before.get('metadata') or {} if not isinstance(meta_in, dict): meta_in = {} now_iso = datetime.now(timezone.utc).isoformat() meta_in['enriched_at'] = now_iso meta_in['enrichment_source'] = [s.get('source') for s in (sources or []) if s.get('source')] history = meta_in.get('enrichment_history') or [] history.append({ 'at': now_iso, 'fields': list(applied.keys()), 'sources': meta_in['enrichment_source'], 'urls': [s.get('url') for s in (sources or []) if s.get('url')], 'user': user_email, }) meta_in['enrichment_history'] = history[-10:] sets.append("metadata = %s::jsonb") params.append(json.dumps(meta_in, ensure_ascii=False, default=str)) params.append(eid) cur.execute(f"UPDATE {table} SET {', '.join(sets)} WHERE id=%s RETURNING *", params) after = dict(cur.fetchone()) cur.execute( """INSERT INTO pgz_sport.enrichment_log (kind, target_id, source, url, fields_set, before_jsonb, after_jsonb, user_email) VALUES (%s,%s,%s,%s,%s,%s::jsonb,%s::jsonb,%s)""", (kind, eid, ','.join(meta_in['enrichment_source'])[:120] if meta_in['enrichment_source'] else None, (sources[0].get('url') if sources else None), list(applied.keys()) or None, json.dumps({k: before.get(k) for k in (list(applied.keys()) + ['metadata'])}, ensure_ascii=False, default=str), json.dumps({k: after.get(k) for k in (list(applied.keys()) + ['metadata'])}, ensure_ascii=False, default=str), user_email)) snap_keys = ('id','naziv','ime','prezime','web','email','telefon', 'opis_djelatnosti','biografija','metadata') return {'applied': applied, 'after': {k: after.get(k) for k in snap_keys if k in after}} @router.post("/enrich/{kind}/{eid}/apply") def enrich_apply(kind: str, eid: int, body: dict = Body(default=None), x_user_email: Optional[str] = Header(default=None)): body = body or {} fields = body.get('fields') sources = body.get('sources') if not fields: row = _load_row(kind, eid) if kind == 'klub': res = _propose_for_klub(row) elif kind == 'savez': res = _propose_for_savez(row) else: res = _propose_for_sportas(row) fields = res['proposed'] sources = res['sources'] out = _apply_to_db(kind, eid, fields or {}, sources or [], x_user_email) return {'kind': kind, 'id': eid, **out} @router.get("/enrich/log") def enrich_log(kind: Optional[str] = None, target_id: Optional[int] = None, limit: int = 50): where, params = [], [] if kind: where.append("kind=%s"); params.append(kind) if target_id: where.append("target_id=%s"); params.append(target_id) sql = ("SELECT id, kind, target_id, source, url, fields_set, user_email, created_at " "FROM pgz_sport.enrichment_log " + ("WHERE " + " AND ".join(where) + " " if where else "") + "ORDER BY id DESC LIMIT %s") params.append(min(int(limit or 50), 200)) with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: cur.execute(sql, params) rows = [dict(r) for r in cur.fetchall()] for r in rows: if r.get('created_at'): r['created_at'] = r['created_at'].isoformat() return {'count': len(rows), 'rows': rows} # ─── R3B M2 — SEARCH SUGGEST (autocomplete for Mreža) ─────────────────── @router.get("/search/suggest") def search_suggest(q: str = '', type: str = '', limit: int = 10): """ Autocomplete suggestions for the Mreža search inputs. type ∈ {person, club, company, ''} — empty means all. Returns: {query, results: [{id, label, type, sub}]} """ q = (q or '').strip() if len(q) < 2: return {'query': q, 'results': []} limit = max(1, min(50, int(limit))) out = [] with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: if type in ('', 'club'): cur.execute(""" SELECT id, naziv AS label, sport, grad FROM pgz_sport.klubovi WHERE naziv ILIKE %s AND aktivan=TRUE ORDER BY length(naziv), naziv LIMIT %s """, ('%'+q+'%', limit)) for r in cur.fetchall(): out.append({'id':'klub:'+str(r['id']), 'label': r['label'], 'type':'club', 'sub': (r.get('sport') or '')+' · '+(r.get('grad') or '')}) cur.execute(""" SELECT id, naziv AS label, sport FROM pgz_sport.savezi WHERE naziv ILIKE %s AND aktivan=TRUE ORDER BY length(naziv), naziv LIMIT %s """, ('%'+q+'%', limit)) for r in cur.fetchall(): out.append({'id':'savez:'+str(r['id']), 'label': r['label'], 'type':'savez', 'sub': r.get('sport') or 'savez'}) if type in ('', 'person'): cur.execute(""" SELECT c.id, c.ime, c.prezime, c.sport, k.naziv AS klub_naziv FROM pgz_sport.clanovi c LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id WHERE (COALESCE(c.ime,'') || ' ' || COALESCE(c.prezime,'')) ILIKE %s ORDER BY length(COALESCE(c.ime,'')||COALESCE(c.prezime,'')), c.prezime LIMIT %s """, ('%'+q+'%', limit)) for r in cur.fetchall(): out.append({'id':'sportas:'+str(r['id']), 'label': (r.get('ime') or '')+' '+(r.get('prezime') or ''), 'type':'person', 'sub': (r.get('sport') or 'sportaš')+(r.get('klub_naziv') and ' · '+r['klub_naziv'] or '')}) cur.execute(""" SELECT id, name AS label, function, oib, county FROM civic.persons WHERE name ILIKE %s ORDER BY oib NULLS LAST, length(name) LIMIT %s """, ('%'+q+'%', limit)) for r in cur.fetchall(): out.append({'id':'civic_person:'+str(r['id']), 'label': r['label'], 'type':'person', 'sub': (r.get('function') or 'civic')+' · '+(r.get('county') or '')}) if type in ('', 'company'): cur.execute(""" SELECT id, name AS label, oib, city, entity_type FROM civic.entities WHERE name ILIKE %s ORDER BY length(name) LIMIT %s """, ('%'+q+'%', limit)) for r in cur.fetchall(): out.append({'id':'civic_entity:'+str(r['id']), 'label': r['label'], 'type':'company', 'sub': (r.get('entity_type') or 'tvrtka')+' · '+(r.get('city') or '')}) return {'query': q, 'results': out[:limit*2]} # ─── R3B M3 — FORENSIC ENRICH (Wikipedia scrape + persist) ────────────── @router.post("/forensic/findings/{finding_id}/enrich") def enrich_forensic(finding_id: int): """ Look up the forensic finding, derive the PEP person name from entities_involved or title, hit Wikipedia HR for a summary, and persist the enriched payload into civic.forensic_findings.ai_analysis (or back into raw_data.enrichment). """ with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: cur.execute(""" SELECT id, finding_type, severity, title, description, entities_involved, raw_data, ai_analysis FROM civic.forensic_findings WHERE id=%s """, (finding_id,)) f = cur.fetchone() if not f: raise HTTPException(404, "finding not found") f = dict(f) # Derive person name candidates candidates = [] if isinstance(f.get('entities_involved'), (list, dict)): ei = f['entities_involved'] if isinstance(ei, dict): for k in ('person','name','osoba','PEP','pep'): if ei.get(k): candidates.append(str(ei[k])) # Also try persons: [...] list for p in (ei.get('persons') or ei.get('osobe') or []): if isinstance(p, dict) and p.get('name'): candidates.append(p['name']) elif isinstance(p, str): candidates.append(p) elif isinstance(ei, list): for it in ei: if isinstance(it, dict): for k in ('name','person','label'): if it.get(k): candidates.append(str(it[k])); break elif isinstance(it, str): candidates.append(it) if not candidates and f.get('title'): # Heuristic: extract first capitalised "Ime Prezime" pair m = re.search(r'\b([A-ZŠĐČĆŽ][a-zšđčćž]{2,})\s+([A-ZŠĐČĆŽ][a-zšđčćž]{2,})', f['title']) if m: candidates.append(m.group(0)) wiki = None used_query = None for q in candidates[:3]: wiki = _wiki_summary(q) if wiki: used_query = q break # Build enrichment payload enrichment = { 'queried': candidates[:5], 'used_query': used_query, 'wiki': wiki, 'enriched_at': datetime.now(timezone.utc).isoformat(), } # Persist into raw_data.enrichment raw = f.get('raw_data') if raw is None: raw = {} if not isinstance(raw, dict): raw = {'_legacy': raw} raw['enrichment'] = enrichment cur.execute(""" UPDATE civic.forensic_findings SET raw_data = %s::jsonb, ai_analysis = COALESCE(ai_analysis, %s) WHERE id = %s """, (json.dumps(raw, default=str, ensure_ascii=False), (wiki or {}).get('extract'), finding_id)) c.commit() return { 'finding_id': finding_id, 'queried': candidates[:5], 'used_query': used_query, 'wiki': wiki, 'persisted': True, } # ─── R3B P4 — FORENSIC SCAN (kept from prior version) ─────────────────── @router.post("/forensic/scan") def forensic_scan(req: dict = Body(...)): name = (req.get('name') or '').strip() if len(name) < 3: raise HTTPException(400, "name must be at least 3 chars") with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: cur.execute(""" SELECT id, name, function, party, county, city, oib, trust_tier FROM civic.persons WHERE upper(name) ILIKE upper(%s) ORDER BY oib NULLS LAST, id LIMIT 25 """, ('%' + name + '%',)) persons = [dict(r) for r in cur.fetchall()] for p in persons: p['links'] = []; p['findings'] = [] if p.get('oib'): cur.execute(""" SELECT pel.entity_id, pel.roles, e.name AS entity_name, e.oib AS entity_oib, e.entity_type, e.city, e.risk_score FROM civic.person_entity_links pel LEFT JOIN civic.entities e ON e.id = pel.entity_id WHERE pel.person_oib = %s LIMIT 50 """, (p['oib'],)) p['links'] = [dict(r) for r in cur.fetchall()] cur.execute(""" SELECT id, finding_type, severity, title, severity_score, created_at FROM civic.forensic_findings WHERE entities_involved::text ILIKE %s ORDER BY severity_score DESC, created_at DESC LIMIT 30 """, ('%' + p['oib'] + '%',)) p['findings'] = [dict(r) for r in cur.fetchall()] if not p['findings']: cur.execute(""" SELECT id, finding_type, severity, title, severity_score, created_at FROM civic.forensic_findings WHERE title ILIKE %s OR description ILIKE %s ORDER BY severity_score DESC, created_at DESC LIMIT 30 """, ('%' + p['name'] + '%', '%' + p['name'] + '%')) p['findings'] = [dict(r) for r in cur.fetchall()] total_links = total_findings = crit_findings = 0 for p in persons: total_links += len(p.get('links') or []) for f in p.get('findings') or []: total_findings += 1 if f.get('severity') in ('CRITICAL', 'HIGH'): crit_findings += 1 score = 0 if (p.get('function') or '').strip(): score += 30 if (p.get('party') or '').strip(): score += 15 score += min(40, len(p.get('links') or []) * 5) score += min(40, len(p.get('findings') or []) * 10) score += sum(20 for f in (p.get('findings') or []) if f.get('severity') in ('CRITICAL', 'HIGH')) p['risk_score'] = min(100, score) overall = max((p.get('risk_score', 0) for p in persons), default=0) return {'query': name, 'matched_persons': len(persons), 'overall_risk_score': overall, 'total_links': total_links, 'total_findings': total_findings, 'critical_findings': crit_findings, 'persons': persons, 'scanned_at': int(time.time())}