.*?

(\d{1,2})\.(\d{1,2})\.(\d{4})', html_doc, re.S) if m: from datetime import date as _date try: out['datum_rodenja'] = _date(int(m.group(3)), int(m.group(2)), int(m.group(1))).isoformat() except Exception: pass m = re.search(r'
([^<]+)
', html_doc) if m: out['mjesto_rodenja'] = m.group(1).strip() m = re.search(r'
(\d+)
', html_doc) if m: out['broj_dresa'] = int(m.group(1)) m = re.search(r' Optional[dict]: body = _http_get(url, timeout=8) if not body: # Try Playwright fallback if _HAS_PW and _pw_scraper is not None: r = _pw_scraper.fetch_rendered(url, timeout_ms=15000) if r and r.get('html_len', 0) > 2000: # We didn't store html in fetch_rendered — re-fetch text only is enough # but we need html for parse. Do a simple HTTP retry with longer timeout. body = _http_get(url, timeout=15) return _parse_hns_player(body, url) if body else None # ─── Generic sport-federation scraper ─────────────────────────────────── def _fed_url_from_row(row: dict) -> Optional[str]: """If the row already points to a federation profile (source_url / profile_url on a known fed host), return it.""" feds, _, _ = _load_sport_feds() fed_hosts = set() for entry in feds.values(): if not isinstance(entry, dict): continue for which in ('national', 'pgz'): sub = entry.get(which) or {} for k in ('url', 'search_url', 'profile_url_pattern'): v = sub.get(k) if v: try: h = urllib.parse.urlparse(v.replace('{q}', 'x').replace('{slug}', 'x').replace('{hns_pid}', '1')).hostname if h: fed_hosts.add(h) except Exception: pass for k in ('source_url', 'profile_url'): u = row.get(k) if not u: continue try: h = urllib.parse.urlparse(u).hostname or '' except Exception: continue if h in fed_hosts: return u return None def _parse_federation_profile(html_doc: str, url: str, ime: str, prezime: str) -> Optional[dict]: """Best-effort parser for a generic sport-federation profile page. Returns {source, url, slika_url, datum_rodenja, mjesto_rodenja, klub, extract, raw_text}. Tolerant of varied page structures. """ if not html_doc: return None host = urllib.parse.urlparse(url).hostname or '' out: dict[str, Any] = { 'source': host, 'url': url, } # Title m = re.search(r']>([^<]+)', html_doc, re.I) if m: out['title'] = html.unescape(m.group(1).strip())[:300] # Meta description m = re.search(r'= 3: name_tokens.append(re.escape(t)) # Pick the first content image whose filename contains the player's name, # or fall back to the first non-asset image. img_candidates = re.findall(r']+src=["\']([^"\']+)["\']', html_doc, re.I) chosen_img = None for src in img_candidates: low = src.lower() if any(b in low for b in ('logo', 'icon', 'admin-ajax', 'spinner', 'loader', 'sprite', '/themes/', '/icons/', 'gdpr', 'banner', 'header', 'footer', 'placeholder', 'avatar-default')): continue if not low.endswith(('.jpg', '.jpeg', '.png', '.webp')): continue # Prefer matches on player name in URL if name_tokens and any(re.search(t, src, re.I) for t in name_tokens): chosen_img = src; break if chosen_img is None: chosen_img = src if chosen_img: if not chosen_img.startswith('http'): chosen_img = urllib.parse.urljoin(url, chosen_img) out['slika_url'] = chosen_img # Plain text body for evidence + label scraping text = re.sub(r']>.?', ' ', html_doc, flags=re.S | re.I) text = re.sub(r']>.?', ' ', text, flags=re.S | re.I) text = re.sub(r'<[^>]+>', ' ', text) text = html.unescape(re.sub(r'\s+', ' ', text)).strip() out['raw_text'] = text[:4000] out['extract'] = (out.get('description') or text[max(0, text.find(prezime)-30):max(0, text.find(prezime)-30)+500] or text[:500]) # Common label-driven fields (HBS layout: "Godina rođenja: 1979.", "Matični klub: …") m = re.search(r'Datum\s+ro[đdj]?enja[:\s]+(\d{1,2}[.\-/]\d{1,2}[.\-/]\d{4})', text, re.I) if m: try: from datetime import date as _date d = re.split(r'[.\-/]', m.group(1)) out['datum_rodenja'] = _date(int(d[2]), int(d[1]), int(d[0])).isoformat() except Exception: pass if 'datum_rodenja' not in out: m = re.search(r'Godina\s+ro[đdj]?enja[:\s]+(\d{4})', text, re.I) if m: try: from datetime import date as _date out['datum_rodenja'] = _date(int(m.group(1)), 1, 1).isoformat() except Exception: pass m = re.search(r'Mjesto\s+ro[đdj]?enja[:\s]+([A-ZČĆŠĐŽ][^,\n.]{2,40})', text) if m: out['mjesto_rodenja'] = m.group(1).strip() m = re.search(r'Mati[čc]ni\s+klub[:\s]+([^\n]{3,60}?)(?:\s+(?:Sportski|Datum|Liječni|Reprezent|Sezona|Domaće|Nastupi))', text, re.I) if m: out['klub_naziv'] = m.group(1).strip().rstrip('.') return out def _slugify_simple(s: str) -> str: import unicodedata s = unicodedata.normalize('NFKD', s or '').encode('ascii', 'ignore').decode('ascii').lower() return re.sub(r'[^a-z0-9]+', '-', s).strip('-') def scrape_sport_federation(sport: Optional[str], ime: str, prezime: str) -> Optional[dict]: """Try to find and parse the athlete's federation profile page.""" fed = _sport_fed(sport) if sport else None if not fed: return None nat = (fed or {}).get('national') or {} full_name = (ime + ' ' + prezime).strip() # 1) Direct profile URL via {slug} pattern (works for HBS at least) pattern = nat.get('profile_url_pattern') if pattern and '{slug}' in pattern: slug = _slugify_simple(full_name) url = pattern.replace('{slug}', slug) body = _http_get(url, timeout=8) if body and prezime.lower() in body.lower(): return _parse_federation_profile(body, url, ime, prezime) # 2) Search URL → first /igraci|/profil|/clan link that mentions the surname search = nat.get('search_url') if search: body = _http_get(search.replace('{q}', urllib.parse.quote(full_name)), timeout=10) if body: for href_re in (r'href="([^"]?/igraci/[^"]+)"', r'href="([^"]?/igrac/[^"]+)"', r'href="([^"]?/sportasi/[^"]+)"', r'href="([^"]?/clanovi/[^"]+)"', r'href="([^"]?/profil/[^"]+)"'): for m in re.finditer(href_re, body, re.I): cand = m.group(1) if not cand.startswith('http'): cand = urllib.parse.urljoin(nat.get('url', search), cand) if _slugify_simple(prezime) in _slugify_simple(cand): b2 = _http_get(cand, timeout=8) if b2: return _parse_federation_profile(b2, cand, ime, prezime) return None def _propose_for_sportas(row: dict) -> dict: naziv = ((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip() ime, prezime = (row.get('ime') or ''), (row.get('prezime') or '') sport = row.get('sport') sources, evidence = [], [] proposed: dict[str, Any] = {} # 1) HNS Semafor — only meaningful when sport is football OR row already # carries an HNS link. hns_doc: Optional[dict] = None if _normalize_sport(sport) == 'nogomet' or _hns_url_from_row(row): hns_url = _hns_url_from_row(row) if hns_url: hns_doc = _hns_fetch_player(hns_url) if hns_doc: sources.append(hns_doc) evidence.append(hns_doc.get('raw_text') or hns_doc.get('extract') or '') # 2) Sport-aware federation scrape (HBS, HKS, etc.) — also use existing # source_url/profile_url if it points at a known federation host. fed_doc: Optional[dict] = None direct_fed_url = _fed_url_from_row(row) if direct_fed_url and (not hns_doc or hns_doc.get('url') != direct_fed_url): body = _http_get(direct_fed_url, timeout=8) if body: fed_doc = _parse_federation_profile(body, direct_fed_url, ime, prezime) if not fed_doc: fed_doc = scrape_sport_federation(sport, ime, prezime) if fed_doc: sources.append(fed_doc) evidence.append(fed_doc.get('raw_text') or fed_doc.get('extract') or '') # Helper: pick from hns_doc first then fed_doc def _pick(field): if hns_doc and hns_doc.get(field): return hns_doc[field] if fed_doc and fed_doc.get(field): return fed_doc[field] return None if not row.get('profile_url'): v = _pick('url') or (hns_doc and hns_doc.get('url')) or (fed_doc and fed_doc.get('url')) if v: proposed['profile_url'] = v if not row.get('source_url'): v = (hns_doc and hns_doc.get('url')) or (fed_doc and fed_doc.get('url')) if v: proposed['source_url'] = v if not row.get('slika_url'): v = _pick('slika_url') if v: proposed['slika_url'] = v if not row.get('hns_igrac_id') and hns_doc and hns_doc.get('hns_igrac_id'): proposed['hns_igrac_id'] = hns_doc['hns_igrac_id'] if not row.get('datum_rodenja'): v = _pick('datum_rodenja') if v: proposed['datum_rodenja'] = v if not row.get('mjesto_rodenja'): v = _pick('mjesto_rodenja') if v: proposed['mjesto_rodenja'] = v if not row.get('broj_dresa') and hns_doc and hns_doc.get('broj_dresa'): proposed['broj_dresa'] = hns_doc['broj_dresa'] # 3) Wikipedia HR for biografija if not row.get('biografija'): wiki = _wiki_summary(naziv) if wiki: sources.append(wiki) evidence.append(wiki.get('extract') or '') # Description: prefer DeepSeek synthesis from all evidence; fallback to first long snippet if not row.get('biografija'): descr = _deepseek_describe(naziv, f'sportaš ({sport})' if sport else 'sportaš', evidence) if evidence else None if not descr: for s in sources: ext = s.get('extract') if ext and len(ext) >= 80: descr = ext; break if descr: proposed['biografija'] = descr.strip()[:2000] return {'proposed': proposed, 'sources': sources} # ─── Endpoints ────────────────────────────────────────────────────────── # ─── R4 — POST /v2/enrich/forensic/{finding_id} ───────────────────────── def _extract_pep_name(finding: dict) -> Optional[str]: """Pull the primary person name from a forensic_findings row.""" title = (finding.get('title') or '').strip() desc = (finding.get('description') or '').strip() payload = finding.get('raw_data') or {} if isinstance(payload, str): try: payload = json.loads(payload) except Exception: payload = {} if isinstance(payload, dict): for k in ('person_name', 'name', 'osoba'): v = payload.get(k) if v: return str(v).strip() # Try entities_involved.entity_name ents = finding.get('entities_involved') or [] if isinstance(ents, str): try: ents = json.loads(ents) except Exception: ents = [] if isinstance(ents, list): for e in ents: if isinstance(e, dict) and e.get('person_name'): return str(e['person_name']).strip() if isinstance(e, dict) and e.get('entity_name') and ' ' in (e.get('entity_name') or ''): # Some entries store person names as entity_name when entity_type='person' if (e.get('entity_type') or '').lower() in ('person','osoba'): return str(e['entity_name']).strip() # Fallback: extract a "Ime Prezime" from the title m = re.search(r'\b([A-ZČĆŠĐŽ][a-zčćšđž]+)\s+([A-ZČĆŠĐŽ][a-zčćšđž]+(?:-[A-ZČĆŠĐŽ][a-zčćšđž]+)?)\b', title + ' ' + desc) if m: return f"{m.group(1)} {m.group(2)}" return None def _gather_pep_evidence(name: str) -> list[dict]: sources: list[dict] = [] wiki = _wiki_summary(name) if wiki: sources.append(wiki) # DDG html-lite as a "Google snippet" replacement (often OK for HR PEPs) ddg = 'https://html.duckduckgo.com/html/?q=' + urllib.parse.quote(f'"{name}" PGŽ Hrvatska') page = _http_get(ddg, timeout=8) if page: # First result block m = re.search(r']+class="result__a"[^>]+href="([^"]+)"[^>]>([^<]{6,200})', page) snippet_m = re.search(r']+class="result__snippet"[^>]>(.?)', page, re.S) if m: sources.append({ 'source': 'duckduckgo', 'url': html.unescape(m.group(1))[:500], 'title': html.unescape(m.group(2)).strip()[:300], 'extract': re.sub(r'<[^>]+>', ' ', snippet_m.group(1)).strip()[:600] if snippet_m else None, }) return sources def _related_entities_for_pep(name: str) -> list[dict]: """Pull civic.persons + their entity links so we have the structured graph.""" out: list[dict] = [] with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: cur.execute("""SELECT id, name, function, party, county, city, oib, trust_tier FROM civic.persons WHERE upper(name) ILIKE upper(%s) ORDER BY oib NULLS LAST, id LIMIT 10""", ('%'+name+'%',)) for p in cur.fetchall(): p = dict(p) entry = { 'kind': 'person', 'person_id': p['id'], 'person_name': p['name'], 'function': p.get('function'), 'party': p.get('party'), 'county': p.get('county'), 'city': p.get('city'), 'oib': p.get('oib'), 'trust_tier': p.get('trust_tier'), 'entities': [], } if p.get('oib'): cur.execute("""SELECT pel.entity_id, pel.roles, e.name AS entity_name, e.oib AS entity_oib, e.entity_type, e.city, e.risk_score FROM civic.person_entity_links pel LEFT JOIN civic.entities e ON e.id = pel.entity_id WHERE pel.person_oib=%s LIMIT 30""", (p['oib'],)) for r in cur.fetchall(): entry['entities'].append(dict(r)) out.append(entry) return out @router.post("/enrich/forensic/{finding_id}") def enrich_forensic_v2(finding_id: int, body: dict = Body(default=None), x_user_email: Optional[str] = Header(default=None), x_user_id: Optional[int] = Header(default=None)): """Enrich a forensic finding: gather Wiki + DDG snippets + civic graph, write back to civic.forensic_findings.related_entities, and seal the payload hash on Polygon (or queue for sealing). """ body = body or {} explicit_name = (body.get('name') or '').strip() or None with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: cur.execute("""SELECT id, finding_type, severity, title, description, entities_involved, raw_data, related_entities, enrichment_metadata FROM civic.forensic_findings WHERE id=%s""", (finding_id,)) finding = cur.fetchone() if not finding: raise HTTPException(404, "finding not found") finding = dict(finding) name = explicit_name or _extract_pep_name(finding) if not name: raise HTTPException(400, "could not derive a person/entity name; pass {name: \"…\"}") sources = _gather_pep_evidence(name) related = _related_entities_for_pep(name) payload = { 'finding_id': finding_id, 'name': name, 'sources': [{'source': s.get('source'), 'url': s.get('url'), 'title': s.get('title')} for s in sources], 'related_entities': related, 'enriched_at': datetime.now(timezone.utc).isoformat(), } # Persist back to the finding enrichment_meta = finding.get('enrichment_metadata') or {} if not isinstance(enrichment_meta, dict): enrichment_meta = {} history = enrichment_meta.get('history') or [] history.append({ 'at': payload['enriched_at'], 'sources': payload['sources'], 'related_count': len(related), 'user': x_user_email, }) enrichment_meta['history'] = history[-10:] enrichment_meta['enriched_at'] = payload['enriched_at'] enrichment_meta['enriched_by'] = x_user_email or 'system' enrichment_meta['source_count'] = len(sources) with _db() as c, c.cursor() as cur: cur.execute("""UPDATE civic.forensic_findings SET related_entities = %s::jsonb, enrichment_metadata = %s::jsonb WHERE id=%s RETURNING id""", (json.dumps(related, default=str, ensure_ascii=False), json.dumps(enrichment_meta, default=str, ensure_ascii=False), finding_id)) cur.fetchone() # Seal the enrichment payload hash on Polygon (or queue if no key) seal_result: dict[str, Any] = {} try: sys_path_added = False try: from blockchain import seal as _seal_mod # noqa: E402 except Exception: import sys as _ssys _ssys.path.insert(0, '/opt/pgz-sport') from blockchain import seal as _seal_mod # noqa: E402 sys_path_added = True del sys_path_added # silence linters h = _seal_mod.hash_payload(payload) seal_result = _seal_mod.seal_to_polygon( data_hash=h, ref_id=str(finding_id), action='forensic.enriched', ref_type='forensic_finding', payload=payload, user_id=x_user_id, user_email=x_user_email, ) except Exception as e: seal_result = {'error': f'{type(e).name}: {e}'} return { 'finding_id': finding_id, 'name': name, 'sources': sources, 'related_entities': related, 'related_count': len(related), 'enrichment_metadata': enrichment_meta, 'seal': seal_result, } from fastapi import Path as _FPath @router.post("/enrich/{kind:str}/{eid:int}") def enrich_preview(kind: str = _FPath(..., regex='^(klub|savez|sportas)$'), eid: int = 0): row = _load_row(kind, eid) if kind == 'klub': res = _propose_for_klub(row) elif kind == 'savez': res = _propose_for_savez(row) else: res = _propose_for_sportas(row) if kind == 'klub': keys = ['oib','sport','grad','predsjednik','tajnik','web','email','telefon', 'sjediste','godina_osnutka','ciljevi','opis_djelatnosti'] elif kind == 'savez': keys = ['oib','sport','predsjednik','tajnik','email','telefon','web','adresa','godina_osnutka'] else: keys = ['sport','profile_url','slika_url','hns_igrac_id','biografija', 'datum_rodenja','mjesto_rodenja','broj_dresa','visina_cm','tezina_kg', 'dominantna_noga','oib'] naziv = _display_name(kind, row) grad = row.get('grad') if kind == 'klub' else None primary = row.get('web') or row.get('web_stranica') or row.get('source_url') or row.get('scrape_url') or row.get('profile_url') filled = sum(1 for k in keys if row.get(k)) coverage = round(filled / len(keys) 100) missing = [k for k in keys if not row.get(k)] proposed = res['proposed'] current = {k: row.get(k) for k in proposed.keys()} meta = row.get('metadata') or {} if not isinstance(meta, dict): meta = {} return { 'kind': kind, 'id': eid, 'naziv': naziv, 'coverage': coverage, 'filled_fields': filled, 'total_fields': len(keys), 'missing_fields': missing, 'live_snippet': _fetch_title(primary) if primary else None, 'research_links': _research_links(naziv, kind, grad, sport=row.get('sport'), row=row), 'sport': row.get('sport'), 'sport_federation': (lambda f: { 'national': (f.get('national') or {}).get('name') if f else None, 'national_url': (f.get('national') or {}).get('url') if f else None, 'pgz': (f.get('pgz') or {}).get('name') if f else None, })(_sport_fed(row.get('sport'))), 'sources': res['sources'], 'current': current, 'proposed': proposed, 'last_enriched_at': meta.get('enriched_at'), 'last_enrichment_source': meta.get('enrichment_source'), 'enriched_at': int(time.time()), 'apply_url': f'/sport/api/v2/enrich/{kind}/{eid}/apply', } _TABLE_MAP = { 'klub': ('pgz_sport.klubovi', {'web','email','telefon','predsjednik','tajnik', 'opis_djelatnosti','ciljevi','godina_osnutka','sjediste','adresa'}), 'savez': ('pgz_sport.savezi', {'web','email','telefon','predsjednik','tajnik','adresa','godina_osnutka'}), 'sportas': ('pgz_sport.clanovi', {'biografija','profile_url','source_url','slika_url','hns_igrac_id', 'datum_rodenja','mjesto_rodenja','broj_dresa','visina_cm', 'tezina_kg','dominantna_noga','oib'}), } def _apply_to_db(kind: str, eid: int, fields: dict, sources: list, user_email: Optional[str]): if kind not in _TABLE_MAP: raise HTTPException(400, "kind must be klub|savez|sportas") table, allowed = _TABLE_MAP[kind] with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: cur.execute(f"SELECT * FROM {table} WHERE id=%s FOR UPDATE", (eid,)) before = cur.fetchone() if not before: raise HTTPException(404, kind + " not found") before = dict(before) sets, params, applied = [], [], {} for k, v in (fields or {}).items(): if k not in allowed: continue if v is None or str(v).strip() == '': continue if before.get(k): continue # never overwrite existing sets.append(f"{k} = %s") params.append(v); applied[k] = v meta_in = before.get('metadata') or {} if not isinstance(meta_in, dict): meta_in = {} now_iso = datetime.now(timezone.utc).isoformat() meta_in['enriched_at'] = now_iso meta_in['enrichment_source'] = [s.get('source') for s in (sources or []) if s.get('source')] history = meta_in.get('enrichment_history') or [] history.append({ 'at': now_iso, 'fields': list(applied.keys()), 'sources': meta_in['enrichment_source'], 'urls': [s.get('url') for s in (sources or []) if s.get('url')], 'user': user_email, }) meta_in['enrichment_history'] = history[-10:] sets.append("metadata = %s::jsonb") params.append(json.dumps(meta_in, ensure_ascii=False, default=str)) params.append(eid) try: cur.execute(f"UPDATE {table} SET {', '.join(sets)} WHERE id=%s RETURNING ", params) after = dict(cur.fetchone()) except psycopg2.errors.UniqueViolation as _uve: # Race condition — fetch existing row instead conn.rollback() cur.execute(f"SELECT FROM {table} WHERE id=%s", (eid,)) row = cur.fetchone() after = dict(row) if row else {} import logging as _lg _lg.getLogger("enrich").info(f"UniqueViolation race avoided table={table} id={eid}") cur.execute( """INSERT INTO pgz_sport.enrichment_log (kind, target_id, source, url, fields_set, before_jsonb, after_jsonb, user_email) VALUES (%s,%s,%s,%s,%s,%s::jsonb,%s::jsonb,%s)""", (kind, eid, ','.join(meta_in['enrichment_source'])[:120] if meta_in['enrichment_source'] else None, (sources[0].get('url') if sources else None), list(applied.keys()) or None, json.dumps({k: before.get(k) for k in (list(applied.keys()) + ['metadata'])}, ensure_ascii=False, default=str), json.dumps({k: after.get(k) for k in (list(applied.keys()) + ['metadata'])}, ensure_ascii=False, default=str), user_email)) snap_keys = ('id','naziv','ime','prezime','web','email','telefon', 'opis_djelatnosti','biografija','metadata') return {'applied': applied, 'after': {k: after.get(k) for k in snap_keys if k in after}} @router.post("/enrich/{kind:str}/{eid:int}/apply") def enrich_apply(kind: str = _FPath(..., regex='^(klub|savez|sportas)$'), eid: int = 0, body: dict = Body(default=None), x_user_email: Optional[str] = Header(default=None), x_user_id: Optional[int] = Header(default=None)): body = body or {} fields = body.get('fields') sources = body.get('sources') if not fields: row = _load_row(kind, eid) if kind == 'klub': res = _propose_for_klub(row) elif kind == 'savez': res = _propose_for_savez(row) else: res = _propose_for_sportas(row) fields = res['proposed'] sources = res['sources'] out = _apply_to_db(kind, eid, fields or {}, sources or [], x_user_email) applied = out.get('applied') or {} # R4-A3: write to pgz_sport.sys_audit so the audit page sees enrichment events try: from audit_seal_router import audit_log as _audit_log if applied: _audit_log( action='enrich.apply', target_type=kind, target_id=eid, payload={'applied': applied, 'sources': [s.get('url') for s in (sources or []) if isinstance(s, dict)]}, user_id=x_user_id, user_email=x_user_email, ) except Exception: pass return { 'status': 'success' if applied else 'no_changes', 'kind': kind, 'id': eid, 'applied_count': len(applied), 'applied_fields': list(applied.keys()), **out, } @router.get("/enrich/log") def enrich_log(kind: Optional[str] = None, target_id: Optional[int] = None, limit: int = 50): where, params = [], [] if kind: where.append("kind=%s"); params.append(kind) if target_id: where.append("target_id=%s"); params.append(target_id) sql = ("SELECT id, kind, target_id, source, url, fields_set, user_email, created_at " "FROM pgz_sport.enrichment_log " + ("WHERE " + " AND ".join(where) + " " if where else "") + "ORDER BY id DESC LIMIT %s") params.append(min(int(limit or 50), 200)) with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: cur.execute(sql, params) rows = [dict(r) for r in cur.fetchall()] for r in rows: if r.get('created_at'): r['created_at'] = r['created_at'].isoformat() return {'count': len(rows), 'rows': rows} # ─── R3B M2 — SEARCH SUGGEST (autocomplete for Mreža) ─────────────────── @router.get("/search/suggest") def search_suggest(q: str = '', type: str = '', limit: int = 10): """ Autocomplete suggestions for the Mreža search inputs. type ∈ {person, club, company, ''} — empty means all. Returns: {query, results: [{id, label, type, sub}]} """ q = (q or '').strip() if len(q) < 2: return {'query': q, 'results': []} limit = max(1, min(50, int(limit))) out = [] with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: if type in ('', 'club'): cur.execute(""" SELECT id, naziv AS label, sport, grad FROM pgz_sport.klubovi WHERE naziv ILIKE %s AND aktivan=TRUE ORDER BY length(naziv), naziv LIMIT %s """, ('%'+q+'%', limit)) for r in cur.fetchall(): out.append({'id':'klub:'+str(r['id']), 'label': r['label'], 'type':'club', 'sub': (r.get('sport') or '')+' · '+(r.get('grad') or '')}) cur.execute(""" SELECT id, naziv AS label, sport FROM pgz_sport.savezi WHERE naziv ILIKE %s AND aktivan=TRUE ORDER BY length(naziv), naziv LIMIT %s """, ('%'+q+'%', limit)) for r in cur.fetchall(): out.append({'id':'savez:'+str(r['id']), 'label': r['label'], 'type':'savez', 'sub': r.get('sport') or 'savez'}) if type in ('', 'person'): cur.execute(""" SELECT c.id, c.ime, c.prezime, c.sport, k.naziv AS klub_naziv FROM pgz_sport.clanovi c LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id WHERE (COALESCE(c.ime,'') || ' ' || COALESCE(c.prezime,'')) ILIKE %s ORDER BY length(COALESCE(c.ime,'')||COALESCE(c.prezime,'')), c.prezime LIMIT %s """, ('%'+q+'%', limit)) for r in cur.fetchall(): out.append({'id':'sportas:'+str(r['id']), 'label': (r.get('ime') or '')+' '+(r.get('prezime') or ''), 'type':'person', 'sub': (r.get('sport') or 'sportaš')+(r.get('klub_naziv') and ' · '+r['klub_naziv'] or '')}) cur.execute(""" SELECT id, name AS label, function, oib, county FROM civic.persons WHERE name ILIKE %s ORDER BY oib NULLS LAST, length(name) LIMIT %s """, ('%'+q+'%', limit)) for r in cur.fetchall(): out.append({'id':'civic_person:'+str(r['id']), 'label': r['label'], 'type':'person', 'sub': (r.get('function') or 'civic')+' · '+(r.get('county') or '')}) if type in ('', 'company'): cur.execute(""" SELECT id, name AS label, oib, city, entity_type FROM civic.entities WHERE name ILIKE %s ORDER BY length(name) LIMIT %s """, ('%'+q+'%', limit)) for r in cur.fetchall(): out.append({'id':'civic_entity:'+str(r['id']), 'label': r['label'], 'type':'company', 'sub': (r.get('entity_type') or 'tvrtka')+' · '+(r.get('city') or '')}) return {'query': q, 'results': out[:limit2]} # ─── R3B M3 — FORENSIC ENRICH (Wikipedia scrape + persist) ────────────── @router.post("/forensic/findings/{finding_id}/enrich") def enrich_forensic(finding_id: int): """ Look up the forensic finding, derive the PEP person name from entities_involved or title, hit Wikipedia HR for a summary, and persist the enriched payload into civic.forensic_findings.ai_analysis (or back into raw_data.enrichment). """ with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: cur.execute(""" SELECT id, finding_type, severity, title, description, entities_involved, raw_data, ai_analysis FROM civic.forensic_findings WHERE id=%s """, (finding_id,)) f = cur.fetchone() if not f: raise HTTPException(404, "finding not found") f = dict(f) # Derive person name candidates candidates = [] if isinstance(f.get('entities_involved'), (list, dict)): ei = f['entities_involved'] if isinstance(ei, dict): for k in ('person','name','osoba','PEP','pep'): if ei.get(k): candidates.append(str(ei[k])) # Also try persons: [...] list for p in (ei.get('persons') or ei.get('osobe') or []): if isinstance(p, dict) and p.get('name'): candidates.append(p['name']) elif isinstance(p, str): candidates.append(p) elif isinstance(ei, list): for it in ei: if isinstance(it, dict): for k in ('name','person','label'): if it.get(k): candidates.append(str(it[k])); break elif isinstance(it, str): candidates.append(it) if not candidates and f.get('title'): # Heuristic: extract first capitalised "Ime Prezime" pair m = re.search(r'\b([A-ZŠĐČĆŽ][a-zšđčćž]{2,})\s+([A-ZŠĐČĆŽ][a-zšđčćž]{2,})', f['title']) if m: candidates.append(m.group(0)) wiki = None used_query = None for q in candidates[:3]: wiki = _wiki_summary(q) if wiki: used_query = q break # Build enrichment payload enrichment = { 'queried': candidates[:5], 'used_query': used_query, 'wiki': wiki, 'enriched_at': datetime.now(timezone.utc).isoformat(), } # Persist into raw_data.enrichment raw = f.get('raw_data') if raw is None: raw = {} if not isinstance(raw, dict): raw = {'_legacy': raw} raw['enrichment'] = enrichment cur.execute(""" UPDATE civic.forensic_findings SET raw_data = %s::jsonb, ai_analysis = COALESCE(ai_analysis, %s) WHERE id = %s """, (json.dumps(raw, default=str, ensure_ascii=False), (wiki or {}).get('extract'), finding_id)) c.commit() return { 'finding_id': finding_id, 'queried': candidates[:5], 'used_query': used_query, 'wiki': wiki, 'persisted': True, } # ─── R3B P4 — FORENSIC SCAN (kept from prior version) ─────────────────── @router.post("/forensic/scan") def forensic_scan(req: dict = Body(...)): name = (req.get('name') or '').strip() if len(name) < 3: raise HTTPException(400, "name must be at least 3 chars") with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: cur.execute(""" SELECT id, name, function, party, county, city, oib, trust_tier FROM civic.persons WHERE upper(name) ILIKE upper(%s) ORDER BY oib NULLS LAST, id LIMIT 25 """, ('%' + name + '%',)) persons = [dict(r) for r in cur.fetchall()] for p in persons: p['links'] = []; p['findings'] = [] if p.get('oib'): cur.execute(""" SELECT pel.entity_id, pel.roles, e.name AS entity_name, e.oib AS entity_oib, e.entity_type, e.city, e.risk_score FROM civic.person_entity_links pel LEFT JOIN civic.entities e ON e.id = pel.entity_id WHERE pel.person_oib = %s LIMIT 50 """, (p['oib'],)) p['links'] = [dict(r) for r in cur.fetchall()] cur.execute(""" SELECT id, finding_type, severity, title, severity_score, created_at FROM civic.forensic_findings WHERE entities_involved::text ILIKE %s ORDER BY severity_score DESC, created_at DESC LIMIT 30 """, ('%' + p['oib'] + '%',)) p['findings'] = [dict(r) for r in cur.fetchall()] if not p['findings']: cur.execute(""" SELECT id, finding_type, severity, title, severity_score, created_at FROM civic.forensic_findings WHERE title ILIKE %s OR description ILIKE %s ORDER BY severity_score DESC, created_at DESC LIMIT 30 """, ('%' + p['name'] + '%', '%' + p['name'] + '%')) p['findings'] = [dict(r) for r in cur.fetchall()] total_links = total_findings = crit_findings = 0 for p in persons: total_links += len(p.get('links') or []) for f in p.get('findings') or []: total_findings += 1 if f.get('severity') in ('CRITICAL', 'HIGH'): crit_findings += 1 score = 0 if (p.get('function') or '').strip(): score += 30 if (p.get('party') or '').strip(): score += 15 score += min(40, len(p.get('links') or []) 5) score += min(40, len(p.get('findings') or []) * 10) score += sum(20 for f in (p.get('findings') or []) if f.get('severity') in ('CRITICAL', 'HIGH')) p['risk_score'] = min(100, score) overall = max((p.get('risk_score', 0) for p in persons), default=0) return {'query': name, 'matched_persons': len(persons), 'overall_risk_score': overall, 'total_links': total_links, 'total_findings': total_findings, 'critical_findings': crit_findings, 'persons': persons, 'scanned_at': int(time.time())} # ─── SB-3 — Bulk enrichment ───────────────────────────────────────────── _BULK_KEY_MAP = { 'klub': ('pgz_sport.klubovi', ('oib','sport','grad','predsjednik','tajnik','web','email','telefon', 'sjediste','godina_osnutka','ciljevi','opis_djelatnosti')), 'savez': ('pgz_sport.savezi', ('oib','sport','predsjednik','tajnik','email','telefon','web', 'adresa','godina_osnutka')), 'sportas': ('pgz_sport.clanovi', ('sport','profile_url','slika_url','hns_igrac_id','biografija', 'datum_rodenja','mjesto_rodenja','broj_dresa')), } def _coverage_sql(prefix: str, keys: tuple[str, ...]) -> str: parts = [f"(CASE WHEN {prefix}{k} IS NOT NULL AND ({prefix}{k}::text) <> '' THEN 1 ELSE 0 END)" for k in keys] return f"((({' + '.join(parts)})::numeric * 100) / {len(keys)})" def _bulk_pick(kind: str, limit: int, coverage_max: int) -> list[int]: if kind not in _BULK_KEY_MAP: raise HTTPException(400, "kind must be klub|savez|sportas") table, keys = _BULK_KEY_MAP[kind] cov = _coverage_sql('', keys) extra_where = '' if kind == 'klub': extra_where = "AND aktivan = TRUE" elif kind == 'sportas': extra_where = "AND aktivan = TRUE" sql = (f"SELECT id FROM {table} " f"WHERE 1=1 {extra_where} " f"AND {cov} < %s " f"ORDER BY random() LIMIT %s") with _db() as c, c.cursor() as cur: cur.execute(sql, (coverage_max, limit)) return [r[0] for r in cur.fetchall()] @router.post("/enrich/bulk") def enrich_bulk(body: dict = Body(default=None), x_user_email: Optional[str] = Header(default=None), x_user_id: Optional[int] = Header(default=None)): """Run preview+apply over N random under-enriched rows of one kind. Body: {kind: 'klub'|'savez'|'sportas', limit: 50, coverage_max: 70} Returns aggregate stats. Synchronous (use polling, not SSE). """ body = body or {} kind = (body.get('kind') or '').strip().lower() if kind not in _BULK_KEY_MAP: raise HTTPException(400, "kind must be klub|savez|sportas") limit = max(1, min(int(body.get('limit') or 50), 200)) coverage_max = max(0, min(int(body.get('coverage_max') or 70), 100)) ids = _bulk_pick(kind, limit, coverage_max) items: list[dict] = [] fields_total = 0 started = time.time() for eid in ids: try: row = _load_row(kind, eid) if kind == 'klub': res = _propose_for_klub(row) elif kind == 'savez': res = _propose_for_savez(row) else: res = _propose_for_sportas(row) proposed = res.get('proposed') or {} srcs = res.get('sources') or [] if not proposed: items.append({'id': eid, 'applied': 0, 'fields': []}) continue out = _apply_to_db(kind, eid, proposed, srcs, x_user_email) applied = out.get('applied') or {} fields_total += len(applied) items.append({'id': eid, 'applied': len(applied), 'fields': list(applied.keys())}) try: from audit_seal_router import audit_log as _audit_log if applied: _audit_log(action='enrich.bulk.apply', target_type=kind, target_id=eid, payload={'applied': applied}, user_id=x_user_id, user_email=x_user_email) except Exception: pass except HTTPException as e: items.append({'id': eid, 'error': e.detail}) except Exception as e: items.append({'id': eid, 'error': f'{type(e).name}: {e}'}) return { 'status': 'success', 'kind': kind, 'requested': limit, 'processed': len(items), 'fields_total': fields_total, 'elapsed_s': round(time.time() - started, 1), 'items': items, } # ─── SB-4 — Worker status / control ───────────────────────────────────── _REDIS_KEYS = { 'heartbeat': 'cc:pgz-enricher:heartbeat', 'pause': 'cc:pgz-enricher:pause', 'run_now': 'cc:pgz-enricher:run_now', 'last_cycle': 'cc:pgz-enricher:last_cycle', 'confidence': 'cc:pgz-enricher:confidence', 'fields_24h': 'cc:pgz-enricher:fields_24h', } def _redis_client(): try: import redis except Exception: return None host = os.environ.get('REDIS_HOST', 'localhost') port = int(os.environ.get('REDIS_PORT', '6379')) pwd = (os.environ.get('REDIS_PASS') or '').strip().strip("'").strip('"') or None # Try with password first (prod); fall back to anonymous (dev box) on AUTH failure. for p in (pwd, None): try: r = redis.Redis(host=host, port=port, password=p, decode_responses=True, socket_connect_timeout=2) r.ping() return r except Exception: continue return None @router.get("/enrich/worker/status") def enrich_worker_status(): r = _redis_client() out = {'available': bool(r)} if not r: return out try: hb = r.get(_REDIS_KEYS['heartbeat']) out['heartbeat'] = int(hb) if hb else None out['heartbeat_age_s'] = (int(time.time()) - int(hb)) if hb else None out['paused'] = (r.get(_REDIS_KEYS['pause']) or '0') == '1' out['run_now_pending'] = (r.get(_REDIS_KEYS['run_now']) or '0') == '1' last = r.get(_REDIS_KEYS['last_cycle']) if last: try: out['last_cycle'] = json.loads(last) except: out['last_cycle'] = last conf = r.get(_REDIS_KEYS['confidence']) out['confidence_threshold'] = float(conf) if conf else 0.7 f24 = r.get(_REDIS_KEYS['fields_24h']) out['fields_24h'] = int(f24) if f24 and f24.isdigit() else 0 except Exception as e: out['error'] = f'{type(e).name}: {e}' # Recent enrichment_log rows for live activity try: with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: cur.execute("""SELECT id, kind, target_id, source, fields_set, user_email, created_at FROM pgz_sport.enrichment_log ORDER BY id DESC LIMIT 25""") rows = [] for rr in cur.fetchall(): rr = dict(rr) if rr.get('created_at'): rr['created_at'] = rr['created_at'].isoformat() rows.append(rr) out['recent'] = rows except Exception: out['recent'] = [] return out @router.post("/enrich/worker/pause") def enrich_worker_pause(body: dict = Body(default=None)): body = body or {} pause = bool(body.get('paused', True)) r = _redis_client() if not r: raise HTTPException(503, 'redis unavailable') r.set(_REDIS_KEYS['pause'], '1' if pause else '0') return {'status': 'success', 'paused': pause} @router.post("/enrich/worker/run-now") def enrich_worker_run_now(): r = _redis_client() if not r: raise HTTPException(503, 'redis unavailable') r.set(_REDIS_KEYS['run_now'], '1') return {'status': 'success', 'queued': True} @router.post("/enrich/worker/confidence") def enrich_worker_confidence(body: dict = Body(...)): try: v = float(body.get('value')) except Exception: raise HTTPException(400, 'value must be number 0..1') if not (0.0 <= v <= 1.0): raise HTTPException(400, 'value out of range 0..1') r = _redis_client() if not r: raise HTTPException(503, 'redis unavailable') r.set(_REDIS_KEYS['confidence'], str(v)) return {'status': 'success', 'confidence_threshold': v}