(\d+)
', html_doc)
if m: out['broj_dresa'] = int(m.group(1))
m = re.search(r'
Optional[dict]:
body = _http_get(url, timeout=8)
if not body:
# Try Playwright fallback
if _HAS_PW and _pw_scraper is not None:
r = _pw_scraper.fetch_rendered(url, timeout_ms=15000)
if r and r.get('html_len', 0) > 2000:
# We didn't store html in fetch_rendered — re-fetch text only is enough
# but we need html for parse. Do a simple HTTP retry with longer timeout.
body = _http_get(url, timeout=15)
return _parse_hns_player(body, url) if body else None
# ─── Generic sport-federation scraper ───────────────────────────────────
def _fed_url_from_row(row: dict) -> Optional[str]:
"""If the row already points to a federation profile (source_url /
profile_url on a known fed host), return it."""
feds, _, _ = _load_sport_feds()
fed_hosts = set()
for entry in feds.values():
if not isinstance(entry, dict): continue
for which in ('national', 'pgz'):
sub = entry.get(which) or {}
for k in ('url', 'search_url', 'profile_url_pattern'):
v = sub.get(k)
if v:
try:
h = urllib.parse.urlparse(v.replace('{q}', 'x').replace('{slug}', 'x').replace('{hns_pid}', '1')).hostname
if h: fed_hosts.add(h)
except Exception:
pass
for k in ('source_url', 'profile_url'):
u = row.get(k)
if not u: continue
try:
h = urllib.parse.urlparse(u).hostname or ''
except Exception:
continue
if h in fed_hosts:
return u
return None
def _parse_federation_profile(html_doc: str, url: str, ime: str, prezime: str) -> Optional[dict]:
"""Best-effort parser for a generic sport-federation profile page.
Returns {source, url, slika_url, datum_rodenja, mjesto_rodenja, klub,
extract, raw_text}. Tolerant of varied page structures.
"""
if not html_doc: return None
host = urllib.parse.urlparse(url).hostname or ''
out: dict[str, Any] = {
'source': host,
'url': url,
}
# Title
m = re.search(r'
]*>([^<]+)', html_doc, re.I)
if m: out['title'] = html.unescape(m.group(1).strip())[:300]
# Meta description
m = re.search(r'
= 3:
name_tokens.append(re.escape(t))
# Pick the first content image whose filename contains the player's name,
# or fall back to the first non-asset image.
img_candidates = re.findall(r'
![]()
]+src=["\']([^"\']+)["\']', html_doc, re.I)
chosen_img = None
for src in img_candidates:
low = src.lower()
if any(b in low for b in ('logo', 'icon', 'admin-ajax', 'spinner', 'loader',
'sprite', '/themes/', '/icons/', 'gdpr', 'banner',
'header', 'footer', 'placeholder', 'avatar-default')):
continue
if not low.endswith(('.jpg', '.jpeg', '.png', '.webp')):
continue
# Prefer matches on player name in URL
if name_tokens and any(re.search(t, src, re.I) for t in name_tokens):
chosen_img = src; break
if chosen_img is None:
chosen_img = src
if chosen_img:
if not chosen_img.startswith('http'):
chosen_img = urllib.parse.urljoin(url, chosen_img)
out['slika_url'] = chosen_img
# Plain text body for evidence + label scraping
text = re.sub(r'', ' ', html_doc, flags=re.S | re.I)
text = re.sub(r'', ' ', text, flags=re.S | re.I)
text = re.sub(r'<[^>]+>', ' ', text)
text = html.unescape(re.sub(r'\s+', ' ', text)).strip()
out['raw_text'] = text[:4000]
out['extract'] = (out.get('description')
or text[max(0, text.find(prezime)-30):max(0, text.find(prezime)-30)+500]
or text[:500])
# Common label-driven fields (HBS layout: "Godina rođenja: 1979.", "Matični klub: …")
m = re.search(r'Datum\s+ro[đdj]?enja[:\s]+(\d{1,2}[.\-/]\d{1,2}[.\-/]\d{4})', text, re.I)
if m:
try:
from datetime import date as _date
d = re.split(r'[.\-/]', m.group(1))
out['datum_rodenja'] = _date(int(d[2]), int(d[1]), int(d[0])).isoformat()
except Exception:
pass
if 'datum_rodenja' not in out:
m = re.search(r'Godina\s+ro[đdj]?enja[:\s]+(\d{4})', text, re.I)
if m:
try:
from datetime import date as _date
out['datum_rodenja'] = _date(int(m.group(1)), 1, 1).isoformat()
except Exception:
pass
m = re.search(r'Mjesto\s+ro[đdj]?enja[:\s]+([A-ZČĆŠĐŽ][^,\n.]{2,40})', text)
if m: out['mjesto_rodenja'] = m.group(1).strip()
m = re.search(r'Mati[čc]ni\s+klub[:\s]+([^\n]{3,60}?)(?:\s+(?:Sportski|Datum|Liječni|Reprezent|Sezona|Domaće|Nastupi))', text, re.I)
if m: out['klub_naziv'] = m.group(1).strip().rstrip('.')
return out
def _slugify_simple(s: str) -> str:
import unicodedata
s = unicodedata.normalize('NFKD', s or '').encode('ascii', 'ignore').decode('ascii').lower()
return re.sub(r'[^a-z0-9]+', '-', s).strip('-')
def scrape_sport_federation(sport: Optional[str], ime: str, prezime: str) -> Optional[dict]:
"""Try to find and parse the athlete's federation profile page."""
fed = _sport_fed(sport) if sport else None
if not fed: return None
nat = (fed or {}).get('national') or {}
full_name = (ime + ' ' + prezime).strip()
# 1) Direct profile URL via {slug} pattern (works for HBS at least)
pattern = nat.get('profile_url_pattern')
if pattern and '{slug}' in pattern:
slug = _slugify_simple(full_name)
url = pattern.replace('{slug}', slug)
body = _http_get(url, timeout=8)
if body and prezime.lower() in body.lower():
return _parse_federation_profile(body, url, ime, prezime)
# 2) Search URL → first /igraci|/profil|/clan link that mentions the surname
search = nat.get('search_url')
if search:
body = _http_get(search.replace('{q}', urllib.parse.quote(full_name)), timeout=10)
if body:
for href_re in (r'href="([^"]*?/igraci/[^"]+)"',
r'href="([^"]*?/igrac/[^"]+)"',
r'href="([^"]*?/sportasi/[^"]+)"',
r'href="([^"]*?/clanovi/[^"]+)"',
r'href="([^"]*?/profil/[^"]+)"'):
for m in re.finditer(href_re, body, re.I):
cand = m.group(1)
if not cand.startswith('http'):
cand = urllib.parse.urljoin(nat.get('url', search), cand)
if _slugify_simple(prezime) in _slugify_simple(cand):
b2 = _http_get(cand, timeout=8)
if b2:
return _parse_federation_profile(b2, cand, ime, prezime)
return None
def _propose_for_sportas(row: dict) -> dict:
naziv = ((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip()
ime, prezime = (row.get('ime') or ''), (row.get('prezime') or '')
sport = row.get('sport')
sources, evidence = [], []
proposed: dict[str, Any] = {}
# 1) HNS Semafor — only meaningful when sport is football OR row already
# carries an HNS link.
hns_doc: Optional[dict] = None
if _normalize_sport(sport) == 'nogomet' or _hns_url_from_row(row):
hns_url = _hns_url_from_row(row)
if hns_url:
hns_doc = _hns_fetch_player(hns_url)
if hns_doc:
sources.append(hns_doc)
evidence.append(hns_doc.get('raw_text') or hns_doc.get('extract') or '')
# 2) Sport-aware federation scrape (HBS, HKS, etc.) — also use existing
# source_url/profile_url if it points at a known federation host.
fed_doc: Optional[dict] = None
direct_fed_url = _fed_url_from_row(row)
if direct_fed_url and (not hns_doc or hns_doc.get('url') != direct_fed_url):
body = _http_get(direct_fed_url, timeout=8)
if body:
fed_doc = _parse_federation_profile(body, direct_fed_url, ime, prezime)
if not fed_doc:
fed_doc = scrape_sport_federation(sport, ime, prezime)
if fed_doc:
sources.append(fed_doc)
evidence.append(fed_doc.get('raw_text') or fed_doc.get('extract') or '')
# Helper: pick from hns_doc first then fed_doc
def _pick(field):
if hns_doc and hns_doc.get(field): return hns_doc[field]
if fed_doc and fed_doc.get(field): return fed_doc[field]
return None
if not row.get('profile_url'):
v = _pick('url') or (hns_doc and hns_doc.get('url')) or (fed_doc and fed_doc.get('url'))
if v: proposed['profile_url'] = v
if not row.get('source_url'):
v = (hns_doc and hns_doc.get('url')) or (fed_doc and fed_doc.get('url'))
if v: proposed['source_url'] = v
if not row.get('slika_url'):
v = _pick('slika_url')
if v: proposed['slika_url'] = v
if not row.get('hns_igrac_id') and hns_doc and hns_doc.get('hns_igrac_id'):
proposed['hns_igrac_id'] = hns_doc['hns_igrac_id']
if not row.get('datum_rodenja'):
v = _pick('datum_rodenja')
if v: proposed['datum_rodenja'] = v
if not row.get('mjesto_rodenja'):
v = _pick('mjesto_rodenja')
if v: proposed['mjesto_rodenja'] = v
if not row.get('broj_dresa') and hns_doc and hns_doc.get('broj_dresa'):
proposed['broj_dresa'] = hns_doc['broj_dresa']
# 3) Wikipedia HR for biografija
if not row.get('biografija'):
wiki = _wiki_summary(naziv)
if wiki:
sources.append(wiki)
evidence.append(wiki.get('extract') or '')
# 4) kosarkapgz.com — news mentions for PGŽ basketball players
if _kosarkapgz_is_basketball_pgz('', sport):
q = (ime + ' ' + prezime).strip()
kpgz = _kosarkapgz_search_posts(q, limit=5) if q else None
if kpgz:
sources.append(kpgz)
evidence.append(kpgz.get('raw_text') or '')
# Description: prefer DeepSeek synthesis from all evidence; fallback to first long snippet
if not row.get('biografija'):
descr = _deepseek_describe(naziv, f'sportaš ({sport})' if sport else 'sportaš', evidence) if evidence else None
if not descr:
for s in sources:
ext = s.get('extract')
if ext and len(ext) >= 80:
descr = ext; break
if descr:
proposed['biografija'] = descr.strip()[:2000]
return {'proposed': proposed, 'sources': sources}
# ─── Endpoints ──────────────────────────────────────────────────────────
# ─── R4 — POST /v2/enrich/forensic/{finding_id} ─────────────────────────
def _extract_pep_name(finding: dict) -> Optional[str]:
"""Pull the primary person name from a forensic_findings row."""
title = (finding.get('title') or '').strip()
desc = (finding.get('description') or '').strip()
payload = finding.get('raw_data') or {}
if isinstance(payload, str):
try: payload = json.loads(payload)
except Exception: payload = {}
if isinstance(payload, dict):
for k in ('person_name', 'name', 'osoba'):
v = payload.get(k)
if v: return str(v).strip()
# Try entities_involved.entity_name
ents = finding.get('entities_involved') or []
if isinstance(ents, str):
try: ents = json.loads(ents)
except Exception: ents = []
if isinstance(ents, list):
for e in ents:
if isinstance(e, dict) and e.get('person_name'):
return str(e['person_name']).strip()
if isinstance(e, dict) and e.get('entity_name') and ' ' in (e.get('entity_name') or ''):
# Some entries store person names as entity_name when entity_type='person'
if (e.get('entity_type') or '').lower() in ('person','osoba'):
return str(e['entity_name']).strip()
# Fallback: extract a "Ime Prezime" from the title
m = re.search(r'\b([A-ZČĆŠĐŽ][a-zčćšđž]+)\s+([A-ZČĆŠĐŽ][a-zčćšđž]+(?:-[A-ZČĆŠĐŽ][a-zčćšđž]+)?)\b', title + ' ' + desc)
if m: return f"{m.group(1)} {m.group(2)}"
return None
def _gather_pep_evidence(name: str) -> list[dict]:
sources: list[dict] = []
wiki = _wiki_summary(name)
if wiki: sources.append(wiki)
# DDG html-lite as a "Google snippet" replacement (often OK for HR PEPs)
ddg = 'https://html.duckduckgo.com/html/?q=' + urllib.parse.quote(f'"{name}" PGŽ Hrvatska')
page = _http_get(ddg, timeout=8)
if page:
# First result block
m = re.search(r'
]+class="result__a"[^>]+href="([^"]+)"[^>]*>([^<]{6,200})', page)
snippet_m = re.search(r'
]+class="result__snippet"[^>]*>(.*?)', page, re.S)
if m:
sources.append({
'source': 'duckduckgo',
'url': html.unescape(m.group(1))[:500],
'title': html.unescape(m.group(2)).strip()[:300],
'extract': re.sub(r'<[^>]+>', ' ', snippet_m.group(1)).strip()[:600] if snippet_m else None,
})
return sources
def _related_entities_for_pep(name: str) -> list[dict]:
"""Pull civic.persons + their entity links so we have the structured graph."""
out: list[dict] = []
with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
cur.execute("""SELECT id, name, function, party, county, city, oib, trust_tier
FROM civic.persons
WHERE upper(name) ILIKE upper(%s)
ORDER BY oib NULLS LAST, id LIMIT 10""", ('%'+name+'%',))
for p in cur.fetchall():
p = dict(p)
entry = {
'kind': 'person',
'person_id': p['id'], 'person_name': p['name'],
'function': p.get('function'), 'party': p.get('party'),
'county': p.get('county'), 'city': p.get('city'),
'oib': p.get('oib'), 'trust_tier': p.get('trust_tier'),
'entities': [],
}
if p.get('oib'):
cur.execute("""SELECT pel.entity_id, pel.roles, e.name AS entity_name,
e.oib AS entity_oib, e.entity_type, e.city, e.risk_score
FROM civic.person_entity_links pel
LEFT JOIN civic.entities e ON e.id = pel.entity_id
WHERE pel.person_oib=%s LIMIT 30""", (p['oib'],))
for r in cur.fetchall():
entry['entities'].append(dict(r))
out.append(entry)
return out
@router.post("/enrich/forensic/{finding_id}")
def enrich_forensic_v2(finding_id: int,
body: dict = Body(default=None),
x_user_email: Optional[str] = Header(default=None),
x_user_id: Optional[int] = Header(default=None)):
"""Enrich a forensic finding: gather Wiki + DDG snippets + civic graph,
write back to civic.forensic_findings.related_entities, and seal the
payload hash on Polygon (or queue for sealing).
"""
body = body or {}
explicit_name = (body.get('name') or '').strip() or None
with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
cur.execute("""SELECT id, finding_type, severity, title, description,
entities_involved, raw_data, related_entities, enrichment_metadata
FROM civic.forensic_findings WHERE id=%s""", (finding_id,))
finding = cur.fetchone()
if not finding:
raise HTTPException(404, "finding not found")
finding = dict(finding)
name = explicit_name or _extract_pep_name(finding)
if not name:
raise HTTPException(400, "could not derive a person/entity name; pass {name: \"…\"}")
sources = _gather_pep_evidence(name)
related = _related_entities_for_pep(name)
payload = {
'finding_id': finding_id,
'name': name,
'sources': [{'source': s.get('source'), 'url': s.get('url'),
'title': s.get('title')} for s in sources],
'related_entities': related,
'enriched_at': datetime.now(timezone.utc).isoformat(),
}
# Persist back to the finding
enrichment_meta = finding.get('enrichment_metadata') or {}
if not isinstance(enrichment_meta, dict): enrichment_meta = {}
history = enrichment_meta.get('history') or []
history.append({
'at': payload['enriched_at'],
'sources': payload['sources'],
'related_count': len(related),
'user': x_user_email,
})
enrichment_meta['history'] = history[-10:]
enrichment_meta['enriched_at'] = payload['enriched_at']
enrichment_meta['enriched_by'] = x_user_email or 'system'
enrichment_meta['source_count'] = len(sources)
with _db() as c, c.cursor() as cur:
cur.execute("""UPDATE civic.forensic_findings
SET related_entities = %s::jsonb,
enrichment_metadata = %s::jsonb
WHERE id=%s
RETURNING id""",
(json.dumps(related, default=str, ensure_ascii=False),
json.dumps(enrichment_meta, default=str, ensure_ascii=False),
finding_id))
cur.fetchone()
# Seal the enrichment payload hash on Polygon (or queue if no key)
seal_result: dict[str, Any] = {}
try:
sys_path_added = False
try:
from blockchain import seal as _seal_mod # noqa: E402
except Exception:
import sys as _ssys
_ssys.path.insert(0, '/opt/pgz-sport')
from blockchain import seal as _seal_mod # noqa: E402
sys_path_added = True
del sys_path_added # silence linters
h = _seal_mod.hash_payload(payload)
seal_result = _seal_mod.seal_to_polygon(
data_hash=h,
ref_id=str(finding_id),
action='forensic.enriched',
ref_type='forensic_finding',
payload=payload,
user_id=x_user_id,
user_email=x_user_email,
)
except Exception as e:
seal_result = {'error': f'{type(e).__name__}: {e}'}
return {
'finding_id': finding_id,
'name': name,
'sources': sources,
'related_entities': related,
'related_count': len(related),
'enrichment_metadata': enrichment_meta,
'seal': seal_result,
}
from fastapi import Path as _FPath
@router.post("/enrich/{kind:str}/{eid:int}")
def enrich_preview(kind: str = _FPath(..., regex='^(klub|savez|sportas)$'),
eid: int = 0,
nocache: int = 0):
"""Enrichment preview with Redis cache (24h TTL) + slow-call telemetry.
Pass ?nocache=1 to bypass cache.
"""
_t0 = time.time()
_cache_key = f"enrich:v1:{kind}:{eid}"
if not nocache:
_cached = _cache_get(_cache_key)
if _cached is not None:
_cached['_cache'] = 'hit'
_cached['_cache_ttl_s'] = ENRICH_CACHE_TTL
try: _enrich_slow_log(kind, eid, time.time() - _t0, cached=True)
except Exception: pass
return _cached
row = _load_row(kind, eid)
if kind == 'klub': res = _propose_for_klub(row)
elif kind == 'savez': res = _propose_for_savez(row)
else: res = _propose_for_sportas(row)
if kind == 'klub':
keys = ['oib','sport','grad','predsjednik','tajnik','web','email','telefon',
'sjediste','godina_osnutka','ciljevi','opis_djelatnosti']
elif kind == 'savez':
keys = ['oib','sport','predsjednik','tajnik','email','telefon','web','adresa','godina_osnutka']
else:
keys = ['sport','profile_url','slika_url','hns_igrac_id','biografija',
'datum_rodenja','mjesto_rodenja','broj_dresa','visina_cm','tezina_kg',
'dominantna_noga','oib']
naziv = _display_name(kind, row)
grad = row.get('grad') if kind == 'klub' else None
primary = row.get('web') or row.get('web_stranica') or row.get('source_url') or row.get('scrape_url') or row.get('profile_url')
filled = sum(1 for k in keys if row.get(k))
coverage = round(filled / len(keys) * 100)
missing = [k for k in keys if not row.get(k)]
proposed = res['proposed']
current = {k: row.get(k) for k in proposed.keys()}
meta = row.get('metadata') or {}
if not isinstance(meta, dict): meta = {}
_result = {
'kind': kind, 'id': eid, 'naziv': naziv,
'coverage': coverage, 'filled_fields': filled, 'total_fields': len(keys),
'missing_fields': missing,
'live_snippet': _fetch_title(primary) if primary else None,
'research_links': _research_links(naziv, kind, grad, sport=row.get('sport'), row=row),
'sport': row.get('sport'),
'sport_federation': (lambda f: {
'national': (f.get('national') or {}).get('name') if f else None,
'national_url': (f.get('national') or {}).get('url') if f else None,
'pgz': (f.get('pgz') or {}).get('name') if f else None,
})(_sport_fed(row.get('sport'))),
'sources': res['sources'],
'current': current,
'proposed': proposed,
'last_enriched_at': meta.get('enriched_at'),
'last_enrichment_source': meta.get('enrichment_source'),
'enriched_at': int(time.time()),
'apply_url': f'/sport/api/v2/enrich/{kind}/{eid}/apply',
'_cache': 'miss',
'_cache_ttl_s': ENRICH_CACHE_TTL,
}
_duration = time.time() - _t0
try:
_cache_set(_cache_key, _result, ttl=ENRICH_CACHE_TTL)
except Exception:
pass
try:
if _duration >= ENRICH_SLOW_LOG_THRESHOLD:
_enrich_slow_log(kind, eid, _duration, cached=False)
except Exception:
pass
return _result
_TABLE_MAP = {
'klub': ('pgz_sport.klubovi',
{'web','email','telefon','predsjednik','tajnik',
'opis_djelatnosti','ciljevi','godina_osnutka','sjediste','adresa'}),
'savez': ('pgz_sport.savezi',
{'web','email','telefon','predsjednik','tajnik','adresa','godina_osnutka'}),
'sportas': ('pgz_sport.clanovi',
{'biografija','profile_url','source_url','slika_url','hns_igrac_id',
'datum_rodenja','mjesto_rodenja','broj_dresa','visina_cm',
'tezina_kg','dominantna_noga','oib'}),
}
def _apply_to_db(kind: str, eid: int, fields: dict, sources: list, user_email: Optional[str]):
if kind not in _TABLE_MAP:
raise HTTPException(400, "kind must be klub|savez|sportas")
table, allowed = _TABLE_MAP[kind]
# Use a manual transaction so SELECT ... FOR UPDATE actually holds the row
# lock until we COMMIT. _db() returns autocommit=True connections, so we
# flip it off locally for this function only.
c = _db()
c.autocommit = False
error_msg = None # populated on UniqueViolation, recorded in enrichment_log
try:
with c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
cur.execute(f"SELECT * FROM {table} WHERE id=%s FOR UPDATE", (eid,))
before = cur.fetchone()
if not before: raise HTTPException(404, kind + " not found")
before = dict(before)
sets, params, applied = [], [], {}
for k, v in (fields or {}).items():
if k not in allowed: continue
if v is None or str(v).strip() == '': continue
if before.get(k):
continue # never overwrite existing
sets.append(f"{k} = %s")
params.append(v); applied[k] = v
meta_in = before.get('metadata') or {}
if not isinstance(meta_in, dict): meta_in = {}
now_iso = datetime.now(timezone.utc).isoformat()
meta_in['enriched_at'] = now_iso
meta_in['enrichment_source'] = [s.get('source') for s in (sources or []) if s.get('source')]
history = meta_in.get('enrichment_history') or []
history.append({
'at': now_iso,
'fields': list(applied.keys()),
'sources': meta_in['enrichment_source'],
'urls': [s.get('url') for s in (sources or []) if s.get('url')],
'user': user_email,
})
meta_in['enrichment_history'] = history[-10:]
sets.append("metadata = %s::jsonb")
params.append(json.dumps(meta_in, ensure_ascii=False, default=str))
params.append(eid)
try:
cur.execute(f"UPDATE {table} SET {', '.join(sets)} WHERE id=%s RETURNING *", params)
after = dict(cur.fetchone())
except psycopg2.errors.UniqueViolation as _uve:
# The UPDATE rolled back — pretend nothing was applied so the
# log row, the API response, and the row state all agree.
c.rollback()
failed_fields = list(applied.keys())
applied = {} # truth-in-reporting
constraint = getattr(getattr(_uve, 'diag', None), 'constraint_name', None)
error_msg = f"unique_violation: {constraint or 'unknown'}"
import logging as _lg
_lg.getLogger("enrich").warning(
f"UniqueViolation table={table} id={eid} blocked_fields={failed_fields} constraint={constraint}")
# Fetch current row state for the audit log.
cur.execute(f"SELECT * FROM {table} WHERE id=%s", (eid,))
row = cur.fetchone()
after = dict(row) if row else {}
# Park a do-not-retry marker on the row so workers stop hammering it.
try:
block_meta = {
'reason': 'unique_violation',
'constraint': constraint,
'fields': failed_fields,
'at': now_iso,
}
cur.execute(
f"UPDATE {table} "
f"SET metadata = COALESCE(metadata,'{{}}'::jsonb) || %s::jsonb "
f"WHERE id=%s",
(json.dumps({'enrichment_block': block_meta}, default=str), eid))
except Exception as _be:
# Marker is best-effort; never let it block the apply path.
import logging as _lg2
_lg2.getLogger("enrich").warning(f"enrichment_block write failed table={table} id={eid}: {_be}")
# Always log — on success error_msg is NULL, on UniqueViolation it
# carries the constraint name and applied is empty.
cur.execute(
"""INSERT INTO pgz_sport.enrichment_log
(kind, target_id, source, url, fields_set, before_jsonb, after_jsonb, user_email, error)
VALUES (%s,%s,%s,%s,%s,%s::jsonb,%s::jsonb,%s,%s)""",
(kind, eid,
','.join(meta_in['enrichment_source'])[:120] if meta_in['enrichment_source'] else None,
(sources[0].get('url') if sources else None),
list(applied.keys()) or None,
json.dumps({k: before.get(k) for k in (list(applied.keys()) + ['metadata'])},
ensure_ascii=False, default=str),
json.dumps({k: after.get(k) for k in (list(applied.keys()) + ['metadata'])},
ensure_ascii=False, default=str),
user_email,
error_msg))
c.commit()
except HTTPException:
try: c.rollback()
except Exception: pass
raise
except Exception:
try: c.rollback()
except Exception: pass
raise
finally:
try: c.close()
except Exception: pass
snap_keys = ('id','naziv','ime','prezime','web','email','telefon',
'opis_djelatnosti','biografija','metadata')
return {'applied': applied,
'after': {k: after.get(k) for k in snap_keys if k in after}}
@router.post("/enrich/{kind:str}/{eid:int}/apply")
def enrich_apply(kind: str = _FPath(..., regex='^(klub|savez|sportas)$'),
eid: int = 0,
body: dict = Body(default=None),
x_user_email: Optional[str] = Header(default=None),
x_user_id: Optional[int] = Header(default=None)):
body = body or {}
fields = body.get('fields')
sources = body.get('sources')
if not fields:
row = _load_row(kind, eid)
if kind == 'klub': res = _propose_for_klub(row)
elif kind == 'savez': res = _propose_for_savez(row)
else: res = _propose_for_sportas(row)
fields = res['proposed']
sources = res['sources']
out = _apply_to_db(kind, eid, fields or {}, sources or [], x_user_email)
applied = out.get('applied') or {}
# R4-A3: write to pgz_sport.sys_audit so the audit page sees enrichment events
try:
from audit_seal_router import audit_log as _audit_log
if applied:
_audit_log(
action='enrich.apply',
target_type=kind,
target_id=eid,
payload={'applied': applied,
'sources': [s.get('url') for s in (sources or []) if isinstance(s, dict)]},
user_id=x_user_id,
user_email=x_user_email,
)
except Exception:
pass
# Invalidate cache so next preview reflects the new DB state
try:
_cache_delete(f"enrich:v1:{kind}:{eid}")
except Exception:
pass
return {
'status': 'success' if applied else 'no_changes',
'kind': kind,
'id': eid,
'applied_count': len(applied),
'applied_fields': list(applied.keys()),
**out,
}
@router.get("/enrich/log")
def enrich_log(kind: Optional[str] = None, target_id: Optional[int] = None, limit: int = 50):
where, params = [], []
if kind: where.append("kind=%s"); params.append(kind)
if target_id: where.append("target_id=%s"); params.append(target_id)
sql = ("SELECT id, kind, target_id, source, url, fields_set, user_email, created_at "
"FROM pgz_sport.enrichment_log "
+ ("WHERE " + " AND ".join(where) + " " if where else "")
+ "ORDER BY id DESC LIMIT %s")
params.append(min(int(limit or 50), 200))
with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
cur.execute(sql, params)
rows = [dict(r) for r in cur.fetchall()]
for r in rows:
if r.get('created_at'): r['created_at'] = r['created_at'].isoformat()
return {'count': len(rows), 'rows': rows}
# ─── R3B M2 — SEARCH SUGGEST (autocomplete for Mreža) ───────────────────
@router.get("/search/suggest")
def search_suggest(q: str = '', type: str = '', limit: int = 10):
"""
Autocomplete suggestions for the Mreža search inputs.
type ∈ {person, club, company, ''} — empty means all.
Returns: {query, results: [{id, label, type, sub}]}
"""
q = (q or '').strip()
if len(q) < 2:
return {'query': q, 'results': []}
limit = max(1, min(50, int(limit)))
out = []
with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
if type in ('', 'club'):
cur.execute("""
SELECT id, naziv AS label, sport, grad
FROM pgz_sport.klubovi
WHERE naziv ILIKE %s AND aktivan=TRUE
ORDER BY length(naziv), naziv LIMIT %s
""", ('%'+q+'%', limit))
for r in cur.fetchall():
out.append({'id':'klub:'+str(r['id']), 'label': r['label'], 'type':'club',
'sub': (r.get('sport') or '')+' · '+(r.get('grad') or '')})
cur.execute("""
SELECT id, naziv AS label, sport
FROM pgz_sport.savezi
WHERE naziv ILIKE %s AND aktivan=TRUE
ORDER BY length(naziv), naziv LIMIT %s
""", ('%'+q+'%', limit))
for r in cur.fetchall():
out.append({'id':'savez:'+str(r['id']), 'label': r['label'], 'type':'savez',
'sub': r.get('sport') or 'savez'})
if type in ('', 'person'):
cur.execute("""
SELECT c.id, c.ime, c.prezime, c.sport, k.naziv AS klub_naziv
FROM pgz_sport.clanovi c
LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id
WHERE (COALESCE(c.ime,'') || ' ' || COALESCE(c.prezime,'')) ILIKE %s
ORDER BY length(COALESCE(c.ime,'')||COALESCE(c.prezime,'')), c.prezime
LIMIT %s
""", ('%'+q+'%', limit))
for r in cur.fetchall():
out.append({'id':'sportas:'+str(r['id']),
'label': (r.get('ime') or '')+' '+(r.get('prezime') or ''),
'type':'person',
'sub': (r.get('sport') or 'sportaš')+(r.get('klub_naziv') and ' · '+r['klub_naziv'] or '')})
cur.execute("""
SELECT id, name AS label, function, oib, county
FROM civic.persons
WHERE name ILIKE %s
ORDER BY oib NULLS LAST, length(name) LIMIT %s
""", ('%'+q+'%', limit))
for r in cur.fetchall():
out.append({'id':'civic_person:'+str(r['id']),
'label': r['label'], 'type':'person',
'sub': (r.get('function') or 'civic')+' · '+(r.get('county') or '')})
if type in ('', 'company'):
cur.execute("""
SELECT id, name AS label, oib, city, entity_type
FROM civic.entities
WHERE name ILIKE %s
ORDER BY length(name) LIMIT %s
""", ('%'+q+'%', limit))
for r in cur.fetchall():
out.append({'id':'civic_entity:'+str(r['id']),
'label': r['label'], 'type':'company',
'sub': (r.get('entity_type') or 'tvrtka')+' · '+(r.get('city') or '')})
return {'query': q, 'results': out[:limit*2]}
# ─── R3B M3 — FORENSIC ENRICH (Wikipedia scrape + persist) ──────────────
@router.post("/forensic/findings/{finding_id}/enrich")
def enrich_forensic(finding_id: int):
"""
Look up the forensic finding, derive the PEP person name from
entities_involved or title, hit Wikipedia HR for a summary, and persist
the enriched payload into civic.forensic_findings.ai_analysis (or back into
raw_data.enrichment).
"""
with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
cur.execute("""
SELECT id, finding_type, severity, title, description, entities_involved,
raw_data, ai_analysis
FROM civic.forensic_findings WHERE id=%s
""", (finding_id,))
f = cur.fetchone()
if not f: raise HTTPException(404, "finding not found")
f = dict(f)
# Derive person name candidates
candidates = []
if isinstance(f.get('entities_involved'), (list, dict)):
ei = f['entities_involved']
if isinstance(ei, dict):
for k in ('person','name','osoba','PEP','pep'):
if ei.get(k): candidates.append(str(ei[k]))
# Also try persons: [...] list
for p in (ei.get('persons') or ei.get('osobe') or []):
if isinstance(p, dict) and p.get('name'): candidates.append(p['name'])
elif isinstance(p, str): candidates.append(p)
elif isinstance(ei, list):
for it in ei:
if isinstance(it, dict):
for k in ('name','person','label'):
if it.get(k): candidates.append(str(it[k])); break
elif isinstance(it, str):
candidates.append(it)
if not candidates and f.get('title'):
# Heuristic: extract first capitalised "Ime Prezime" pair
m = re.search(r'\b([A-ZŠĐČĆŽ][a-zšđčćž]{2,})\s+([A-ZŠĐČĆŽ][a-zšđčćž]{2,})', f['title'])
if m: candidates.append(m.group(0))
wiki = None
used_query = None
for q in candidates[:3]:
wiki = _wiki_summary(q)
if wiki:
used_query = q
break
# Build enrichment payload
enrichment = {
'queried': candidates[:5],
'used_query': used_query,
'wiki': wiki,
'enriched_at': datetime.now(timezone.utc).isoformat(),
}
# Persist into raw_data.enrichment
raw = f.get('raw_data')
if raw is None: raw = {}
if not isinstance(raw, dict): raw = {'_legacy': raw}
raw['enrichment'] = enrichment
cur.execute("""
UPDATE civic.forensic_findings
SET raw_data = %s::jsonb,
ai_analysis = COALESCE(ai_analysis, %s)
WHERE id = %s
""", (json.dumps(raw, default=str, ensure_ascii=False),
(wiki or {}).get('extract'),
finding_id))
c.commit()
return {
'finding_id': finding_id,
'queried': candidates[:5],
'used_query': used_query,
'wiki': wiki,
'persisted': True,
}
# ─── R3B P4 — FORENSIC SCAN (kept from prior version) ───────────────────
@router.post("/forensic/scan")
def forensic_scan(req: dict = Body(...)):
name = (req.get('name') or '').strip()
if len(name) < 3:
raise HTTPException(400, "name must be at least 3 chars")
with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
cur.execute("""
SELECT id, name, function, party, county, city, oib, trust_tier
FROM civic.persons
WHERE upper(name) ILIKE upper(%s)
ORDER BY oib NULLS LAST, id LIMIT 25
""", ('%' + name + '%',))
persons = [dict(r) for r in cur.fetchall()]
for p in persons:
p['links'] = []; p['findings'] = []
if p.get('oib'):
cur.execute("""
SELECT pel.entity_id, pel.roles, e.name AS entity_name, e.oib AS entity_oib,
e.entity_type, e.city, e.risk_score
FROM civic.person_entity_links pel
LEFT JOIN civic.entities e ON e.id = pel.entity_id
WHERE pel.person_oib = %s LIMIT 50
""", (p['oib'],))
p['links'] = [dict(r) for r in cur.fetchall()]
cur.execute("""
SELECT id, finding_type, severity, title, severity_score, created_at
FROM civic.forensic_findings
WHERE entities_involved::text ILIKE %s
ORDER BY severity_score DESC, created_at DESC LIMIT 30
""", ('%' + p['oib'] + '%',))
p['findings'] = [dict(r) for r in cur.fetchall()]
if not p['findings']:
cur.execute("""
SELECT id, finding_type, severity, title, severity_score, created_at
FROM civic.forensic_findings
WHERE title ILIKE %s OR description ILIKE %s
ORDER BY severity_score DESC, created_at DESC LIMIT 30
""", ('%' + p['name'] + '%', '%' + p['name'] + '%'))
p['findings'] = [dict(r) for r in cur.fetchall()]
total_links = total_findings = crit_findings = 0
for p in persons:
total_links += len(p.get('links') or [])
for f in p.get('findings') or []:
total_findings += 1
if f.get('severity') in ('CRITICAL', 'HIGH'): crit_findings += 1
score = 0
if (p.get('function') or '').strip(): score += 30
if (p.get('party') or '').strip(): score += 15
score += min(40, len(p.get('links') or []) * 5)
score += min(40, len(p.get('findings') or []) * 10)
score += sum(20 for f in (p.get('findings') or []) if f.get('severity') in ('CRITICAL', 'HIGH'))
p['risk_score'] = min(100, score)
overall = max((p.get('risk_score', 0) for p in persons), default=0)
return {'query': name, 'matched_persons': len(persons),
'overall_risk_score': overall, 'total_links': total_links,
'total_findings': total_findings, 'critical_findings': crit_findings,
'persons': persons, 'scanned_at': int(time.time())}
# ─── SB-3 — Bulk enrichment ─────────────────────────────────────────────
_BULK_KEY_MAP = {
'klub': ('pgz_sport.klubovi',
('oib','sport','grad','predsjednik','tajnik','web','email','telefon',
'sjediste','godina_osnutka','ciljevi','opis_djelatnosti')),
'savez': ('pgz_sport.savezi',
('oib','sport','predsjednik','tajnik','email','telefon','web',
'adresa','godina_osnutka')),
'sportas': ('pgz_sport.clanovi',
('sport','profile_url','slika_url','hns_igrac_id','biografija',
'datum_rodenja','mjesto_rodenja','broj_dresa')),
}
def _coverage_sql(prefix: str, keys: tuple[str, ...]) -> str:
parts = [f"(CASE WHEN {prefix}{k} IS NOT NULL AND ({prefix}{k}::text) <> '' THEN 1 ELSE 0 END)"
for k in keys]
return f"((({' + '.join(parts)})::numeric * 100) / {len(keys)})"
def _bulk_pick(kind: str, limit: int, coverage_max: int) -> list[int]:
if kind not in _BULK_KEY_MAP:
raise HTTPException(400, "kind must be klub|savez|sportas")
table, keys = _BULK_KEY_MAP[kind]
cov = _coverage_sql('', keys)
extra_where = ''
if kind == 'klub':
extra_where = "AND aktivan = TRUE"
elif kind == 'sportas':
extra_where = "AND aktivan = TRUE"
sql = (f"SELECT id FROM {table} "
f"WHERE 1=1 {extra_where} "
f"AND {cov} < %s "
f"ORDER BY random() LIMIT %s")
with _db() as c, c.cursor() as cur:
cur.execute(sql, (coverage_max, limit))
return [r[0] for r in cur.fetchall()]
@router.post("/enrich/bulk")
def enrich_bulk(body: dict = Body(default=None),
x_user_email: Optional[str] = Header(default=None),
x_user_id: Optional[int] = Header(default=None)):
"""Run preview+apply over N random under-enriched rows of one kind.
Body: {kind: 'klub'|'savez'|'sportas', limit: 50, coverage_max: 70}
Returns aggregate stats. Synchronous (use polling, not SSE).
"""
body = body or {}
kind = (body.get('kind') or '').strip().lower()
if kind not in _BULK_KEY_MAP:
raise HTTPException(400, "kind must be klub|savez|sportas")
limit = max(1, min(int(body.get('limit') or 50), 200))
coverage_max = max(0, min(int(body.get('coverage_max') or 70), 100))
ids = _bulk_pick(kind, limit, coverage_max)
items: list[dict] = []
fields_total = 0
started = time.time()
for eid in ids:
try:
row = _load_row(kind, eid)
if kind == 'klub': res = _propose_for_klub(row)
elif kind == 'savez': res = _propose_for_savez(row)
else: res = _propose_for_sportas(row)
proposed = res.get('proposed') or {}
srcs = res.get('sources') or []
if not proposed:
items.append({'id': eid, 'applied': 0, 'fields': []})
continue
out = _apply_to_db(kind, eid, proposed, srcs, x_user_email)
applied = out.get('applied') or {}
fields_total += len(applied)
items.append({'id': eid, 'applied': len(applied), 'fields': list(applied.keys())})
try:
from audit_seal_router import audit_log as _audit_log
if applied:
_audit_log(action='enrich.bulk.apply',
target_type=kind, target_id=eid,
payload={'applied': applied},
user_id=x_user_id, user_email=x_user_email)
except Exception:
pass
except HTTPException as e:
items.append({'id': eid, 'error': e.detail})
except Exception as e:
items.append({'id': eid, 'error': f'{type(e).__name__}: {e}'})
return {
'status': 'success',
'kind': kind,
'requested': limit,
'processed': len(items),
'fields_total': fields_total,
'elapsed_s': round(time.time() - started, 1),
'items': items,
}
# ─── SB-4 — Worker status / control ─────────────────────────────────────
_REDIS_KEYS = {
'heartbeat': 'cc:pgz-enricher:heartbeat',
'pause': 'cc:pgz-enricher:pause',
'run_now': 'cc:pgz-enricher:run_now',
'last_cycle': 'cc:pgz-enricher:last_cycle',
'confidence': 'cc:pgz-enricher:confidence',
'fields_24h': 'cc:pgz-enricher:fields_24h',
}
def _redis_client():
try:
import redis
except Exception:
return None
host = os.environ.get('REDIS_HOST', 'localhost')
port = int(os.environ.get('REDIS_PORT', '6379'))
pwd = (os.environ.get('REDIS_PASS') or '').strip().strip("'").strip('"') or None
# Try with password first (prod); fall back to anonymous (dev box) on AUTH failure.
for p in (pwd, None):
try:
r = redis.Redis(host=host, port=port, password=p,
decode_responses=True, socket_connect_timeout=2)
r.ping()
return r
except Exception:
continue
return None
@router.get("/enrich/worker/status")
def enrich_worker_status():
r = _redis_client()
out = {'available': bool(r)}
if not r:
return out
try:
hb = r.get(_REDIS_KEYS['heartbeat'])
out['heartbeat'] = int(hb) if hb else None
out['heartbeat_age_s'] = (int(time.time()) - int(hb)) if hb else None
out['paused'] = (r.get(_REDIS_KEYS['pause']) or '0') == '1'
out['run_now_pending'] = (r.get(_REDIS_KEYS['run_now']) or '0') == '1'
last = r.get(_REDIS_KEYS['last_cycle'])
if last:
try: out['last_cycle'] = json.loads(last)
except: out['last_cycle'] = last
conf = r.get(_REDIS_KEYS['confidence'])
out['confidence_threshold'] = float(conf) if conf else 0.7
f24 = r.get(_REDIS_KEYS['fields_24h'])
out['fields_24h'] = int(f24) if f24 and f24.isdigit() else 0
except Exception as e:
out['error'] = f'{type(e).__name__}: {e}'
# Recent enrichment_log rows for live activity
try:
with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
cur.execute("""SELECT id, kind, target_id, source, fields_set, user_email, created_at
FROM pgz_sport.enrichment_log
ORDER BY id DESC LIMIT 25""")
rows = []
for rr in cur.fetchall():
rr = dict(rr)
if rr.get('created_at'): rr['created_at'] = rr['created_at'].isoformat()
rows.append(rr)
out['recent'] = rows
except Exception:
out['recent'] = []
return out
@router.post("/enrich/worker/pause")
def enrich_worker_pause(body: dict = Body(default=None)):
body = body or {}
pause = bool(body.get('paused', True))
r = _redis_client()
if not r: raise HTTPException(503, 'redis unavailable')
r.set(_REDIS_KEYS['pause'], '1' if pause else '0')
return {'status': 'success', 'paused': pause}
@router.post("/enrich/worker/run-now")
def enrich_worker_run_now():
r = _redis_client()
if not r: raise HTTPException(503, 'redis unavailable')
r.set(_REDIS_KEYS['run_now'], '1')
return {'status': 'success', 'queued': True}
@router.post("/enrich/worker/confidence")
def enrich_worker_confidence(body: dict = Body(...)):
try:
v = float(body.get('value'))
except Exception:
raise HTTPException(400, 'value must be number 0..1')
if not (0.0 <= v <= 1.0):
raise HTTPException(400, 'value out of range 0..1')
r = _redis_client()
if not r: raise HTTPException(503, 'redis unavailable')
r.set(_REDIS_KEYS['confidence'], str(v))
return {'status': 'success', 'confidence_threshold': v}