Files
pgz-sport/_backups/sprint_1777940323/enrich_router.py
T
damir c38f15a566 R7+: 5x P0 demo fixes — HNS direct link, avatar cache, logo home, klub→sportaši, smarter enrichment
1) HNS direct link u research_links: za sportaš s profile_url/source_url
   (npr. https://semafor.hns.family/igraci/X/...) generira [DIRECT] link na vrhu liste,
   umjesto generic Google search. _research_links sada prima row dict.

2) Avatar cache buster: applyMeToHeader dodaje ?t=Date.now() na sve avatar img tagove.
   Avatar upload handler dodatno persistira novi avatar_url u localStorage.pgz_user
   tako da preživi page refresh + cross-page navigacije.

3) Logo home link: <div class='logo'> → <a href='/' class='logo'> u app.html i sport2.html.
   Klik na PGŽ SPORT logo vodi na public portal.

4) Klub → Sportaši drill-down: u klub Info tabu dodan button
   '👥 Vidi sportaše ovog kluba (N)' koji prebacuje na k-clan tab.
   Plus '🌐 Službena stranica' link kad klub ima web.

5) Smarter klub enrichment:
   - URL validacija (skip placeholder strings poput 'godisnjak_zspgz_2025')
   - Domain candidate guesser (slug → 16 candidate URLs s common HR TLD-ovima i sport prefix-ima)
   - Parallel HEAD probe (8 threads, 10s budget) — first 200 + name token match wins
   - Subpage scrape (/kontakt, /uprava, /o-nama, /o-klubu, /predsjednik) za richer evidence
   - HNK Orijent (id 3766) test: pogađa https://www.orijent.hr/, predlaže web+email+telefon+opis

E2E verified:
- 9/9 sidebar URL-ova → 200
- /users/me/gdpr-export → 200 (28KB JSON)
- /users/me/request-deletion → 200 (DB row pgz_sport.gdpr_erasure_requests)
- /enrich/klub/3766 → 4 proposed fields (web, email, telefon, opis)
- HNS sportaš research_links:  HNS profil DIRECT link na vrhu

Backend: routers/enrich_router.py
Frontend: static/app.html, static/sport2.html
Backups: _backups/sprint_1777940670/

Tag: R7-demo-ready
2026-05-05 02:24:30 +02:00

1691 lines
72 KiB
Python

"""
enrich_router.py — v3 enrichment + forensic scan
Author: dradulic@outlook.com / damir@rinet.one
Date: 2026-05-04 (R2) → 2026-05-05 (R3 CC6 v3)
POST /v2/enrich/{kind}/{eid}
Inspect the row, scrape the web (Wikipedia HR, sport-pgz.hr search,
primary club URL if any), regex-extract candidate fields (web/email/
telefon), optionally synthesise descriptions via DeepSeek, and return
a *preview* shape with `proposed` updates the operator can apply.
POST /v2/enrich/{kind}/{eid}/apply
Body shapes:
None / {} → re-run preview, apply every proposed field
{"fields": {...}} → apply ONLY those (whitelist + emptiness still enforced)
Performs UPDATE on the matching table, sets metadata.enriched_at and
metadata.enrichment_source, writes a row to pgz_sport.enrichment_log,
returns the after snapshot.
GET /v2/enrich/log?kind=&target_id=&limit=
Read recent enrichment-log entries.
POST /v2/forensic/scan
Search civic.persons by name, return entity links + findings + risk score.
Kinds: klub | savez | sportas
"""
from __future__ import annotations
import os, re, json, time, html, urllib.parse, urllib.request
from datetime import datetime, timezone
from typing import Any, Optional
import psycopg2, psycopg2.extras
from fastapi import APIRouter, HTTPException, Header, Body
router = APIRouter()
_pgh = os.environ.get('PG_HOST', '10.10.0.2')
_pgp = int(os.environ.get('PG_PORT', '6432'))
if _pgh in ('localhost', '127.0.0.1'):
_pgh = os.environ.get('DB_HOST', '10.10.0.2')
_pgp = int(os.environ.get('DB_PORT', '6432'))
DB = dict(host=_pgh, port=_pgp,
dbname=os.environ.get('PG_DB', 'rinet_v3'),
user=os.environ.get('PG_USER', 'rinet'),
password=os.environ.get('PG_PASS', 'R1net2026!SecureDB#v7'))
UA = 'pgz-sport-enrich/3.0 (+https://sport.rinet.one)'
TIMEOUT = 6 # seconds — fail-soft
# Optional JS-aware fallback (Playwright). Lazy-loaded, never required.
import sys as _sys
_sys.path.insert(0, '/opt/pgz-sport')
try:
from enrichment import playwright_scraper as _pw_scraper
_HAS_PW = _pw_scraper.HAS_PLAYWRIGHT
except Exception:
_pw_scraper = None
_HAS_PW = False
DEEPSEEK_KEY = os.environ.get('DEEPSEEK_API_KEY', '').strip()
DEEPSEEK_URL = os.environ.get('DEEPSEEK_URL',
'https://api.deepseek.com/v1/chat/completions')
# ─── DB helpers ──────────────────────────────────────────────────────────
def _db():
c = psycopg2.connect(**DB); c.autocommit = True; return c
def _fetch_one(sql, p):
with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
cur.execute(sql, p); r = cur.fetchone()
return dict(r) if r else None
# ─── HTTP helpers ────────────────────────────────────────────────────────
def _http_get(url: str, timeout: int = TIMEOUT) -> Optional[str]:
if not url: return None
if not url.startswith('http'): return None
try:
req = urllib.request.Request(url, headers={
'User-Agent': UA, 'Accept-Language': 'hr,en;q=0.8'})
with urllib.request.urlopen(req, timeout=timeout) as r:
data = r.read(150000)
try: return data.decode('utf-8')
except: return data.decode('latin-1', 'ignore')
except Exception:
return None
def _strip_tags(s: str) -> str:
if not s: return ''
s = re.sub(r'<script[^>]*>.*?</script>', ' ', s, flags=re.S | re.I)
s = re.sub(r'<style[^>]*>.*?</style>', ' ', s, flags=re.S | re.I)
s = re.sub(r'<[^>]+>', ' ', s)
s = html.unescape(s)
s = re.sub(r'\s+', ' ', s).strip()
return s
def _extract_meta(html_doc: str, url: str) -> dict:
if not html_doc: return {}
out = {'url': url, 'fetched_at': int(time.time())}
m = re.search(r'<title[^>]*>([^<]+)</title>', html_doc, re.I)
if m: out['title'] = html.unescape(m.group(1).strip())[:300]
m = re.search(r'<meta\s+name=["\']description["\']\s+content=["\']([^"\']+)["\']', html_doc, re.I)
if not m:
m = re.search(r'<meta\s+property=["\']og:description["\']\s+content=["\']([^"\']+)["\']', html_doc, re.I)
if m: out['description'] = html.unescape(m.group(1).strip())[:600]
return out
def _fetch_title(url, timeout=5):
body = _http_get(url, timeout=timeout)
if not body: return {'url': url, 'error': 'fetch failed'} if url else None
return _extract_meta(body, url)
# ─── Field extractors ───────────────────────────────────────────────────
RE_EMAIL = re.compile(r'[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,}', re.I)
RE_PHONE = re.compile(r'(?:\+?385[\s\-/]*|0)\d[\d\s\-/]{6,12}\d')
RE_URL = re.compile(r'https?://[^\s"\'<>)\]]+', re.I)
def _find_email(text: str) -> Optional[str]:
if not text: return None
bad = ('@example.', '@test.', '@email.', 'wixpress.com',
'sentry.io', 'jquery.com', 'googleapis', '@2x.', 'noreply@')
seen = set()
for m in RE_EMAIL.finditer(text):
e = m.group(0).lower().rstrip('.,;:)')
if any(b in e for b in bad): continue
if e in seen: continue
seen.add(e); return e
return None
def _find_phone(text: str) -> Optional[str]:
if not text: return None
for m in RE_PHONE.finditer(text):
raw = m.group(0).strip()
digits = re.sub(r'\D', '', raw)
if not (8 <= len(digits) <= 13): continue
cleaned = re.sub(r'\s+', ' ', raw).strip()
if raw.startswith('+385'): return '+385 ' + raw[4:].lstrip().lstrip('-/')
if raw.startswith('00385'): return '+385 ' + raw[5:].lstrip().lstrip('-/')
return cleaned
return None
def _find_official_web(text: str, hint: str = '') -> Optional[str]:
if not text: return None
blocked = ('wikipedia.org', 'sport-pgz.hr', 'google.com', 'facebook.com',
'instagram.com', 'youtube.com', 'twitter.com', 'wikimedia',
'sportilus.com', 'transfermarkt.com', 'wikidata.org',
'sudreg.pravosudje.hr', 'gov.hr', 'apis.google.com',
'rinet.one', 'pgz.hr')
candidates: list[str] = []
for m in RE_URL.finditer(text):
u = m.group(0).rstrip('.,;:)\'"')
try:
host = urllib.parse.urlparse(u).hostname or ''
except Exception:
continue
if not host or any(b in host for b in blocked): continue
candidates.append(u)
if not candidates: return None
if hint:
slug = re.sub(r'[^a-z0-9]', '', hint.lower())[:8]
for u in candidates:
host = urllib.parse.urlparse(u).hostname or ''
if slug and slug in host.replace('-', '').replace('.', ''):
return u
return candidates[0]
# ─── External sources ────────────────────────────────────────────────────
def _wiki_variants(query: str) -> list[str]:
"""Generate sensible Wikipedia HR title variants for a query.
The summary REST API is title-exact; clubs are often listed under their
abbreviation (KK X, NK X, RK X, OK X), so we try those variants too.
"""
if not query: return []
out, seen = [], set()
raw = query.strip()
def _push(v):
if v and v not in seen: seen.add(v); out.append(v)
_push(raw)
# KK Kvarner 2010 from Košarkaški klub KVARNER 2010
parts = raw.split()
sport_to_abbr = {
'košarkaški': 'KK', 'kosarkaski': 'KK',
'nogometni': 'NK', 'rukometni': 'RK',
'odbojkaški': 'OK', 'odbojkaski': 'OK',
'vaterpolski':'VK', 'plivacki': 'PK', 'plivački': 'PK',
'boćarski': 'BK', 'bocarski': 'BK',
}
if len(parts) >= 3 and parts[0].lower() in sport_to_abbr and parts[1].lower() == 'klub':
_push(sport_to_abbr[parts[0].lower()] + ' ' + ' '.join(p.capitalize() if p.isupper() else p for p in parts[2:]))
return out
def _wiki_summary(query: str) -> Optional[dict]:
for variant in _wiki_variants(query):
title = urllib.parse.quote(variant.replace(' ', '_'), safe='')
body = _http_get(f'https://hr.wikipedia.org/api/rest_v1/page/summary/{title}', timeout=5)
if not body: continue
try:
d = json.loads(body)
except Exception:
continue
if d.get('type') in ('disambiguation', 'no-extract'): continue
if not d.get('extract'): continue
return {
'source': 'wikipedia.hr',
'url': d.get('content_urls', {}).get('desktop', {}).get('page'),
'title': d.get('title'),
'extract': d.get('extract'),
'description': d.get('description'),
'matched_variant': variant,
}
return None
def _sport_pgz_search(query: str) -> Optional[dict]:
if not query: return None
page = _http_get('https://sport-pgz.hr/?s=' + urllib.parse.quote(query), timeout=6)
if not page:
# Plain HTTP failed → try JS-rendered fallback if available.
if _HAS_PW and _pw_scraper is not None:
return _pw_scraper.scrape_sport_pgz_klub(query)
return None
m = re.search(r'<article[^>]*>.*?<a\s+href=["\']([^"\']+)["\'][^>]*rel=["\']bookmark["\'][^>]*>([^<]+)</a>',
page, re.S | re.I)
if not m:
m = re.search(r'<a\s+href=["\'](https?://sport-pgz\.hr/[^"\']+)["\'][^>]*>([^<]{6,180})</a>', page, re.I)
if not m:
# Search page rendered but yielded nothing parseable — try JS fallback.
if _HAS_PW and _pw_scraper is not None:
return _pw_scraper.scrape_sport_pgz_klub(query)
return None
hit = m.group(1)
body = _http_get(hit, timeout=6)
if not body:
return {'source': 'sport-pgz.hr', 'url': hit, 'title': html.unescape(m.group(2).strip())}
text = _strip_tags(body)[:4000]
meta = _extract_meta(body, hit)
return {
'source': 'sport-pgz.hr',
'url': hit,
'title': meta.get('title') or html.unescape(m.group(2).strip()),
'extract': meta.get('description') or text[:500],
'raw_text': text,
}
def _fetch_primary_site(url: str) -> Optional[dict]:
body = _http_get(url, timeout=6)
if not body: return None
text = _strip_tags(body)
meta = _extract_meta(body, url)
return {
'source': urllib.parse.urlparse(url).hostname or url,
'url': url,
'title': meta.get('title'),
'extract': meta.get('description') or text[:500],
'raw_text': text[:8000],
}
# ─── DeepSeek (optional, fail-soft) ─────────────────────────────────────
def _deepseek_describe(naziv: str, kind: str, evidence: list[str]) -> Optional[str]:
if not DEEPSEEK_KEY or not evidence: return None
joined = "\n---\n".join(e for e in evidence if e)[:6000]
if not joined.strip(): return None
prompt = (f"Iz dolje navedenih izvora napiši profesionalni opis za "
f"{kind} '{naziv}' na hrvatskom jeziku. 3-5 rečenica. "
f"Bez uvoda 'Evo opisa', samo tekst.\n\nIZVORI:\n{joined}")
payload = {
"model": "deepseek-chat",
"messages": [
{"role": "system", "content": "Pišeš sažete činjenične opise sportskih organizacija na hrvatskom."},
{"role": "user", "content": prompt},
],
"max_tokens": 280, "temperature": 0.3,
}
req = urllib.request.Request(
DEEPSEEK_URL, data=json.dumps(payload).encode('utf-8'),
headers={'Authorization': 'Bearer ' + DEEPSEEK_KEY,
'Content-Type': 'application/json',
'User-Agent': UA}, method='POST')
try:
with urllib.request.urlopen(req, timeout=20) as r:
d = json.loads(r.read().decode('utf-8'))
text = d.get('choices', [{}])[0].get('message', {}).get('content', '').strip()
return text or None
except Exception:
return None
# ─── Row loaders & display name ─────────────────────────────────────────
def _load_row(kind: str, eid: int) -> dict:
if kind == 'klub':
row = _fetch_one("""SELECT id, naziv, oib, sport, grad, predsjednik, tajnik,
web, web_stranica, email, telefon, ciljevi, opis_djelatnosti,
sjediste, godina_osnutka, savez_id, scrape_url, source_url,
metadata
FROM pgz_sport.klubovi WHERE id=%s""", (eid,))
elif kind == 'savez':
row = _fetch_one("""SELECT id, naziv, oib, sport, predsjednik, tajnik, email, telefon, web,
adresa, godina_osnutka, source_url, metadata
FROM pgz_sport.savezi WHERE id=%s""", (eid,))
elif kind == 'sportas':
row = _fetch_one("""SELECT c.id, c.ime, c.prezime, c.sport, c.klub_id, c.profile_url,
c.slika_url, c.source_url, c.source, c.source_id,
c.hns_igrac_id, c.biografija,
c.datum_rodenja, c.mjesto_rodenja, c.broj_dresa,
c.visina_cm, c.tezina_kg, c.dominantna_noga, c.oib,
c.vanjski_id, c.metadata,
k.sport AS klub_sport, k.naziv AS klub_naziv
FROM pgz_sport.clanovi c
LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id
WHERE c.id=%s""", (eid,))
# Fall back to klub.sport when c.sport is empty
if row and not row.get('sport') and row.get('klub_sport'):
row['sport'] = row['klub_sport']
else:
raise HTTPException(400, "kind must be klub|savez|sportas")
if not row:
raise HTTPException(404, kind + " not found")
return row
def _display_name(kind: str, row: dict) -> str:
if kind == 'sportas':
return ((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip()
return row.get('naziv', '') or ''
# ─── Sport federations map (loaded once, refresh on file mtime) ─────────
_SPORT_FED_PATH = '/opt/pgz-sport/data/sport_federations.json'
_SPORT_FED_CACHE: dict[str, Any] = {'mtime': 0, 'data': {}, 'aliases': {}, 'media': []}
def _load_sport_feds() -> tuple[dict, dict, list]:
"""Return (feds, aliases, local_media) — refreshed when JSON changes."""
try:
st = os.stat(_SPORT_FED_PATH)
except FileNotFoundError:
return ({}, {}, [])
if st.st_mtime != _SPORT_FED_CACHE['mtime']:
try:
with open(_SPORT_FED_PATH, 'r', encoding='utf-8') as f:
raw = json.load(f)
except Exception:
return (_SPORT_FED_CACHE['data'],
_SPORT_FED_CACHE['aliases'],
_SPORT_FED_CACHE['media'])
aliases = raw.pop('_aliases', {}) if isinstance(raw, dict) else {}
media = raw.pop('_local_media_pgz', []) if isinstance(raw, dict) else []
raw.pop('_meta', None)
_SPORT_FED_CACHE.update(mtime=st.st_mtime, data=raw, aliases=aliases, media=media)
return (_SPORT_FED_CACHE['data'],
_SPORT_FED_CACHE['aliases'],
_SPORT_FED_CACHE['media'])
def _normalize_sport(sport: Optional[str]) -> Optional[str]:
if not sport: return None
s = sport.strip().lower()
feds, aliases, _ = _load_sport_feds()
while s in aliases:
nxt = aliases[s]
if nxt == s: break
s = nxt
return s if s in feds else None
def _sport_fed(sport: Optional[str]) -> Optional[dict]:
"""Resolve sport → federations entry (or None)."""
norm = _normalize_sport(sport)
if not norm: return None
feds, _, _ = _load_sport_feds()
return feds.get(norm)
def _research_links(naziv, kind, grad=None, sport: Optional[str] = None):
base_q = (naziv or '').strip()
q = (base_q + ' ' + grad) if grad else base_q
qenc = urllib.parse.quote(q)
out = [
{'label': 'Google', 'icon': '🔍', 'url': 'https://www.google.com/search?q=' + qenc},
{'label': 'Wikipedia HR', 'icon': '📚', 'url': 'https://hr.wikipedia.org/w/index.php?search=' + qenc},
{'label': 'sport-pgz.hr', 'icon': '🏅', 'url': 'https://sport-pgz.hr/?s=' + qenc},
]
if kind == 'klub':
out.append({'label': 'Sportilus', 'icon': '', 'url': 'https://www.sportilus.com/?s=' + qenc})
out.append({'label': 'Sudski registar', 'icon': '', 'url': 'https://sudreg.pravosudje.hr/registar/oc/index.html'})
# Sport-specific federation links (replace static HNS/transfermarkt for sportas)
fed = _sport_fed(sport) if sport else None
if kind == 'sportas':
if fed and isinstance(fed.get('national'), dict):
nat = fed['national']
search = (nat.get('search_url') or nat.get('url') or '').replace('{q}', qenc)
if search:
out.append({'label': nat.get('name', 'Nacionalni savez'),
'icon': '🏆', 'url': search})
if fed and isinstance(fed.get('pgz'), dict):
pgz = fed['pgz']
url = pgz.get('search_url') or pgz.get('url') or ''
if url:
out.append({'label': pgz.get('name', 'PGŽ savez'),
'icon': '🏟', 'url': url.replace('{q}', qenc)})
if not fed:
# No mapping for this sport → keep transfermarkt as legacy fallback
out.append({'label': 'HNS Semafor', 'icon': '', 'url': 'https://semafor.hns.family/?s=' + qenc})
out.append({'label': 'transfermarkt','icon': '', 'url': 'https://www.transfermarkt.com/schnellsuche/ergebnis/schnellsuche?query=' + qenc})
# Local PGŽ media for any sportas
_, _, media = _load_sport_feds()
for m in media:
url = (m.get('search_url') or '').replace('{q}', qenc)
if url:
out.append({'label': m.get('name', 'Lokalni medij'),
'icon': '📰', 'url': url})
if kind == 'savez':
out.append({'label': 'sport-pgz.hr savezi', 'icon': '🏅', 'url': 'https://sport-pgz.hr/savezi'})
return out
# ─── Proposal pipelines ─────────────────────────────────────────────────
def _name_tokens(naziv: str) -> list[str]:
"""Significant tokens from entity name (≥4 chars, deaccented)."""
import unicodedata
s = unicodedata.normalize('NFKD', naziv or '').encode('ascii', 'ignore').decode('ascii').lower()
toks = [t for t in re.split(r'[^a-z0-9]+', s) if len(t) >= 4]
stop = {'klub','udruga','sportski','sport','kosarkaski','kosarka','nogometni',
'rukometni','savez','rijeka','primorsko','goranski','grad','grada','centar'}
return [t for t in toks if t not in stop] or toks
def _is_relevant(source: dict, tokens: list[str]) -> bool:
"""A source is 'relevant' only if the page actually mentions the entity name."""
if not tokens: return True
import unicodedata
blob = (source.get('title') or '') + ' ' + (source.get('extract') or '') + ' ' + (source.get('raw_text') or '')
blob = unicodedata.normalize('NFKD', blob.lower()).encode('ascii', 'ignore').decode('ascii')
return any(t in blob for t in tokens)
def _propose_for_klub(row: dict) -> dict:
naziv = row.get('naziv') or ''
primary = row.get('web') or row.get('web_stranica') or row.get('source_url') or row.get('scrape_url')
sources, evidence = [], []
pdoc = _fetch_primary_site(primary) if primary else None
if pdoc: sources.append(pdoc); evidence.append(pdoc.get('raw_text') or pdoc.get('extract') or '')
wiki = _wiki_summary(naziv)
if wiki: sources.append(wiki); evidence.append(wiki.get('extract') or '')
spz = _sport_pgz_search(naziv)
if spz: sources.append(spz); evidence.append(spz.get('raw_text') or spz.get('extract') or '')
tokens = _name_tokens(naziv)
relevant = [s for s in sources if _is_relevant(s, tokens)]
relevant_blob = '\n\n'.join((s.get('raw_text') or s.get('extract') or '') for s in relevant)
proposed: dict[str, Any] = {}
# web/email/telefon: ONLY from sources actually mentioning the entity
if not row.get('web'):
u = _find_official_web(relevant_blob, naziv)
if u: proposed['web'] = u
if not row.get('email'):
e = _find_email(relevant_blob)
if e: proposed['email'] = e
if not row.get('telefon'):
t = _find_phone(relevant_blob)
if t: proposed['telefon'] = t
if not row.get('opis_djelatnosti'):
descr_evidence = [(s.get('raw_text') or s.get('extract') or '') for s in relevant] or evidence
descr = _deepseek_describe(naziv, 'sportski klub', descr_evidence)
if not descr:
for s in (relevant or sources):
if s.get('extract') and len(s['extract']) >= 80:
descr = s['extract']; break
if descr: proposed['opis_djelatnosti'] = descr.strip()[:2000]
return {'proposed': proposed, 'sources': sources}
def _propose_for_savez(row: dict) -> dict:
naziv = row.get('naziv') or ''
primary = row.get('web') or row.get('source_url')
sources, evidence = [], []
pdoc = _fetch_primary_site(primary) if primary else None
if pdoc: sources.append(pdoc); evidence.append(pdoc.get('raw_text') or '')
wiki = _wiki_summary(naziv)
if wiki: sources.append(wiki); evidence.append(wiki.get('extract') or '')
spz = _sport_pgz_search(naziv)
if spz: sources.append(spz); evidence.append(spz.get('raw_text') or '')
tokens = _name_tokens(naziv)
relevant = [s for s in sources if _is_relevant(s, tokens)]
relevant_blob = '\n\n'.join((s.get('raw_text') or s.get('extract') or '') for s in relevant)
proposed: dict[str, Any] = {}
if not row.get('web'):
u = _find_official_web(relevant_blob, naziv)
if u: proposed['web'] = u
if not row.get('email'):
e = _find_email(relevant_blob)
if e: proposed['email'] = e
if not row.get('telefon'):
t = _find_phone(relevant_blob)
if t: proposed['telefon'] = t
return {'proposed': proposed, 'sources': sources}
# ─── HNS Semafor parsing ────────────────────────────────────────────────
_HNS_BASE = 'https://semafor.hns.family'
def _slugify(name: str) -> str:
import unicodedata
s = unicodedata.normalize('NFKD', name or '').encode('ascii', 'ignore').decode('ascii').lower()
return re.sub(r'[^a-z0-9]+', '-', s).strip('-')
def _hns_url_from_row(row: dict) -> Optional[str]:
"""Try to build a semafor.hns.family /igraci/ URL for this row."""
# 1) Already-set columns
for k in ('profile_url', 'source_url'):
u = row.get(k)
if u and 'semafor.hns.family/igraci/' in (u or ''):
return u
# 2) hns_igrac_id column
pid = row.get('hns_igrac_id')
if pid:
slug = _slugify(((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip())
return f'{_HNS_BASE}/igraci/{int(pid)}/{slug}/'
# 3) vanjski_id JSONB → hns_comet
vid = row.get('vanjski_id') or {}
if isinstance(vid, dict):
comet = vid.get('hns_comet') or vid.get('hns_pid')
slug = vid.get('hns_slug') or _slugify(((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip())
if comet:
try:
return f'{_HNS_BASE}/igraci/{int(comet)}/{slug}/'
except Exception:
pass
# 4) source='hns_semafor' + source_id
if (row.get('source') or '').startswith('hns_') and row.get('source_id'):
try:
slug = _slugify(((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip())
return f'{_HNS_BASE}/igraci/{int(row["source_id"])}/{slug}/'
except Exception:
pass
return None
def _parse_hns_player(html_doc: str, url: str) -> Optional[dict]:
"""Extract structured fields from a semafor.hns.family player page."""
if not html_doc: return None
try:
from bs4 import BeautifulSoup
except Exception:
return _parse_hns_player_regex(html_doc, url)
soup = BeautifulSoup(html_doc, 'html.parser')
out: dict[str, Any] = {'source': 'semafor.hns.family', 'url': url}
# hns_igrac_id from URL
m = re.search(r'/igraci/(\d+)/', url)
if m: out['hns_igrac_id'] = int(m.group(1))
title = soup.find('title')
if title: out['title'] = title.get_text(strip=True)[:300]
# Photo
photo = soup.find('div', class_='photo')
if photo:
img = photo.find('img')
if img and img.get('src'):
src = img['src']
if not src.startswith('http'):
src = urllib.parse.urljoin(url, src)
out['slika_url'] = src
# Player number (jersey)
pn = soup.find('div', class_='playerName')
if pn:
h3 = pn.find('h3')
if h3:
t = h3.get_text(strip=True)
if t.isdigit():
out['broj_dresa'] = int(t)
# Datum rodjenja
li = soup.find('li', class_='dob')
if li:
h4 = li.find('h4')
if h4:
t = h4.get_text(' ', strip=True)
mm = re.match(r'(\d{1,2})\.(\d{1,2})\.(\d{4})', t)
if mm:
from datetime import date as _date
try:
out['datum_rodenja'] = _date(int(mm.group(3)), int(mm.group(2)), int(mm.group(1))).isoformat()
except Exception:
pass
# Mjesto rodjenja
li = soup.find('li', class_='pob')
if li:
h4 = li.find('h4')
if h4:
out['mjesto_rodenja'] = h4.get_text(strip=True)
# Trenutni klub (info only — we don't reassign klub_id from here)
klub_link = soup.find('a', href=re.compile(r'/klubovi/(\d+)/'))
if klub_link:
h4 = klub_link.find('h4')
if h4:
out['trenutni_klub'] = h4.get_text(strip=True)
m = re.search(r'/klubovi/(\d+)/', klub_link.get('href') or '')
if m: out['hns_klub_id'] = int(m.group(1))
# Description (meta)
meta_d = soup.find('meta', attrs={'name': 'description'})
if meta_d and meta_d.get('content'):
out['description'] = meta_d['content'][:600]
# Make a clean text blob for relevance / DeepSeek
text = soup.get_text(' ', strip=True)
out['raw_text'] = re.sub(r'\s+', ' ', text)[:4000]
out['extract'] = (out.get('description')
or (out['raw_text'][:500] if out.get('raw_text') else None))
return out
def _parse_hns_player_regex(html_doc: str, url: str) -> Optional[dict]:
"""BS4-free fallback parser."""
out: dict[str, Any] = {'source': 'semafor.hns.family', 'url': url}
m = re.search(r'/igraci/(\d+)/', url)
if m: out['hns_igrac_id'] = int(m.group(1))
m = re.search(r'<div class="photo"><img src="([^"]+)"', html_doc)
if m:
src = m.group(1)
if not src.startswith('http'): src = urllib.parse.urljoin(url, src)
out['slika_url'] = src
m = re.search(r'<li class="dob">.*?<h4>(\d{1,2})\.(\d{1,2})\.(\d{4})', html_doc, re.S)
if m:
from datetime import date as _date
try:
out['datum_rodenja'] = _date(int(m.group(3)), int(m.group(2)), int(m.group(1))).isoformat()
except Exception:
pass
m = re.search(r'<li class="pob"><i></i><h4>([^<]+)</h4>', html_doc)
if m: out['mjesto_rodenja'] = m.group(1).strip()
m = re.search(r'<div class="playerName"><h3>(\d+)</h3>', html_doc)
if m: out['broj_dresa'] = int(m.group(1))
m = re.search(r'<meta\s+name="description"\s+content="([^"]+)"', html_doc)
if m: out['description'] = m.group(1)[:600]
return out
def _hns_fetch_player(url: str) -> Optional[dict]:
body = _http_get(url, timeout=8)
if not body:
# Try Playwright fallback
if _HAS_PW and _pw_scraper is not None:
r = _pw_scraper.fetch_rendered(url, timeout_ms=15000)
if r and r.get('html_len', 0) > 2000:
# We didn't store html in fetch_rendered — re-fetch text only is enough
# but we need html for parse. Do a simple HTTP retry with longer timeout.
body = _http_get(url, timeout=15)
return _parse_hns_player(body, url) if body else None
# ─── Generic sport-federation scraper ───────────────────────────────────
def _fed_url_from_row(row: dict) -> Optional[str]:
"""If the row already points to a federation profile (source_url /
profile_url on a known fed host), return it."""
feds, _, _ = _load_sport_feds()
fed_hosts = set()
for entry in feds.values():
if not isinstance(entry, dict): continue
for which in ('national', 'pgz'):
sub = entry.get(which) or {}
for k in ('url', 'search_url', 'profile_url_pattern'):
v = sub.get(k)
if v:
try:
h = urllib.parse.urlparse(v.replace('{q}', 'x').replace('{slug}', 'x').replace('{hns_pid}', '1')).hostname
if h: fed_hosts.add(h)
except Exception:
pass
for k in ('source_url', 'profile_url'):
u = row.get(k)
if not u: continue
try:
h = urllib.parse.urlparse(u).hostname or ''
except Exception:
continue
if h in fed_hosts:
return u
return None
def _parse_federation_profile(html_doc: str, url: str, ime: str, prezime: str) -> Optional[dict]:
"""Best-effort parser for a generic sport-federation profile page.
Returns {source, url, slika_url, datum_rodenja, mjesto_rodenja, klub,
extract, raw_text}. Tolerant of varied page structures.
"""
if not html_doc: return None
host = urllib.parse.urlparse(url).hostname or ''
out: dict[str, Any] = {
'source': host,
'url': url,
}
# Title
m = re.search(r'<title[^>]*>([^<]+)</title>', html_doc, re.I)
if m: out['title'] = html.unescape(m.group(1).strip())[:300]
# Meta description
m = re.search(r'<meta\s+name=["\']description["\']\s+content=["\']([^"\']+)["\']', html_doc, re.I)
if m: out['description'] = html.unescape(m.group(1).strip())[:600]
name_tokens = []
for t in (ime, prezime):
if t and len(t) >= 3:
name_tokens.append(re.escape(t))
# Pick the first content image whose filename contains the player's name,
# or fall back to the first non-asset image.
img_candidates = re.findall(r'<img[^>]+src=["\']([^"\']+)["\']', html_doc, re.I)
chosen_img = None
for src in img_candidates:
low = src.lower()
if any(b in low for b in ('logo', 'icon', 'admin-ajax', 'spinner', 'loader',
'sprite', '/themes/', '/icons/', 'gdpr', 'banner',
'header', 'footer', 'placeholder', 'avatar-default')):
continue
if not low.endswith(('.jpg', '.jpeg', '.png', '.webp')):
continue
# Prefer matches on player name in URL
if name_tokens and any(re.search(t, src, re.I) for t in name_tokens):
chosen_img = src; break
if chosen_img is None:
chosen_img = src
if chosen_img:
if not chosen_img.startswith('http'):
chosen_img = urllib.parse.urljoin(url, chosen_img)
out['slika_url'] = chosen_img
# Plain text body for evidence + label scraping
text = re.sub(r'<script[^>]*>.*?</script>', ' ', html_doc, flags=re.S | re.I)
text = re.sub(r'<style[^>]*>.*?</style>', ' ', text, flags=re.S | re.I)
text = re.sub(r'<[^>]+>', ' ', text)
text = html.unescape(re.sub(r'\s+', ' ', text)).strip()
out['raw_text'] = text[:4000]
out['extract'] = (out.get('description')
or text[max(0, text.find(prezime)-30):max(0, text.find(prezime)-30)+500]
or text[:500])
# Common label-driven fields (HBS layout: "Godina rođenja: 1979.", "Matični klub: …")
m = re.search(r'Datum\s+ro[đdj]?enja[:\s]+(\d{1,2}[.\-/]\d{1,2}[.\-/]\d{4})', text, re.I)
if m:
try:
from datetime import date as _date
d = re.split(r'[.\-/]', m.group(1))
out['datum_rodenja'] = _date(int(d[2]), int(d[1]), int(d[0])).isoformat()
except Exception:
pass
if 'datum_rodenja' not in out:
m = re.search(r'Godina\s+ro[đdj]?enja[:\s]+(\d{4})', text, re.I)
if m:
try:
from datetime import date as _date
out['datum_rodenja'] = _date(int(m.group(1)), 1, 1).isoformat()
except Exception:
pass
m = re.search(r'Mjesto\s+ro[đdj]?enja[:\s]+([A-ZČĆŠĐŽ][^,\n.]{2,40})', text)
if m: out['mjesto_rodenja'] = m.group(1).strip()
m = re.search(r'Mati[čc]ni\s+klub[:\s]+([^\n]{3,60}?)(?:\s+(?:Sportski|Datum|Liječni|Reprezent|Sezona|Domaće|Nastupi))', text, re.I)
if m: out['klub_naziv'] = m.group(1).strip().rstrip('.')
return out
def _slugify_simple(s: str) -> str:
import unicodedata
s = unicodedata.normalize('NFKD', s or '').encode('ascii', 'ignore').decode('ascii').lower()
return re.sub(r'[^a-z0-9]+', '-', s).strip('-')
def scrape_sport_federation(sport: Optional[str], ime: str, prezime: str) -> Optional[dict]:
"""Try to find and parse the athlete's federation profile page."""
fed = _sport_fed(sport) if sport else None
if not fed: return None
nat = (fed or {}).get('national') or {}
full_name = (ime + ' ' + prezime).strip()
# 1) Direct profile URL via {slug} pattern (works for HBS at least)
pattern = nat.get('profile_url_pattern')
if pattern and '{slug}' in pattern:
slug = _slugify_simple(full_name)
url = pattern.replace('{slug}', slug)
body = _http_get(url, timeout=8)
if body and prezime.lower() in body.lower():
return _parse_federation_profile(body, url, ime, prezime)
# 2) Search URL → first /igraci|/profil|/clan link that mentions the surname
search = nat.get('search_url')
if search:
body = _http_get(search.replace('{q}', urllib.parse.quote(full_name)), timeout=10)
if body:
for href_re in (r'href="([^"]*?/igraci/[^"]+)"',
r'href="([^"]*?/igrac/[^"]+)"',
r'href="([^"]*?/sportasi/[^"]+)"',
r'href="([^"]*?/clanovi/[^"]+)"',
r'href="([^"]*?/profil/[^"]+)"'):
for m in re.finditer(href_re, body, re.I):
cand = m.group(1)
if not cand.startswith('http'):
cand = urllib.parse.urljoin(nat.get('url', search), cand)
if _slugify_simple(prezime) in _slugify_simple(cand):
b2 = _http_get(cand, timeout=8)
if b2:
return _parse_federation_profile(b2, cand, ime, prezime)
return None
def _propose_for_sportas(row: dict) -> dict:
naziv = ((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip()
ime, prezime = (row.get('ime') or ''), (row.get('prezime') or '')
sport = row.get('sport')
sources, evidence = [], []
proposed: dict[str, Any] = {}
# 1) HNS Semafor — only meaningful when sport is football OR row already
# carries an HNS link.
hns_doc: Optional[dict] = None
if _normalize_sport(sport) == 'nogomet' or _hns_url_from_row(row):
hns_url = _hns_url_from_row(row)
if hns_url:
hns_doc = _hns_fetch_player(hns_url)
if hns_doc:
sources.append(hns_doc)
evidence.append(hns_doc.get('raw_text') or hns_doc.get('extract') or '')
# 2) Sport-aware federation scrape (HBS, HKS, etc.) — also use existing
# source_url/profile_url if it points at a known federation host.
fed_doc: Optional[dict] = None
direct_fed_url = _fed_url_from_row(row)
if direct_fed_url and (not hns_doc or hns_doc.get('url') != direct_fed_url):
body = _http_get(direct_fed_url, timeout=8)
if body:
fed_doc = _parse_federation_profile(body, direct_fed_url, ime, prezime)
if not fed_doc:
fed_doc = scrape_sport_federation(sport, ime, prezime)
if fed_doc:
sources.append(fed_doc)
evidence.append(fed_doc.get('raw_text') or fed_doc.get('extract') or '')
# Helper: pick from hns_doc first then fed_doc
def _pick(field):
if hns_doc and hns_doc.get(field): return hns_doc[field]
if fed_doc and fed_doc.get(field): return fed_doc[field]
return None
if not row.get('profile_url'):
v = _pick('url') or (hns_doc and hns_doc.get('url')) or (fed_doc and fed_doc.get('url'))
if v: proposed['profile_url'] = v
if not row.get('source_url'):
v = (hns_doc and hns_doc.get('url')) or (fed_doc and fed_doc.get('url'))
if v: proposed['source_url'] = v
if not row.get('slika_url'):
v = _pick('slika_url')
if v: proposed['slika_url'] = v
if not row.get('hns_igrac_id') and hns_doc and hns_doc.get('hns_igrac_id'):
proposed['hns_igrac_id'] = hns_doc['hns_igrac_id']
if not row.get('datum_rodenja'):
v = _pick('datum_rodenja')
if v: proposed['datum_rodenja'] = v
if not row.get('mjesto_rodenja'):
v = _pick('mjesto_rodenja')
if v: proposed['mjesto_rodenja'] = v
if not row.get('broj_dresa') and hns_doc and hns_doc.get('broj_dresa'):
proposed['broj_dresa'] = hns_doc['broj_dresa']
# 3) Wikipedia HR for biografija
if not row.get('biografija'):
wiki = _wiki_summary(naziv)
if wiki:
sources.append(wiki)
evidence.append(wiki.get('extract') or '')
# Description: prefer DeepSeek synthesis from all evidence; fallback to first long snippet
if not row.get('biografija'):
descr = _deepseek_describe(naziv, f'sportaš ({sport})' if sport else 'sportaš', evidence) if evidence else None
if not descr:
for s in sources:
ext = s.get('extract')
if ext and len(ext) >= 80:
descr = ext; break
if descr:
proposed['biografija'] = descr.strip()[:2000]
return {'proposed': proposed, 'sources': sources}
# ─── Endpoints ──────────────────────────────────────────────────────────
# ─── R4 — POST /v2/enrich/forensic/{finding_id} ─────────────────────────
def _extract_pep_name(finding: dict) -> Optional[str]:
"""Pull the primary person name from a forensic_findings row."""
title = (finding.get('title') or '').strip()
desc = (finding.get('description') or '').strip()
payload = finding.get('raw_data') or {}
if isinstance(payload, str):
try: payload = json.loads(payload)
except Exception: payload = {}
if isinstance(payload, dict):
for k in ('person_name', 'name', 'osoba'):
v = payload.get(k)
if v: return str(v).strip()
# Try entities_involved.entity_name
ents = finding.get('entities_involved') or []
if isinstance(ents, str):
try: ents = json.loads(ents)
except Exception: ents = []
if isinstance(ents, list):
for e in ents:
if isinstance(e, dict) and e.get('person_name'):
return str(e['person_name']).strip()
if isinstance(e, dict) and e.get('entity_name') and ' ' in (e.get('entity_name') or ''):
# Some entries store person names as entity_name when entity_type='person'
if (e.get('entity_type') or '').lower() in ('person','osoba'):
return str(e['entity_name']).strip()
# Fallback: extract a "Ime Prezime" from the title
m = re.search(r'\b([A-ZČĆŠĐŽ][a-zčćšđž]+)\s+([A-ZČĆŠĐŽ][a-zčćšđž]+(?:-[A-ZČĆŠĐŽ][a-zčćšđž]+)?)\b', title + ' ' + desc)
if m: return f"{m.group(1)} {m.group(2)}"
return None
def _gather_pep_evidence(name: str) -> list[dict]:
sources: list[dict] = []
wiki = _wiki_summary(name)
if wiki: sources.append(wiki)
# DDG html-lite as a "Google snippet" replacement (often OK for HR PEPs)
ddg = 'https://html.duckduckgo.com/html/?q=' + urllib.parse.quote(f'"{name}" PGŽ Hrvatska')
page = _http_get(ddg, timeout=8)
if page:
# First result block
m = re.search(r'<a[^>]+class="result__a"[^>]+href="([^"]+)"[^>]*>([^<]{6,200})</a>', page)
snippet_m = re.search(r'<a[^>]+class="result__snippet"[^>]*>(.*?)</a>', page, re.S)
if m:
sources.append({
'source': 'duckduckgo',
'url': html.unescape(m.group(1))[:500],
'title': html.unescape(m.group(2)).strip()[:300],
'extract': re.sub(r'<[^>]+>', ' ', snippet_m.group(1)).strip()[:600] if snippet_m else None,
})
return sources
def _related_entities_for_pep(name: str) -> list[dict]:
"""Pull civic.persons + their entity links so we have the structured graph."""
out: list[dict] = []
with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
cur.execute("""SELECT id, name, function, party, county, city, oib, trust_tier
FROM civic.persons
WHERE upper(name) ILIKE upper(%s)
ORDER BY oib NULLS LAST, id LIMIT 10""", ('%'+name+'%',))
for p in cur.fetchall():
p = dict(p)
entry = {
'kind': 'person',
'person_id': p['id'], 'person_name': p['name'],
'function': p.get('function'), 'party': p.get('party'),
'county': p.get('county'), 'city': p.get('city'),
'oib': p.get('oib'), 'trust_tier': p.get('trust_tier'),
'entities': [],
}
if p.get('oib'):
cur.execute("""SELECT pel.entity_id, pel.roles, e.name AS entity_name,
e.oib AS entity_oib, e.entity_type, e.city, e.risk_score
FROM civic.person_entity_links pel
LEFT JOIN civic.entities e ON e.id = pel.entity_id
WHERE pel.person_oib=%s LIMIT 30""", (p['oib'],))
for r in cur.fetchall():
entry['entities'].append(dict(r))
out.append(entry)
return out
@router.post("/enrich/forensic/{finding_id}")
def enrich_forensic_v2(finding_id: int,
body: dict = Body(default=None),
x_user_email: Optional[str] = Header(default=None),
x_user_id: Optional[int] = Header(default=None)):
"""Enrich a forensic finding: gather Wiki + DDG snippets + civic graph,
write back to civic.forensic_findings.related_entities, and seal the
payload hash on Polygon (or queue for sealing).
"""
body = body or {}
explicit_name = (body.get('name') or '').strip() or None
with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
cur.execute("""SELECT id, finding_type, severity, title, description,
entities_involved, raw_data, related_entities, enrichment_metadata
FROM civic.forensic_findings WHERE id=%s""", (finding_id,))
finding = cur.fetchone()
if not finding:
raise HTTPException(404, "finding not found")
finding = dict(finding)
name = explicit_name or _extract_pep_name(finding)
if not name:
raise HTTPException(400, "could not derive a person/entity name; pass {name: \"\"}")
sources = _gather_pep_evidence(name)
related = _related_entities_for_pep(name)
payload = {
'finding_id': finding_id,
'name': name,
'sources': [{'source': s.get('source'), 'url': s.get('url'),
'title': s.get('title')} for s in sources],
'related_entities': related,
'enriched_at': datetime.now(timezone.utc).isoformat(),
}
# Persist back to the finding
enrichment_meta = finding.get('enrichment_metadata') or {}
if not isinstance(enrichment_meta, dict): enrichment_meta = {}
history = enrichment_meta.get('history') or []
history.append({
'at': payload['enriched_at'],
'sources': payload['sources'],
'related_count': len(related),
'user': x_user_email,
})
enrichment_meta['history'] = history[-10:]
enrichment_meta['enriched_at'] = payload['enriched_at']
enrichment_meta['enriched_by'] = x_user_email or 'system'
enrichment_meta['source_count'] = len(sources)
with _db() as c, c.cursor() as cur:
cur.execute("""UPDATE civic.forensic_findings
SET related_entities = %s::jsonb,
enrichment_metadata = %s::jsonb
WHERE id=%s
RETURNING id""",
(json.dumps(related, default=str, ensure_ascii=False),
json.dumps(enrichment_meta, default=str, ensure_ascii=False),
finding_id))
cur.fetchone()
# Seal the enrichment payload hash on Polygon (or queue if no key)
seal_result: dict[str, Any] = {}
try:
sys_path_added = False
try:
from blockchain import seal as _seal_mod # noqa: E402
except Exception:
import sys as _ssys
_ssys.path.insert(0, '/opt/pgz-sport')
from blockchain import seal as _seal_mod # noqa: E402
sys_path_added = True
del sys_path_added # silence linters
h = _seal_mod.hash_payload(payload)
seal_result = _seal_mod.seal_to_polygon(
data_hash=h,
ref_id=str(finding_id),
action='forensic.enriched',
ref_type='forensic_finding',
payload=payload,
user_id=x_user_id,
user_email=x_user_email,
)
except Exception as e:
seal_result = {'error': f'{type(e).__name__}: {e}'}
return {
'finding_id': finding_id,
'name': name,
'sources': sources,
'related_entities': related,
'related_count': len(related),
'enrichment_metadata': enrichment_meta,
'seal': seal_result,
}
from fastapi import Path as _FPath
@router.post("/enrich/{kind:str}/{eid:int}")
def enrich_preview(kind: str = _FPath(..., regex='^(klub|savez|sportas)$'), eid: int = 0):
row = _load_row(kind, eid)
if kind == 'klub': res = _propose_for_klub(row)
elif kind == 'savez': res = _propose_for_savez(row)
else: res = _propose_for_sportas(row)
if kind == 'klub':
keys = ['oib','sport','grad','predsjednik','tajnik','web','email','telefon',
'sjediste','godina_osnutka','ciljevi','opis_djelatnosti']
elif kind == 'savez':
keys = ['oib','sport','predsjednik','tajnik','email','telefon','web','adresa','godina_osnutka']
else:
keys = ['sport','profile_url','slika_url','hns_igrac_id','biografija',
'datum_rodenja','mjesto_rodenja','broj_dresa','visina_cm','tezina_kg',
'dominantna_noga','oib']
naziv = _display_name(kind, row)
grad = row.get('grad') if kind == 'klub' else None
primary = row.get('web') or row.get('web_stranica') or row.get('source_url') or row.get('scrape_url') or row.get('profile_url')
filled = sum(1 for k in keys if row.get(k))
coverage = round(filled / len(keys) * 100)
missing = [k for k in keys if not row.get(k)]
proposed = res['proposed']
current = {k: row.get(k) for k in proposed.keys()}
meta = row.get('metadata') or {}
if not isinstance(meta, dict): meta = {}
return {
'kind': kind, 'id': eid, 'naziv': naziv,
'coverage': coverage, 'filled_fields': filled, 'total_fields': len(keys),
'missing_fields': missing,
'live_snippet': _fetch_title(primary) if primary else None,
'research_links': _research_links(naziv, kind, grad, sport=row.get('sport')),
'sport': row.get('sport'),
'sport_federation': (lambda f: {
'national': (f.get('national') or {}).get('name') if f else None,
'national_url': (f.get('national') or {}).get('url') if f else None,
'pgz': (f.get('pgz') or {}).get('name') if f else None,
})(_sport_fed(row.get('sport'))),
'sources': res['sources'],
'current': current,
'proposed': proposed,
'last_enriched_at': meta.get('enriched_at'),
'last_enrichment_source': meta.get('enrichment_source'),
'enriched_at': int(time.time()),
'apply_url': f'/sport/api/v2/enrich/{kind}/{eid}/apply',
}
_TABLE_MAP = {
'klub': ('pgz_sport.klubovi',
{'web','email','telefon','predsjednik','tajnik',
'opis_djelatnosti','ciljevi','godina_osnutka','sjediste','adresa'}),
'savez': ('pgz_sport.savezi',
{'web','email','telefon','predsjednik','tajnik','adresa','godina_osnutka'}),
'sportas': ('pgz_sport.clanovi',
{'biografija','profile_url','source_url','slika_url','hns_igrac_id',
'datum_rodenja','mjesto_rodenja','broj_dresa','visina_cm',
'tezina_kg','dominantna_noga','oib'}),
}
def _apply_to_db(kind: str, eid: int, fields: dict, sources: list, user_email: Optional[str]):
if kind not in _TABLE_MAP:
raise HTTPException(400, "kind must be klub|savez|sportas")
table, allowed = _TABLE_MAP[kind]
with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
cur.execute(f"SELECT * FROM {table} WHERE id=%s FOR UPDATE", (eid,))
before = cur.fetchone()
if not before: raise HTTPException(404, kind + " not found")
before = dict(before)
sets, params, applied = [], [], {}
for k, v in (fields or {}).items():
if k not in allowed: continue
if v is None or str(v).strip() == '': continue
if before.get(k):
continue # never overwrite existing
sets.append(f"{k} = %s")
params.append(v); applied[k] = v
meta_in = before.get('metadata') or {}
if not isinstance(meta_in, dict): meta_in = {}
now_iso = datetime.now(timezone.utc).isoformat()
meta_in['enriched_at'] = now_iso
meta_in['enrichment_source'] = [s.get('source') for s in (sources or []) if s.get('source')]
history = meta_in.get('enrichment_history') or []
history.append({
'at': now_iso,
'fields': list(applied.keys()),
'sources': meta_in['enrichment_source'],
'urls': [s.get('url') for s in (sources or []) if s.get('url')],
'user': user_email,
})
meta_in['enrichment_history'] = history[-10:]
sets.append("metadata = %s::jsonb")
params.append(json.dumps(meta_in, ensure_ascii=False, default=str))
params.append(eid)
cur.execute(f"UPDATE {table} SET {', '.join(sets)} WHERE id=%s RETURNING *", params)
after = dict(cur.fetchone())
cur.execute(
"""INSERT INTO pgz_sport.enrichment_log
(kind, target_id, source, url, fields_set, before_jsonb, after_jsonb, user_email)
VALUES (%s,%s,%s,%s,%s,%s::jsonb,%s::jsonb,%s)""",
(kind, eid,
','.join(meta_in['enrichment_source'])[:120] if meta_in['enrichment_source'] else None,
(sources[0].get('url') if sources else None),
list(applied.keys()) or None,
json.dumps({k: before.get(k) for k in (list(applied.keys()) + ['metadata'])},
ensure_ascii=False, default=str),
json.dumps({k: after.get(k) for k in (list(applied.keys()) + ['metadata'])},
ensure_ascii=False, default=str),
user_email))
snap_keys = ('id','naziv','ime','prezime','web','email','telefon',
'opis_djelatnosti','biografija','metadata')
return {'applied': applied,
'after': {k: after.get(k) for k in snap_keys if k in after}}
@router.post("/enrich/{kind:str}/{eid:int}/apply")
def enrich_apply(kind: str = _FPath(..., regex='^(klub|savez|sportas)$'),
eid: int = 0,
body: dict = Body(default=None),
x_user_email: Optional[str] = Header(default=None),
x_user_id: Optional[int] = Header(default=None)):
body = body or {}
fields = body.get('fields')
sources = body.get('sources')
if not fields:
row = _load_row(kind, eid)
if kind == 'klub': res = _propose_for_klub(row)
elif kind == 'savez': res = _propose_for_savez(row)
else: res = _propose_for_sportas(row)
fields = res['proposed']
sources = res['sources']
out = _apply_to_db(kind, eid, fields or {}, sources or [], x_user_email)
applied = out.get('applied') or {}
# R4-A3: write to pgz_sport.sys_audit so the audit page sees enrichment events
try:
from audit_seal_router import audit_log as _audit_log
if applied:
_audit_log(
action='enrich.apply',
target_type=kind,
target_id=eid,
payload={'applied': applied,
'sources': [s.get('url') for s in (sources or []) if isinstance(s, dict)]},
user_id=x_user_id,
user_email=x_user_email,
)
except Exception:
pass
return {
'status': 'success' if applied else 'no_changes',
'kind': kind,
'id': eid,
'applied_count': len(applied),
'applied_fields': list(applied.keys()),
**out,
}
@router.get("/enrich/log")
def enrich_log(kind: Optional[str] = None, target_id: Optional[int] = None, limit: int = 50):
where, params = [], []
if kind: where.append("kind=%s"); params.append(kind)
if target_id: where.append("target_id=%s"); params.append(target_id)
sql = ("SELECT id, kind, target_id, source, url, fields_set, user_email, created_at "
"FROM pgz_sport.enrichment_log "
+ ("WHERE " + " AND ".join(where) + " " if where else "")
+ "ORDER BY id DESC LIMIT %s")
params.append(min(int(limit or 50), 200))
with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
cur.execute(sql, params)
rows = [dict(r) for r in cur.fetchall()]
for r in rows:
if r.get('created_at'): r['created_at'] = r['created_at'].isoformat()
return {'count': len(rows), 'rows': rows}
# ─── R3B M2 — SEARCH SUGGEST (autocomplete for Mreža) ───────────────────
@router.get("/search/suggest")
def search_suggest(q: str = '', type: str = '', limit: int = 10):
"""
Autocomplete suggestions for the Mreža search inputs.
type ∈ {person, club, company, ''} — empty means all.
Returns: {query, results: [{id, label, type, sub}]}
"""
q = (q or '').strip()
if len(q) < 2:
return {'query': q, 'results': []}
limit = max(1, min(50, int(limit)))
out = []
with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
if type in ('', 'club'):
cur.execute("""
SELECT id, naziv AS label, sport, grad
FROM pgz_sport.klubovi
WHERE naziv ILIKE %s AND aktivan=TRUE
ORDER BY length(naziv), naziv LIMIT %s
""", ('%'+q+'%', limit))
for r in cur.fetchall():
out.append({'id':'klub:'+str(r['id']), 'label': r['label'], 'type':'club',
'sub': (r.get('sport') or '')+' · '+(r.get('grad') or '')})
cur.execute("""
SELECT id, naziv AS label, sport
FROM pgz_sport.savezi
WHERE naziv ILIKE %s AND aktivan=TRUE
ORDER BY length(naziv), naziv LIMIT %s
""", ('%'+q+'%', limit))
for r in cur.fetchall():
out.append({'id':'savez:'+str(r['id']), 'label': r['label'], 'type':'savez',
'sub': r.get('sport') or 'savez'})
if type in ('', 'person'):
cur.execute("""
SELECT c.id, c.ime, c.prezime, c.sport, k.naziv AS klub_naziv
FROM pgz_sport.clanovi c
LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id
WHERE (COALESCE(c.ime,'') || ' ' || COALESCE(c.prezime,'')) ILIKE %s
ORDER BY length(COALESCE(c.ime,'')||COALESCE(c.prezime,'')), c.prezime
LIMIT %s
""", ('%'+q+'%', limit))
for r in cur.fetchall():
out.append({'id':'sportas:'+str(r['id']),
'label': (r.get('ime') or '')+' '+(r.get('prezime') or ''),
'type':'person',
'sub': (r.get('sport') or 'sportaš')+(r.get('klub_naziv') and ' · '+r['klub_naziv'] or '')})
cur.execute("""
SELECT id, name AS label, function, oib, county
FROM civic.persons
WHERE name ILIKE %s
ORDER BY oib NULLS LAST, length(name) LIMIT %s
""", ('%'+q+'%', limit))
for r in cur.fetchall():
out.append({'id':'civic_person:'+str(r['id']),
'label': r['label'], 'type':'person',
'sub': (r.get('function') or 'civic')+' · '+(r.get('county') or '')})
if type in ('', 'company'):
cur.execute("""
SELECT id, name AS label, oib, city, entity_type
FROM civic.entities
WHERE name ILIKE %s
ORDER BY length(name) LIMIT %s
""", ('%'+q+'%', limit))
for r in cur.fetchall():
out.append({'id':'civic_entity:'+str(r['id']),
'label': r['label'], 'type':'company',
'sub': (r.get('entity_type') or 'tvrtka')+' · '+(r.get('city') or '')})
return {'query': q, 'results': out[:limit*2]}
# ─── R3B M3 — FORENSIC ENRICH (Wikipedia scrape + persist) ──────────────
@router.post("/forensic/findings/{finding_id}/enrich")
def enrich_forensic(finding_id: int):
"""
Look up the forensic finding, derive the PEP person name from
entities_involved or title, hit Wikipedia HR for a summary, and persist
the enriched payload into civic.forensic_findings.ai_analysis (or back into
raw_data.enrichment).
"""
with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
cur.execute("""
SELECT id, finding_type, severity, title, description, entities_involved,
raw_data, ai_analysis
FROM civic.forensic_findings WHERE id=%s
""", (finding_id,))
f = cur.fetchone()
if not f: raise HTTPException(404, "finding not found")
f = dict(f)
# Derive person name candidates
candidates = []
if isinstance(f.get('entities_involved'), (list, dict)):
ei = f['entities_involved']
if isinstance(ei, dict):
for k in ('person','name','osoba','PEP','pep'):
if ei.get(k): candidates.append(str(ei[k]))
# Also try persons: [...] list
for p in (ei.get('persons') or ei.get('osobe') or []):
if isinstance(p, dict) and p.get('name'): candidates.append(p['name'])
elif isinstance(p, str): candidates.append(p)
elif isinstance(ei, list):
for it in ei:
if isinstance(it, dict):
for k in ('name','person','label'):
if it.get(k): candidates.append(str(it[k])); break
elif isinstance(it, str):
candidates.append(it)
if not candidates and f.get('title'):
# Heuristic: extract first capitalised "Ime Prezime" pair
m = re.search(r'\b([A-ZŠĐČĆŽ][a-zšđčćž]{2,})\s+([A-ZŠĐČĆŽ][a-zšđčćž]{2,})', f['title'])
if m: candidates.append(m.group(0))
wiki = None
used_query = None
for q in candidates[:3]:
wiki = _wiki_summary(q)
if wiki:
used_query = q
break
# Build enrichment payload
enrichment = {
'queried': candidates[:5],
'used_query': used_query,
'wiki': wiki,
'enriched_at': datetime.now(timezone.utc).isoformat(),
}
# Persist into raw_data.enrichment
raw = f.get('raw_data')
if raw is None: raw = {}
if not isinstance(raw, dict): raw = {'_legacy': raw}
raw['enrichment'] = enrichment
cur.execute("""
UPDATE civic.forensic_findings
SET raw_data = %s::jsonb,
ai_analysis = COALESCE(ai_analysis, %s)
WHERE id = %s
""", (json.dumps(raw, default=str, ensure_ascii=False),
(wiki or {}).get('extract'),
finding_id))
c.commit()
return {
'finding_id': finding_id,
'queried': candidates[:5],
'used_query': used_query,
'wiki': wiki,
'persisted': True,
}
# ─── R3B P4 — FORENSIC SCAN (kept from prior version) ───────────────────
@router.post("/forensic/scan")
def forensic_scan(req: dict = Body(...)):
name = (req.get('name') or '').strip()
if len(name) < 3:
raise HTTPException(400, "name must be at least 3 chars")
with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
cur.execute("""
SELECT id, name, function, party, county, city, oib, trust_tier
FROM civic.persons
WHERE upper(name) ILIKE upper(%s)
ORDER BY oib NULLS LAST, id LIMIT 25
""", ('%' + name + '%',))
persons = [dict(r) for r in cur.fetchall()]
for p in persons:
p['links'] = []; p['findings'] = []
if p.get('oib'):
cur.execute("""
SELECT pel.entity_id, pel.roles, e.name AS entity_name, e.oib AS entity_oib,
e.entity_type, e.city, e.risk_score
FROM civic.person_entity_links pel
LEFT JOIN civic.entities e ON e.id = pel.entity_id
WHERE pel.person_oib = %s LIMIT 50
""", (p['oib'],))
p['links'] = [dict(r) for r in cur.fetchall()]
cur.execute("""
SELECT id, finding_type, severity, title, severity_score, created_at
FROM civic.forensic_findings
WHERE entities_involved::text ILIKE %s
ORDER BY severity_score DESC, created_at DESC LIMIT 30
""", ('%' + p['oib'] + '%',))
p['findings'] = [dict(r) for r in cur.fetchall()]
if not p['findings']:
cur.execute("""
SELECT id, finding_type, severity, title, severity_score, created_at
FROM civic.forensic_findings
WHERE title ILIKE %s OR description ILIKE %s
ORDER BY severity_score DESC, created_at DESC LIMIT 30
""", ('%' + p['name'] + '%', '%' + p['name'] + '%'))
p['findings'] = [dict(r) for r in cur.fetchall()]
total_links = total_findings = crit_findings = 0
for p in persons:
total_links += len(p.get('links') or [])
for f in p.get('findings') or []:
total_findings += 1
if f.get('severity') in ('CRITICAL', 'HIGH'): crit_findings += 1
score = 0
if (p.get('function') or '').strip(): score += 30
if (p.get('party') or '').strip(): score += 15
score += min(40, len(p.get('links') or []) * 5)
score += min(40, len(p.get('findings') or []) * 10)
score += sum(20 for f in (p.get('findings') or []) if f.get('severity') in ('CRITICAL', 'HIGH'))
p['risk_score'] = min(100, score)
overall = max((p.get('risk_score', 0) for p in persons), default=0)
return {'query': name, 'matched_persons': len(persons),
'overall_risk_score': overall, 'total_links': total_links,
'total_findings': total_findings, 'critical_findings': crit_findings,
'persons': persons, 'scanned_at': int(time.time())}
# ─── SB-3 — Bulk enrichment ─────────────────────────────────────────────
_BULK_KEY_MAP = {
'klub': ('pgz_sport.klubovi',
('oib','sport','grad','predsjednik','tajnik','web','email','telefon',
'sjediste','godina_osnutka','ciljevi','opis_djelatnosti')),
'savez': ('pgz_sport.savezi',
('oib','sport','predsjednik','tajnik','email','telefon','web',
'adresa','godina_osnutka')),
'sportas': ('pgz_sport.clanovi',
('sport','profile_url','slika_url','hns_igrac_id','biografija',
'datum_rodenja','mjesto_rodenja','broj_dresa')),
}
def _coverage_sql(prefix: str, keys: tuple[str, ...]) -> str:
parts = [f"(CASE WHEN {prefix}{k} IS NOT NULL AND ({prefix}{k}::text) <> '' THEN 1 ELSE 0 END)"
for k in keys]
return f"((({' + '.join(parts)})::numeric * 100) / {len(keys)})"
def _bulk_pick(kind: str, limit: int, coverage_max: int) -> list[int]:
if kind not in _BULK_KEY_MAP:
raise HTTPException(400, "kind must be klub|savez|sportas")
table, keys = _BULK_KEY_MAP[kind]
cov = _coverage_sql('', keys)
extra_where = ''
if kind == 'klub':
extra_where = "AND aktivan = TRUE"
elif kind == 'sportas':
extra_where = "AND aktivan = TRUE"
sql = (f"SELECT id FROM {table} "
f"WHERE 1=1 {extra_where} "
f"AND {cov} < %s "
f"ORDER BY random() LIMIT %s")
with _db() as c, c.cursor() as cur:
cur.execute(sql, (coverage_max, limit))
return [r[0] for r in cur.fetchall()]
@router.post("/enrich/bulk")
def enrich_bulk(body: dict = Body(default=None),
x_user_email: Optional[str] = Header(default=None),
x_user_id: Optional[int] = Header(default=None)):
"""Run preview+apply over N random under-enriched rows of one kind.
Body: {kind: 'klub'|'savez'|'sportas', limit: 50, coverage_max: 70}
Returns aggregate stats. Synchronous (use polling, not SSE).
"""
body = body or {}
kind = (body.get('kind') or '').strip().lower()
if kind not in _BULK_KEY_MAP:
raise HTTPException(400, "kind must be klub|savez|sportas")
limit = max(1, min(int(body.get('limit') or 50), 200))
coverage_max = max(0, min(int(body.get('coverage_max') or 70), 100))
ids = _bulk_pick(kind, limit, coverage_max)
items: list[dict] = []
fields_total = 0
started = time.time()
for eid in ids:
try:
row = _load_row(kind, eid)
if kind == 'klub': res = _propose_for_klub(row)
elif kind == 'savez': res = _propose_for_savez(row)
else: res = _propose_for_sportas(row)
proposed = res.get('proposed') or {}
srcs = res.get('sources') or []
if not proposed:
items.append({'id': eid, 'applied': 0, 'fields': []})
continue
out = _apply_to_db(kind, eid, proposed, srcs, x_user_email)
applied = out.get('applied') or {}
fields_total += len(applied)
items.append({'id': eid, 'applied': len(applied), 'fields': list(applied.keys())})
try:
from audit_seal_router import audit_log as _audit_log
if applied:
_audit_log(action='enrich.bulk.apply',
target_type=kind, target_id=eid,
payload={'applied': applied},
user_id=x_user_id, user_email=x_user_email)
except Exception:
pass
except HTTPException as e:
items.append({'id': eid, 'error': e.detail})
except Exception as e:
items.append({'id': eid, 'error': f'{type(e).__name__}: {e}'})
return {
'status': 'success',
'kind': kind,
'requested': limit,
'processed': len(items),
'fields_total': fields_total,
'elapsed_s': round(time.time() - started, 1),
'items': items,
}
# ─── SB-4 — Worker status / control ─────────────────────────────────────
_REDIS_KEYS = {
'heartbeat': 'cc:pgz-enricher:heartbeat',
'pause': 'cc:pgz-enricher:pause',
'run_now': 'cc:pgz-enricher:run_now',
'last_cycle': 'cc:pgz-enricher:last_cycle',
'confidence': 'cc:pgz-enricher:confidence',
'fields_24h': 'cc:pgz-enricher:fields_24h',
}
def _redis_client():
try:
import redis
except Exception:
return None
host = os.environ.get('REDIS_HOST', 'localhost')
port = int(os.environ.get('REDIS_PORT', '6379'))
pwd = (os.environ.get('REDIS_PASS') or '').strip().strip("'").strip('"') or None
# Try with password first (prod); fall back to anonymous (dev box) on AUTH failure.
for p in (pwd, None):
try:
r = redis.Redis(host=host, port=port, password=p,
decode_responses=True, socket_connect_timeout=2)
r.ping()
return r
except Exception:
continue
return None
@router.get("/enrich/worker/status")
def enrich_worker_status():
r = _redis_client()
out = {'available': bool(r)}
if not r:
return out
try:
hb = r.get(_REDIS_KEYS['heartbeat'])
out['heartbeat'] = int(hb) if hb else None
out['heartbeat_age_s'] = (int(time.time()) - int(hb)) if hb else None
out['paused'] = (r.get(_REDIS_KEYS['pause']) or '0') == '1'
out['run_now_pending'] = (r.get(_REDIS_KEYS['run_now']) or '0') == '1'
last = r.get(_REDIS_KEYS['last_cycle'])
if last:
try: out['last_cycle'] = json.loads(last)
except: out['last_cycle'] = last
conf = r.get(_REDIS_KEYS['confidence'])
out['confidence_threshold'] = float(conf) if conf else 0.7
f24 = r.get(_REDIS_KEYS['fields_24h'])
out['fields_24h'] = int(f24) if f24 and f24.isdigit() else 0
except Exception as e:
out['error'] = f'{type(e).__name__}: {e}'
# Recent enrichment_log rows for live activity
try:
with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
cur.execute("""SELECT id, kind, target_id, source, fields_set, user_email, created_at
FROM pgz_sport.enrichment_log
ORDER BY id DESC LIMIT 25""")
rows = []
for rr in cur.fetchall():
rr = dict(rr)
if rr.get('created_at'): rr['created_at'] = rr['created_at'].isoformat()
rows.append(rr)
out['recent'] = rows
except Exception:
out['recent'] = []
return out
@router.post("/enrich/worker/pause")
def enrich_worker_pause(body: dict = Body(default=None)):
body = body or {}
pause = bool(body.get('paused', True))
r = _redis_client()
if not r: raise HTTPException(503, 'redis unavailable')
r.set(_REDIS_KEYS['pause'], '1' if pause else '0')
return {'status': 'success', 'paused': pause}
@router.post("/enrich/worker/run-now")
def enrich_worker_run_now():
r = _redis_client()
if not r: raise HTTPException(503, 'redis unavailable')
r.set(_REDIS_KEYS['run_now'], '1')
return {'status': 'success', 'queued': True}
@router.post("/enrich/worker/confidence")
def enrich_worker_confidence(body: dict = Body(...)):
try:
v = float(body.get('value'))
except Exception:
raise HTTPException(400, 'value must be number 0..1')
if not (0.0 <= v <= 1.0):
raise HTTPException(400, 'value out of range 0..1')
r = _redis_client()
if not r: raise HTTPException(503, 'redis unavailable')
r.set(_REDIS_KEYS['confidence'], str(v))
return {'status': 'success', 'confidence_threshold': v}