+
+
+
+
+
+
+
+
+
+
diff --git a/_backups/sprint_1777940323/enrich_router.py b/_backups/sprint_1777940323/enrich_router.py
new file mode 100644
index 0000000..ccc0e7c
--- /dev/null
+++ b/_backups/sprint_1777940323/enrich_router.py
@@ -0,0 +1,1690 @@
+"""
+enrich_router.py — v3 enrichment + forensic scan
+Author: dradulic@outlook.com / damir@rinet.one
+Date: 2026-05-04 (R2) → 2026-05-05 (R3 CC6 v3)
+
+POST /v2/enrich/{kind}/{eid}
+ Inspect the row, scrape the web (Wikipedia HR, sport-pgz.hr search,
+ primary club URL if any), regex-extract candidate fields (web/email/
+ telefon), optionally synthesise descriptions via DeepSeek, and return
+ a *preview* shape with `proposed` updates the operator can apply.
+
+POST /v2/enrich/{kind}/{eid}/apply
+ Body shapes:
+ None / {} → re-run preview, apply every proposed field
+ {"fields": {...}} → apply ONLY those (whitelist + emptiness still enforced)
+ Performs UPDATE on the matching table, sets metadata.enriched_at and
+ metadata.enrichment_source, writes a row to pgz_sport.enrichment_log,
+ returns the after snapshot.
+
+GET /v2/enrich/log?kind=&target_id=&limit=
+ Read recent enrichment-log entries.
+
+POST /v2/forensic/scan
+ Search civic.persons by name, return entity links + findings + risk score.
+
+Kinds: klub | savez | sportas
+"""
+from __future__ import annotations
+import os, re, json, time, html, urllib.parse, urllib.request
+from datetime import datetime, timezone
+from typing import Any, Optional
+
+import psycopg2, psycopg2.extras
+from fastapi import APIRouter, HTTPException, Header, Body
+
+router = APIRouter()
+
+_pgh = os.environ.get('PG_HOST', '10.10.0.2')
+_pgp = int(os.environ.get('PG_PORT', '6432'))
+if _pgh in ('localhost', '127.0.0.1'):
+ _pgh = os.environ.get('DB_HOST', '10.10.0.2')
+ _pgp = int(os.environ.get('DB_PORT', '6432'))
+DB = dict(host=_pgh, port=_pgp,
+ dbname=os.environ.get('PG_DB', 'rinet_v3'),
+ user=os.environ.get('PG_USER', 'rinet'),
+ password=os.environ.get('PG_PASS', 'R1net2026!SecureDB#v7'))
+
+UA = 'pgz-sport-enrich/3.0 (+https://sport.rinet.one)'
+TIMEOUT = 6 # seconds — fail-soft
+
+# Optional JS-aware fallback (Playwright). Lazy-loaded, never required.
+import sys as _sys
+_sys.path.insert(0, '/opt/pgz-sport')
+try:
+ from enrichment import playwright_scraper as _pw_scraper
+ _HAS_PW = _pw_scraper.HAS_PLAYWRIGHT
+except Exception:
+ _pw_scraper = None
+ _HAS_PW = False
+
+DEEPSEEK_KEY = os.environ.get('DEEPSEEK_API_KEY', '').strip()
+DEEPSEEK_URL = os.environ.get('DEEPSEEK_URL',
+ 'https://api.deepseek.com/v1/chat/completions')
+
+
+# ─── DB helpers ──────────────────────────────────────────────────────────
+def _db():
+ c = psycopg2.connect(**DB); c.autocommit = True; return c
+
+def _fetch_one(sql, p):
+ with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
+ cur.execute(sql, p); r = cur.fetchone()
+ return dict(r) if r else None
+
+
+# ─── HTTP helpers ────────────────────────────────────────────────────────
+def _http_get(url: str, timeout: int = TIMEOUT) -> Optional[str]:
+ if not url: return None
+ if not url.startswith('http'): return None
+ try:
+ req = urllib.request.Request(url, headers={
+ 'User-Agent': UA, 'Accept-Language': 'hr,en;q=0.8'})
+ with urllib.request.urlopen(req, timeout=timeout) as r:
+ data = r.read(150000)
+ try: return data.decode('utf-8')
+ except: return data.decode('latin-1', 'ignore')
+ except Exception:
+ return None
+
+
+def _strip_tags(s: str) -> str:
+ if not s: return ''
+ s = re.sub(r'', ' ', s, flags=re.S | re.I)
+ s = re.sub(r'', ' ', s, flags=re.S | re.I)
+ s = re.sub(r'<[^>]+>', ' ', s)
+ s = html.unescape(s)
+ s = re.sub(r'\s+', ' ', s).strip()
+ return s
+
+
+def _extract_meta(html_doc: str, url: str) -> dict:
+ if not html_doc: return {}
+ out = {'url': url, 'fetched_at': int(time.time())}
+ m = re.search(r']*>([^<]+)', html_doc, re.I)
+ if m: out['title'] = html.unescape(m.group(1).strip())[:300]
+ m = re.search(r')\]]+', re.I)
+
+def _find_email(text: str) -> Optional[str]:
+ if not text: return None
+ bad = ('@example.', '@test.', '@email.', 'wixpress.com',
+ 'sentry.io', 'jquery.com', 'googleapis', '@2x.', 'noreply@')
+ seen = set()
+ for m in RE_EMAIL.finditer(text):
+ e = m.group(0).lower().rstrip('.,;:)')
+ if any(b in e for b in bad): continue
+ if e in seen: continue
+ seen.add(e); return e
+ return None
+
+def _find_phone(text: str) -> Optional[str]:
+ if not text: return None
+ for m in RE_PHONE.finditer(text):
+ raw = m.group(0).strip()
+ digits = re.sub(r'\D', '', raw)
+ if not (8 <= len(digits) <= 13): continue
+ cleaned = re.sub(r'\s+', ' ', raw).strip()
+ if raw.startswith('+385'): return '+385 ' + raw[4:].lstrip().lstrip('-/')
+ if raw.startswith('00385'): return '+385 ' + raw[5:].lstrip().lstrip('-/')
+ return cleaned
+ return None
+
+def _find_official_web(text: str, hint: str = '') -> Optional[str]:
+ if not text: return None
+ blocked = ('wikipedia.org', 'sport-pgz.hr', 'google.com', 'facebook.com',
+ 'instagram.com', 'youtube.com', 'twitter.com', 'wikimedia',
+ 'sportilus.com', 'transfermarkt.com', 'wikidata.org',
+ 'sudreg.pravosudje.hr', 'gov.hr', 'apis.google.com',
+ 'rinet.one', 'pgz.hr')
+ candidates: list[str] = []
+ for m in RE_URL.finditer(text):
+ u = m.group(0).rstrip('.,;:)\'"')
+ try:
+ host = urllib.parse.urlparse(u).hostname or ''
+ except Exception:
+ continue
+ if not host or any(b in host for b in blocked): continue
+ candidates.append(u)
+ if not candidates: return None
+ if hint:
+ slug = re.sub(r'[^a-z0-9]', '', hint.lower())[:8]
+ for u in candidates:
+ host = urllib.parse.urlparse(u).hostname or ''
+ if slug and slug in host.replace('-', '').replace('.', ''):
+ return u
+ return candidates[0]
+
+
+# ─── External sources ────────────────────────────────────────────────────
+def _wiki_variants(query: str) -> list[str]:
+ """Generate sensible Wikipedia HR title variants for a query.
+
+ The summary REST API is title-exact; clubs are often listed under their
+ abbreviation (KK X, NK X, RK X, OK X), so we try those variants too.
+ """
+ if not query: return []
+ out, seen = [], set()
+ raw = query.strip()
+ def _push(v):
+ if v and v not in seen: seen.add(v); out.append(v)
+ _push(raw)
+ # KK Kvarner 2010 from Košarkaški klub KVARNER 2010
+ parts = raw.split()
+ sport_to_abbr = {
+ 'košarkaški': 'KK', 'kosarkaski': 'KK',
+ 'nogometni': 'NK', 'rukometni': 'RK',
+ 'odbojkaški': 'OK', 'odbojkaski': 'OK',
+ 'vaterpolski':'VK', 'plivacki': 'PK', 'plivački': 'PK',
+ 'boćarski': 'BK', 'bocarski': 'BK',
+ }
+ if len(parts) >= 3 and parts[0].lower() in sport_to_abbr and parts[1].lower() == 'klub':
+ _push(sport_to_abbr[parts[0].lower()] + ' ' + ' '.join(p.capitalize() if p.isupper() else p for p in parts[2:]))
+ return out
+
+def _wiki_summary(query: str) -> Optional[dict]:
+ for variant in _wiki_variants(query):
+ title = urllib.parse.quote(variant.replace(' ', '_'), safe='')
+ body = _http_get(f'https://hr.wikipedia.org/api/rest_v1/page/summary/{title}', timeout=5)
+ if not body: continue
+ try:
+ d = json.loads(body)
+ except Exception:
+ continue
+ if d.get('type') in ('disambiguation', 'no-extract'): continue
+ if not d.get('extract'): continue
+ return {
+ 'source': 'wikipedia.hr',
+ 'url': d.get('content_urls', {}).get('desktop', {}).get('page'),
+ 'title': d.get('title'),
+ 'extract': d.get('extract'),
+ 'description': d.get('description'),
+ 'matched_variant': variant,
+ }
+ return None
+
+
+def _sport_pgz_search(query: str) -> Optional[dict]:
+ if not query: return None
+ page = _http_get('https://sport-pgz.hr/?s=' + urllib.parse.quote(query), timeout=6)
+ if not page:
+ # Plain HTTP failed → try JS-rendered fallback if available.
+ if _HAS_PW and _pw_scraper is not None:
+ return _pw_scraper.scrape_sport_pgz_klub(query)
+ return None
+ m = re.search(r']*>.*?]*rel=["\']bookmark["\'][^>]*>([^<]+)',
+ page, re.S | re.I)
+ if not m:
+ m = re.search(r']*>([^<]{6,180})', page, re.I)
+ if not m:
+ # Search page rendered but yielded nothing parseable — try JS fallback.
+ if _HAS_PW and _pw_scraper is not None:
+ return _pw_scraper.scrape_sport_pgz_klub(query)
+ return None
+ hit = m.group(1)
+ body = _http_get(hit, timeout=6)
+ if not body:
+ return {'source': 'sport-pgz.hr', 'url': hit, 'title': html.unescape(m.group(2).strip())}
+ text = _strip_tags(body)[:4000]
+ meta = _extract_meta(body, hit)
+ return {
+ 'source': 'sport-pgz.hr',
+ 'url': hit,
+ 'title': meta.get('title') or html.unescape(m.group(2).strip()),
+ 'extract': meta.get('description') or text[:500],
+ 'raw_text': text,
+ }
+
+
+def _fetch_primary_site(url: str) -> Optional[dict]:
+ body = _http_get(url, timeout=6)
+ if not body: return None
+ text = _strip_tags(body)
+ meta = _extract_meta(body, url)
+ return {
+ 'source': urllib.parse.urlparse(url).hostname or url,
+ 'url': url,
+ 'title': meta.get('title'),
+ 'extract': meta.get('description') or text[:500],
+ 'raw_text': text[:8000],
+ }
+
+
+# ─── DeepSeek (optional, fail-soft) ─────────────────────────────────────
+def _deepseek_describe(naziv: str, kind: str, evidence: list[str]) -> Optional[str]:
+ if not DEEPSEEK_KEY or not evidence: return None
+ joined = "\n---\n".join(e for e in evidence if e)[:6000]
+ if not joined.strip(): return None
+ prompt = (f"Iz dolje navedenih izvora napiši profesionalni opis za "
+ f"{kind} '{naziv}' na hrvatskom jeziku. 3-5 rečenica. "
+ f"Bez uvoda 'Evo opisa', samo tekst.\n\nIZVORI:\n{joined}")
+ payload = {
+ "model": "deepseek-chat",
+ "messages": [
+ {"role": "system", "content": "Pišeš sažete činjenične opise sportskih organizacija na hrvatskom."},
+ {"role": "user", "content": prompt},
+ ],
+ "max_tokens": 280, "temperature": 0.3,
+ }
+ req = urllib.request.Request(
+ DEEPSEEK_URL, data=json.dumps(payload).encode('utf-8'),
+ headers={'Authorization': 'Bearer ' + DEEPSEEK_KEY,
+ 'Content-Type': 'application/json',
+ 'User-Agent': UA}, method='POST')
+ try:
+ with urllib.request.urlopen(req, timeout=20) as r:
+ d = json.loads(r.read().decode('utf-8'))
+ text = d.get('choices', [{}])[0].get('message', {}).get('content', '').strip()
+ return text or None
+ except Exception:
+ return None
+
+
+# ─── Row loaders & display name ─────────────────────────────────────────
+def _load_row(kind: str, eid: int) -> dict:
+ if kind == 'klub':
+ row = _fetch_one("""SELECT id, naziv, oib, sport, grad, predsjednik, tajnik,
+ web, web_stranica, email, telefon, ciljevi, opis_djelatnosti,
+ sjediste, godina_osnutka, savez_id, scrape_url, source_url,
+ metadata
+ FROM pgz_sport.klubovi WHERE id=%s""", (eid,))
+ elif kind == 'savez':
+ row = _fetch_one("""SELECT id, naziv, oib, sport, predsjednik, tajnik, email, telefon, web,
+ adresa, godina_osnutka, source_url, metadata
+ FROM pgz_sport.savezi WHERE id=%s""", (eid,))
+ elif kind == 'sportas':
+ row = _fetch_one("""SELECT c.id, c.ime, c.prezime, c.sport, c.klub_id, c.profile_url,
+ c.slika_url, c.source_url, c.source, c.source_id,
+ c.hns_igrac_id, c.biografija,
+ c.datum_rodenja, c.mjesto_rodenja, c.broj_dresa,
+ c.visina_cm, c.tezina_kg, c.dominantna_noga, c.oib,
+ c.vanjski_id, c.metadata,
+ k.sport AS klub_sport, k.naziv AS klub_naziv
+ FROM pgz_sport.clanovi c
+ LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id
+ WHERE c.id=%s""", (eid,))
+ # Fall back to klub.sport when c.sport is empty
+ if row and not row.get('sport') and row.get('klub_sport'):
+ row['sport'] = row['klub_sport']
+ else:
+ raise HTTPException(400, "kind must be klub|savez|sportas")
+ if not row:
+ raise HTTPException(404, kind + " not found")
+ return row
+
+
+def _display_name(kind: str, row: dict) -> str:
+ if kind == 'sportas':
+ return ((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip()
+ return row.get('naziv', '') or ''
+
+
+# ─── Sport federations map (loaded once, refresh on file mtime) ─────────
+_SPORT_FED_PATH = '/opt/pgz-sport/data/sport_federations.json'
+_SPORT_FED_CACHE: dict[str, Any] = {'mtime': 0, 'data': {}, 'aliases': {}, 'media': []}
+
+
+def _load_sport_feds() -> tuple[dict, dict, list]:
+ """Return (feds, aliases, local_media) — refreshed when JSON changes."""
+ try:
+ st = os.stat(_SPORT_FED_PATH)
+ except FileNotFoundError:
+ return ({}, {}, [])
+ if st.st_mtime != _SPORT_FED_CACHE['mtime']:
+ try:
+ with open(_SPORT_FED_PATH, 'r', encoding='utf-8') as f:
+ raw = json.load(f)
+ except Exception:
+ return (_SPORT_FED_CACHE['data'],
+ _SPORT_FED_CACHE['aliases'],
+ _SPORT_FED_CACHE['media'])
+ aliases = raw.pop('_aliases', {}) if isinstance(raw, dict) else {}
+ media = raw.pop('_local_media_pgz', []) if isinstance(raw, dict) else []
+ raw.pop('_meta', None)
+ _SPORT_FED_CACHE.update(mtime=st.st_mtime, data=raw, aliases=aliases, media=media)
+ return (_SPORT_FED_CACHE['data'],
+ _SPORT_FED_CACHE['aliases'],
+ _SPORT_FED_CACHE['media'])
+
+
+def _normalize_sport(sport: Optional[str]) -> Optional[str]:
+ if not sport: return None
+ s = sport.strip().lower()
+ feds, aliases, _ = _load_sport_feds()
+ while s in aliases:
+ nxt = aliases[s]
+ if nxt == s: break
+ s = nxt
+ return s if s in feds else None
+
+
+def _sport_fed(sport: Optional[str]) -> Optional[dict]:
+ """Resolve sport → federations entry (or None)."""
+ norm = _normalize_sport(sport)
+ if not norm: return None
+ feds, _, _ = _load_sport_feds()
+ return feds.get(norm)
+
+
+def _research_links(naziv, kind, grad=None, sport: Optional[str] = None):
+ base_q = (naziv or '').strip()
+ q = (base_q + ' ' + grad) if grad else base_q
+ qenc = urllib.parse.quote(q)
+ out = [
+ {'label': 'Google', 'icon': '🔍', 'url': 'https://www.google.com/search?q=' + qenc},
+ {'label': 'Wikipedia HR', 'icon': '📚', 'url': 'https://hr.wikipedia.org/w/index.php?search=' + qenc},
+ {'label': 'sport-pgz.hr', 'icon': '🏅', 'url': 'https://sport-pgz.hr/?s=' + qenc},
+ ]
+ if kind == 'klub':
+ out.append({'label': 'Sportilus', 'icon': '⬡', 'url': 'https://www.sportilus.com/?s=' + qenc})
+ out.append({'label': 'Sudski registar', 'icon': '⚖', 'url': 'https://sudreg.pravosudje.hr/registar/oc/index.html'})
+
+ # Sport-specific federation links (replace static HNS/transfermarkt for sportas)
+ fed = _sport_fed(sport) if sport else None
+ if kind == 'sportas':
+ if fed and isinstance(fed.get('national'), dict):
+ nat = fed['national']
+ search = (nat.get('search_url') or nat.get('url') or '').replace('{q}', qenc)
+ if search:
+ out.append({'label': nat.get('name', 'Nacionalni savez'),
+ 'icon': '🏆', 'url': search})
+ if fed and isinstance(fed.get('pgz'), dict):
+ pgz = fed['pgz']
+ url = pgz.get('search_url') or pgz.get('url') or ''
+ if url:
+ out.append({'label': pgz.get('name', 'PGŽ savez'),
+ 'icon': '🏟', 'url': url.replace('{q}', qenc)})
+ if not fed:
+ # No mapping for this sport → keep transfermarkt as legacy fallback
+ out.append({'label': 'HNS Semafor', 'icon': '⚽', 'url': 'https://semafor.hns.family/?s=' + qenc})
+ out.append({'label': 'transfermarkt','icon': '⚽', 'url': 'https://www.transfermarkt.com/schnellsuche/ergebnis/schnellsuche?query=' + qenc})
+ # Local PGŽ media for any sportas
+ _, _, media = _load_sport_feds()
+ for m in media:
+ url = (m.get('search_url') or '').replace('{q}', qenc)
+ if url:
+ out.append({'label': m.get('name', 'Lokalni medij'),
+ 'icon': '📰', 'url': url})
+ if kind == 'savez':
+ out.append({'label': 'sport-pgz.hr savezi', 'icon': '🏅', 'url': 'https://sport-pgz.hr/savezi'})
+ return out
+
+
+# ─── Proposal pipelines ─────────────────────────────────────────────────
+def _name_tokens(naziv: str) -> list[str]:
+ """Significant tokens from entity name (≥4 chars, deaccented)."""
+ import unicodedata
+ s = unicodedata.normalize('NFKD', naziv or '').encode('ascii', 'ignore').decode('ascii').lower()
+ toks = [t for t in re.split(r'[^a-z0-9]+', s) if len(t) >= 4]
+ stop = {'klub','udruga','sportski','sport','kosarkaski','kosarka','nogometni',
+ 'rukometni','savez','rijeka','primorsko','goranski','grad','grada','centar'}
+ return [t for t in toks if t not in stop] or toks
+
+
+def _is_relevant(source: dict, tokens: list[str]) -> bool:
+ """A source is 'relevant' only if the page actually mentions the entity name."""
+ if not tokens: return True
+ import unicodedata
+ blob = (source.get('title') or '') + ' ' + (source.get('extract') or '') + ' ' + (source.get('raw_text') or '')
+ blob = unicodedata.normalize('NFKD', blob.lower()).encode('ascii', 'ignore').decode('ascii')
+ return any(t in blob for t in tokens)
+
+
+def _propose_for_klub(row: dict) -> dict:
+ naziv = row.get('naziv') or ''
+ primary = row.get('web') or row.get('web_stranica') or row.get('source_url') or row.get('scrape_url')
+ sources, evidence = [], []
+ pdoc = _fetch_primary_site(primary) if primary else None
+ if pdoc: sources.append(pdoc); evidence.append(pdoc.get('raw_text') or pdoc.get('extract') or '')
+ wiki = _wiki_summary(naziv)
+ if wiki: sources.append(wiki); evidence.append(wiki.get('extract') or '')
+ spz = _sport_pgz_search(naziv)
+ if spz: sources.append(spz); evidence.append(spz.get('raw_text') or spz.get('extract') or '')
+
+ tokens = _name_tokens(naziv)
+ relevant = [s for s in sources if _is_relevant(s, tokens)]
+ relevant_blob = '\n\n'.join((s.get('raw_text') or s.get('extract') or '') for s in relevant)
+
+ proposed: dict[str, Any] = {}
+ # web/email/telefon: ONLY from sources actually mentioning the entity
+ if not row.get('web'):
+ u = _find_official_web(relevant_blob, naziv)
+ if u: proposed['web'] = u
+ if not row.get('email'):
+ e = _find_email(relevant_blob)
+ if e: proposed['email'] = e
+ if not row.get('telefon'):
+ t = _find_phone(relevant_blob)
+ if t: proposed['telefon'] = t
+ if not row.get('opis_djelatnosti'):
+ descr_evidence = [(s.get('raw_text') or s.get('extract') or '') for s in relevant] or evidence
+ descr = _deepseek_describe(naziv, 'sportski klub', descr_evidence)
+ if not descr:
+ for s in (relevant or sources):
+ if s.get('extract') and len(s['extract']) >= 80:
+ descr = s['extract']; break
+ if descr: proposed['opis_djelatnosti'] = descr.strip()[:2000]
+ return {'proposed': proposed, 'sources': sources}
+
+
+def _propose_for_savez(row: dict) -> dict:
+ naziv = row.get('naziv') or ''
+ primary = row.get('web') or row.get('source_url')
+ sources, evidence = [], []
+ pdoc = _fetch_primary_site(primary) if primary else None
+ if pdoc: sources.append(pdoc); evidence.append(pdoc.get('raw_text') or '')
+ wiki = _wiki_summary(naziv)
+ if wiki: sources.append(wiki); evidence.append(wiki.get('extract') or '')
+ spz = _sport_pgz_search(naziv)
+ if spz: sources.append(spz); evidence.append(spz.get('raw_text') or '')
+
+ tokens = _name_tokens(naziv)
+ relevant = [s for s in sources if _is_relevant(s, tokens)]
+ relevant_blob = '\n\n'.join((s.get('raw_text') or s.get('extract') or '') for s in relevant)
+
+ proposed: dict[str, Any] = {}
+ if not row.get('web'):
+ u = _find_official_web(relevant_blob, naziv)
+ if u: proposed['web'] = u
+ if not row.get('email'):
+ e = _find_email(relevant_blob)
+ if e: proposed['email'] = e
+ if not row.get('telefon'):
+ t = _find_phone(relevant_blob)
+ if t: proposed['telefon'] = t
+ return {'proposed': proposed, 'sources': sources}
+
+
+# ─── HNS Semafor parsing ────────────────────────────────────────────────
+_HNS_BASE = 'https://semafor.hns.family'
+
+def _slugify(name: str) -> str:
+ import unicodedata
+ s = unicodedata.normalize('NFKD', name or '').encode('ascii', 'ignore').decode('ascii').lower()
+ return re.sub(r'[^a-z0-9]+', '-', s).strip('-')
+
+def _hns_url_from_row(row: dict) -> Optional[str]:
+ """Try to build a semafor.hns.family /igraci/ URL for this row."""
+ # 1) Already-set columns
+ for k in ('profile_url', 'source_url'):
+ u = row.get(k)
+ if u and 'semafor.hns.family/igraci/' in (u or ''):
+ return u
+ # 2) hns_igrac_id column
+ pid = row.get('hns_igrac_id')
+ if pid:
+ slug = _slugify(((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip())
+ return f'{_HNS_BASE}/igraci/{int(pid)}/{slug}/'
+ # 3) vanjski_id JSONB → hns_comet
+ vid = row.get('vanjski_id') or {}
+ if isinstance(vid, dict):
+ comet = vid.get('hns_comet') or vid.get('hns_pid')
+ slug = vid.get('hns_slug') or _slugify(((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip())
+ if comet:
+ try:
+ return f'{_HNS_BASE}/igraci/{int(comet)}/{slug}/'
+ except Exception:
+ pass
+ # 4) source='hns_semafor' + source_id
+ if (row.get('source') or '').startswith('hns_') and row.get('source_id'):
+ try:
+ slug = _slugify(((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip())
+ return f'{_HNS_BASE}/igraci/{int(row["source_id"])}/{slug}/'
+ except Exception:
+ pass
+ return None
+
+
+def _parse_hns_player(html_doc: str, url: str) -> Optional[dict]:
+ """Extract structured fields from a semafor.hns.family player page."""
+ if not html_doc: return None
+ try:
+ from bs4 import BeautifulSoup
+ except Exception:
+ return _parse_hns_player_regex(html_doc, url)
+ soup = BeautifulSoup(html_doc, 'html.parser')
+ out: dict[str, Any] = {'source': 'semafor.hns.family', 'url': url}
+
+ # hns_igrac_id from URL
+ m = re.search(r'/igraci/(\d+)/', url)
+ if m: out['hns_igrac_id'] = int(m.group(1))
+
+ title = soup.find('title')
+ if title: out['title'] = title.get_text(strip=True)[:300]
+
+ # Photo
+ photo = soup.find('div', class_='photo')
+ if photo:
+ img = photo.find('img')
+ if img and img.get('src'):
+ src = img['src']
+ if not src.startswith('http'):
+ src = urllib.parse.urljoin(url, src)
+ out['slika_url'] = src
+
+ # Player number (jersey)
+ pn = soup.find('div', class_='playerName')
+ if pn:
+ h3 = pn.find('h3')
+ if h3:
+ t = h3.get_text(strip=True)
+ if t.isdigit():
+ out['broj_dresa'] = int(t)
+
+ # Datum rodjenja
+ li = soup.find('li', class_='dob')
+ if li:
+ h4 = li.find('h4')
+ if h4:
+ t = h4.get_text(' ', strip=True)
+ mm = re.match(r'(\d{1,2})\.(\d{1,2})\.(\d{4})', t)
+ if mm:
+ from datetime import date as _date
+ try:
+ out['datum_rodenja'] = _date(int(mm.group(3)), int(mm.group(2)), int(mm.group(1))).isoformat()
+ except Exception:
+ pass
+
+ # Mjesto rodjenja
+ li = soup.find('li', class_='pob')
+ if li:
+ h4 = li.find('h4')
+ if h4:
+ out['mjesto_rodenja'] = h4.get_text(strip=True)
+
+ # Trenutni klub (info only — we don't reassign klub_id from here)
+ klub_link = soup.find('a', href=re.compile(r'/klubovi/(\d+)/'))
+ if klub_link:
+ h4 = klub_link.find('h4')
+ if h4:
+ out['trenutni_klub'] = h4.get_text(strip=True)
+ m = re.search(r'/klubovi/(\d+)/', klub_link.get('href') or '')
+ if m: out['hns_klub_id'] = int(m.group(1))
+
+ # Description (meta)
+ meta_d = soup.find('meta', attrs={'name': 'description'})
+ if meta_d and meta_d.get('content'):
+ out['description'] = meta_d['content'][:600]
+
+ # Make a clean text blob for relevance / DeepSeek
+ text = soup.get_text(' ', strip=True)
+ out['raw_text'] = re.sub(r'\s+', ' ', text)[:4000]
+ out['extract'] = (out.get('description')
+ or (out['raw_text'][:500] if out.get('raw_text') else None))
+ return out
+
+
+def _parse_hns_player_regex(html_doc: str, url: str) -> Optional[dict]:
+ """BS4-free fallback parser."""
+ out: dict[str, Any] = {'source': 'semafor.hns.family', 'url': url}
+ m = re.search(r'/igraci/(\d+)/', url)
+ if m: out['hns_igrac_id'] = int(m.group(1))
+ m = re.search(r'
.*?
(\d{1,2})\.(\d{1,2})\.(\d{4})', html_doc, re.S)
+ if m:
+ from datetime import date as _date
+ try:
+ out['datum_rodenja'] = _date(int(m.group(3)), int(m.group(2)), int(m.group(1))).isoformat()
+ except Exception:
+ pass
+ m = re.search(r'
([^<]+)
', html_doc)
+ if m: out['mjesto_rodenja'] = m.group(1).strip()
+ m = re.search(r'
(\d+)
', html_doc)
+ if m: out['broj_dresa'] = int(m.group(1))
+ m = re.search(r' Optional[dict]:
+ body = _http_get(url, timeout=8)
+ if not body:
+ # Try Playwright fallback
+ if _HAS_PW and _pw_scraper is not None:
+ r = _pw_scraper.fetch_rendered(url, timeout_ms=15000)
+ if r and r.get('html_len', 0) > 2000:
+ # We didn't store html in fetch_rendered — re-fetch text only is enough
+ # but we need html for parse. Do a simple HTTP retry with longer timeout.
+ body = _http_get(url, timeout=15)
+ return _parse_hns_player(body, url) if body else None
+
+
+# ─── Generic sport-federation scraper ───────────────────────────────────
+def _fed_url_from_row(row: dict) -> Optional[str]:
+ """If the row already points to a federation profile (source_url /
+ profile_url on a known fed host), return it."""
+ feds, _, _ = _load_sport_feds()
+ fed_hosts = set()
+ for entry in feds.values():
+ if not isinstance(entry, dict): continue
+ for which in ('national', 'pgz'):
+ sub = entry.get(which) or {}
+ for k in ('url', 'search_url', 'profile_url_pattern'):
+ v = sub.get(k)
+ if v:
+ try:
+ h = urllib.parse.urlparse(v.replace('{q}', 'x').replace('{slug}', 'x').replace('{hns_pid}', '1')).hostname
+ if h: fed_hosts.add(h)
+ except Exception:
+ pass
+ for k in ('source_url', 'profile_url'):
+ u = row.get(k)
+ if not u: continue
+ try:
+ h = urllib.parse.urlparse(u).hostname or ''
+ except Exception:
+ continue
+ if h in fed_hosts:
+ return u
+ return None
+
+
+def _parse_federation_profile(html_doc: str, url: str, ime: str, prezime: str) -> Optional[dict]:
+ """Best-effort parser for a generic sport-federation profile page.
+
+ Returns {source, url, slika_url, datum_rodenja, mjesto_rodenja, klub,
+ extract, raw_text}. Tolerant of varied page structures.
+ """
+ if not html_doc: return None
+ host = urllib.parse.urlparse(url).hostname or ''
+ out: dict[str, Any] = {
+ 'source': host,
+ 'url': url,
+ }
+ # Title
+ m = re.search(r']*>([^<]+)', html_doc, re.I)
+ if m: out['title'] = html.unescape(m.group(1).strip())[:300]
+ # Meta description
+ m = re.search(r'= 3:
+ name_tokens.append(re.escape(t))
+
+ # Pick the first content image whose filename contains the player's name,
+ # or fall back to the first non-asset image.
+ img_candidates = re.findall(r']+src=["\']([^"\']+)["\']', html_doc, re.I)
+ chosen_img = None
+ for src in img_candidates:
+ low = src.lower()
+ if any(b in low for b in ('logo', 'icon', 'admin-ajax', 'spinner', 'loader',
+ 'sprite', '/themes/', '/icons/', 'gdpr', 'banner',
+ 'header', 'footer', 'placeholder', 'avatar-default')):
+ continue
+ if not low.endswith(('.jpg', '.jpeg', '.png', '.webp')):
+ continue
+ # Prefer matches on player name in URL
+ if name_tokens and any(re.search(t, src, re.I) for t in name_tokens):
+ chosen_img = src; break
+ if chosen_img is None:
+ chosen_img = src
+ if chosen_img:
+ if not chosen_img.startswith('http'):
+ chosen_img = urllib.parse.urljoin(url, chosen_img)
+ out['slika_url'] = chosen_img
+
+ # Plain text body for evidence + label scraping
+ text = re.sub(r'', ' ', html_doc, flags=re.S | re.I)
+ text = re.sub(r'', ' ', text, flags=re.S | re.I)
+ text = re.sub(r'<[^>]+>', ' ', text)
+ text = html.unescape(re.sub(r'\s+', ' ', text)).strip()
+ out['raw_text'] = text[:4000]
+ out['extract'] = (out.get('description')
+ or text[max(0, text.find(prezime)-30):max(0, text.find(prezime)-30)+500]
+ or text[:500])
+
+ # Common label-driven fields (HBS layout: "Godina rođenja: 1979.", "Matični klub: …")
+ m = re.search(r'Datum\s+ro[đdj]?enja[:\s]+(\d{1,2}[.\-/]\d{1,2}[.\-/]\d{4})', text, re.I)
+ if m:
+ try:
+ from datetime import date as _date
+ d = re.split(r'[.\-/]', m.group(1))
+ out['datum_rodenja'] = _date(int(d[2]), int(d[1]), int(d[0])).isoformat()
+ except Exception:
+ pass
+ if 'datum_rodenja' not in out:
+ m = re.search(r'Godina\s+ro[đdj]?enja[:\s]+(\d{4})', text, re.I)
+ if m:
+ try:
+ from datetime import date as _date
+ out['datum_rodenja'] = _date(int(m.group(1)), 1, 1).isoformat()
+ except Exception:
+ pass
+ m = re.search(r'Mjesto\s+ro[đdj]?enja[:\s]+([A-ZČĆŠĐŽ][^,\n.]{2,40})', text)
+ if m: out['mjesto_rodenja'] = m.group(1).strip()
+ m = re.search(r'Mati[čc]ni\s+klub[:\s]+([^\n]{3,60}?)(?:\s+(?:Sportski|Datum|Liječni|Reprezent|Sezona|Domaće|Nastupi))', text, re.I)
+ if m: out['klub_naziv'] = m.group(1).strip().rstrip('.')
+
+ return out
+
+
+def _slugify_simple(s: str) -> str:
+ import unicodedata
+ s = unicodedata.normalize('NFKD', s or '').encode('ascii', 'ignore').decode('ascii').lower()
+ return re.sub(r'[^a-z0-9]+', '-', s).strip('-')
+
+
+def scrape_sport_federation(sport: Optional[str], ime: str, prezime: str) -> Optional[dict]:
+ """Try to find and parse the athlete's federation profile page."""
+ fed = _sport_fed(sport) if sport else None
+ if not fed: return None
+ nat = (fed or {}).get('national') or {}
+ full_name = (ime + ' ' + prezime).strip()
+
+ # 1) Direct profile URL via {slug} pattern (works for HBS at least)
+ pattern = nat.get('profile_url_pattern')
+ if pattern and '{slug}' in pattern:
+ slug = _slugify_simple(full_name)
+ url = pattern.replace('{slug}', slug)
+ body = _http_get(url, timeout=8)
+ if body and prezime.lower() in body.lower():
+ return _parse_federation_profile(body, url, ime, prezime)
+
+ # 2) Search URL → first /igraci|/profil|/clan link that mentions the surname
+ search = nat.get('search_url')
+ if search:
+ body = _http_get(search.replace('{q}', urllib.parse.quote(full_name)), timeout=10)
+ if body:
+ for href_re in (r'href="([^"]*?/igraci/[^"]+)"',
+ r'href="([^"]*?/igrac/[^"]+)"',
+ r'href="([^"]*?/sportasi/[^"]+)"',
+ r'href="([^"]*?/clanovi/[^"]+)"',
+ r'href="([^"]*?/profil/[^"]+)"'):
+ for m in re.finditer(href_re, body, re.I):
+ cand = m.group(1)
+ if not cand.startswith('http'):
+ cand = urllib.parse.urljoin(nat.get('url', search), cand)
+ if _slugify_simple(prezime) in _slugify_simple(cand):
+ b2 = _http_get(cand, timeout=8)
+ if b2:
+ return _parse_federation_profile(b2, cand, ime, prezime)
+ return None
+
+
+def _propose_for_sportas(row: dict) -> dict:
+ naziv = ((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip()
+ ime, prezime = (row.get('ime') or ''), (row.get('prezime') or '')
+ sport = row.get('sport')
+ sources, evidence = [], []
+ proposed: dict[str, Any] = {}
+
+ # 1) HNS Semafor — only meaningful when sport is football OR row already
+ # carries an HNS link.
+ hns_doc: Optional[dict] = None
+ if _normalize_sport(sport) == 'nogomet' or _hns_url_from_row(row):
+ hns_url = _hns_url_from_row(row)
+ if hns_url:
+ hns_doc = _hns_fetch_player(hns_url)
+ if hns_doc:
+ sources.append(hns_doc)
+ evidence.append(hns_doc.get('raw_text') or hns_doc.get('extract') or '')
+
+ # 2) Sport-aware federation scrape (HBS, HKS, etc.) — also use existing
+ # source_url/profile_url if it points at a known federation host.
+ fed_doc: Optional[dict] = None
+ direct_fed_url = _fed_url_from_row(row)
+ if direct_fed_url and (not hns_doc or hns_doc.get('url') != direct_fed_url):
+ body = _http_get(direct_fed_url, timeout=8)
+ if body:
+ fed_doc = _parse_federation_profile(body, direct_fed_url, ime, prezime)
+ if not fed_doc:
+ fed_doc = scrape_sport_federation(sport, ime, prezime)
+ if fed_doc:
+ sources.append(fed_doc)
+ evidence.append(fed_doc.get('raw_text') or fed_doc.get('extract') or '')
+
+ # Helper: pick from hns_doc first then fed_doc
+ def _pick(field):
+ if hns_doc and hns_doc.get(field): return hns_doc[field]
+ if fed_doc and fed_doc.get(field): return fed_doc[field]
+ return None
+
+ if not row.get('profile_url'):
+ v = _pick('url') or (hns_doc and hns_doc.get('url')) or (fed_doc and fed_doc.get('url'))
+ if v: proposed['profile_url'] = v
+ if not row.get('source_url'):
+ v = (hns_doc and hns_doc.get('url')) or (fed_doc and fed_doc.get('url'))
+ if v: proposed['source_url'] = v
+ if not row.get('slika_url'):
+ v = _pick('slika_url')
+ if v: proposed['slika_url'] = v
+ if not row.get('hns_igrac_id') and hns_doc and hns_doc.get('hns_igrac_id'):
+ proposed['hns_igrac_id'] = hns_doc['hns_igrac_id']
+ if not row.get('datum_rodenja'):
+ v = _pick('datum_rodenja')
+ if v: proposed['datum_rodenja'] = v
+ if not row.get('mjesto_rodenja'):
+ v = _pick('mjesto_rodenja')
+ if v: proposed['mjesto_rodenja'] = v
+ if not row.get('broj_dresa') and hns_doc and hns_doc.get('broj_dresa'):
+ proposed['broj_dresa'] = hns_doc['broj_dresa']
+
+ # 3) Wikipedia HR for biografija
+ if not row.get('biografija'):
+ wiki = _wiki_summary(naziv)
+ if wiki:
+ sources.append(wiki)
+ evidence.append(wiki.get('extract') or '')
+
+ # Description: prefer DeepSeek synthesis from all evidence; fallback to first long snippet
+ if not row.get('biografija'):
+ descr = _deepseek_describe(naziv, f'sportaš ({sport})' if sport else 'sportaš', evidence) if evidence else None
+ if not descr:
+ for s in sources:
+ ext = s.get('extract')
+ if ext and len(ext) >= 80:
+ descr = ext; break
+ if descr:
+ proposed['biografija'] = descr.strip()[:2000]
+
+ return {'proposed': proposed, 'sources': sources}
+
+
+# ─── Endpoints ──────────────────────────────────────────────────────────
+# ─── R4 — POST /v2/enrich/forensic/{finding_id} ─────────────────────────
+def _extract_pep_name(finding: dict) -> Optional[str]:
+ """Pull the primary person name from a forensic_findings row."""
+ title = (finding.get('title') or '').strip()
+ desc = (finding.get('description') or '').strip()
+ payload = finding.get('raw_data') or {}
+ if isinstance(payload, str):
+ try: payload = json.loads(payload)
+ except Exception: payload = {}
+ if isinstance(payload, dict):
+ for k in ('person_name', 'name', 'osoba'):
+ v = payload.get(k)
+ if v: return str(v).strip()
+ # Try entities_involved.entity_name
+ ents = finding.get('entities_involved') or []
+ if isinstance(ents, str):
+ try: ents = json.loads(ents)
+ except Exception: ents = []
+ if isinstance(ents, list):
+ for e in ents:
+ if isinstance(e, dict) and e.get('person_name'):
+ return str(e['person_name']).strip()
+ if isinstance(e, dict) and e.get('entity_name') and ' ' in (e.get('entity_name') or ''):
+ # Some entries store person names as entity_name when entity_type='person'
+ if (e.get('entity_type') or '').lower() in ('person','osoba'):
+ return str(e['entity_name']).strip()
+ # Fallback: extract a "Ime Prezime" from the title
+ m = re.search(r'\b([A-ZČĆŠĐŽ][a-zčćšđž]+)\s+([A-ZČĆŠĐŽ][a-zčćšđž]+(?:-[A-ZČĆŠĐŽ][a-zčćšđž]+)?)\b', title + ' ' + desc)
+ if m: return f"{m.group(1)} {m.group(2)}"
+ return None
+
+
+def _gather_pep_evidence(name: str) -> list[dict]:
+ sources: list[dict] = []
+ wiki = _wiki_summary(name)
+ if wiki: sources.append(wiki)
+ # DDG html-lite as a "Google snippet" replacement (often OK for HR PEPs)
+ ddg = 'https://html.duckduckgo.com/html/?q=' + urllib.parse.quote(f'"{name}" PGŽ Hrvatska')
+ page = _http_get(ddg, timeout=8)
+ if page:
+ # First result block
+ m = re.search(r']+class="result__a"[^>]+href="([^"]+)"[^>]*>([^<]{6,200})', page)
+ snippet_m = re.search(r']+class="result__snippet"[^>]*>(.*?)', page, re.S)
+ if m:
+ sources.append({
+ 'source': 'duckduckgo',
+ 'url': html.unescape(m.group(1))[:500],
+ 'title': html.unescape(m.group(2)).strip()[:300],
+ 'extract': re.sub(r'<[^>]+>', ' ', snippet_m.group(1)).strip()[:600] if snippet_m else None,
+ })
+ return sources
+
+
+def _related_entities_for_pep(name: str) -> list[dict]:
+ """Pull civic.persons + their entity links so we have the structured graph."""
+ out: list[dict] = []
+ with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
+ cur.execute("""SELECT id, name, function, party, county, city, oib, trust_tier
+ FROM civic.persons
+ WHERE upper(name) ILIKE upper(%s)
+ ORDER BY oib NULLS LAST, id LIMIT 10""", ('%'+name+'%',))
+ for p in cur.fetchall():
+ p = dict(p)
+ entry = {
+ 'kind': 'person',
+ 'person_id': p['id'], 'person_name': p['name'],
+ 'function': p.get('function'), 'party': p.get('party'),
+ 'county': p.get('county'), 'city': p.get('city'),
+ 'oib': p.get('oib'), 'trust_tier': p.get('trust_tier'),
+ 'entities': [],
+ }
+ if p.get('oib'):
+ cur.execute("""SELECT pel.entity_id, pel.roles, e.name AS entity_name,
+ e.oib AS entity_oib, e.entity_type, e.city, e.risk_score
+ FROM civic.person_entity_links pel
+ LEFT JOIN civic.entities e ON e.id = pel.entity_id
+ WHERE pel.person_oib=%s LIMIT 30""", (p['oib'],))
+ for r in cur.fetchall():
+ entry['entities'].append(dict(r))
+ out.append(entry)
+ return out
+
+
+@router.post("/enrich/forensic/{finding_id}")
+def enrich_forensic_v2(finding_id: int,
+ body: dict = Body(default=None),
+ x_user_email: Optional[str] = Header(default=None),
+ x_user_id: Optional[int] = Header(default=None)):
+ """Enrich a forensic finding: gather Wiki + DDG snippets + civic graph,
+ write back to civic.forensic_findings.related_entities, and seal the
+ payload hash on Polygon (or queue for sealing).
+ """
+ body = body or {}
+ explicit_name = (body.get('name') or '').strip() or None
+
+ with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
+ cur.execute("""SELECT id, finding_type, severity, title, description,
+ entities_involved, raw_data, related_entities, enrichment_metadata
+ FROM civic.forensic_findings WHERE id=%s""", (finding_id,))
+ finding = cur.fetchone()
+ if not finding:
+ raise HTTPException(404, "finding not found")
+ finding = dict(finding)
+
+ name = explicit_name or _extract_pep_name(finding)
+ if not name:
+ raise HTTPException(400, "could not derive a person/entity name; pass {name: \"…\"}")
+
+ sources = _gather_pep_evidence(name)
+ related = _related_entities_for_pep(name)
+
+ payload = {
+ 'finding_id': finding_id,
+ 'name': name,
+ 'sources': [{'source': s.get('source'), 'url': s.get('url'),
+ 'title': s.get('title')} for s in sources],
+ 'related_entities': related,
+ 'enriched_at': datetime.now(timezone.utc).isoformat(),
+ }
+
+ # Persist back to the finding
+ enrichment_meta = finding.get('enrichment_metadata') or {}
+ if not isinstance(enrichment_meta, dict): enrichment_meta = {}
+ history = enrichment_meta.get('history') or []
+ history.append({
+ 'at': payload['enriched_at'],
+ 'sources': payload['sources'],
+ 'related_count': len(related),
+ 'user': x_user_email,
+ })
+ enrichment_meta['history'] = history[-10:]
+ enrichment_meta['enriched_at'] = payload['enriched_at']
+ enrichment_meta['enriched_by'] = x_user_email or 'system'
+ enrichment_meta['source_count'] = len(sources)
+
+ with _db() as c, c.cursor() as cur:
+ cur.execute("""UPDATE civic.forensic_findings
+ SET related_entities = %s::jsonb,
+ enrichment_metadata = %s::jsonb
+ WHERE id=%s
+ RETURNING id""",
+ (json.dumps(related, default=str, ensure_ascii=False),
+ json.dumps(enrichment_meta, default=str, ensure_ascii=False),
+ finding_id))
+ cur.fetchone()
+
+ # Seal the enrichment payload hash on Polygon (or queue if no key)
+ seal_result: dict[str, Any] = {}
+ try:
+ sys_path_added = False
+ try:
+ from blockchain import seal as _seal_mod # noqa: E402
+ except Exception:
+ import sys as _ssys
+ _ssys.path.insert(0, '/opt/pgz-sport')
+ from blockchain import seal as _seal_mod # noqa: E402
+ sys_path_added = True
+ del sys_path_added # silence linters
+ h = _seal_mod.hash_payload(payload)
+ seal_result = _seal_mod.seal_to_polygon(
+ data_hash=h,
+ ref_id=str(finding_id),
+ action='forensic.enriched',
+ ref_type='forensic_finding',
+ payload=payload,
+ user_id=x_user_id,
+ user_email=x_user_email,
+ )
+ except Exception as e:
+ seal_result = {'error': f'{type(e).__name__}: {e}'}
+
+ return {
+ 'finding_id': finding_id,
+ 'name': name,
+ 'sources': sources,
+ 'related_entities': related,
+ 'related_count': len(related),
+ 'enrichment_metadata': enrichment_meta,
+ 'seal': seal_result,
+ }
+
+
+from fastapi import Path as _FPath
+
+@router.post("/enrich/{kind:str}/{eid:int}")
+def enrich_preview(kind: str = _FPath(..., regex='^(klub|savez|sportas)$'), eid: int = 0):
+ row = _load_row(kind, eid)
+ if kind == 'klub': res = _propose_for_klub(row)
+ elif kind == 'savez': res = _propose_for_savez(row)
+ else: res = _propose_for_sportas(row)
+
+ if kind == 'klub':
+ keys = ['oib','sport','grad','predsjednik','tajnik','web','email','telefon',
+ 'sjediste','godina_osnutka','ciljevi','opis_djelatnosti']
+ elif kind == 'savez':
+ keys = ['oib','sport','predsjednik','tajnik','email','telefon','web','adresa','godina_osnutka']
+ else:
+ keys = ['sport','profile_url','slika_url','hns_igrac_id','biografija',
+ 'datum_rodenja','mjesto_rodenja','broj_dresa','visina_cm','tezina_kg',
+ 'dominantna_noga','oib']
+
+ naziv = _display_name(kind, row)
+ grad = row.get('grad') if kind == 'klub' else None
+ primary = row.get('web') or row.get('web_stranica') or row.get('source_url') or row.get('scrape_url') or row.get('profile_url')
+
+ filled = sum(1 for k in keys if row.get(k))
+ coverage = round(filled / len(keys) * 100)
+ missing = [k for k in keys if not row.get(k)]
+
+ proposed = res['proposed']
+ current = {k: row.get(k) for k in proposed.keys()}
+ meta = row.get('metadata') or {}
+ if not isinstance(meta, dict): meta = {}
+
+ return {
+ 'kind': kind, 'id': eid, 'naziv': naziv,
+ 'coverage': coverage, 'filled_fields': filled, 'total_fields': len(keys),
+ 'missing_fields': missing,
+ 'live_snippet': _fetch_title(primary) if primary else None,
+ 'research_links': _research_links(naziv, kind, grad, sport=row.get('sport')),
+ 'sport': row.get('sport'),
+ 'sport_federation': (lambda f: {
+ 'national': (f.get('national') or {}).get('name') if f else None,
+ 'national_url': (f.get('national') or {}).get('url') if f else None,
+ 'pgz': (f.get('pgz') or {}).get('name') if f else None,
+ })(_sport_fed(row.get('sport'))),
+ 'sources': res['sources'],
+ 'current': current,
+ 'proposed': proposed,
+ 'last_enriched_at': meta.get('enriched_at'),
+ 'last_enrichment_source': meta.get('enrichment_source'),
+ 'enriched_at': int(time.time()),
+ 'apply_url': f'/sport/api/v2/enrich/{kind}/{eid}/apply',
+ }
+
+
+_TABLE_MAP = {
+ 'klub': ('pgz_sport.klubovi',
+ {'web','email','telefon','predsjednik','tajnik',
+ 'opis_djelatnosti','ciljevi','godina_osnutka','sjediste','adresa'}),
+ 'savez': ('pgz_sport.savezi',
+ {'web','email','telefon','predsjednik','tajnik','adresa','godina_osnutka'}),
+ 'sportas': ('pgz_sport.clanovi',
+ {'biografija','profile_url','source_url','slika_url','hns_igrac_id',
+ 'datum_rodenja','mjesto_rodenja','broj_dresa','visina_cm',
+ 'tezina_kg','dominantna_noga','oib'}),
+}
+
+
+def _apply_to_db(kind: str, eid: int, fields: dict, sources: list, user_email: Optional[str]):
+ if kind not in _TABLE_MAP:
+ raise HTTPException(400, "kind must be klub|savez|sportas")
+ table, allowed = _TABLE_MAP[kind]
+
+ with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
+ cur.execute(f"SELECT * FROM {table} WHERE id=%s FOR UPDATE", (eid,))
+ before = cur.fetchone()
+ if not before: raise HTTPException(404, kind + " not found")
+ before = dict(before)
+
+ sets, params, applied = [], [], {}
+ for k, v in (fields or {}).items():
+ if k not in allowed: continue
+ if v is None or str(v).strip() == '': continue
+ if before.get(k):
+ continue # never overwrite existing
+ sets.append(f"{k} = %s")
+ params.append(v); applied[k] = v
+
+ meta_in = before.get('metadata') or {}
+ if not isinstance(meta_in, dict): meta_in = {}
+ now_iso = datetime.now(timezone.utc).isoformat()
+ meta_in['enriched_at'] = now_iso
+ meta_in['enrichment_source'] = [s.get('source') for s in (sources or []) if s.get('source')]
+ history = meta_in.get('enrichment_history') or []
+ history.append({
+ 'at': now_iso,
+ 'fields': list(applied.keys()),
+ 'sources': meta_in['enrichment_source'],
+ 'urls': [s.get('url') for s in (sources or []) if s.get('url')],
+ 'user': user_email,
+ })
+ meta_in['enrichment_history'] = history[-10:]
+ sets.append("metadata = %s::jsonb")
+ params.append(json.dumps(meta_in, ensure_ascii=False, default=str))
+
+ params.append(eid)
+ cur.execute(f"UPDATE {table} SET {', '.join(sets)} WHERE id=%s RETURNING *", params)
+ after = dict(cur.fetchone())
+
+ cur.execute(
+ """INSERT INTO pgz_sport.enrichment_log
+ (kind, target_id, source, url, fields_set, before_jsonb, after_jsonb, user_email)
+ VALUES (%s,%s,%s,%s,%s,%s::jsonb,%s::jsonb,%s)""",
+ (kind, eid,
+ ','.join(meta_in['enrichment_source'])[:120] if meta_in['enrichment_source'] else None,
+ (sources[0].get('url') if sources else None),
+ list(applied.keys()) or None,
+ json.dumps({k: before.get(k) for k in (list(applied.keys()) + ['metadata'])},
+ ensure_ascii=False, default=str),
+ json.dumps({k: after.get(k) for k in (list(applied.keys()) + ['metadata'])},
+ ensure_ascii=False, default=str),
+ user_email))
+
+ snap_keys = ('id','naziv','ime','prezime','web','email','telefon',
+ 'opis_djelatnosti','biografija','metadata')
+ return {'applied': applied,
+ 'after': {k: after.get(k) for k in snap_keys if k in after}}
+
+
+@router.post("/enrich/{kind:str}/{eid:int}/apply")
+def enrich_apply(kind: str = _FPath(..., regex='^(klub|savez|sportas)$'),
+ eid: int = 0,
+ body: dict = Body(default=None),
+ x_user_email: Optional[str] = Header(default=None),
+ x_user_id: Optional[int] = Header(default=None)):
+ body = body or {}
+ fields = body.get('fields')
+ sources = body.get('sources')
+ if not fields:
+ row = _load_row(kind, eid)
+ if kind == 'klub': res = _propose_for_klub(row)
+ elif kind == 'savez': res = _propose_for_savez(row)
+ else: res = _propose_for_sportas(row)
+ fields = res['proposed']
+ sources = res['sources']
+ out = _apply_to_db(kind, eid, fields or {}, sources or [], x_user_email)
+ applied = out.get('applied') or {}
+ # R4-A3: write to pgz_sport.sys_audit so the audit page sees enrichment events
+ try:
+ from audit_seal_router import audit_log as _audit_log
+ if applied:
+ _audit_log(
+ action='enrich.apply',
+ target_type=kind,
+ target_id=eid,
+ payload={'applied': applied,
+ 'sources': [s.get('url') for s in (sources or []) if isinstance(s, dict)]},
+ user_id=x_user_id,
+ user_email=x_user_email,
+ )
+ except Exception:
+ pass
+ return {
+ 'status': 'success' if applied else 'no_changes',
+ 'kind': kind,
+ 'id': eid,
+ 'applied_count': len(applied),
+ 'applied_fields': list(applied.keys()),
+ **out,
+ }
+
+
+@router.get("/enrich/log")
+def enrich_log(kind: Optional[str] = None, target_id: Optional[int] = None, limit: int = 50):
+ where, params = [], []
+ if kind: where.append("kind=%s"); params.append(kind)
+ if target_id: where.append("target_id=%s"); params.append(target_id)
+ sql = ("SELECT id, kind, target_id, source, url, fields_set, user_email, created_at "
+ "FROM pgz_sport.enrichment_log "
+ + ("WHERE " + " AND ".join(where) + " " if where else "")
+ + "ORDER BY id DESC LIMIT %s")
+ params.append(min(int(limit or 50), 200))
+ with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
+ cur.execute(sql, params)
+ rows = [dict(r) for r in cur.fetchall()]
+ for r in rows:
+ if r.get('created_at'): r['created_at'] = r['created_at'].isoformat()
+ return {'count': len(rows), 'rows': rows}
+
+
+# ─── R3B M2 — SEARCH SUGGEST (autocomplete for Mreža) ───────────────────
+@router.get("/search/suggest")
+def search_suggest(q: str = '', type: str = '', limit: int = 10):
+ """
+ Autocomplete suggestions for the Mreža search inputs.
+ type ∈ {person, club, company, ''} — empty means all.
+ Returns: {query, results: [{id, label, type, sub}]}
+ """
+ q = (q or '').strip()
+ if len(q) < 2:
+ return {'query': q, 'results': []}
+ limit = max(1, min(50, int(limit)))
+ out = []
+ with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
+ if type in ('', 'club'):
+ cur.execute("""
+ SELECT id, naziv AS label, sport, grad
+ FROM pgz_sport.klubovi
+ WHERE naziv ILIKE %s AND aktivan=TRUE
+ ORDER BY length(naziv), naziv LIMIT %s
+ """, ('%'+q+'%', limit))
+ for r in cur.fetchall():
+ out.append({'id':'klub:'+str(r['id']), 'label': r['label'], 'type':'club',
+ 'sub': (r.get('sport') or '')+' · '+(r.get('grad') or '')})
+ cur.execute("""
+ SELECT id, naziv AS label, sport
+ FROM pgz_sport.savezi
+ WHERE naziv ILIKE %s AND aktivan=TRUE
+ ORDER BY length(naziv), naziv LIMIT %s
+ """, ('%'+q+'%', limit))
+ for r in cur.fetchall():
+ out.append({'id':'savez:'+str(r['id']), 'label': r['label'], 'type':'savez',
+ 'sub': r.get('sport') or 'savez'})
+ if type in ('', 'person'):
+ cur.execute("""
+ SELECT c.id, c.ime, c.prezime, c.sport, k.naziv AS klub_naziv
+ FROM pgz_sport.clanovi c
+ LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id
+ WHERE (COALESCE(c.ime,'') || ' ' || COALESCE(c.prezime,'')) ILIKE %s
+ ORDER BY length(COALESCE(c.ime,'')||COALESCE(c.prezime,'')), c.prezime
+ LIMIT %s
+ """, ('%'+q+'%', limit))
+ for r in cur.fetchall():
+ out.append({'id':'sportas:'+str(r['id']),
+ 'label': (r.get('ime') or '')+' '+(r.get('prezime') or ''),
+ 'type':'person',
+ 'sub': (r.get('sport') or 'sportaš')+(r.get('klub_naziv') and ' · '+r['klub_naziv'] or '')})
+ cur.execute("""
+ SELECT id, name AS label, function, oib, county
+ FROM civic.persons
+ WHERE name ILIKE %s
+ ORDER BY oib NULLS LAST, length(name) LIMIT %s
+ """, ('%'+q+'%', limit))
+ for r in cur.fetchall():
+ out.append({'id':'civic_person:'+str(r['id']),
+ 'label': r['label'], 'type':'person',
+ 'sub': (r.get('function') or 'civic')+' · '+(r.get('county') or '')})
+ if type in ('', 'company'):
+ cur.execute("""
+ SELECT id, name AS label, oib, city, entity_type
+ FROM civic.entities
+ WHERE name ILIKE %s
+ ORDER BY length(name) LIMIT %s
+ """, ('%'+q+'%', limit))
+ for r in cur.fetchall():
+ out.append({'id':'civic_entity:'+str(r['id']),
+ 'label': r['label'], 'type':'company',
+ 'sub': (r.get('entity_type') or 'tvrtka')+' · '+(r.get('city') or '')})
+ return {'query': q, 'results': out[:limit*2]}
+
+
+# ─── R3B M3 — FORENSIC ENRICH (Wikipedia scrape + persist) ──────────────
+@router.post("/forensic/findings/{finding_id}/enrich")
+def enrich_forensic(finding_id: int):
+ """
+ Look up the forensic finding, derive the PEP person name from
+ entities_involved or title, hit Wikipedia HR for a summary, and persist
+ the enriched payload into civic.forensic_findings.ai_analysis (or back into
+ raw_data.enrichment).
+ """
+ with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
+ cur.execute("""
+ SELECT id, finding_type, severity, title, description, entities_involved,
+ raw_data, ai_analysis
+ FROM civic.forensic_findings WHERE id=%s
+ """, (finding_id,))
+ f = cur.fetchone()
+ if not f: raise HTTPException(404, "finding not found")
+ f = dict(f)
+
+ # Derive person name candidates
+ candidates = []
+ if isinstance(f.get('entities_involved'), (list, dict)):
+ ei = f['entities_involved']
+ if isinstance(ei, dict):
+ for k in ('person','name','osoba','PEP','pep'):
+ if ei.get(k): candidates.append(str(ei[k]))
+ # Also try persons: [...] list
+ for p in (ei.get('persons') or ei.get('osobe') or []):
+ if isinstance(p, dict) and p.get('name'): candidates.append(p['name'])
+ elif isinstance(p, str): candidates.append(p)
+ elif isinstance(ei, list):
+ for it in ei:
+ if isinstance(it, dict):
+ for k in ('name','person','label'):
+ if it.get(k): candidates.append(str(it[k])); break
+ elif isinstance(it, str):
+ candidates.append(it)
+ if not candidates and f.get('title'):
+ # Heuristic: extract first capitalised "Ime Prezime" pair
+ m = re.search(r'\b([A-ZŠĐČĆŽ][a-zšđčćž]{2,})\s+([A-ZŠĐČĆŽ][a-zšđčćž]{2,})', f['title'])
+ if m: candidates.append(m.group(0))
+
+ wiki = None
+ used_query = None
+ for q in candidates[:3]:
+ wiki = _wiki_summary(q)
+ if wiki:
+ used_query = q
+ break
+
+ # Build enrichment payload
+ enrichment = {
+ 'queried': candidates[:5],
+ 'used_query': used_query,
+ 'wiki': wiki,
+ 'enriched_at': datetime.now(timezone.utc).isoformat(),
+ }
+
+ # Persist into raw_data.enrichment
+ raw = f.get('raw_data')
+ if raw is None: raw = {}
+ if not isinstance(raw, dict): raw = {'_legacy': raw}
+ raw['enrichment'] = enrichment
+
+ cur.execute("""
+ UPDATE civic.forensic_findings
+ SET raw_data = %s::jsonb,
+ ai_analysis = COALESCE(ai_analysis, %s)
+ WHERE id = %s
+ """, (json.dumps(raw, default=str, ensure_ascii=False),
+ (wiki or {}).get('extract'),
+ finding_id))
+ c.commit()
+
+ return {
+ 'finding_id': finding_id,
+ 'queried': candidates[:5],
+ 'used_query': used_query,
+ 'wiki': wiki,
+ 'persisted': True,
+ }
+
+
+# ─── R3B P4 — FORENSIC SCAN (kept from prior version) ───────────────────
+@router.post("/forensic/scan")
+def forensic_scan(req: dict = Body(...)):
+ name = (req.get('name') or '').strip()
+ if len(name) < 3:
+ raise HTTPException(400, "name must be at least 3 chars")
+ with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
+ cur.execute("""
+ SELECT id, name, function, party, county, city, oib, trust_tier
+ FROM civic.persons
+ WHERE upper(name) ILIKE upper(%s)
+ ORDER BY oib NULLS LAST, id LIMIT 25
+ """, ('%' + name + '%',))
+ persons = [dict(r) for r in cur.fetchall()]
+ for p in persons:
+ p['links'] = []; p['findings'] = []
+ if p.get('oib'):
+ cur.execute("""
+ SELECT pel.entity_id, pel.roles, e.name AS entity_name, e.oib AS entity_oib,
+ e.entity_type, e.city, e.risk_score
+ FROM civic.person_entity_links pel
+ LEFT JOIN civic.entities e ON e.id = pel.entity_id
+ WHERE pel.person_oib = %s LIMIT 50
+ """, (p['oib'],))
+ p['links'] = [dict(r) for r in cur.fetchall()]
+ cur.execute("""
+ SELECT id, finding_type, severity, title, severity_score, created_at
+ FROM civic.forensic_findings
+ WHERE entities_involved::text ILIKE %s
+ ORDER BY severity_score DESC, created_at DESC LIMIT 30
+ """, ('%' + p['oib'] + '%',))
+ p['findings'] = [dict(r) for r in cur.fetchall()]
+ if not p['findings']:
+ cur.execute("""
+ SELECT id, finding_type, severity, title, severity_score, created_at
+ FROM civic.forensic_findings
+ WHERE title ILIKE %s OR description ILIKE %s
+ ORDER BY severity_score DESC, created_at DESC LIMIT 30
+ """, ('%' + p['name'] + '%', '%' + p['name'] + '%'))
+ p['findings'] = [dict(r) for r in cur.fetchall()]
+ total_links = total_findings = crit_findings = 0
+ for p in persons:
+ total_links += len(p.get('links') or [])
+ for f in p.get('findings') or []:
+ total_findings += 1
+ if f.get('severity') in ('CRITICAL', 'HIGH'): crit_findings += 1
+ score = 0
+ if (p.get('function') or '').strip(): score += 30
+ if (p.get('party') or '').strip(): score += 15
+ score += min(40, len(p.get('links') or []) * 5)
+ score += min(40, len(p.get('findings') or []) * 10)
+ score += sum(20 for f in (p.get('findings') or []) if f.get('severity') in ('CRITICAL', 'HIGH'))
+ p['risk_score'] = min(100, score)
+ overall = max((p.get('risk_score', 0) for p in persons), default=0)
+ return {'query': name, 'matched_persons': len(persons),
+ 'overall_risk_score': overall, 'total_links': total_links,
+ 'total_findings': total_findings, 'critical_findings': crit_findings,
+ 'persons': persons, 'scanned_at': int(time.time())}
+
+
+
+# ─── SB-3 — Bulk enrichment ─────────────────────────────────────────────
+_BULK_KEY_MAP = {
+ 'klub': ('pgz_sport.klubovi',
+ ('oib','sport','grad','predsjednik','tajnik','web','email','telefon',
+ 'sjediste','godina_osnutka','ciljevi','opis_djelatnosti')),
+ 'savez': ('pgz_sport.savezi',
+ ('oib','sport','predsjednik','tajnik','email','telefon','web',
+ 'adresa','godina_osnutka')),
+ 'sportas': ('pgz_sport.clanovi',
+ ('sport','profile_url','slika_url','hns_igrac_id','biografija',
+ 'datum_rodenja','mjesto_rodenja','broj_dresa')),
+}
+
+
+def _coverage_sql(prefix: str, keys: tuple[str, ...]) -> str:
+ parts = [f"(CASE WHEN {prefix}{k} IS NOT NULL AND ({prefix}{k}::text) <> '' THEN 1 ELSE 0 END)"
+ for k in keys]
+ return f"((({' + '.join(parts)})::numeric * 100) / {len(keys)})"
+
+
+def _bulk_pick(kind: str, limit: int, coverage_max: int) -> list[int]:
+ if kind not in _BULK_KEY_MAP:
+ raise HTTPException(400, "kind must be klub|savez|sportas")
+ table, keys = _BULK_KEY_MAP[kind]
+ cov = _coverage_sql('', keys)
+ extra_where = ''
+ if kind == 'klub':
+ extra_where = "AND aktivan = TRUE"
+ elif kind == 'sportas':
+ extra_where = "AND aktivan = TRUE"
+ sql = (f"SELECT id FROM {table} "
+ f"WHERE 1=1 {extra_where} "
+ f"AND {cov} < %s "
+ f"ORDER BY random() LIMIT %s")
+ with _db() as c, c.cursor() as cur:
+ cur.execute(sql, (coverage_max, limit))
+ return [r[0] for r in cur.fetchall()]
+
+
+@router.post("/enrich/bulk")
+def enrich_bulk(body: dict = Body(default=None),
+ x_user_email: Optional[str] = Header(default=None),
+ x_user_id: Optional[int] = Header(default=None)):
+ """Run preview+apply over N random under-enriched rows of one kind.
+
+ Body: {kind: 'klub'|'savez'|'sportas', limit: 50, coverage_max: 70}
+ Returns aggregate stats. Synchronous (use polling, not SSE).
+ """
+ body = body or {}
+ kind = (body.get('kind') or '').strip().lower()
+ if kind not in _BULK_KEY_MAP:
+ raise HTTPException(400, "kind must be klub|savez|sportas")
+ limit = max(1, min(int(body.get('limit') or 50), 200))
+ coverage_max = max(0, min(int(body.get('coverage_max') or 70), 100))
+
+ ids = _bulk_pick(kind, limit, coverage_max)
+ items: list[dict] = []
+ fields_total = 0
+ started = time.time()
+
+ for eid in ids:
+ try:
+ row = _load_row(kind, eid)
+ if kind == 'klub': res = _propose_for_klub(row)
+ elif kind == 'savez': res = _propose_for_savez(row)
+ else: res = _propose_for_sportas(row)
+ proposed = res.get('proposed') or {}
+ srcs = res.get('sources') or []
+ if not proposed:
+ items.append({'id': eid, 'applied': 0, 'fields': []})
+ continue
+ out = _apply_to_db(kind, eid, proposed, srcs, x_user_email)
+ applied = out.get('applied') or {}
+ fields_total += len(applied)
+ items.append({'id': eid, 'applied': len(applied), 'fields': list(applied.keys())})
+ try:
+ from audit_seal_router import audit_log as _audit_log
+ if applied:
+ _audit_log(action='enrich.bulk.apply',
+ target_type=kind, target_id=eid,
+ payload={'applied': applied},
+ user_id=x_user_id, user_email=x_user_email)
+ except Exception:
+ pass
+ except HTTPException as e:
+ items.append({'id': eid, 'error': e.detail})
+ except Exception as e:
+ items.append({'id': eid, 'error': f'{type(e).__name__}: {e}'})
+
+ return {
+ 'status': 'success',
+ 'kind': kind,
+ 'requested': limit,
+ 'processed': len(items),
+ 'fields_total': fields_total,
+ 'elapsed_s': round(time.time() - started, 1),
+ 'items': items,
+ }
+
+
+# ─── SB-4 — Worker status / control ─────────────────────────────────────
+_REDIS_KEYS = {
+ 'heartbeat': 'cc:pgz-enricher:heartbeat',
+ 'pause': 'cc:pgz-enricher:pause',
+ 'run_now': 'cc:pgz-enricher:run_now',
+ 'last_cycle': 'cc:pgz-enricher:last_cycle',
+ 'confidence': 'cc:pgz-enricher:confidence',
+ 'fields_24h': 'cc:pgz-enricher:fields_24h',
+}
+
+
+def _redis_client():
+ try:
+ import redis
+ except Exception:
+ return None
+ host = os.environ.get('REDIS_HOST', 'localhost')
+ port = int(os.environ.get('REDIS_PORT', '6379'))
+ pwd = (os.environ.get('REDIS_PASS') or '').strip().strip("'").strip('"') or None
+ # Try with password first (prod); fall back to anonymous (dev box) on AUTH failure.
+ for p in (pwd, None):
+ try:
+ r = redis.Redis(host=host, port=port, password=p,
+ decode_responses=True, socket_connect_timeout=2)
+ r.ping()
+ return r
+ except Exception:
+ continue
+ return None
+
+
+@router.get("/enrich/worker/status")
+def enrich_worker_status():
+ r = _redis_client()
+ out = {'available': bool(r)}
+ if not r:
+ return out
+ try:
+ hb = r.get(_REDIS_KEYS['heartbeat'])
+ out['heartbeat'] = int(hb) if hb else None
+ out['heartbeat_age_s'] = (int(time.time()) - int(hb)) if hb else None
+ out['paused'] = (r.get(_REDIS_KEYS['pause']) or '0') == '1'
+ out['run_now_pending'] = (r.get(_REDIS_KEYS['run_now']) or '0') == '1'
+ last = r.get(_REDIS_KEYS['last_cycle'])
+ if last:
+ try: out['last_cycle'] = json.loads(last)
+ except: out['last_cycle'] = last
+ conf = r.get(_REDIS_KEYS['confidence'])
+ out['confidence_threshold'] = float(conf) if conf else 0.7
+ f24 = r.get(_REDIS_KEYS['fields_24h'])
+ out['fields_24h'] = int(f24) if f24 and f24.isdigit() else 0
+ except Exception as e:
+ out['error'] = f'{type(e).__name__}: {e}'
+ # Recent enrichment_log rows for live activity
+ try:
+ with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
+ cur.execute("""SELECT id, kind, target_id, source, fields_set, user_email, created_at
+ FROM pgz_sport.enrichment_log
+ ORDER BY id DESC LIMIT 25""")
+ rows = []
+ for rr in cur.fetchall():
+ rr = dict(rr)
+ if rr.get('created_at'): rr['created_at'] = rr['created_at'].isoformat()
+ rows.append(rr)
+ out['recent'] = rows
+ except Exception:
+ out['recent'] = []
+ return out
+
+
+@router.post("/enrich/worker/pause")
+def enrich_worker_pause(body: dict = Body(default=None)):
+ body = body or {}
+ pause = bool(body.get('paused', True))
+ r = _redis_client()
+ if not r: raise HTTPException(503, 'redis unavailable')
+ r.set(_REDIS_KEYS['pause'], '1' if pause else '0')
+ return {'status': 'success', 'paused': pause}
+
+
+@router.post("/enrich/worker/run-now")
+def enrich_worker_run_now():
+ r = _redis_client()
+ if not r: raise HTTPException(503, 'redis unavailable')
+ r.set(_REDIS_KEYS['run_now'], '1')
+ return {'status': 'success', 'queued': True}
+
+
+@router.post("/enrich/worker/confidence")
+def enrich_worker_confidence(body: dict = Body(...)):
+ try:
+ v = float(body.get('value'))
+ except Exception:
+ raise HTTPException(400, 'value must be number 0..1')
+ if not (0.0 <= v <= 1.0):
+ raise HTTPException(400, 'value out of range 0..1')
+ r = _redis_client()
+ if not r: raise HTTPException(503, 'redis unavailable')
+ r.set(_REDIS_KEYS['confidence'], str(v))
+ return {'status': 'success', 'confidence_threshold': v}
diff --git a/_backups/sprint_1777940323/sport2.html b/_backups/sprint_1777940323/sport2.html
new file mode 100644
index 0000000..abcceb7
--- /dev/null
+++ b/_backups/sprint_1777940323/sport2.html
@@ -0,0 +1,2950 @@
+
+
+
+
+
+PGŽ SPORT — Platforma
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Dashboard
+
Pregled stanja
+
+
+ ● API live · sport.rinet.one
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Detalji
+
×
+
+
+
+
+
+
+
diff --git a/routers/enrich_router.py b/routers/enrich_router.py
index ccc0e7c..64152dc 100644
--- a/routers/enrich_router.py
+++ b/routers/enrich_router.py
@@ -381,11 +381,27 @@ def _sport_fed(sport: Optional[str]) -> Optional[dict]:
return feds.get(norm)
-def _research_links(naziv, kind, grad=None, sport: Optional[str] = None):
+def _research_links(naziv, kind, grad=None, sport: Optional[str] = None, row: Optional[dict] = None):
base_q = (naziv or '').strip()
q = (base_q + ' ' + grad) if grad else base_q
qenc = urllib.parse.quote(q)
- out = [
+ out = []
+ # Prefer DIRECT profile/source link if entity already has one (e.g. HNS Semafor)
+ if row:
+ direct = row.get('profile_url') or row.get('source_url') or row.get('scrape_url') or row.get('web') or row.get('web_stranica')
+ if direct and isinstance(direct, str) and direct.startswith(('http://','https://')):
+ try:
+ host = urllib.parse.urlparse(direct).hostname or ''
+ except Exception:
+ host = ''
+ label = 'Vanjski profil'
+ icon = '🔗'
+ if 'hns' in host: label, icon = 'HNS profil', '⚽'
+ elif 'transfermarkt' in host: label, icon = 'Transfermarkt', '⚽'
+ elif 'wikipedia' in host: label, icon = 'Wikipedia', '📚'
+ elif host.endswith('.hr') or host.endswith('.com'): label, icon = 'Službena stranica', '🌐'
+ out.append({'label': label, 'icon': icon, 'url': direct, 'is_direct': True})
+ out += [
{'label': 'Google', 'icon': '🔍', 'url': 'https://www.google.com/search?q=' + qenc},
{'label': 'Wikipedia HR', 'icon': '📚', 'url': 'https://hr.wikipedia.org/w/index.php?search=' + qenc},
{'label': 'sport-pgz.hr', 'icon': '🏅', 'url': 'https://sport-pgz.hr/?s=' + qenc},
@@ -445,11 +461,128 @@ def _is_relevant(source: dict, tokens: list[str]) -> bool:
return any(t in blob for t in tokens)
+
+# ─── Klub domain guesser (HR slug → candidate URLs → HEAD probe) ────────
+import re as _re_klg
+
+def _slugify_klub(naziv: str) -> str:
+ if not naziv: return ""
+ s = naziv.lower()
+ repl = (("č","c"),("ć","c"),("ž","z"),("š","s"),("đ","d"),
+ ('"',''),("'",""),("(",""),(")",""),(",",""),(".",""),
+ ("/",""),("\\",""))
+ for a,b in repl: s = s.replace(a,b)
+ s = _re_klg.sub(r"[^a-z0-9]+", "-", s).strip("-")
+ return s
+
+def _klub_domain_candidates(naziv: str) -> list[str]:
+ """Generate ranked candidate URLs from club name."""
+ if not naziv: return []
+ s = _slugify_klub(naziv)
+ # Strip common prefixes for cleaner domains
+ base = s
+ for pref in ("hnk-","nk-","rk-","kk-","ok-","bk-","gk-","tk-","ak-","hbk-"):
+ if base.startswith(pref):
+ base = base[len(pref):]; break
+ # also try short prefix-ed variants
+ short = base.split("-")[0] if base else ""
+ candidates = []
+ sports_prefixes = ["nk-","hnk-","rk-","kk-","bk-","ok-","ak-","tk-"]
+ # full slug with original prefix
+ for tld in (".hr",".com",".eu",".info"):
+ candidates.append(f"https://{s}{tld}")
+ candidates.append(f"https://www.{s}{tld}")
+ # base-only
+ for tld in (".hr",".com"):
+ candidates.append(f"https://{base}{tld}")
+ candidates.append(f"https://www.{base}{tld}")
+ # try sport prefixes if name doesn't already have one
+ if not any(s.startswith(p) for p in sports_prefixes):
+ for sp in sports_prefixes[:5]:
+ for tld in (".hr",".com"):
+ candidates.append(f"https://{sp}{base}{tld}")
+ # dedup, preserve order
+ seen, out = set(), []
+ for c in candidates:
+ if c not in seen:
+ seen.add(c); out.append(c)
+ return out[:20]
+
+def _probe_klub_url(url: str, naziv_tokens: list, timeout: int = 5) -> Optional[dict]:
+ """HEAD/GET probe; return doc with raw_text if URL is alive AND mentions club tokens."""
+ try:
+ import requests
+ r = requests.get(url, timeout=timeout, allow_redirects=True,
+ headers={"User-Agent":"Mozilla/5.0 RinetEnrichBot/1.0"})
+ if r.status_code != 200: return None
+ if len(r.text) < 200: return None
+ text = r.text.lower()
+ # Must mention at least one distinctive token from name
+ toks = [t.lower() for t in (naziv_tokens or []) if len(t) > 2]
+ if toks and not any(t in text for t in toks):
+ return None
+ return {"source": "domain_probe", "url": r.url, "raw_text": r.text[:50000]}
+ except Exception:
+ return None
+
+def _guess_klub_domains(naziv: str, tokens: list) -> Optional[dict]:
+ """Parallel probe candidates (5 workers, 4s timeout each); first hit wins."""
+ from concurrent.futures import ThreadPoolExecutor, as_completed
+ candidates = _klub_domain_candidates(naziv)
+ if not candidates: return None
+ with ThreadPoolExecutor(max_workers=8) as ex:
+ futs = {ex.submit(_probe_klub_url, url, tokens, 4): url for url in candidates[:16]}
+ for fut in as_completed(futs, timeout=10):
+ try:
+ doc = fut.result()
+ if doc:
+ # Cancel remaining (best effort)
+ for f in futs:
+ if not f.done(): f.cancel()
+ return doc
+ except Exception:
+ continue
+ return None
+
+def _scrape_klub_subpages(base_url: str, tokens: list) -> str:
+ """Fetch /kontakt /uprava /o-nama /o-klubu and concat texts."""
+ if not base_url: return ""
+ import requests
+ base = base_url.rstrip("/")
+ paths = ["/kontakt","/uprava","/o-nama","/o-klubu","/predsjednik","/klub","/contact","/about"]
+ accum = []
+ for path in paths:
+ try:
+ r = requests.get(base + path, timeout=4, allow_redirects=True,
+ headers={"User-Agent":"Mozilla/5.0 RinetEnrichBot/1.0"})
+ if r.status_code == 200 and len(r.text) > 200:
+ accum.append(r.text[:30000])
+ except Exception:
+ pass
+ return "\n\n".join(accum)
+
+
def _propose_for_klub(row: dict) -> dict:
naziv = row.get('naziv') or ''
- primary = row.get('web') or row.get('web_stranica') or row.get('source_url') or row.get('scrape_url')
+ # Only consider HTTP(S) URLs as valid primary sources — skip placeholder strings like 'godisnjak_2025'
+ raw_primary = row.get('web') or row.get('web_stranica') or row.get('source_url') or row.get('scrape_url')
+ primary = raw_primary if (raw_primary and isinstance(raw_primary, str) and raw_primary.startswith(('http://','https://'))) else None
sources, evidence = [], []
+ tokens_pre = _name_tokens(naziv)
pdoc = _fetch_primary_site(primary) if primary else None
+ if not pdoc:
+ # No valid web in DB — try to guess domain from club name
+ pdoc = _guess_klub_domains(naziv, tokens_pre)
+ if pdoc:
+ # Also fetch subpages for richer evidence
+ sub = _scrape_klub_subpages(pdoc.get('url',''), tokens_pre)
+ if sub:
+ pdoc['raw_text'] = (pdoc.get('raw_text','') + '\n\n' + sub)[:120000]
+ elif pdoc:
+ # Have primary site — also fetch its subpages
+ sub = _scrape_klub_subpages(pdoc.get('url') or primary, tokens_pre)
+ if sub:
+ pdoc['raw_text'] = (pdoc.get('raw_text','') + '\n\n' + sub)[:120000]
if pdoc: sources.append(pdoc); evidence.append(pdoc.get('raw_text') or pdoc.get('extract') or '')
wiki = _wiki_summary(naziv)
if wiki: sources.append(wiki); evidence.append(wiki.get('extract') or '')
@@ -1121,7 +1254,7 @@ def enrich_preview(kind: str = _FPath(..., regex='^(klub|savez|sportas)$'), eid:
'coverage': coverage, 'filled_fields': filled, 'total_fields': len(keys),
'missing_fields': missing,
'live_snippet': _fetch_title(primary) if primary else None,
- 'research_links': _research_links(naziv, kind, grad, sport=row.get('sport')),
+ 'research_links': _research_links(naziv, kind, grad, sport=row.get('sport'), row=row),
'sport': row.get('sport'),
'sport_federation': (lambda f: {
'national': (f.get('national') or {}).get('name') if f else None,
diff --git a/static/app.html b/static/app.html
index c94f455..9f7ae72 100644
--- a/static/app.html
+++ b/static/app.html
@@ -265,7 +265,7 @@ table tbody tr:hover{background:var(--bg3)}