M12.1: enrich v3 — preview + /apply persists to DB (klubovi/savezi/clanovi)

- POST /v2/enrich/{kind}/{eid} now scrapes Wikipedia HR + sport-pgz.hr + primary site, runs relevance filter so contact info from off-topic pages isn't lifted, optionally calls DeepSeek for opis_djelatnosti, returns {current, proposed, sources, last_enriched_at} for diff UI. - POST /v2/enrich/{kind}/{eid}/apply UPDATES klubovi/savezi/clanovi for whitelisted empty fields, sets metadata.enriched_at + metadata.enrichment_source + metadata.enrichment_history, writes a row to pgz_sport.enrichment_log (new table). - GET /v2/enrich/log read-back endpoint. - Tested on klub 3 (KK Kvarner 2010): opis_djelatnosti persisted; metadata carries enriched_at + sources. - New tables/columns: pgz_sport.enrichment_log; metadata jsonb on klubovi/savezi. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-05 00:14:17 +02:00
parent 21be7ff42b
commit 85fd51bfd9
1 changed files with 677 additions and 224 deletions
@@ -1,310 +1,763 @@
 """
-enrich_router.py — Round-2/3B enrichment + forensic-scan endpoints
-Author: dradulic@outlook.com  Date: 2026-05-04 (R2), 2026-05-05 (R3B)
+enrich_router.py — v3 enrichment + forensic scan
+Author: dradulic@outlook.com / damir@rinet.one
+Date:   2026-05-04 (R2) → 2026-05-05 (R3 CC6 v3)

-Surfaces "Obogati podatke" buttons for klubovi, savezi, sportasi, plus
-the Forenzika "Pokreni novu analizu" scan endpoint that searches civic.*.
+POST /v2/enrich/{kind}/{eid}
+    Inspect the row, scrape the web (Wikipedia HR, sport-pgz.hr search,
+    primary club URL if any), regex-extract candidate fields (web/email/
+    telefon), optionally synthesise descriptions via DeepSeek, and return
+    a *preview* shape with `proposed` updates the operator can apply.

-Strategy:
-  1) Read what's already in DB and surface fields the frontend may not have shown.
-  2) Build curated research URLs (Google, Wikipedia HR, Sportilus, sport-pgz.hr,
-     HNS Semafor) so the operator can verify or expand by hand.
-  3) If the entity has a `web` URL set, quickly fetch the page and extract
-     <title> + <meta description> to return as a "live snippet". 5s timeout, fail-soft.
-  4) /forensic/scan — match name across civic.persons, return entity links,
-     forensic_findings hits, and a synthesised risk score.
-  5) /enrich/{kind}/{id}/apply — fetch best web source for entity and UPDATE the
-     row's web/email/telefon fields when missing.
+POST /v2/enrich/{kind}/{eid}/apply
+    Body shapes:
+      None / {}                  → re-run preview, apply every proposed field
+      {"fields": {...}}          → apply ONLY those (whitelist + emptiness still enforced)
+    Performs UPDATE on the matching table, sets metadata.enriched_at and
+    metadata.enrichment_source, writes a row to pgz_sport.enrichment_log,
+    returns the after snapshot.
+
+GET  /v2/enrich/log?kind=&target_id=&limit=
+    Read recent enrichment-log entries.
+
+POST /v2/forensic/scan
+    Search civic.persons by name, return entity links + findings + risk score.
+
+Kinds: klub | savez | sportas
 """
-import os, re, json, time, urllib.parse, urllib.request, html
+from __future__ import annotations
+import os, re, json, time, html, urllib.parse, urllib.request
+from datetime import datetime, timezone
+from typing import Any, Optional
+
 import psycopg2, psycopg2.extras
-from fastapi import APIRouter, HTTPException, Body
+from fastapi import APIRouter, HTTPException, Header, Body

 router = APIRouter()

-_pgh = os.environ.get('PG_HOST','10.10.0.2')
-_pgp = int(os.environ.get('PG_PORT','6432'))
-# pgz-sport.service inherits PG_HOST=localhost:5432 from /opt/.env.rinet which is wrong
-# (local PG is disabled). Force the Server B DSN if env says localhost.
+_pgh = os.environ.get('PG_HOST', '10.10.0.2')
+_pgp = int(os.environ.get('PG_PORT', '6432'))
 if _pgh in ('localhost', '127.0.0.1'):
-    _pgh = os.environ.get('DB_HOST','10.10.0.2')
-    _pgp = int(os.environ.get('DB_PORT','6432'))
+    _pgh = os.environ.get('DB_HOST', '10.10.0.2')
+    _pgp = int(os.environ.get('DB_PORT', '6432'))
 DB = dict(host=_pgh, port=_pgp,
-          dbname=os.environ.get('PG_DB','rinet_v3'),
-          user=os.environ.get('PG_USER','rinet'),
-          password=os.environ.get('PG_PASS','R1net2026!SecureDB#v7'))
+          dbname=os.environ.get('PG_DB', 'rinet_v3'),
+          user=os.environ.get('PG_USER', 'rinet'),
+          password=os.environ.get('PG_PASS', 'R1net2026!SecureDB#v7'))

-UA = 'pgz-sport-enrich/2.0'
+UA = 'pgz-sport-enrich/3.0 (+https://sport.rinet.one)'
+TIMEOUT = 6  # seconds — fail-soft

+DEEPSEEK_KEY = os.environ.get('DEEPSEEK_API_KEY', '').strip()
+DEEPSEEK_URL = os.environ.get('DEEPSEEK_URL',
+                              'https://api.deepseek.com/v1/chat/completions')
+
+
+# ─── DB helpers ──────────────────────────────────────────────────────────
 def _db():
    c = psycopg2.connect(**DB); c.autocommit = True; return c

 def _fetch_one(sql, p):
    with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
-        cur.execute(sql, p)
-        r = cur.fetchone()
+        cur.execute(sql, p); r = cur.fetchone()
        return dict(r) if r else None

-def _fetch_title(url, timeout=5):
-    if not url: return None
-    try:
-        if not url.startswith('http'):
-            return None
-        req = urllib.request.Request(url, headers={'User-Agent': UA})
-        with urllib.request.urlopen(req, timeout=timeout) as r:
-            data = r.read(40000).decode('utf-8','ignore')
-        title_m = re.search(r'<title[^>]*>([^<]+)</title>', data, re.I)
-        desc_m = re.search(r'<meta\s+name=["\']description["\']\s+content=["\']([^"\']+)["\']', data, re.I)
-        og_desc_m = re.search(r'<meta\s+property=["\']og:description["\']\s+content=["\']([^"\']+)["\']', data, re.I)
-        return {
-            'url': url,
-            'title': html.unescape(title_m.group(1).strip())[:300] if title_m else None,
-            'description': html.unescape((desc_m or og_desc_m).group(1).strip())[:500] if (desc_m or og_desc_m) else None,
-            'fetched_at': int(time.time()),
-        }
-    except Exception as e:
-        return {'url': url, 'error': str(e)[:120]}

-def _research_links(naziv, kind, grad=None):
-    base_q = (naziv or '').strip()
-    if grad: q = base_q + ' ' + grad
-    else: q = base_q
-    qenc = urllib.parse.quote(q)
-    out = [
-        {'label':'Google', 'icon':'🔍', 'url':'https://www.google.com/search?q='+qenc},
-        {'label':'Wikipedia HR', 'icon':'📚', 'url':'https://hr.wikipedia.org/w/index.php?search='+qenc},
-        {'label':'sport-pgz.hr', 'icon':'🏅', 'url':'https://sport-pgz.hr/?s='+qenc},
-    ]
-    if kind == 'klub':
-        out.append({'label':'Sportilus', 'icon':'⬡', 'url':'https://www.sportilus.com/?s='+qenc})
-        out.append({'label':'Sudski registar', 'icon':'⚖', 'url':'https://sudreg.pravosudje.hr/registar/oc/index.html'})
-    if kind == 'sportas':
-        out.append({'label':'HNS Semafor', 'icon':'⚽', 'url':'https://semafor.hns.family/?s='+qenc})
-        out.append({'label':'transfermarkt', 'icon':'⚽', 'url':'https://www.transfermarkt.com/schnellsuche/ergebnis/schnellsuche?query='+qenc})
-    if kind == 'savez':
-        out.append({'label':'sport-pgz.hr savezi', 'icon':'🏅', 'url':'https://sport-pgz.hr/savezi'})
+# ─── HTTP helpers ────────────────────────────────────────────────────────
+def _http_get(url: str, timeout: int = TIMEOUT) -> Optional[str]:
+    if not url: return None
+    if not url.startswith('http'): return None
+    try:
+        req = urllib.request.Request(url, headers={
+            'User-Agent': UA, 'Accept-Language': 'hr,en;q=0.8'})
+        with urllib.request.urlopen(req, timeout=timeout) as r:
+            data = r.read(150000)
+            try:    return data.decode('utf-8')
+            except: return data.decode('latin-1', 'ignore')
+    except Exception:
+        return None
+
+
+def _strip_tags(s: str) -> str:
+    if not s: return ''
+    s = re.sub(r'<script[^>]*>.*?</script>', ' ', s, flags=re.S | re.I)
+    s = re.sub(r'<style[^>]*>.*?</style>',   ' ', s, flags=re.S | re.I)
+    s = re.sub(r'<[^>]+>', ' ', s)
+    s = html.unescape(s)
+    s = re.sub(r'\s+', ' ', s).strip()
+    return s
+
+
+def _extract_meta(html_doc: str, url: str) -> dict:
+    if not html_doc: return {}
+    out = {'url': url, 'fetched_at': int(time.time())}
+    m = re.search(r'<title[^>]*>([^<]+)</title>', html_doc, re.I)
+    if m: out['title'] = html.unescape(m.group(1).strip())[:300]
+    m = re.search(r'<meta\s+name=["\']description["\']\s+content=["\']([^"\']+)["\']', html_doc, re.I)
+    if not m:
+        m = re.search(r'<meta\s+property=["\']og:description["\']\s+content=["\']([^"\']+)["\']', html_doc, re.I)
+    if m: out['description'] = html.unescape(m.group(1).strip())[:600]
    return out

-@router.post("/enrich/{kind}/{eid}")
-def enrich(kind: str, eid: int):
-    if kind not in ('klub','savez','sportas'):
-        raise HTTPException(400, "kind must be klub|savez|sportas")

+def _fetch_title(url, timeout=5):
+    body = _http_get(url, timeout=timeout)
+    if not body: return {'url': url, 'error': 'fetch failed'} if url else None
+    return _extract_meta(body, url)
+
+
+# ─── Field extractors ───────────────────────────────────────────────────
+RE_EMAIL = re.compile(r'[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,}', re.I)
+RE_PHONE = re.compile(r'(?:\+?385[\s\-/]*|0)\d[\d\s\-/]{6,12}\d')
+RE_URL   = re.compile(r'https?://[^\s"\'<>)\]]+', re.I)
+
+def _find_email(text: str) -> Optional[str]:
+    if not text: return None
+    bad = ('@example.', '@test.', '@email.', 'wixpress.com',
+           'sentry.io', 'jquery.com', 'googleapis', '@2x.', 'noreply@')
+    seen = set()
+    for m in RE_EMAIL.finditer(text):
+        e = m.group(0).lower().rstrip('.,;:)')
+        if any(b in e for b in bad): continue
+        if e in seen: continue
+        seen.add(e); return e
+    return None
+
+def _find_phone(text: str) -> Optional[str]:
+    if not text: return None
+    for m in RE_PHONE.finditer(text):
+        raw = m.group(0).strip()
+        digits = re.sub(r'\D', '', raw)
+        if not (8 <= len(digits) <= 13): continue
+        cleaned = re.sub(r'\s+', ' ', raw).strip()
+        if raw.startswith('+385'):  return '+385 ' + raw[4:].lstrip().lstrip('-/')
+        if raw.startswith('00385'): return '+385 ' + raw[5:].lstrip().lstrip('-/')
+        return cleaned
+    return None
+
+def _find_official_web(text: str, hint: str = '') -> Optional[str]:
+    if not text: return None
+    blocked = ('wikipedia.org', 'sport-pgz.hr', 'google.com', 'facebook.com',
+               'instagram.com', 'youtube.com', 'twitter.com', 'wikimedia',
+               'sportilus.com', 'transfermarkt.com', 'wikidata.org',
+               'sudreg.pravosudje.hr', 'gov.hr', 'apis.google.com',
+               'rinet.one', 'pgz.hr')
+    candidates: list[str] = []
+    for m in RE_URL.finditer(text):
+        u = m.group(0).rstrip('.,;:)\'"')
+        try:
+            host = urllib.parse.urlparse(u).hostname or ''
+        except Exception:
+            continue
+        if not host or any(b in host for b in blocked): continue
+        candidates.append(u)
+    if not candidates: return None
+    if hint:
+        slug = re.sub(r'[^a-z0-9]', '', hint.lower())[:8]
+        for u in candidates:
+            host = urllib.parse.urlparse(u).hostname or ''
+            if slug and slug in host.replace('-', '').replace('.', ''):
+                return u
+    return candidates[0]
+
+
+# ─── External sources ────────────────────────────────────────────────────
+def _wiki_summary(query: str) -> Optional[dict]:
+    if not query: return None
+    title = urllib.parse.quote(query.replace(' ', '_'), safe='')
+    body = _http_get(f'https://hr.wikipedia.org/api/rest_v1/page/summary/{title}', timeout=5)
+    if not body: return None
+    try:
+        d = json.loads(body)
+        if d.get('type') == 'disambiguation' or 'extract' not in d: return None
+        return {
+            'source': 'wikipedia.hr',
+            'url': d.get('content_urls', {}).get('desktop', {}).get('page'),
+            'title': d.get('title'),
+            'extract': d.get('extract'),
+            'description': d.get('description'),
+        }
+    except Exception:
+        return None
+
+
+def _sport_pgz_search(query: str) -> Optional[dict]:
+    if not query: return None
+    page = _http_get('https://sport-pgz.hr/?s=' + urllib.parse.quote(query), timeout=6)
+    if not page: return None
+    m = re.search(r'<article[^>]*>.*?<a\s+href=["\']([^"\']+)["\'][^>]*rel=["\']bookmark["\'][^>]*>([^<]+)</a>',
+                  page, re.S | re.I)
+    if not m:
+        m = re.search(r'<a\s+href=["\'](https?://sport-pgz\.hr/[^"\']+)["\'][^>]*>([^<]{6,180})</a>', page, re.I)
+    if not m: return None
+    hit = m.group(1)
+    body = _http_get(hit, timeout=6)
+    if not body:
+        return {'source': 'sport-pgz.hr', 'url': hit, 'title': html.unescape(m.group(2).strip())}
+    text = _strip_tags(body)[:4000]
+    meta = _extract_meta(body, hit)
+    return {
+        'source': 'sport-pgz.hr',
+        'url': hit,
+        'title': meta.get('title') or html.unescape(m.group(2).strip()),
+        'extract': meta.get('description') or text[:500],
+        'raw_text': text,
+    }
+
+
+def _fetch_primary_site(url: str) -> Optional[dict]:
+    body = _http_get(url, timeout=6)
+    if not body: return None
+    text = _strip_tags(body)
+    meta = _extract_meta(body, url)
+    return {
+        'source': urllib.parse.urlparse(url).hostname or url,
+        'url': url,
+        'title': meta.get('title'),
+        'extract': meta.get('description') or text[:500],
+        'raw_text': text[:8000],
+    }
+
+
+# ─── DeepSeek (optional, fail-soft) ─────────────────────────────────────
+def _deepseek_describe(naziv: str, kind: str, evidence: list[str]) -> Optional[str]:
+    if not DEEPSEEK_KEY or not evidence: return None
+    joined = "\n---\n".join(e for e in evidence if e)[:6000]
+    if not joined.strip(): return None
+    prompt = (f"Iz dolje navedenih izvora napiši profesionalni opis za "
+              f"{kind} '{naziv}' na hrvatskom jeziku. 3-5 rečenica. "
+              f"Bez uvoda 'Evo opisa', samo tekst.\n\nIZVORI:\n{joined}")
+    payload = {
+        "model": "deepseek-chat",
+        "messages": [
+            {"role": "system", "content": "Pišeš sažete činjenične opise sportskih organizacija na hrvatskom."},
+            {"role": "user",   "content": prompt},
+        ],
+        "max_tokens": 280, "temperature": 0.3,
+    }
+    req = urllib.request.Request(
+        DEEPSEEK_URL, data=json.dumps(payload).encode('utf-8'),
+        headers={'Authorization': 'Bearer ' + DEEPSEEK_KEY,
+                 'Content-Type': 'application/json',
+                 'User-Agent': UA}, method='POST')
+    try:
+        with urllib.request.urlopen(req, timeout=20) as r:
+            d = json.loads(r.read().decode('utf-8'))
+        text = d.get('choices', [{}])[0].get('message', {}).get('content', '').strip()
+        return text or None
+    except Exception:
+        return None
+
+
+# ─── Row loaders & display name ─────────────────────────────────────────
+def _load_row(kind: str, eid: int) -> dict:
    if kind == 'klub':
        row = _fetch_one("""SELECT id, naziv, oib, sport, grad, predsjednik, tajnik,
                                   web, web_stranica, email, telefon, ciljevi, opis_djelatnosti,
-                                   sjediste, godina_osnutka, savez_id, scrape_url, source_url
+                                   sjediste, godina_osnutka, savez_id, scrape_url, source_url,
+                                   metadata
                            FROM pgz_sport.klubovi WHERE id=%s""", (eid,))
    elif kind == 'savez':
        row = _fetch_one("""SELECT id, naziv, oib, sport, predsjednik, tajnik, email, telefon, web,
-                                   adresa, godina_osnutka, source_url
+                                   adresa, godina_osnutka, source_url, metadata
                            FROM pgz_sport.savezi WHERE id=%s""", (eid,))
-    else:  # sportas
+    elif kind == 'sportas':
        row = _fetch_one("""SELECT id, ime, prezime, sport, klub_id, profile_url, scrape_url,
-                                   slika_url, source_url, hns_igrac_id, biografija
+                                   slika_url, source_url, hns_igrac_id, biografija, metadata
                            FROM pgz_sport.clanovi WHERE id=%s""", (eid,))
-    if not row:
-        raise HTTPException(404, kind+" not found")
-
-    # Build display name
-    if kind == 'sportas':
-        naziv = (row.get('ime','') + ' ' + row.get('prezime','')).strip()
-        grad = None
    else:
-        naziv = row.get('naziv','')
-        grad = row.get('grad') if kind=='klub' else None
+        raise HTTPException(400, "kind must be klub|savez|sportas")
+    if not row:
+        raise HTTPException(404, kind + " not found")
+    return row

-    # Live web snippet from primary URL
-    primary = row.get('web') or row.get('web_stranica') or row.get('source_url') or row.get('scrape_url') or row.get('profile_url')
-    snippet = _fetch_title(primary) if primary else None

-    # Coverage score: how many key fields are filled?
+def _display_name(kind: str, row: dict) -> str:
+    if kind == 'sportas':
+        return ((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip()
+    return row.get('naziv', '') or ''
+
+
+def _research_links(naziv, kind, grad=None):
+    base_q = (naziv or '').strip()
+    q = (base_q + ' ' + grad) if grad else base_q
+    qenc = urllib.parse.quote(q)
+    out = [
+        {'label': 'Google',       'icon': '🔍', 'url': 'https://www.google.com/search?q=' + qenc},
+        {'label': 'Wikipedia HR', 'icon': '📚', 'url': 'https://hr.wikipedia.org/w/index.php?search=' + qenc},
+        {'label': 'sport-pgz.hr', 'icon': '🏅', 'url': 'https://sport-pgz.hr/?s=' + qenc},
+    ]
    if kind == 'klub':
-        keys = ['oib','sport','grad','predsjednik','tajnik','web','email','telefon','sjediste','godina_osnutka','ciljevi']
+        out.append({'label': 'Sportilus',       'icon': '⬡', 'url': 'https://www.sportilus.com/?s=' + qenc})
+        out.append({'label': 'Sudski registar', 'icon': '⚖', 'url': 'https://sudreg.pravosudje.hr/registar/oc/index.html'})
+    if kind == 'sportas':
+        out.append({'label': 'HNS Semafor',  'icon': '⚽', 'url': 'https://semafor.hns.family/?s=' + qenc})
+        out.append({'label': 'transfermarkt','icon': '⚽', 'url': 'https://www.transfermarkt.com/schnellsuche/ergebnis/schnellsuche?query=' + qenc})
+    if kind == 'savez':
+        out.append({'label': 'sport-pgz.hr savezi', 'icon': '🏅', 'url': 'https://sport-pgz.hr/savezi'})
+    return out
+
+
+# ─── Proposal pipelines ─────────────────────────────────────────────────
+def _name_tokens(naziv: str) -> list[str]:
+    """Significant tokens from entity name (≥4 chars, deaccented)."""
+    import unicodedata
+    s = unicodedata.normalize('NFKD', naziv or '').encode('ascii', 'ignore').decode('ascii').lower()
+    toks = [t for t in re.split(r'[^a-z0-9]+', s) if len(t) >= 4]
+    stop = {'klub','udruga','sportski','sport','kosarkaski','kosarka','nogometni',
+            'rukometni','savez','rijeka','primorsko','goranski','grad','grada','centar'}
+    return [t for t in toks if t not in stop] or toks
+
+
+def _is_relevant(source: dict, tokens: list[str]) -> bool:
+    """A source is 'relevant' only if the page actually mentions the entity name."""
+    if not tokens: return True
+    import unicodedata
+    blob = (source.get('title') or '') + ' ' + (source.get('extract') or '') + ' ' + (source.get('raw_text') or '')
+    blob = unicodedata.normalize('NFKD', blob.lower()).encode('ascii', 'ignore').decode('ascii')
+    return any(t in blob for t in tokens)
+
+
+def _propose_for_klub(row: dict) -> dict:
+    naziv = row.get('naziv') or ''
+    primary = row.get('web') or row.get('web_stranica') or row.get('source_url') or row.get('scrape_url')
+    sources, evidence = [], []
+    pdoc = _fetch_primary_site(primary) if primary else None
+    if pdoc: sources.append(pdoc); evidence.append(pdoc.get('raw_text') or pdoc.get('extract') or '')
+    wiki = _wiki_summary(naziv)
+    if wiki: sources.append(wiki); evidence.append(wiki.get('extract') or '')
+    spz = _sport_pgz_search(naziv)
+    if spz: sources.append(spz); evidence.append(spz.get('raw_text') or spz.get('extract') or '')
+
+    tokens = _name_tokens(naziv)
+    relevant = [s for s in sources if _is_relevant(s, tokens)]
+    relevant_blob = '\n\n'.join((s.get('raw_text') or s.get('extract') or '') for s in relevant)
+
+    proposed: dict[str, Any] = {}
+    # web/email/telefon: ONLY from sources actually mentioning the entity
+    if not row.get('web'):
+        u = _find_official_web(relevant_blob, naziv)
+        if u: proposed['web'] = u
+    if not row.get('email'):
+        e = _find_email(relevant_blob)
+        if e: proposed['email'] = e
+    if not row.get('telefon'):
+        t = _find_phone(relevant_blob)
+        if t: proposed['telefon'] = t
+    if not row.get('opis_djelatnosti'):
+        descr_evidence = [(s.get('raw_text') or s.get('extract') or '') for s in relevant] or evidence
+        descr = _deepseek_describe(naziv, 'sportski klub', descr_evidence)
+        if not descr:
+            for s in (relevant or sources):
+                if s.get('extract') and len(s['extract']) >= 80:
+                    descr = s['extract']; break
+        if descr: proposed['opis_djelatnosti'] = descr.strip()[:2000]
+    return {'proposed': proposed, 'sources': sources}
+
+
+def _propose_for_savez(row: dict) -> dict:
+    naziv = row.get('naziv') or ''
+    primary = row.get('web') or row.get('source_url')
+    sources, evidence = [], []
+    pdoc = _fetch_primary_site(primary) if primary else None
+    if pdoc: sources.append(pdoc); evidence.append(pdoc.get('raw_text') or '')
+    wiki = _wiki_summary(naziv)
+    if wiki: sources.append(wiki); evidence.append(wiki.get('extract') or '')
+    spz = _sport_pgz_search(naziv)
+    if spz: sources.append(spz); evidence.append(spz.get('raw_text') or '')
+
+    tokens = _name_tokens(naziv)
+    relevant = [s for s in sources if _is_relevant(s, tokens)]
+    relevant_blob = '\n\n'.join((s.get('raw_text') or s.get('extract') or '') for s in relevant)
+
+    proposed: dict[str, Any] = {}
+    if not row.get('web'):
+        u = _find_official_web(relevant_blob, naziv)
+        if u: proposed['web'] = u
+    if not row.get('email'):
+        e = _find_email(relevant_blob)
+        if e: proposed['email'] = e
+    if not row.get('telefon'):
+        t = _find_phone(relevant_blob)
+        if t: proposed['telefon'] = t
+    return {'proposed': proposed, 'sources': sources}
+
+
+def _propose_for_sportas(row: dict) -> dict:
+    naziv = ((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip()
+    sources, evidence = [], []
+    wiki = _wiki_summary(naziv)
+    if wiki: sources.append(wiki); evidence.append(wiki.get('extract') or '')
+    proposed: dict[str, Any] = {}
+    if not row.get('biografija') and evidence:
+        descr = _deepseek_describe(naziv, 'sportaš', evidence)
+        if not descr and wiki: descr = wiki.get('extract')
+        if descr: proposed['biografija'] = descr.strip()[:2000]
+    return {'proposed': proposed, 'sources': sources}
+
+
+# ─── Endpoints ──────────────────────────────────────────────────────────
+@router.post("/enrich/{kind}/{eid}")
+def enrich_preview(kind: str, eid: int):
+    row = _load_row(kind, eid)
+    if   kind == 'klub':    res = _propose_for_klub(row)
+    elif kind == 'savez':   res = _propose_for_savez(row)
+    else:                   res = _propose_for_sportas(row)
+
+    if kind == 'klub':
+        keys = ['oib','sport','grad','predsjednik','tajnik','web','email','telefon',
+                'sjediste','godina_osnutka','ciljevi','opis_djelatnosti']
    elif kind == 'savez':
        keys = ['oib','sport','predsjednik','tajnik','email','telefon','web','adresa','godina_osnutka']
    else:
        keys = ['sport','profile_url','slika_url','hns_igrac_id','biografija']
-    filled = sum(1 for k in keys if row.get(k))
-    coverage = round(filled/len(keys)*100)

-    # Suggested missing fields
-    missing = [k for k in keys if not row.get(k)]
+    naziv = _display_name(kind, row)
+    grad = row.get('grad') if kind == 'klub' else None
+    primary = row.get('web') or row.get('web_stranica') or row.get('source_url') or row.get('scrape_url') or row.get('profile_url')
+
+    filled  = sum(1 for k in keys if row.get(k))
+    coverage = round(filled / len(keys) * 100)
+    missing  = [k for k in keys if not row.get(k)]
+
+    proposed = res['proposed']
+    current  = {k: row.get(k) for k in proposed.keys()}
+    meta = row.get('metadata') or {}
+    if not isinstance(meta, dict): meta = {}

    return {
-        'kind': kind,
-        'id': eid,
-        'naziv': naziv,
-        'coverage': coverage,
-        'filled_fields': filled,
-        'total_fields': len(keys),
+        'kind': kind, 'id': eid, 'naziv': naziv,
+        'coverage': coverage, 'filled_fields': filled, 'total_fields': len(keys),
        'missing_fields': missing,
-        'live_snippet': snippet,
+        'live_snippet': _fetch_title(primary) if primary else None,
        'research_links': _research_links(naziv, kind, grad),
+        'sources': res['sources'],
+        'current':  current,
+        'proposed': proposed,
+        'last_enriched_at': meta.get('enriched_at'),
+        'last_enrichment_source': meta.get('enrichment_source'),
        'enriched_at': int(time.time()),
+        'apply_url': f'/sport/api/v2/enrich/{kind}/{eid}/apply',
    }


-# ── R3B P4 — FORENSIC SCAN ──────────────────────────────────────────
+_TABLE_MAP = {
+    'klub':    ('pgz_sport.klubovi',
+                {'web','email','telefon','predsjednik','tajnik',
+                 'opis_djelatnosti','ciljevi','godina_osnutka','sjediste'}),
+    'savez':   ('pgz_sport.savezi',
+                {'web','email','telefon','predsjednik','tajnik','adresa','godina_osnutka'}),
+    'sportas': ('pgz_sport.clanovi',
+                {'biografija','profile_url','slika_url'}),
+}
+
+
+def _apply_to_db(kind: str, eid: int, fields: dict, sources: list, user_email: Optional[str]):
+    if kind not in _TABLE_MAP:
+        raise HTTPException(400, "kind must be klub|savez|sportas")
+    table, allowed = _TABLE_MAP[kind]
+
+    with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
+        cur.execute(f"SELECT * FROM {table} WHERE id=%s FOR UPDATE", (eid,))
+        before = cur.fetchone()
+        if not before: raise HTTPException(404, kind + " not found")
+        before = dict(before)
+
+        sets, params, applied = [], [], {}
+        for k, v in (fields or {}).items():
+            if k not in allowed: continue
+            if v is None or str(v).strip() == '': continue
+            if before.get(k):
+                continue  # never overwrite existing
+            sets.append(f"{k} = %s")
+            params.append(v); applied[k] = v
+
+        meta_in = before.get('metadata') or {}
+        if not isinstance(meta_in, dict): meta_in = {}
+        now_iso = datetime.now(timezone.utc).isoformat()
+        meta_in['enriched_at'] = now_iso
+        meta_in['enrichment_source'] = [s.get('source') for s in (sources or []) if s.get('source')]
+        history = meta_in.get('enrichment_history') or []
+        history.append({
+            'at': now_iso,
+            'fields': list(applied.keys()),
+            'sources': meta_in['enrichment_source'],
+            'urls':    [s.get('url') for s in (sources or []) if s.get('url')],
+            'user':    user_email,
+        })
+        meta_in['enrichment_history'] = history[-10:]
+        sets.append("metadata = %s::jsonb")
+        params.append(json.dumps(meta_in, ensure_ascii=False, default=str))
+
+        params.append(eid)
+        cur.execute(f"UPDATE {table} SET {', '.join(sets)} WHERE id=%s RETURNING *", params)
+        after = dict(cur.fetchone())
+
+        cur.execute(
+            """INSERT INTO pgz_sport.enrichment_log
+                 (kind, target_id, source, url, fields_set, before_jsonb, after_jsonb, user_email)
+               VALUES (%s,%s,%s,%s,%s,%s::jsonb,%s::jsonb,%s)""",
+            (kind, eid,
+             ','.join(meta_in['enrichment_source'])[:120] if meta_in['enrichment_source'] else None,
+             (sources[0].get('url') if sources else None),
+             list(applied.keys()) or None,
+             json.dumps({k: before.get(k) for k in (list(applied.keys()) + ['metadata'])},
+                        ensure_ascii=False, default=str),
+             json.dumps({k: after.get(k)  for k in (list(applied.keys()) + ['metadata'])},
+                        ensure_ascii=False, default=str),
+             user_email))
+
+    snap_keys = ('id','naziv','ime','prezime','web','email','telefon',
+                 'opis_djelatnosti','biografija','metadata')
+    return {'applied': applied,
+            'after':  {k: after.get(k) for k in snap_keys if k in after}}
+
+
+@router.post("/enrich/{kind}/{eid}/apply")
+def enrich_apply(kind: str, eid: int,
+                 body: dict = Body(default=None),
+                 x_user_email: Optional[str] = Header(default=None)):
+    body = body or {}
+    fields  = body.get('fields')
+    sources = body.get('sources')
+    if not fields:
+        row = _load_row(kind, eid)
+        if   kind == 'klub':    res = _propose_for_klub(row)
+        elif kind == 'savez':   res = _propose_for_savez(row)
+        else:                   res = _propose_for_sportas(row)
+        fields  = res['proposed']
+        sources = res['sources']
+    out = _apply_to_db(kind, eid, fields or {}, sources or [], x_user_email)
+    return {'kind': kind, 'id': eid, **out}
+
+
+@router.get("/enrich/log")
+def enrich_log(kind: Optional[str] = None, target_id: Optional[int] = None, limit: int = 50):
+    where, params = [], []
+    if kind:      where.append("kind=%s");      params.append(kind)
+    if target_id: where.append("target_id=%s"); params.append(target_id)
+    sql = ("SELECT id, kind, target_id, source, url, fields_set, user_email, created_at "
+           "FROM pgz_sport.enrichment_log "
+           + ("WHERE " + " AND ".join(where) + " " if where else "")
+           + "ORDER BY id DESC LIMIT %s")
+    params.append(min(int(limit or 50), 200))
+    with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
+        cur.execute(sql, params)
+        rows = [dict(r) for r in cur.fetchall()]
+    for r in rows:
+        if r.get('created_at'): r['created_at'] = r['created_at'].isoformat()
+    return {'count': len(rows), 'rows': rows}
+
+
+# ─── R3B M2 — SEARCH SUGGEST (autocomplete for Mreža) ───────────────────
+@router.get("/search/suggest")
+def search_suggest(q: str = '', type: str = '', limit: int = 10):
+    """
+    Autocomplete suggestions for the Mreža search inputs.
+    type ∈ {person, club, company, ''} — empty means all.
+    Returns: {query, results: [{id, label, type, sub}]}
+    """
+    q = (q or '').strip()
+    if len(q) < 2:
+        return {'query': q, 'results': []}
+    limit = max(1, min(50, int(limit)))
+    out = []
+    with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
+        if type in ('', 'club'):
+            cur.execute("""
+              SELECT id, naziv AS label, sport, grad
+              FROM pgz_sport.klubovi
+              WHERE naziv ILIKE %s AND aktivan=TRUE
+              ORDER BY length(naziv), naziv LIMIT %s
+            """, ('%'+q+'%', limit))
+            for r in cur.fetchall():
+                out.append({'id':'klub:'+str(r['id']), 'label': r['label'], 'type':'club',
+                            'sub': (r.get('sport') or '')+' · '+(r.get('grad') or '')})
+            cur.execute("""
+              SELECT id, naziv AS label, sport
+              FROM pgz_sport.savezi
+              WHERE naziv ILIKE %s AND aktivan=TRUE
+              ORDER BY length(naziv), naziv LIMIT %s
+            """, ('%'+q+'%', limit))
+            for r in cur.fetchall():
+                out.append({'id':'savez:'+str(r['id']), 'label': r['label'], 'type':'savez',
+                            'sub': r.get('sport') or 'savez'})
+        if type in ('', 'person'):
+            cur.execute("""
+              SELECT c.id, c.ime, c.prezime, c.sport, k.naziv AS klub_naziv
+              FROM pgz_sport.clanovi c
+              LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id
+              WHERE (COALESCE(c.ime,'') || ' ' || COALESCE(c.prezime,'')) ILIKE %s
+              ORDER BY length(COALESCE(c.ime,'')||COALESCE(c.prezime,'')), c.prezime
+              LIMIT %s
+            """, ('%'+q+'%', limit))
+            for r in cur.fetchall():
+                out.append({'id':'sportas:'+str(r['id']),
+                            'label': (r.get('ime') or '')+' '+(r.get('prezime') or ''),
+                            'type':'person',
+                            'sub': (r.get('sport') or 'sportaš')+(r.get('klub_naziv') and ' · '+r['klub_naziv'] or '')})
+            cur.execute("""
+              SELECT id, name AS label, function, oib, county
+              FROM civic.persons
+              WHERE name ILIKE %s
+              ORDER BY oib NULLS LAST, length(name) LIMIT %s
+            """, ('%'+q+'%', limit))
+            for r in cur.fetchall():
+                out.append({'id':'civic_person:'+str(r['id']),
+                            'label': r['label'], 'type':'person',
+                            'sub': (r.get('function') or 'civic')+' · '+(r.get('county') or '')})
+        if type in ('', 'company'):
+            cur.execute("""
+              SELECT id, name AS label, oib, city, entity_type
+              FROM civic.entities
+              WHERE name ILIKE %s
+              ORDER BY length(name) LIMIT %s
+            """, ('%'+q+'%', limit))
+            for r in cur.fetchall():
+                out.append({'id':'civic_entity:'+str(r['id']),
+                            'label': r['label'], 'type':'company',
+                            'sub': (r.get('entity_type') or 'tvrtka')+' · '+(r.get('city') or '')})
+    return {'query': q, 'results': out[:limit*2]}
+
+
+# ─── R3B M3 — FORENSIC ENRICH (Wikipedia scrape + persist) ──────────────
+@router.post("/forensic/findings/{finding_id}/enrich")
+def enrich_forensic(finding_id: int):
+    """
+    Look up the forensic finding, derive the PEP person name from
+    entities_involved or title, hit Wikipedia HR for a summary, and persist
+    the enriched payload into civic.forensic_findings.ai_analysis (or back into
+    raw_data.enrichment).
+    """
+    with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
+        cur.execute("""
+          SELECT id, finding_type, severity, title, description, entities_involved,
+                 raw_data, ai_analysis
+          FROM civic.forensic_findings WHERE id=%s
+        """, (finding_id,))
+        f = cur.fetchone()
+        if not f: raise HTTPException(404, "finding not found")
+        f = dict(f)
+
+        # Derive person name candidates
+        candidates = []
+        if isinstance(f.get('entities_involved'), (list, dict)):
+            ei = f['entities_involved']
+            if isinstance(ei, dict):
+                for k in ('person','name','osoba','PEP','pep'):
+                    if ei.get(k): candidates.append(str(ei[k]))
+                # Also try persons: [...] list
+                for p in (ei.get('persons') or ei.get('osobe') or []):
+                    if isinstance(p, dict) and p.get('name'): candidates.append(p['name'])
+                    elif isinstance(p, str): candidates.append(p)
+            elif isinstance(ei, list):
+                for it in ei:
+                    if isinstance(it, dict):
+                        for k in ('name','person','label'):
+                            if it.get(k): candidates.append(str(it[k])); break
+                    elif isinstance(it, str):
+                        candidates.append(it)
+        if not candidates and f.get('title'):
+            # Heuristic: extract first capitalised "Ime Prezime" pair
+            m = re.search(r'\b([A-ZŠĐČĆŽ][a-zšđčćž]{2,})\s+([A-ZŠĐČĆŽ][a-zšđčćž]{2,})', f['title'])
+            if m: candidates.append(m.group(0))
+
+        wiki = None
+        used_query = None
+        for q in candidates[:3]:
+            wiki = _wiki_summary(q)
+            if wiki:
+                used_query = q
+                break
+
+        # Build enrichment payload
+        enrichment = {
+            'queried': candidates[:5],
+            'used_query': used_query,
+            'wiki': wiki,
+            'enriched_at': datetime.now(timezone.utc).isoformat(),
+        }
+
+        # Persist into raw_data.enrichment
+        raw = f.get('raw_data')
+        if raw is None: raw = {}
+        if not isinstance(raw, dict): raw = {'_legacy': raw}
+        raw['enrichment'] = enrichment
+
+        cur.execute("""
+          UPDATE civic.forensic_findings
+          SET raw_data = %s::jsonb,
+              ai_analysis = COALESCE(ai_analysis, %s)
+          WHERE id = %s
+        """, (json.dumps(raw, default=str, ensure_ascii=False),
+              (wiki or {}).get('extract'),
+              finding_id))
+        c.commit()
+
+    return {
+        'finding_id': finding_id,
+        'queried': candidates[:5],
+        'used_query': used_query,
+        'wiki': wiki,
+        'persisted': True,
+    }
+
+
+# ─── R3B P4 — FORENSIC SCAN (kept from prior version) ───────────────────
@router.post("/forensic/scan")
 def forensic_scan(req: dict = Body(...)):
-    """
-    Search civic.persons by name. For each match, gather entities, person
-    role, forensic_findings count, and synthesise a risk score.
-    Body: {"name": "Velimir Liverić"}
-    """
    name = (req.get('name') or '').strip()
    if len(name) < 3:
        raise HTTPException(400, "name must be at least 3 chars")
-
    with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
        cur.execute("""
          SELECT id, name, function, party, county, city, oib, trust_tier
          FROM civic.persons
          WHERE upper(name) ILIKE upper(%s)
-          ORDER BY oib NULLS LAST, id
-          LIMIT 25
-        """, ('%'+name+'%',))
+          ORDER BY oib NULLS LAST, id LIMIT 25
+        """, ('%' + name + '%',))
        persons = [dict(r) for r in cur.fetchall()]
-
-        # For each person collect entity links via OIB
        for p in persons:
-            p['links'] = []
-            p['findings'] = []
+            p['links'] = []; p['findings'] = []
            if p.get('oib'):
                cur.execute("""
                  SELECT pel.entity_id, pel.roles, e.name AS entity_name, e.oib AS entity_oib,
                         e.entity_type, e.city, e.risk_score
                  FROM civic.person_entity_links pel
                  LEFT JOIN civic.entities e ON e.id = pel.entity_id
-                  WHERE pel.person_oib = %s
-                  LIMIT 50
+                  WHERE pel.person_oib = %s LIMIT 50
                """, (p['oib'],))
                p['links'] = [dict(r) for r in cur.fetchall()]
-                # Forensic findings JSONB containing this OIB
                cur.execute("""
                  SELECT id, finding_type, severity, title, severity_score, created_at
                  FROM civic.forensic_findings
                  WHERE entities_involved::text ILIKE %s
-                  ORDER BY severity_score DESC, created_at DESC
-                  LIMIT 30
-                """, ('%'+p['oib']+'%',))
+                  ORDER BY severity_score DESC, created_at DESC LIMIT 30
+                """, ('%' + p['oib'] + '%',))
                p['findings'] = [dict(r) for r in cur.fetchall()]
-            # Also search forensic_findings by name
            if not p['findings']:
                cur.execute("""
                  SELECT id, finding_type, severity, title, severity_score, created_at
                  FROM civic.forensic_findings
                  WHERE title ILIKE %s OR description ILIKE %s
-                  ORDER BY severity_score DESC, created_at DESC
-                  LIMIT 30
-                """, ('%'+p['name']+'%', '%'+p['name']+'%'))
+                  ORDER BY severity_score DESC, created_at DESC LIMIT 30
+                """, ('%' + p['name'] + '%', '%' + p['name'] + '%'))
                p['findings'] = [dict(r) for r in cur.fetchall()]
-
-    # Synthesise risk score per person and overall
-    total_links = 0
-    total_findings = 0
-    crit_findings = 0
+    total_links = total_findings = crit_findings = 0
    for p in persons:
        total_links += len(p.get('links') or [])
        for f in p.get('findings') or []:
            total_findings += 1
-            if f.get('severity') in ('CRITICAL','HIGH'):
-                crit_findings += 1
-        # per-person risk: 30 base if PEP-like (function set), +5 per link, +10 per finding, +20 per crit
+            if f.get('severity') in ('CRITICAL', 'HIGH'): crit_findings += 1
        score = 0
-        if (p.get('function') or '').strip():
-            score += 30
-        if (p.get('party') or '').strip():
-            score += 15
-        score += min(40, len(p.get('links') or [])*5)
-        score += min(40, len(p.get('findings') or [])*10)
-        score += sum(20 for f in (p.get('findings') or []) if f.get('severity') in ('CRITICAL','HIGH'))
+        if (p.get('function') or '').strip(): score += 30
+        if (p.get('party') or '').strip():    score += 15
+        score += min(40, len(p.get('links') or []) * 5)
+        score += min(40, len(p.get('findings') or []) * 10)
+        score += sum(20 for f in (p.get('findings') or []) if f.get('severity') in ('CRITICAL', 'HIGH'))
        p['risk_score'] = min(100, score)
-
-    overall = 0
-    if persons:
-        overall = max(p.get('risk_score',0) for p in persons)
-
-    return {
-        'query': name,
-        'matched_persons': len(persons),
-        'overall_risk_score': overall,
-        'total_links': total_links,
-        'total_findings': total_findings,
-        'critical_findings': crit_findings,
-        'persons': persons,
-        'scanned_at': int(time.time()),
-    }
-
-
-# ── R3B P6 — ENRICH /apply (write enriched fields back to DB) ───────
-@router.post("/enrich/{kind}/{eid}/apply")
-def enrich_apply(kind: str, eid: int, req: dict = Body(default={})):
-    """
-    Apply enrichment to DB. Body may contain {fields: {web, email, telefon}}
-    to override the auto-derived suggestions; otherwise we apply derived ones.
-    Only updates fields that are currently NULL or empty in DB (additive only).
-    """
-    if kind not in ('klub','savez','sportas'):
-        raise HTTPException(400, "kind must be klub|savez|sportas")
-    body_fields = (req.get('fields') if isinstance(req, dict) else {}) or {}
-
-    if kind == 'klub':
-        table = 'pgz_sport.klubovi'
-        cols = ['web','email','telefon']
-    elif kind == 'savez':
-        table = 'pgz_sport.savezi'
-        cols = ['web','email','telefon']
-    else:
-        table = 'pgz_sport.clanovi'
-        cols = ['biografija','profile_url']
-
-    with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
-        cur.execute(f"SELECT * FROM {table} WHERE id=%s", (eid,))
-        row = cur.fetchone()
-        if not row: raise HTTPException(404, kind+" not found")
-        row = dict(row)
-
-        # Try a live fetch from primary URL to glean email/phone
-        primary = row.get('web') or row.get('web_stranica') or row.get('source_url') or row.get('scrape_url') or row.get('profile_url')
-        derived = {}
-        if primary:
-            snippet = _fetch_title(primary, timeout=6)
-            try:
-                if snippet and snippet.get('url'):
-                    req2 = urllib.request.Request(primary, headers={'User-Agent': UA})
-                    with urllib.request.urlopen(req2, timeout=6) as r:
-                        page = r.read(80000).decode('utf-8','ignore')
-                    em = re.search(r'[\w\.-]+@[\w\.-]+\.[a-z]{2,8}', page, re.I)
-                    if em: derived['email'] = em.group(0)
-                    tel = re.search(r'\+?385[\s\-]?\d[\d\s\-/]{6,}', page)
-                    if tel: derived['telefon'] = re.sub(r'\s+', ' ', tel.group(0).strip())
-            except Exception:
-                pass
-
-        # Merge: body fields override derived
-        proposed = dict(derived)
-        for k, v in (body_fields or {}).items():
-            if k in cols and v:
-                proposed[k] = v
-
-        # Only apply where DB currently empty
-        applied = {}
-        for k, v in proposed.items():
-            if k in cols and (row.get(k) is None or row.get(k)==''):
-                applied[k] = v
-
-        if applied:
-            sets = ', '.join([f"{k}=%s" for k in applied])
-            params = list(applied.values()) + [eid]
-            cur.execute(f"UPDATE {table} SET {sets} WHERE id=%s", params)
-            c.commit()
-
-    return {
-        'kind': kind, 'id': eid,
-        'proposed': proposed,
-        'applied': applied,
-        'skipped_existing': [k for k in proposed if k not in applied],
-        'applied_at': int(time.time()),
-    }
+    overall = max((p.get('risk_score', 0) for p in persons), default=0)
+    return {'query': name, 'matched_persons': len(persons),
+            'overall_risk_score': overall, 'total_links': total_links,
+            'total_findings': total_findings, 'critical_findings': crit_findings,
+            'persons': persons, 'scanned_at': int(time.time())}