pgz-sport/routers/enrich_router.py

"""
enrich_router.py — v3 enrichment + forensic scan
Author: dradulic@outlook.com / damir@rinet.one
Date:   2026-05-04 (R2) → 2026-05-05 (R3 CC6 v3)

POST /v2/enrich/{kind}/{eid}
    Inspect the row, scrape the web (Wikipedia HR, sport-pgz.hr search,
    primary club URL if any), regex-extract candidate fields (web/email/
    telefon), optionally synthesise descriptions via DeepSeek, and return
    a *preview* shape with `proposed` updates the operator can apply.

POST /v2/enrich/{kind}/{eid}/apply
    Body shapes:
      None / {}                  → re-run preview, apply every proposed field
      {"fields": {...}}          → apply ONLY those (whitelist + emptiness still enforced)
    Performs UPDATE on the matching table, sets metadata.enriched_at and
    metadata.enrichment_source, writes a row to pgz_sport.enrichment_log,
    returns the after snapshot.

GET  /v2/enrich/log?kind=&target_id=&limit=
    Read recent enrichment-log entries.

POST /v2/forensic/scan
    Search civic.persons by name, return entity links + findings + risk score.

Kinds: klub | savez | sportas
"""
from __future__ import annotations
import os, re, json, time, html, urllib.parse, urllib.request
from datetime import datetime, timezone
from typing import Any, Optional

import psycopg2, psycopg2.extras
from fastapi import APIRouter, HTTPException, Header, Body

router = APIRouter()

_pgh = os.environ.get('PG_HOST', '10.10.0.2')
_pgp = int(os.environ.get('PG_PORT', '6432'))
if _pgh in ('localhost', '127.0.0.1'):
    _pgh = os.environ.get('DB_HOST', '10.10.0.2')
    _pgp = int(os.environ.get('DB_PORT', '6432'))
DB = dict(host=_pgh, port=_pgp,
          dbname=os.environ.get('PG_DB', 'rinet_v3'),
          user=os.environ.get('PG_USER', 'rinet'),
          password=os.environ.get('PG_PASS', 'R1net2026!SecureDB#v7'))

UA = 'pgz-sport-enrich/3.0 (+https://sport.rinet.one)'
TIMEOUT = 6  # seconds — fail-soft

# Optional JS-aware fallback (Playwright). Lazy-loaded, never required.
import sys as _sys
_sys.path.insert(0, '/opt/pgz-sport')
try:
    from enrichment import playwright_scraper as _pw_scraper
    _HAS_PW = _pw_scraper.HAS_PLAYWRIGHT
except Exception:
    _pw_scraper = None
    _HAS_PW = False

DEEPSEEK_KEY = os.environ.get('DEEPSEEK_API_KEY', '').strip()
DEEPSEEK_URL = os.environ.get('DEEPSEEK_URL',
                              'https://api.deepseek.com/v1/chat/completions')


# ─── DB helpers ──────────────────────────────────────────────────────────
def _db():
    c = psycopg2.connect(**DB); c.autocommit = True; return c

def _fetch_one(sql, p):
    with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
        cur.execute(sql, p); r = cur.fetchone()
        return dict(r) if r else None


# ─── HTTP helpers ────────────────────────────────────────────────────────
def _http_get(url: str, timeout: int = TIMEOUT) -> Optional[str]:
    if not url: return None
    if not url.startswith('http'): return None
    try:
        req = urllib.request.Request(url, headers={
            'User-Agent': UA, 'Accept-Language': 'hr,en;q=0.8'})
        with urllib.request.urlopen(req, timeout=timeout) as r:
            data = r.read(150000)
            try:    return data.decode('utf-8')
            except: return data.decode('latin-1', 'ignore')
    except Exception:
        return None


def _strip_tags(s: str) -> str:
    if not s: return ''
    s = re.sub(r'<script[^>]*>.*?</script>', ' ', s, flags=re.S | re.I)
    s = re.sub(r'<style[^>]*>.*?</style>',   ' ', s, flags=re.S | re.I)
    s = re.sub(r'<[^>]+>', ' ', s)
    s = html.unescape(s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s


def _extract_meta(html_doc: str, url: str) -> dict:
    if not html_doc: return {}
    out = {'url': url, 'fetched_at': int(time.time())}
    m = re.search(r'<title[^>]*>([^<]+)</title>', html_doc, re.I)
    if m: out['title'] = html.unescape(m.group(1).strip())[:300]
    m = re.search(r'<meta\s+name=["\']description["\']\s+content=["\']([^"\']+)["\']', html_doc, re.I)
    if not m:
        m = re.search(r'<meta\s+property=["\']og:description["\']\s+content=["\']([^"\']+)["\']', html_doc, re.I)
    if m: out['description'] = html.unescape(m.group(1).strip())[:600]
    return out


def _fetch_title(url, timeout=5):
    body = _http_get(url, timeout=timeout)
    if not body: return {'url': url, 'error': 'fetch failed'} if url else None
    return _extract_meta(body, url)


# ─── Field extractors ───────────────────────────────────────────────────
RE_EMAIL = re.compile(r'[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,}', re.I)
RE_PHONE = re.compile(r'(?:\+?385[\s\-/]*|0)\d[\d\s\-/]{6,12}\d')
RE_URL   = re.compile(r'https?://[^\s"\'<>)\]]+', re.I)

def _find_email(text: str) -> Optional[str]:
    if not text: return None
    bad = ('@example.', '@test.', '@email.', 'wixpress.com',
           'sentry.io', 'jquery.com', 'googleapis', '@2x.', 'noreply@')
    seen = set()
    for m in RE_EMAIL.finditer(text):
        e = m.group(0).lower().rstrip('.,;:)')
        if any(b in e for b in bad): continue
        if e in seen: continue
        seen.add(e); return e
    return None

def _find_phone(text: str) -> Optional[str]:
    if not text: return None
    for m in RE_PHONE.finditer(text):
        raw = m.group(0).strip()
        digits = re.sub(r'\D', '', raw)
        if not (8 <= len(digits) <= 13): continue
        cleaned = re.sub(r'\s+', ' ', raw).strip()
        if raw.startswith('+385'):  return '+385 ' + raw[4:].lstrip().lstrip('-/')
        if raw.startswith('00385'): return '+385 ' + raw[5:].lstrip().lstrip('-/')
        return cleaned
    return None

def _find_official_web(text: str, hint: str = '') -> Optional[str]:
    if not text: return None
    blocked = ('wikipedia.org', 'sport-pgz.hr', 'google.com', 'facebook.com',
               'instagram.com', 'youtube.com', 'twitter.com', 'wikimedia',
               'sportilus.com', 'transfermarkt.com', 'wikidata.org',
               'sudreg.pravosudje.hr', 'gov.hr', 'apis.google.com',
               'rinet.one', 'pgz.hr')
    candidates: list[str] = []
    for m in RE_URL.finditer(text):
        u = m.group(0).rstrip('.,;:)\'"')
        try:
            host = urllib.parse.urlparse(u).hostname or ''
        except Exception:
            continue
        if not host or any(b in host for b in blocked): continue
        candidates.append(u)
    if not candidates: return None
    if hint:
        slug = re.sub(r'[^a-z0-9]', '', hint.lower())[:8]
        for u in candidates:
            host = urllib.parse.urlparse(u).hostname or ''
            if slug and slug in host.replace('-', '').replace('.', ''):
                return u
    return candidates[0]


# ─── External sources ────────────────────────────────────────────────────
def _wiki_summary(query: str) -> Optional[dict]:
    if not query: return None
    title = urllib.parse.quote(query.replace(' ', '_'), safe='')
    body = _http_get(f'https://hr.wikipedia.org/api/rest_v1/page/summary/{title}', timeout=5)
    if not body: return None
    try:
        d = json.loads(body)
        if d.get('type') == 'disambiguation' or 'extract' not in d: return None
        return {
            'source': 'wikipedia.hr',
            'url': d.get('content_urls', {}).get('desktop', {}).get('page'),
            'title': d.get('title'),
            'extract': d.get('extract'),
            'description': d.get('description'),
        }
    except Exception:
        return None


def _sport_pgz_search(query: str) -> Optional[dict]:
    if not query: return None
    page = _http_get('https://sport-pgz.hr/?s=' + urllib.parse.quote(query), timeout=6)
    if not page:
        # Plain HTTP failed → try JS-rendered fallback if available.
        if _HAS_PW and _pw_scraper is not None:
            return _pw_scraper.scrape_sport_pgz_klub(query)
        return None
    m = re.search(r'<article[^>]*>.*?<a\s+href=["\']([^"\']+)["\'][^>]*rel=["\']bookmark["\'][^>]*>([^<]+)</a>',
                  page, re.S | re.I)
    if not m:
        m = re.search(r'<a\s+href=["\'](https?://sport-pgz\.hr/[^"\']+)["\'][^>]*>([^<]{6,180})</a>', page, re.I)
    if not m:
        # Search page rendered but yielded nothing parseable — try JS fallback.
        if _HAS_PW and _pw_scraper is not None:
            return _pw_scraper.scrape_sport_pgz_klub(query)
        return None
    hit = m.group(1)
    body = _http_get(hit, timeout=6)
    if not body:
        return {'source': 'sport-pgz.hr', 'url': hit, 'title': html.unescape(m.group(2).strip())}
    text = _strip_tags(body)[:4000]
    meta = _extract_meta(body, hit)
    return {
        'source': 'sport-pgz.hr',
        'url': hit,
        'title': meta.get('title') or html.unescape(m.group(2).strip()),
        'extract': meta.get('description') or text[:500],
        'raw_text': text,
    }


def _fetch_primary_site(url: str) -> Optional[dict]:
    body = _http_get(url, timeout=6)
    if not body: return None
    text = _strip_tags(body)
    meta = _extract_meta(body, url)
    return {
        'source': urllib.parse.urlparse(url).hostname or url,
        'url': url,
        'title': meta.get('title'),
        'extract': meta.get('description') or text[:500],
        'raw_text': text[:8000],
    }


# ─── DeepSeek (optional, fail-soft) ─────────────────────────────────────
def _deepseek_describe(naziv: str, kind: str, evidence: list[str]) -> Optional[str]:
    if not DEEPSEEK_KEY or not evidence: return None
    joined = "\n---\n".join(e for e in evidence if e)[:6000]
    if not joined.strip(): return None
    prompt = (f"Iz dolje navedenih izvora napiši profesionalni opis za "
              f"{kind} '{naziv}' na hrvatskom jeziku. 3-5 rečenica. "
              f"Bez uvoda 'Evo opisa', samo tekst.\n\nIZVORI:\n{joined}")
    payload = {
        "model": "deepseek-chat",
        "messages": [
            {"role": "system", "content": "Pišeš sažete činjenične opise sportskih organizacija na hrvatskom."},
            {"role": "user",   "content": prompt},
        ],
        "max_tokens": 280, "temperature": 0.3,
    }
    req = urllib.request.Request(
        DEEPSEEK_URL, data=json.dumps(payload).encode('utf-8'),
        headers={'Authorization': 'Bearer ' + DEEPSEEK_KEY,
                 'Content-Type': 'application/json',
                 'User-Agent': UA}, method='POST')
    try:
        with urllib.request.urlopen(req, timeout=20) as r:
            d = json.loads(r.read().decode('utf-8'))
        text = d.get('choices', [{}])[0].get('message', {}).get('content', '').strip()
        return text or None
    except Exception:
        return None


# ─── Row loaders & display name ─────────────────────────────────────────
def _load_row(kind: str, eid: int) -> dict:
    if kind == 'klub':
        row = _fetch_one("""SELECT id, naziv, oib, sport, grad, predsjednik, tajnik,
                                   web, web_stranica, email, telefon, ciljevi, opis_djelatnosti,
                                   sjediste, godina_osnutka, savez_id, scrape_url, source_url,
                                   metadata
                            FROM pgz_sport.klubovi WHERE id=%s""", (eid,))
    elif kind == 'savez':
        row = _fetch_one("""SELECT id, naziv, oib, sport, predsjednik, tajnik, email, telefon, web,
                                   adresa, godina_osnutka, source_url, metadata
                            FROM pgz_sport.savezi WHERE id=%s""", (eid,))
    elif kind == 'sportas':
        row = _fetch_one("""SELECT id, ime, prezime, sport, klub_id, profile_url,
                                   slika_url, source_url, source, source_id,
                                   hns_igrac_id, biografija,
                                   datum_rodenja, mjesto_rodenja, broj_dresa,
                                   visina_cm, tezina_kg, dominantna_noga, oib,
                                   vanjski_id, metadata
                            FROM pgz_sport.clanovi WHERE id=%s""", (eid,))
    else:
        raise HTTPException(400, "kind must be klub|savez|sportas")
    if not row:
        raise HTTPException(404, kind + " not found")
    return row


def _display_name(kind: str, row: dict) -> str:
    if kind == 'sportas':
        return ((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip()
    return row.get('naziv', '') or ''


def _research_links(naziv, kind, grad=None):
    base_q = (naziv or '').strip()
    q = (base_q + ' ' + grad) if grad else base_q
    qenc = urllib.parse.quote(q)
    out = [
        {'label': 'Google',       'icon': '🔍', 'url': 'https://www.google.com/search?q=' + qenc},
        {'label': 'Wikipedia HR', 'icon': '📚', 'url': 'https://hr.wikipedia.org/w/index.php?search=' + qenc},
        {'label': 'sport-pgz.hr', 'icon': '🏅', 'url': 'https://sport-pgz.hr/?s=' + qenc},
    ]
    if kind == 'klub':
        out.append({'label': 'Sportilus',       'icon': '⬡', 'url': 'https://www.sportilus.com/?s=' + qenc})
        out.append({'label': 'Sudski registar', 'icon': '⚖', 'url': 'https://sudreg.pravosudje.hr/registar/oc/index.html'})
    if kind == 'sportas':
        out.append({'label': 'HNS Semafor',  'icon': '⚽', 'url': 'https://semafor.hns.family/?s=' + qenc})
        out.append({'label': 'transfermarkt','icon': '⚽', 'url': 'https://www.transfermarkt.com/schnellsuche/ergebnis/schnellsuche?query=' + qenc})
    if kind == 'savez':
        out.append({'label': 'sport-pgz.hr savezi', 'icon': '🏅', 'url': 'https://sport-pgz.hr/savezi'})
    return out


# ─── Proposal pipelines ─────────────────────────────────────────────────
def _name_tokens(naziv: str) -> list[str]:
    """Significant tokens from entity name (≥4 chars, deaccented)."""
    import unicodedata
    s = unicodedata.normalize('NFKD', naziv or '').encode('ascii', 'ignore').decode('ascii').lower()
    toks = [t for t in re.split(r'[^a-z0-9]+', s) if len(t) >= 4]
    stop = {'klub','udruga','sportski','sport','kosarkaski','kosarka','nogometni',
            'rukometni','savez','rijeka','primorsko','goranski','grad','grada','centar'}
    return [t for t in toks if t not in stop] or toks


def _is_relevant(source: dict, tokens: list[str]) -> bool:
    """A source is 'relevant' only if the page actually mentions the entity name."""
    if not tokens: return True
    import unicodedata
    blob = (source.get('title') or '') + ' ' + (source.get('extract') or '') + ' ' + (source.get('raw_text') or '')
    blob = unicodedata.normalize('NFKD', blob.lower()).encode('ascii', 'ignore').decode('ascii')
    return any(t in blob for t in tokens)


def _propose_for_klub(row: dict) -> dict:
    naziv = row.get('naziv') or ''
    primary = row.get('web') or row.get('web_stranica') or row.get('source_url') or row.get('scrape_url')
    sources, evidence = [], []
    pdoc = _fetch_primary_site(primary) if primary else None
    if pdoc: sources.append(pdoc); evidence.append(pdoc.get('raw_text') or pdoc.get('extract') or '')
    wiki = _wiki_summary(naziv)
    if wiki: sources.append(wiki); evidence.append(wiki.get('extract') or '')
    spz = _sport_pgz_search(naziv)
    if spz: sources.append(spz); evidence.append(spz.get('raw_text') or spz.get('extract') or '')

    tokens = _name_tokens(naziv)
    relevant = [s for s in sources if _is_relevant(s, tokens)]
    relevant_blob = '\n\n'.join((s.get('raw_text') or s.get('extract') or '') for s in relevant)

    proposed: dict[str, Any] = {}
    # web/email/telefon: ONLY from sources actually mentioning the entity
    if not row.get('web'):
        u = _find_official_web(relevant_blob, naziv)
        if u: proposed['web'] = u
    if not row.get('email'):
        e = _find_email(relevant_blob)
        if e: proposed['email'] = e
    if not row.get('telefon'):
        t = _find_phone(relevant_blob)
        if t: proposed['telefon'] = t
    if not row.get('opis_djelatnosti'):
        descr_evidence = [(s.get('raw_text') or s.get('extract') or '') for s in relevant] or evidence
        descr = _deepseek_describe(naziv, 'sportski klub', descr_evidence)
        if not descr:
            for s in (relevant or sources):
                if s.get('extract') and len(s['extract']) >= 80:
                    descr = s['extract']; break
        if descr: proposed['opis_djelatnosti'] = descr.strip()[:2000]
    return {'proposed': proposed, 'sources': sources}


def _propose_for_savez(row: dict) -> dict:
    naziv = row.get('naziv') or ''
    primary = row.get('web') or row.get('source_url')
    sources, evidence = [], []
    pdoc = _fetch_primary_site(primary) if primary else None
    if pdoc: sources.append(pdoc); evidence.append(pdoc.get('raw_text') or '')
    wiki = _wiki_summary(naziv)
    if wiki: sources.append(wiki); evidence.append(wiki.get('extract') or '')
    spz = _sport_pgz_search(naziv)
    if spz: sources.append(spz); evidence.append(spz.get('raw_text') or '')

    tokens = _name_tokens(naziv)
    relevant = [s for s in sources if _is_relevant(s, tokens)]
    relevant_blob = '\n\n'.join((s.get('raw_text') or s.get('extract') or '') for s in relevant)

    proposed: dict[str, Any] = {}
    if not row.get('web'):
        u = _find_official_web(relevant_blob, naziv)
        if u: proposed['web'] = u
    if not row.get('email'):
        e = _find_email(relevant_blob)
        if e: proposed['email'] = e
    if not row.get('telefon'):
        t = _find_phone(relevant_blob)
        if t: proposed['telefon'] = t
    return {'proposed': proposed, 'sources': sources}


# ─── HNS Semafor parsing ────────────────────────────────────────────────
_HNS_BASE = 'https://semafor.hns.family'

def _slugify(name: str) -> str:
    import unicodedata
    s = unicodedata.normalize('NFKD', name or '').encode('ascii', 'ignore').decode('ascii').lower()
    return re.sub(r'[^a-z0-9]+', '-', s).strip('-')

def _hns_url_from_row(row: dict) -> Optional[str]:
    """Try to build a semafor.hns.family /igraci/ URL for this row."""
    # 1) Already-set columns
    for k in ('profile_url', 'source_url'):
        u = row.get(k)
        if u and 'semafor.hns.family/igraci/' in (u or ''):
            return u
    # 2) hns_igrac_id column
    pid = row.get('hns_igrac_id')
    if pid:
        slug = _slugify(((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip())
        return f'{_HNS_BASE}/igraci/{int(pid)}/{slug}/'
    # 3) vanjski_id JSONB → hns_comet
    vid = row.get('vanjski_id') or {}
    if isinstance(vid, dict):
        comet = vid.get('hns_comet') or vid.get('hns_pid')
        slug  = vid.get('hns_slug') or _slugify(((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip())
        if comet:
            try:
                return f'{_HNS_BASE}/igraci/{int(comet)}/{slug}/'
            except Exception:
                pass
    # 4) source='hns_semafor' + source_id
    if (row.get('source') or '').startswith('hns_') and row.get('source_id'):
        try:
            slug = _slugify(((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip())
            return f'{_HNS_BASE}/igraci/{int(row["source_id"])}/{slug}/'
        except Exception:
            pass
    return None


def _parse_hns_player(html_doc: str, url: str) -> Optional[dict]:
    """Extract structured fields from a semafor.hns.family player page."""
    if not html_doc: return None
    try:
        from bs4 import BeautifulSoup
    except Exception:
        return _parse_hns_player_regex(html_doc, url)
    soup = BeautifulSoup(html_doc, 'html.parser')
    out: dict[str, Any] = {'source': 'semafor.hns.family', 'url': url}

    # hns_igrac_id from URL
    m = re.search(r'/igraci/(\d+)/', url)
    if m: out['hns_igrac_id'] = int(m.group(1))

    title = soup.find('title')
    if title: out['title'] = title.get_text(strip=True)[:300]

    # Photo
    photo = soup.find('div', class_='photo')
    if photo:
        img = photo.find('img')
        if img and img.get('src'):
            src = img['src']
            if not src.startswith('http'):
                src = urllib.parse.urljoin(url, src)
            out['slika_url'] = src

    # Player number (jersey)
    pn = soup.find('div', class_='playerName')
    if pn:
        h3 = pn.find('h3')
        if h3:
            t = h3.get_text(strip=True)
            if t.isdigit():
                out['broj_dresa'] = int(t)

    # Datum rodjenja
    li = soup.find('li', class_='dob')
    if li:
        h4 = li.find('h4')
        if h4:
            t = h4.get_text(' ', strip=True)
            mm = re.match(r'(\d{1,2})\.(\d{1,2})\.(\d{4})', t)
            if mm:
                from datetime import date as _date
                try:
                    out['datum_rodenja'] = _date(int(mm.group(3)), int(mm.group(2)), int(mm.group(1))).isoformat()
                except Exception:
                    pass

    # Mjesto rodjenja
    li = soup.find('li', class_='pob')
    if li:
        h4 = li.find('h4')
        if h4:
            out['mjesto_rodenja'] = h4.get_text(strip=True)

    # Trenutni klub (info only — we don't reassign klub_id from here)
    klub_link = soup.find('a', href=re.compile(r'/klubovi/(\d+)/'))
    if klub_link:
        h4 = klub_link.find('h4')
        if h4:
            out['trenutni_klub'] = h4.get_text(strip=True)
        m = re.search(r'/klubovi/(\d+)/', klub_link.get('href') or '')
        if m: out['hns_klub_id'] = int(m.group(1))

    # Description (meta)
    meta_d = soup.find('meta', attrs={'name': 'description'})
    if meta_d and meta_d.get('content'):
        out['description'] = meta_d['content'][:600]

    # Make a clean text blob for relevance / DeepSeek
    text = soup.get_text(' ', strip=True)
    out['raw_text'] = re.sub(r'\s+', ' ', text)[:4000]
    out['extract'] = (out.get('description')
                      or (out['raw_text'][:500] if out.get('raw_text') else None))
    return out


def _parse_hns_player_regex(html_doc: str, url: str) -> Optional[dict]:
    """BS4-free fallback parser."""
    out: dict[str, Any] = {'source': 'semafor.hns.family', 'url': url}
    m = re.search(r'/igraci/(\d+)/', url)
    if m: out['hns_igrac_id'] = int(m.group(1))
    m = re.search(r'<div class="photo"><img src="([^"]+)"', html_doc)
    if m:
        src = m.group(1)
        if not src.startswith('http'): src = urllib.parse.urljoin(url, src)
        out['slika_url'] = src
    m = re.search(r'<li class="dob">.*?<h4>(\d{1,2})\.(\d{1,2})\.(\d{4})', html_doc, re.S)
    if m:
        from datetime import date as _date
        try:
            out['datum_rodenja'] = _date(int(m.group(3)), int(m.group(2)), int(m.group(1))).isoformat()
        except Exception:
            pass
    m = re.search(r'<li class="pob"><i></i><h4>([^<]+)</h4>', html_doc)
    if m: out['mjesto_rodenja'] = m.group(1).strip()
    m = re.search(r'<div class="playerName"><h3>(\d+)</h3>', html_doc)
    if m: out['broj_dresa'] = int(m.group(1))
    m = re.search(r'<meta\s+name="description"\s+content="([^"]+)"', html_doc)
    if m: out['description'] = m.group(1)[:600]
    return out


def _hns_fetch_player(url: str) -> Optional[dict]:
    body = _http_get(url, timeout=8)
    if not body:
        # Try Playwright fallback
        if _HAS_PW and _pw_scraper is not None:
            r = _pw_scraper.fetch_rendered(url, timeout_ms=15000)
            if r and r.get('html_len', 0) > 2000:
                # We didn't store html in fetch_rendered — re-fetch text only is enough
                # but we need html for parse. Do a simple HTTP retry with longer timeout.
                body = _http_get(url, timeout=15)
    return _parse_hns_player(body, url) if body else None


def _propose_for_sportas(row: dict) -> dict:
    naziv = ((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip()
    sources, evidence = [], []
    proposed: dict[str, Any] = {}

    # 1) Resolve a HNS Semafor URL for this athlete (column / vanjski_id / source_id)
    hns_url = _hns_url_from_row(row)
    hns_doc: Optional[dict] = None
    if hns_url:
        hns_doc = _hns_fetch_player(hns_url)
        if hns_doc:
            sources.append(hns_doc)
            evidence.append(hns_doc.get('raw_text') or hns_doc.get('extract') or '')

    # Field-level proposals from HNS Semafor (only when DB is empty)
    if hns_doc:
        if not row.get('profile_url') and hns_doc.get('url'):
            proposed['profile_url'] = hns_doc['url']
        if not row.get('source_url') and hns_doc.get('url'):
            proposed['source_url']  = hns_doc['url']
        if not row.get('slika_url') and hns_doc.get('slika_url'):
            proposed['slika_url'] = hns_doc['slika_url']
        if not row.get('hns_igrac_id') and hns_doc.get('hns_igrac_id'):
            proposed['hns_igrac_id'] = hns_doc['hns_igrac_id']
        if not row.get('datum_rodenja') and hns_doc.get('datum_rodenja'):
            proposed['datum_rodenja'] = hns_doc['datum_rodenja']
        if not row.get('mjesto_rodenja') and hns_doc.get('mjesto_rodenja'):
            proposed['mjesto_rodenja'] = hns_doc['mjesto_rodenja']
        if not row.get('broj_dresa') and hns_doc.get('broj_dresa'):
            proposed['broj_dresa'] = hns_doc['broj_dresa']

    # 2) Wikipedia HR for biografija
    if not row.get('biografija'):
        wiki = _wiki_summary(naziv)
        if wiki:
            sources.append(wiki)
            evidence.append(wiki.get('extract') or '')

    # Description: prefer DeepSeek synthesis from all evidence; fallback to first long snippet
    if not row.get('biografija'):
        descr = _deepseek_describe(naziv, 'sportaš', evidence) if evidence else None
        if not descr:
            for s in sources:
                ext = s.get('extract')
                if ext and len(ext) >= 80:
                    descr = ext; break
        if descr:
            proposed['biografija'] = descr.strip()[:2000]

    return {'proposed': proposed, 'sources': sources}


# ─── Endpoints ──────────────────────────────────────────────────────────
@router.post("/enrich/{kind}/{eid}")
def enrich_preview(kind: str, eid: int):
    row = _load_row(kind, eid)
    if   kind == 'klub':    res = _propose_for_klub(row)
    elif kind == 'savez':   res = _propose_for_savez(row)
    else:                   res = _propose_for_sportas(row)

    if kind == 'klub':
        keys = ['oib','sport','grad','predsjednik','tajnik','web','email','telefon',
                'sjediste','godina_osnutka','ciljevi','opis_djelatnosti']
    elif kind == 'savez':
        keys = ['oib','sport','predsjednik','tajnik','email','telefon','web','adresa','godina_osnutka']
    else:
        keys = ['sport','profile_url','slika_url','hns_igrac_id','biografija',
                'datum_rodenja','mjesto_rodenja','broj_dresa','visina_cm','tezina_kg',
                'dominantna_noga','oib']

    naziv = _display_name(kind, row)
    grad = row.get('grad') if kind == 'klub' else None
    primary = row.get('web') or row.get('web_stranica') or row.get('source_url') or row.get('scrape_url') or row.get('profile_url')

    filled  = sum(1 for k in keys if row.get(k))
    coverage = round(filled / len(keys) * 100)
    missing  = [k for k in keys if not row.get(k)]

    proposed = res['proposed']
    current  = {k: row.get(k) for k in proposed.keys()}
    meta = row.get('metadata') or {}
    if not isinstance(meta, dict): meta = {}

    return {
        'kind': kind, 'id': eid, 'naziv': naziv,
        'coverage': coverage, 'filled_fields': filled, 'total_fields': len(keys),
        'missing_fields': missing,
        'live_snippet': _fetch_title(primary) if primary else None,
        'research_links': _research_links(naziv, kind, grad),
        'sources': res['sources'],
        'current':  current,
        'proposed': proposed,
        'last_enriched_at': meta.get('enriched_at'),
        'last_enrichment_source': meta.get('enrichment_source'),
        'enriched_at': int(time.time()),
        'apply_url': f'/sport/api/v2/enrich/{kind}/{eid}/apply',
    }


_TABLE_MAP = {
    'klub':    ('pgz_sport.klubovi',
                {'web','email','telefon','predsjednik','tajnik',
                 'opis_djelatnosti','ciljevi','godina_osnutka','sjediste','adresa'}),
    'savez':   ('pgz_sport.savezi',
                {'web','email','telefon','predsjednik','tajnik','adresa','godina_osnutka'}),
    'sportas': ('pgz_sport.clanovi',
                {'biografija','profile_url','source_url','slika_url','hns_igrac_id',
                 'datum_rodenja','mjesto_rodenja','broj_dresa','visina_cm',
                 'tezina_kg','dominantna_noga','oib'}),
}


def _apply_to_db(kind: str, eid: int, fields: dict, sources: list, user_email: Optional[str]):
    if kind not in _TABLE_MAP:
        raise HTTPException(400, "kind must be klub|savez|sportas")
    table, allowed = _TABLE_MAP[kind]

    with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
        cur.execute(f"SELECT * FROM {table} WHERE id=%s FOR UPDATE", (eid,))
        before = cur.fetchone()
        if not before: raise HTTPException(404, kind + " not found")
        before = dict(before)

        sets, params, applied = [], [], {}
        for k, v in (fields or {}).items():
            if k not in allowed: continue
            if v is None or str(v).strip() == '': continue
            if before.get(k):
                continue  # never overwrite existing
            sets.append(f"{k} = %s")
            params.append(v); applied[k] = v

        meta_in = before.get('metadata') or {}
        if not isinstance(meta_in, dict): meta_in = {}
        now_iso = datetime.now(timezone.utc).isoformat()
        meta_in['enriched_at'] = now_iso
        meta_in['enrichment_source'] = [s.get('source') for s in (sources or []) if s.get('source')]
        history = meta_in.get('enrichment_history') or []
        history.append({
            'at': now_iso,
            'fields': list(applied.keys()),
            'sources': meta_in['enrichment_source'],
            'urls':    [s.get('url') for s in (sources or []) if s.get('url')],
            'user':    user_email,
        })
        meta_in['enrichment_history'] = history[-10:]
        sets.append("metadata = %s::jsonb")
        params.append(json.dumps(meta_in, ensure_ascii=False, default=str))

        params.append(eid)
        cur.execute(f"UPDATE {table} SET {', '.join(sets)} WHERE id=%s RETURNING *", params)
        after = dict(cur.fetchone())

        cur.execute(
            """INSERT INTO pgz_sport.enrichment_log
                 (kind, target_id, source, url, fields_set, before_jsonb, after_jsonb, user_email)
               VALUES (%s,%s,%s,%s,%s,%s::jsonb,%s::jsonb,%s)""",
            (kind, eid,
             ','.join(meta_in['enrichment_source'])[:120] if meta_in['enrichment_source'] else None,
             (sources[0].get('url') if sources else None),
             list(applied.keys()) or None,
             json.dumps({k: before.get(k) for k in (list(applied.keys()) + ['metadata'])},
                        ensure_ascii=False, default=str),
             json.dumps({k: after.get(k)  for k in (list(applied.keys()) + ['metadata'])},
                        ensure_ascii=False, default=str),
             user_email))

    snap_keys = ('id','naziv','ime','prezime','web','email','telefon',
                 'opis_djelatnosti','biografija','metadata')
    return {'applied': applied,
            'after':  {k: after.get(k) for k in snap_keys if k in after}}


@router.post("/enrich/{kind}/{eid}/apply")
def enrich_apply(kind: str, eid: int,
                 body: dict = Body(default=None),
                 x_user_email: Optional[str] = Header(default=None),
                 x_user_id:    Optional[int] = Header(default=None)):
    body = body or {}
    fields  = body.get('fields')
    sources = body.get('sources')
    if not fields:
        row = _load_row(kind, eid)
        if   kind == 'klub':    res = _propose_for_klub(row)
        elif kind == 'savez':   res = _propose_for_savez(row)
        else:                   res = _propose_for_sportas(row)
        fields  = res['proposed']
        sources = res['sources']
    out = _apply_to_db(kind, eid, fields or {}, sources or [], x_user_email)
    # R4-A3: write to pgz_sport.sys_audit so the audit page sees enrichment events
    try:
        from audit_seal_router import audit_log as _audit_log
        if out.get('applied'):
            _audit_log(
                action='enrich.apply',
                target_type=kind,
                target_id=eid,
                payload={'applied': out.get('applied'),
                         'sources': [s.get('url') for s in (sources or []) if isinstance(s, dict)]},
                user_id=x_user_id,
                user_email=x_user_email,
            )
    except Exception:
        pass
    return {'kind': kind, 'id': eid, **out}


@router.get("/enrich/log")
def enrich_log(kind: Optional[str] = None, target_id: Optional[int] = None, limit: int = 50):
    where, params = [], []
    if kind:      where.append("kind=%s");      params.append(kind)
    if target_id: where.append("target_id=%s"); params.append(target_id)
    sql = ("SELECT id, kind, target_id, source, url, fields_set, user_email, created_at "
           "FROM pgz_sport.enrichment_log "
           + ("WHERE " + " AND ".join(where) + " " if where else "")
           + "ORDER BY id DESC LIMIT %s")
    params.append(min(int(limit or 50), 200))
    with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
        cur.execute(sql, params)
        rows = [dict(r) for r in cur.fetchall()]
    for r in rows:
        if r.get('created_at'): r['created_at'] = r['created_at'].isoformat()
    return {'count': len(rows), 'rows': rows}


# ─── R3B M2 — SEARCH SUGGEST (autocomplete for Mreža) ───────────────────
@router.get("/search/suggest")
def search_suggest(q: str = '', type: str = '', limit: int = 10):
    """
    Autocomplete suggestions for the Mreža search inputs.
    type ∈ {person, club, company, ''} — empty means all.
    Returns: {query, results: [{id, label, type, sub}]}
    """
    q = (q or '').strip()
    if len(q) < 2:
        return {'query': q, 'results': []}
    limit = max(1, min(50, int(limit)))
    out = []
    with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
        if type in ('', 'club'):
            cur.execute("""
              SELECT id, naziv AS label, sport, grad
              FROM pgz_sport.klubovi
              WHERE naziv ILIKE %s AND aktivan=TRUE
              ORDER BY length(naziv), naziv LIMIT %s
            """, ('%'+q+'%', limit))
            for r in cur.fetchall():
                out.append({'id':'klub:'+str(r['id']), 'label': r['label'], 'type':'club',
                            'sub': (r.get('sport') or '')+' · '+(r.get('grad') or '')})
            cur.execute("""
              SELECT id, naziv AS label, sport
              FROM pgz_sport.savezi
              WHERE naziv ILIKE %s AND aktivan=TRUE
              ORDER BY length(naziv), naziv LIMIT %s
            """, ('%'+q+'%', limit))
            for r in cur.fetchall():
                out.append({'id':'savez:'+str(r['id']), 'label': r['label'], 'type':'savez',
                            'sub': r.get('sport') or 'savez'})
        if type in ('', 'person'):
            cur.execute("""
              SELECT c.id, c.ime, c.prezime, c.sport, k.naziv AS klub_naziv
              FROM pgz_sport.clanovi c
              LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id
              WHERE (COALESCE(c.ime,'') || ' ' || COALESCE(c.prezime,'')) ILIKE %s
              ORDER BY length(COALESCE(c.ime,'')||COALESCE(c.prezime,'')), c.prezime
              LIMIT %s
            """, ('%'+q+'%', limit))
            for r in cur.fetchall():
                out.append({'id':'sportas:'+str(r['id']),
                            'label': (r.get('ime') or '')+' '+(r.get('prezime') or ''),
                            'type':'person',
                            'sub': (r.get('sport') or 'sportaš')+(r.get('klub_naziv') and ' · '+r['klub_naziv'] or '')})
            cur.execute("""
              SELECT id, name AS label, function, oib, county
              FROM civic.persons
              WHERE name ILIKE %s
              ORDER BY oib NULLS LAST, length(name) LIMIT %s
            """, ('%'+q+'%', limit))
            for r in cur.fetchall():
                out.append({'id':'civic_person:'+str(r['id']),
                            'label': r['label'], 'type':'person',
                            'sub': (r.get('function') or 'civic')+' · '+(r.get('county') or '')})
        if type in ('', 'company'):
            cur.execute("""
              SELECT id, name AS label, oib, city, entity_type
              FROM civic.entities
              WHERE name ILIKE %s
              ORDER BY length(name) LIMIT %s
            """, ('%'+q+'%', limit))
            for r in cur.fetchall():
                out.append({'id':'civic_entity:'+str(r['id']),
                            'label': r['label'], 'type':'company',
                            'sub': (r.get('entity_type') or 'tvrtka')+' · '+(r.get('city') or '')})
    return {'query': q, 'results': out[:limit*2]}


# ─── R3B M3 — FORENSIC ENRICH (Wikipedia scrape + persist) ──────────────
@router.post("/forensic/findings/{finding_id}/enrich")
def enrich_forensic(finding_id: int):
    """
    Look up the forensic finding, derive the PEP person name from
    entities_involved or title, hit Wikipedia HR for a summary, and persist
    the enriched payload into civic.forensic_findings.ai_analysis (or back into
    raw_data.enrichment).
    """
    with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
        cur.execute("""
          SELECT id, finding_type, severity, title, description, entities_involved,
                 raw_data, ai_analysis
          FROM civic.forensic_findings WHERE id=%s
        """, (finding_id,))
        f = cur.fetchone()
        if not f: raise HTTPException(404, "finding not found")
        f = dict(f)

        # Derive person name candidates
        candidates = []
        if isinstance(f.get('entities_involved'), (list, dict)):
            ei = f['entities_involved']
            if isinstance(ei, dict):
                for k in ('person','name','osoba','PEP','pep'):
                    if ei.get(k): candidates.append(str(ei[k]))
                # Also try persons: [...] list
                for p in (ei.get('persons') or ei.get('osobe') or []):
                    if isinstance(p, dict) and p.get('name'): candidates.append(p['name'])
                    elif isinstance(p, str): candidates.append(p)
            elif isinstance(ei, list):
                for it in ei:
                    if isinstance(it, dict):
                        for k in ('name','person','label'):
                            if it.get(k): candidates.append(str(it[k])); break
                    elif isinstance(it, str):
                        candidates.append(it)
        if not candidates and f.get('title'):
            # Heuristic: extract first capitalised "Ime Prezime" pair
            m = re.search(r'\b([A-ZŠĐČĆŽ][a-zšđčćž]{2,})\s+([A-ZŠĐČĆŽ][a-zšđčćž]{2,})', f['title'])
            if m: candidates.append(m.group(0))

        wiki = None
        used_query = None
        for q in candidates[:3]:
            wiki = _wiki_summary(q)
            if wiki:
                used_query = q
                break

        # Build enrichment payload
        enrichment = {
            'queried': candidates[:5],
            'used_query': used_query,
            'wiki': wiki,
            'enriched_at': datetime.now(timezone.utc).isoformat(),
        }

        # Persist into raw_data.enrichment
        raw = f.get('raw_data')
        if raw is None: raw = {}
        if not isinstance(raw, dict): raw = {'_legacy': raw}
        raw['enrichment'] = enrichment

        cur.execute("""
          UPDATE civic.forensic_findings
          SET raw_data = %s::jsonb,
              ai_analysis = COALESCE(ai_analysis, %s)
          WHERE id = %s
        """, (json.dumps(raw, default=str, ensure_ascii=False),
              (wiki or {}).get('extract'),
              finding_id))
        c.commit()

    return {
        'finding_id': finding_id,
        'queried': candidates[:5],
        'used_query': used_query,
        'wiki': wiki,
        'persisted': True,
    }


# ─── R3B P4 — FORENSIC SCAN (kept from prior version) ───────────────────
@router.post("/forensic/scan")
def forensic_scan(req: dict = Body(...)):
    name = (req.get('name') or '').strip()
    if len(name) < 3:
        raise HTTPException(400, "name must be at least 3 chars")
    with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
        cur.execute("""
          SELECT id, name, function, party, county, city, oib, trust_tier
          FROM civic.persons
          WHERE upper(name) ILIKE upper(%s)
          ORDER BY oib NULLS LAST, id LIMIT 25
        """, ('%' + name + '%',))
        persons = [dict(r) for r in cur.fetchall()]
        for p in persons:
            p['links'] = []; p['findings'] = []
            if p.get('oib'):
                cur.execute("""
                  SELECT pel.entity_id, pel.roles, e.name AS entity_name, e.oib AS entity_oib,
                         e.entity_type, e.city, e.risk_score
                  FROM civic.person_entity_links pel
                  LEFT JOIN civic.entities e ON e.id = pel.entity_id
                  WHERE pel.person_oib = %s LIMIT 50
                """, (p['oib'],))
                p['links'] = [dict(r) for r in cur.fetchall()]
                cur.execute("""
                  SELECT id, finding_type, severity, title, severity_score, created_at
                  FROM civic.forensic_findings
                  WHERE entities_involved::text ILIKE %s
                  ORDER BY severity_score DESC, created_at DESC LIMIT 30
                """, ('%' + p['oib'] + '%',))
                p['findings'] = [dict(r) for r in cur.fetchall()]
            if not p['findings']:
                cur.execute("""
                  SELECT id, finding_type, severity, title, severity_score, created_at
                  FROM civic.forensic_findings
                  WHERE title ILIKE %s OR description ILIKE %s
                  ORDER BY severity_score DESC, created_at DESC LIMIT 30
                """, ('%' + p['name'] + '%', '%' + p['name'] + '%'))
                p['findings'] = [dict(r) for r in cur.fetchall()]
    total_links = total_findings = crit_findings = 0
    for p in persons:
        total_links += len(p.get('links') or [])
        for f in p.get('findings') or []:
            total_findings += 1
            if f.get('severity') in ('CRITICAL', 'HIGH'): crit_findings += 1
        score = 0
        if (p.get('function') or '').strip(): score += 30
        if (p.get('party') or '').strip():    score += 15
        score += min(40, len(p.get('links') or []) * 5)
        score += min(40, len(p.get('findings') or []) * 10)
        score += sum(20 for f in (p.get('findings') or []) if f.get('severity') in ('CRITICAL', 'HIGH'))
        p['risk_score'] = min(100, score)
    overall = max((p.get('risk_score', 0) for p in persons), default=0)
    return {'query': name, 'matched_persons': len(persons),
            'overall_risk_score': overall, 'total_links': total_links,
            'total_findings': total_findings, 'critical_findings': crit_findings,
            'persons': persons, 'scanned_at': int(time.time())}