pgz-sport/routers/enrich_router.py

"""
enrich_router.py — v3 enrichment + forensic scan
Author: dradulic@outlook.com / damir@rinet.one
Date:   2026-05-04 (R2) → 2026-05-05 (R3 CC6 v3)

POST /v2/enrich/{kind}/{eid}
    Inspect the row, scrape the web (Wikipedia HR, sport-pgz.hr search,
    primary club URL if any), regex-extract candidate fields (web/email/
    telefon), optionally synthesise descriptions via DeepSeek, and return
    a *preview* shape with `proposed` updates the operator can apply.

POST /v2/enrich/{kind}/{eid}/apply
    Body shapes:
      None / {}                  → re-run preview, apply every proposed field
      {"fields": {...}}          → apply ONLY those (whitelist + emptiness still enforced)
    Performs UPDATE on the matching table, sets metadata.enriched_at and
    metadata.enrichment_source, writes a row to pgz_sport.enrichment_log,
    returns the after snapshot.

GET  /v2/enrich/log?kind=&target_id=&limit=
    Read recent enrichment-log entries.

POST /v2/forensic/scan
    Search civic.persons by name, return entity links + findings + risk score.

Kinds: klub | savez | sportas
"""
from __future__ import annotations
import os, re, json, time, html, urllib.parse, urllib.request
from datetime import datetime, timezone
from typing import Any, Optional

import psycopg2, psycopg2.extras
from fastapi import APIRouter, HTTPException, Header, Body

router = APIRouter()

_pgh = os.environ.get('PG_HOST', '10.10.0.2')
_pgp = int(os.environ.get('PG_PORT', '6432'))
if _pgh in ('localhost', '127.0.0.1'):
    _pgh = os.environ.get('DB_HOST', '10.10.0.2')
    _pgp = int(os.environ.get('DB_PORT', '6432'))
DB = dict(host=_pgh, port=_pgp,
          dbname=os.environ.get('PG_DB', 'rinet_v3'),
          user=os.environ.get('PG_USER', 'rinet'),
          password=os.environ.get('PG_PASS', 'R1net2026!SecureDB#v7'))

UA = 'pgz-sport-enrich/3.0 (+https://sport.rinet.one)'
TIMEOUT = 6  # seconds — fail-soft

# ---- Sport-aware source selection (Damir 2026-05-10, task 06) ----
# Generic enrichment was only fitted to football. Other sports need their
# own federation roster + terminology. This map drives the per-sport source
# list used by enrich pipelines.
SPORT_FEDERATION_MAP: "dict[str, list[str]]" = {
    "nogomet":    ["hns_semafor", "hns_cff", "sport_pgz"],
    "košarka":    ["hks_cbf", "kosarkapgz", "sport_pgz"],
    "rukomet":    ["hrs", "sport_pgz"],
    "vaterpolo":  ["hvs", "sport_pgz"],
    "atletika":   ["has", "sport_pgz"],
    "plivanje":   ["hps", "sport_pgz"],
    "hokej":      ["hsh", "sport_pgz"],
    "šah":        ["hss_chess", "sport_pgz"],
    "odbojka":    ["hos", "sport_pgz"],
    "tenis":      ["hts", "sport_pgz"],
    "biciklizam": ["hbs", "sport_pgz"],
}

# Per-sport field schemas - used by /select_sources_for_sport callers and
# by the frontend (data-sport-fields="...") to render the right fields.
SPORT_FIELD_SCHEMA: "dict[str, list[str]]" = {
    "nogomet":    ["position", "dominant_foot", "height_cm", "jersey_no", "club"],
    "košarka":    ["position", "height_cm", "weight_kg", "hand", "stats"],
    "rukomet":    ["position", "hand", "stats"],
    "atletika":   ["discipline", "category", "personal_records"],
    "šah":        ["elo_rating", "fide_id", "title"],
    "vaterpolo":  ["position", "hand", "stats"],
    "plivanje":   ["discipline", "personal_records"],
    "hokej":      ["position", "hand", "jersey_no"],
    "odbojka":    ["position", "height_cm", "stats"],
    "tenis":      ["dominant_hand", "atp_wta_rank", "personal_records"],
    "biciklizam": ["discipline", "team", "category"],
}

_DEFAULT_SOURCES = ["sport_pgz", "google", "wikipedia"]
_DEFAULT_FIELDS  = ["club", "position", "stats"]


def select_sources_for_sport(sport):
    """Return the prioritized source list for a given sport string.

    Match strategy: lower + strip -> exact key, else longest-prefix key,
    else default (_DEFAULT_SOURCES). Preserves diacritics so "košarka"
    matches "košarka" and not the prefix "kosarka".
    """
    if not sport:
        return list(_DEFAULT_SOURCES)
    s = sport.lower().strip()
    if s in SPORT_FEDERATION_MAP:
        return list(SPORT_FEDERATION_MAP[s])
    best = None
    for key in SPORT_FEDERATION_MAP:
        if s.startswith(key) and (best is None or len(key) > len(best)):
            best = key
    if best:
        return list(SPORT_FEDERATION_MAP[best])
    return list(_DEFAULT_SOURCES)


def fields_for_sport(sport):
    """Return the per-sport field list (UI hint + enrichment scope)."""
    if not sport:
        return list(_DEFAULT_FIELDS)
    s = sport.lower().strip()
    if s in SPORT_FIELD_SCHEMA:
        return list(SPORT_FIELD_SCHEMA[s])
    for key in SPORT_FIELD_SCHEMA:
        if s.startswith(key):
            return list(SPORT_FIELD_SCHEMA[key])
    return list(_DEFAULT_FIELDS)
# ---- end sport-aware section ----


# Optional JS-aware fallback (Playwright). Lazy-loaded, never required.
import sys as _sys
_sys.path.insert(0, '/opt/pgz-sport')
try:
    from enrichment import playwright_scraper as _pw_scraper
    _HAS_PW = _pw_scraper.HAS_PLAYWRIGHT
except Exception:
    _pw_scraper = None
    _HAS_PW = False

DEEPSEEK_KEY = os.environ.get('DEEPSEEK_API_KEY', '').strip()
DEEPSEEK_URL = os.environ.get('DEEPSEEK_URL',
                              'https://api.deepseek.com/v1/chat/completions')


# ─── DB helpers ──────────────────────────────────────────────────────────
def _db():
    c = psycopg2.connect(**DB); c.autocommit = True; return c

def _fetch_one(sql, p):
    with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
        cur.execute(sql, p); r = cur.fetchone()
        return dict(r) if r else None


# ─── HTTP helpers ────────────────────────────────────────────────────────
def _http_get(url: str, timeout: int = TIMEOUT) -> Optional[str]:
    if not url: return None
    if not url.startswith('http'): return None
    try:
        req = urllib.request.Request(url, headers={
            'User-Agent': UA, 'Accept-Language': 'hr,en;q=0.8'})
        with urllib.request.urlopen(req, timeout=timeout) as r:
            data = r.read(150000)
            try:    return data.decode('utf-8')
            except: return data.decode('latin-1', 'ignore')
    except Exception:
        return None


# ─── Redis cache + slow-enrichment telemetry (added 2026-05-10) ──────────
ENRICH_CACHE_TTL = int(os.environ.get('ENRICH_CACHE_TTL_SECONDS', '86400'))  # 24h default
ENRICH_SLOW_LOG  = '/var/log/enrich_slow.log'
ENRICH_SLOW_TG_THRESHOLD = float(os.environ.get('ENRICH_SLOW_TG_THRESHOLD', '20.0'))
ENRICH_SLOW_LOG_THRESHOLD = float(os.environ.get('ENRICH_SLOW_LOG_THRESHOLD', '10.0'))

_enrich_redis_singleton = None
def _enrich_redis():
    """Singleton Redis connection for enrichment cache. Returns None on failure (fail-soft).

    Renamed from _redis() to avoid collision with existing _redis_client() function
    elsewhere in this module.
    """
    global _enrich_redis_singleton
    if _enrich_redis_singleton is not None:
        return _enrich_redis_singleton
    try:
        import redis
    except Exception:
        return None
    host = os.environ.get('REDIS_HOST', 'localhost')
    port = int(os.environ.get('REDIS_PORT', '6379'))
    pwd  = (os.environ.get('REDIS_PASS') or os.environ.get('REDIS_PASSWORD') or '').strip().strip("'").strip('"') or None
    for p in (pwd, None):
        try:
            r = redis.Redis(host=host, port=port, password=p,
                            decode_responses=True, socket_connect_timeout=2,
                            socket_timeout=2)
            r.ping()
            _enrich_redis_singleton = r
            return r
        except Exception:
            continue
    return None

def _cache_get(key: str):
    """Fail-soft cache read. Returns parsed JSON or None."""
    r = _enrich_redis()
    if not r: return None
    try:
        v = r.get(key)
        if v is None: return None
        return json.loads(v)
    except Exception:
        return None

def _cache_set(key: str, value, ttl: int = ENRICH_CACHE_TTL):
    """Fail-soft cache write."""
    r = _enrich_redis()
    if not r: return False
    try:
        r.setex(key, ttl, json.dumps(value, default=str, ensure_ascii=False))
        return True
    except Exception:
        return False

def _cache_delete(key: str):
    r = _enrich_redis()
    if not r: return
    try: r.delete(key)
    except Exception: pass

def _enrich_slow_log(kind: str, eid: int, duration_s: float, cached: bool, error: str = None):
    """Append slow-enrichment record to /var/log/enrich_slow.log + Telegram alert if extreme."""
    import logging as _lg
    line = (f"{datetime.now(timezone.utc).isoformat()} "
            f"kind={kind} id={eid} duration={duration_s:.2f}s cached={cached}"
            f"{' error=' + error if error else ''}\n")
    try:
        with open(ENRICH_SLOW_LOG, "a", encoding="utf-8") as f:
            f.write(line)
    except Exception as e:
        _lg.getLogger("enrich").warning(f"slow-log write failed: {e}")
    if duration_s >= ENRICH_SLOW_TG_THRESHOLD:
        try:
            tok = os.environ.get('TG_BOT_TOKEN') or os.environ.get('TELEGRAM_BOT_TOKEN')
            chat = os.environ.get('TG_CHAT_ID') or os.environ.get('TELEGRAM_CHAT_ID')
            if tok and chat:
                msg = f"\u26a0\ufe0f enrich slow: {kind}/{eid} took {duration_s:.1f}s (cached={cached})"
                urllib.request.urlopen(
                    f"https://api.telegram.org/bot{tok}/sendMessage",
                    data=urllib.parse.urlencode({'chat_id': chat, 'text': msg}).encode('utf-8'),
                    timeout=3
                ).read()
        except Exception:
            pass


def _strip_tags(s: str) -> str:
    if not s: return ''
    s = re.sub(r'<script[^>]*>.*?</script>', ' ', s, flags=re.S | re.I)
    s = re.sub(r'<style[^>]*>.*?</style>',   ' ', s, flags=re.S | re.I)
    s = re.sub(r'<[^>]+>', ' ', s)
    s = html.unescape(s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s


def _extract_meta(html_doc: str, url: str) -> dict:
    if not html_doc: return {}
    out = {'url': url, 'fetched_at': int(time.time())}
    m = re.search(r'<title[^>]*>([^<]+)</title>', html_doc, re.I)
    if m: out['title'] = html.unescape(m.group(1).strip())[:300]
    m = re.search(r'<meta\s+name=["\']description["\']\s+content=["\']([^"\']+)["\']', html_doc, re.I)
    if not m:
        m = re.search(r'<meta\s+property=["\']og:description["\']\s+content=["\']([^"\']+)["\']', html_doc, re.I)
    if m: out['description'] = html.unescape(m.group(1).strip())[:600]
    return out


def _fetch_title(url, timeout=5):
    body = _http_get(url, timeout=timeout)
    if not body: return {'url': url, 'error': 'fetch failed'} if url else None
    return _extract_meta(body, url)


# ─── Field extractors ───────────────────────────────────────────────────
RE_EMAIL = re.compile(r'[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,}', re.I)
RE_PHONE = re.compile(r'(?:\+?385[\s\-/]*|0)\d[\d\s\-/]{6,12}\d')
RE_URL   = re.compile(r'https?://[^\s"\'<>)\]]+', re.I)

def _find_email(text: str) -> Optional[str]:
    if not text: return None
    bad = ('@example.', '@test.', '@email.', 'wixpress.com',
           'sentry.io', 'jquery.com', 'googleapis', '@2x.', 'noreply@')
    seen = set()
    for m in RE_EMAIL.finditer(text):
        e = m.group(0).lower().rstrip('.,;:)')
        if any(b in e for b in bad): continue
        if e in seen: continue
        seen.add(e); return e
    return None

def _find_phone(text: str) -> Optional[str]:
    if not text: return None
    for m in RE_PHONE.finditer(text):
        raw = m.group(0).strip()
        digits = re.sub(r'\D', '', raw)
        if not (8 <= len(digits) <= 13): continue
        cleaned = re.sub(r'\s+', ' ', raw).strip()
        if raw.startswith('+385'):  return '+385 ' + raw[4:].lstrip().lstrip('-/')
        if raw.startswith('00385'): return '+385 ' + raw[5:].lstrip().lstrip('-/')
        return cleaned
    return None

def _find_official_web(text: str, hint: str = '') -> Optional[str]:
    if not text: return None
    blocked = ('wikipedia.org', 'sport-pgz.hr', 'google.com', 'facebook.com',
               'instagram.com', 'youtube.com', 'twitter.com', 'wikimedia',
               'sportilus.com', 'transfermarkt.com', 'wikidata.org',
               'sudreg.pravosudje.hr', 'gov.hr', 'apis.google.com',
               'rinet.one', 'pgz.hr')
    candidates: list[str] = []
    for m in RE_URL.finditer(text):
        u = m.group(0).rstrip('.,;:)\'"')
        try:
            host = urllib.parse.urlparse(u).hostname or ''
        except Exception:
            continue
        if not host or any(b in host for b in blocked): continue
        candidates.append(u)
    if not candidates: return None
    if hint:
        slug = re.sub(r'[^a-z0-9]', '', hint.lower())[:8]
        for u in candidates:
            host = urllib.parse.urlparse(u).hostname or ''
            if slug and slug in host.replace('-', '').replace('.', ''):
                return u
    return candidates[0]


# ─── External sources ────────────────────────────────────────────────────
def _wiki_variants(query: str) -> list[str]:
    """Generate sensible Wikipedia HR title variants for a query.

    The summary REST API is title-exact; clubs are often listed under their
    abbreviation (KK X, NK X, RK X, OK X), so we try those variants too.
    """
    if not query: return []
    out, seen = [], set()
    raw = query.strip()
    def _push(v):
        if v and v not in seen: seen.add(v); out.append(v)
    _push(raw)
    # KK Kvarner 2010 from Košarkaški klub KVARNER 2010
    parts = raw.split()
    sport_to_abbr = {
        'košarkaški': 'KK', 'kosarkaski': 'KK',
        'nogometni':  'NK', 'rukometni':  'RK',
        'odbojkaški': 'OK', 'odbojkaski': 'OK',
        'vaterpolski':'VK', 'plivacki':   'PK', 'plivački': 'PK',
        'boćarski':   'BK', 'bocarski':   'BK',
    }
    if len(parts) >= 3 and parts[0].lower() in sport_to_abbr and parts[1].lower() == 'klub':
        _push(sport_to_abbr[parts[0].lower()] + ' ' + ' '.join(p.capitalize() if p.isupper() else p for p in parts[2:]))
    return out

def _wiki_summary(query: str) -> Optional[dict]:
    for variant in _wiki_variants(query):
        title = urllib.parse.quote(variant.replace(' ', '_'), safe='')
        body = _http_get(f'https://hr.wikipedia.org/api/rest_v1/page/summary/{title}', timeout=5)
        if not body: continue
        try:
            d = json.loads(body)
        except Exception:
            continue
        if d.get('type') in ('disambiguation', 'no-extract'): continue
        if not d.get('extract'): continue
        return {
            'source': 'wikipedia.hr',
            'url': d.get('content_urls', {}).get('desktop', {}).get('page'),
            'title': d.get('title'),
            'extract': d.get('extract'),
            'description': d.get('description'),
            'matched_variant': variant,
        }
    return None


def _sport_pgz_search(query: str) -> Optional[dict]:
    if not query: return None
    page = _http_get('https://sport-pgz.hr/?s=' + urllib.parse.quote(query), timeout=6)
    if not page:
        # Plain HTTP failed → try JS-rendered fallback if available.
        if _HAS_PW and _pw_scraper is not None:
            return _pw_scraper.scrape_sport_pgz_klub(query)
        return None
    m = re.search(r'<article[^>]*>.*?<a\s+href=["\']([^"\']+)["\'][^>]*rel=["\']bookmark["\'][^>]*>([^<]+)</a>',
                  page, re.S | re.I)
    if not m:
        m = re.search(r'<a\s+href=["\'](https?://sport-pgz\.hr/[^"\']+)["\'][^>]*>([^<]{6,180})</a>', page, re.I)
    if not m:
        # Search page rendered but yielded nothing parseable — try JS fallback.
        if _HAS_PW and _pw_scraper is not None:
            return _pw_scraper.scrape_sport_pgz_klub(query)
        return None
    hit = m.group(1)
    body = _http_get(hit, timeout=6)
    if not body:
        return {'source': 'sport-pgz.hr', 'url': hit, 'title': html.unescape(m.group(2).strip())}
    text = _strip_tags(body)[:4000]
    meta = _extract_meta(body, hit)
    return {
        'source': 'sport-pgz.hr',
        'url': hit,
        'title': meta.get('title') or html.unescape(m.group(2).strip()),
        'extract': meta.get('description') or text[:500],
        'raw_text': text,
    }


# ─── kosarkapgz.com (Košarkaški savez PGŽ) ──────────────────────────────
# Added 2026-05-10. WordPress site with permalinks DISABLED — REST API is
# reachable only via `?rest_route=/...` (the `/wp-json/...` paths 404).
# Use as enrichment source for PGŽ basketball entities (savez/klub/sportaš).
_KOSARKAPGZ_BASE = 'https://kosarkapgz.com'

def _kosarkapgz_is_basketball_pgz(naziv: str, sport: Optional[str]) -> bool:
    """Heuristic: PGŽ basketball entity worth scraping kosarkapgz.com for.

    The pgz_sport.klubovi/savezi tables are PGŽ-scoped by design
    (klubovi.is_pgz default true, savezi.region default 'PGŽ'), so we don't
    re-check region here — a basketball signal is enough.
    """
    def _fold(s: str) -> str:
        return (s or '').lower().replace('ć', 'c').replace('š', 's') \
                                .replace('ž', 'z').replace('č', 'c').replace('đ', 'd')
    n, sp = _fold(naziv), _fold(sport)
    if 'kosark' in sp: return True   # 'košarka' / 'košarkaški'
    if 'kosark' in n:  return True
    if re.match(r'^\s*kk\b', n): return True   # 'KK X' = Košarkaški klub X
    return False

def _kosarkapgz_savez_meta() -> Optional[dict]:
    """Fetch homepage + key WP pages from kosarkapgz.com. Cached 24h.

    Also runs structured extractors on /?page_id=234 (Upravni odbor) and
    /?page_id=70 (Kontakt), and persists the result into civic.entities +
    civic.persons exactly once per cache miss. See _kosarkapgz_persist_kspgz.
    """
    cache_key = 'kosarkapgz:savez_meta:v1'
    cached = _cache_get(cache_key)
    if cached: return cached

    home = _http_get(_KOSARKAPGZ_BASE + '/', timeout=8)
    if not home:
        _cache_set(cache_key, None, ttl=900); return None
    meta = _extract_meta(home, _KOSARKAPGZ_BASE + '/')

    page_ids = {'povijest': 232, 'upravni_odbor': 234, 'kontakt': 70, 'objave': 217}
    page_html: dict[str, str] = {}
    page_text: dict[str, str] = {}
    parts: list[str] = [meta.get('description', '') or '']
    for slug, pid in page_ids.items():
        body = _http_get(f'{_KOSARKAPGZ_BASE}/?page_id={pid}', timeout=8)
        if not body: continue
        page_html[slug] = body
        text = _strip_tags(body)
        if text:
            page_text[slug] = text[:5000]
            parts.append(f'[{slug}] ' + text[:5000])

    # Structured extraction + idempotent persistence.
    osobe  = _kosarkapgz_extract_osobe(page_html.get('upravni_odbor', '')) or []
    adresa = _kosarkapgz_extract_adresa(page_html.get('kontakt', '')) or {}
    persist = _kosarkapgz_persist_kspgz(osobe, adresa) if (osobe or adresa) else {}

    result = {
        'source': 'kosarkapgz.com',
        'url': _KOSARKAPGZ_BASE + '/',
        'title': meta.get('title') or 'Košarkaški savez PGŽ',
        'extract': (meta.get('description') or '')[:600],
        'pages': page_text,
        'structured': {
            'upravni_odbor': osobe,
            'adresa': adresa,
            'persist': persist,
        },
        'raw_text': ('\n\n'.join(p for p in parts if p))[:80000],
    }
    _cache_set(cache_key, result, ttl=86400)
    return result

def _kosarkapgz_search_posts(query: str, limit: int = 5) -> Optional[dict]:
    """WP REST search for posts mentioning `query`. Source-doc shape."""
    if not query: return None
    q = query.strip()
    if len(q) < 3: return None
    cache_key = 'kosarkapgz:search:' + re.sub(r'[^a-z0-9]+', '_', q.lower())[:60] + ':v1'
    cached = _cache_get(cache_key)
    if cached is not None:
        return cached or None

    url = (f'{_KOSARKAPGZ_BASE}/?rest_route=/wp/v2/posts'
           f'&search={urllib.parse.quote(q)}&per_page={max(1, min(limit, 20))}'
           f'&_fields=id,date,slug,title,excerpt,link,categories')
    body = _http_get(url, timeout=8)
    if not body or not body.lstrip().startswith('['):
        _cache_set(cache_key, [], ttl=3600); return None
    try:
        posts = json.loads(body)
    except Exception:
        _cache_set(cache_key, [], ttl=3600); return None
    if not posts:
        _cache_set(cache_key, [], ttl=3600); return None

    articles = []
    for p in posts[:limit]:
        title_html = ((p.get('title') or {}).get('rendered')) or ''
        excerpt_html = ((p.get('excerpt') or {}).get('rendered')) or ''
        articles.append({
            'title':   html.unescape(_strip_tags(title_html))[:300],
            'excerpt': html.unescape(_strip_tags(excerpt_html))[:400],
            'date':    p.get('date'),
            'link':    p.get('link'),
        })
    blob_lines = [
        f"{(a.get('date') or '')[:10]} — {a.get('title','')}\n"
        f"{a.get('excerpt','')}\n{a.get('link','')}"
        for a in articles
    ]
    result = {
        'source': 'kosarkapgz.com',
        'url': _KOSARKAPGZ_BASE + '/',
        'title': f'kosarkapgz.com — {len(articles)} mention(s) for "{q[:60]}"',
        'extract': blob_lines[0] if blob_lines else '',
        'articles': articles,
        'raw_text': ('\n\n'.join(blob_lines))[:60000],
    }
    _cache_set(cache_key, result, ttl=86400)
    return result


# ─── kosarkapgz.com — structured extractors (page_id=234, page_id=70) ────
# Added 2026-05-10. Cheap, deterministic parsers for two specific WP pages
# whose layout is known:
#   ?page_id=234 → "Upravni odbor" — entry-content is a single <ul> of names,
#                 optionally suffixed with " – funkcija" (en/em dash).
#   ?page_id=70  → "Kontakt" — the OG description packs ulica/grad/email/
#                 tel/web in a single string ("Adresa Verdijeva 11/III51000
#                 RijekaHrvatska E-mail: ... Mob: ... Tel: ... http://...").
# Period of office is not published on either page, so persons.period is
# always None today; the field is kept so future scrapers (kszpg.hr archive)
# can fill it in without a schema change.

_KPGZ_DASH_RE  = re.compile(r'\s*[\u2013\u2014\-]\s*')   # en/em/ascii dash
_KPGZ_LI_RE    = re.compile(r'<li[^>]*>(.*?)</li>', re.I | re.S)
_KPGZ_TAG_RE   = re.compile(r'<[^>]+>')
_KPGZ_OG_DESC  = re.compile(r'<meta[^>]+property=["\']og:description["\'][^>]+content=["\']([^"\']+)', re.I)
_KPGZ_EMAIL_RE = re.compile(r'[\w.+-]+@[\w-]+\.[\w.-]+')
_KPGZ_TEL_RE   = re.compile(r'(?i)Tel\s*[:\.\-]?\s*(\+?[\d\s().\-/]{6,})')
_KPGZ_MOB_RE   = re.compile(r'(?i)Mob\s*[:\.\-]?\s*(\+?[\d\s().\-/]{6,})')
_KPGZ_WEB_RE   = re.compile(r'https?://[\w.\-]+(?:/[\w.\-/?#=&%]*)?', re.I)
# "Verdijeva 11/III" → ulica="Verdijeva", broj="11/III"
_KPGZ_ULICA_RE = re.compile(r'(?i)Adresa\s+([A-ZČĆĐŠŽa-zčćđšž\s]+?)\s+([\dA-Za-z][\w./\-]*)')
# "51000 Rijeka"
_KPGZ_POSTAL_RE = re.compile(r'(\d{5})\s+([A-ZČĆĐŠŽa-zčćđšž][A-ZČĆĐŠŽa-zčćđšž\s\-]+?)(?=\s+Hrvatska|\s+E-mail|\s+Mob|\s+Tel|$)')

def _kosarkapgz_extract_osobe(html_doc: str) -> list[dict]:
    """Return [{'ime':, 'prezime':, 'funkcija':, 'period':}] for ?page_id=234."""
    if not html_doc:
        return []
    # Find the entry-content div by hand (depth-balanced) to avoid grabbing
    # JSON-LD / scripts. WordPress always wraps the body in <div class="entry-content">.
    i = html_doc.find('class="entry-content')
    if i < 0:
        return []
    j = html_doc.find('>', i) + 1
    depth, k = 1, j
    while k < len(html_doc) and depth > 0:
        no = html_doc.find('<div', k)
        nc = html_doc.find('</div>', k)
        if nc < 0:
            break
        if 0 <= no < nc:
            depth += 1; k = no + 4
        else:
            depth -= 1; k = nc + 6
    body = html_doc[j:k-6] if depth == 0 else html_doc[j:]
    out: list[dict] = []
    for raw in _KPGZ_LI_RE.findall(body):
        text = html.unescape(_KPGZ_TAG_RE.sub(' ', raw)).strip()
        text = re.sub(r'\s+', ' ', text)
        if not text:
            continue
        # Split on en/em/ASCII dash → "Name Surname" — "funkcija"
        parts = _KPGZ_DASH_RE.split(text, maxsplit=1)
        person = parts[0].strip()
        funkcija = parts[1].strip().lower() if len(parts) > 1 else None
        # Split person → first name(s) + last name on last whitespace.
        toks = person.split()
        if len(toks) < 2:
            continue
        ime = ' '.join(toks[:-1])
        prezime = toks[-1]
        out.append({
            'ime': ime, 'prezime': prezime,
            'funkcija': funkcija, 'period': None,
        })
    return out

def _kosarkapgz_extract_adresa(html_doc: str) -> dict:
    """Return {'ulica','broj','grad','postal_code','telefon','mobitel','email','web'}.

    Prefers the og:description meta on /?page_id=70 — it is one clean string
    the site re-renders verbatim from the page body. Falls back to stripping
    the page itself if the meta is missing.
    """
    if not html_doc:
        return {}
    desc_match = _KPGZ_OG_DESC.search(html_doc)
    text = html.unescape(desc_match.group(1)) if desc_match else _strip_tags(html_doc)
    text = re.sub(r'\s+', ' ', text).strip()
    if not text:
        return {}
    # The og:description glues fields without whitespace
    # ("11/III51000 RijekaHrvatska E-mail: a@b.com, c@d.comMob: ...").
    # Inject separators so the field regexes don't gobble across boundaries.
    text = re.sub(r'(?<=\D)(?=\d{5}\s)', ' ', text)
    text = re.sub(r'(?<=[a-zćčđšž])(?=[A-ZČĆĐŠŽ])', ' ', text)
    text = re.sub(r'(?i)\s*(Hrvatska|E-?mail|Mob|Tel|IBAN|http)', r' \1', text)
    text = re.sub(r'\s+', ' ', text).strip()
    out: dict = {}
    m = _KPGZ_ULICA_RE.search(text)
    if m:
        out['ulica'] = m.group(1).strip()
        out['broj']  = m.group(2).strip()
    m = _KPGZ_POSTAL_RE.search(text)
    if m:
        out['postal_code'] = m.group(1)
        out['grad']        = m.group(2).strip()
    emails = list(dict.fromkeys(_KPGZ_EMAIL_RE.findall(text)))
    if emails:
        out['email'] = emails[0]
        if len(emails) > 1:
            out['email_extra'] = emails[1:]
    m = _KPGZ_TEL_RE.search(text)
    if m:
        out['telefon'] = re.sub(r'\s+', ' ', m.group(1)).strip()
    m = _KPGZ_MOB_RE.search(text)
    if m:
        out['mobitel'] = re.sub(r'\s+', ' ', m.group(1)).strip()
    # Pick the first http(s) URL that isn't a mailto / schema.org probe.
    for u in _KPGZ_WEB_RE.findall(text):
        if 'schema.org' in u: continue
        out['web'] = u.rstrip('.,')
        break
    return out

# Canonical name kept in civic.entities for the Košarkaški savez PGŽ.
# (civic.entities has rows for many county basketball savezi but NOT for the
# PGŽ one as of 2026-05-10 — we insert it on first persist call.)
_KPGZ_KSPGZ_NAME = 'Košarkaški savez Primorsko-goranske županije'
_KPGZ_KSPGZ_ALIASES = (
    _KPGZ_KSPGZ_NAME.lower(),
    'košarkaški savez pgž',
    'kosarkaski savez pgz',
    'košarkaški savez primorsko-goranske županije',
    'kosarkaski savez primorsko-goranske zupanije',
)

def _kosarkapgz_is_kspgz(naziv: str) -> bool:
    """True for the PGŽ-county savez specifically (not 'Grada Rijeke', not HKS)."""
    if not naziv: return False
    n = naziv.lower().strip()
    if any(a in n for a in _KPGZ_KSPGZ_ALIASES):
        return True
    # Permissive fallback: anything matching košarkaški savez + PGŽ token.
    return ('košarkaški savez' in n or 'kosarkaski savez' in n) and (
        'pgž' in n or 'pgz' in n or 'primorsko' in n
    )

def _kosarkapgz_persist_kspgz(osobe: list[dict], adresa: dict) -> dict:
    """Upsert civic.entities row for KSPGŽ + civic.persons for each Upravni
    odbor member. Also updates pgz_sport.savezi (id=6, KSPGŽ) for predsjednik,
    email, telefon, web. All writes carry provenance.source='kosarkapgz.com'
    via SET LOCAL so the trigger logs them correctly.

    Idempotent: persons keyed on (lower(name), entity_id, metadata->>'source').
    """
    # # KSPGZ-SAVEZI-PATCH-20260510
    res = {
        'entity_id': None, 'entity_action': 'noop',
        'persons_inserted': 0, 'persons_updated': 0,
        'savez_updates': {},   # NEW: pgz_sport.savezi field changes
        'errors': [],
    }
    try:
        c = _db()
        c.autocommit = False  # need a transaction for SET LOCAL
        with c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
            # ── provenance attribution for this whole transaction ─────────
            cur.execute("SET LOCAL provenance.source = 'kosarkapgz.com'")
            cur.execute("SET LOCAL provenance.source_url = 'https://kosarkapgz.com/'")
            cur.execute("SET LOCAL provenance.edited_by = 'kosarkapgz_scraper@enrich_router'")
            cur.execute("SET LOCAL provenance.trust_tier = '2'")  # 1=visit, 2=verified site, 3=oss
            from datetime import date as _date
            cur.execute("SET LOCAL provenance.source_date = %s", (_date.today().isoformat(),))
            # ── entity upsert ─────────────────────────────────────────────
            cur.execute(
                "SELECT id, address, city, county, phone, email, website "
                "FROM civic.entities WHERE lower(name) = lower(%s) LIMIT 1",
                [_KPGZ_KSPGZ_NAME],
            )
            row = cur.fetchone()
            if row:
                eid = row['id']
                # Patch any missing address-ish field. Never overwrite existing
                # non-null values — they may have been hand-curated.
                set_clauses, params = [], []
                addr_full = None
                if adresa.get('ulica') and adresa.get('broj'):
                    addr_full = f"{adresa['ulica']} {adresa['broj']}"
                if addr_full and not row.get('address'):
                    set_clauses.append('address = %s'); params.append(addr_full)
                if adresa.get('grad') and not row.get('city'):
                    set_clauses.append('city = %s'); params.append(adresa['grad'])
                if not row.get('county'):
                    set_clauses.append('county = %s'); params.append('Primorsko-goranska')
                if adresa.get('telefon') and not row.get('phone'):
                    set_clauses.append('phone = %s'); params.append(adresa['telefon'])
                if adresa.get('email') and not row.get('email'):
                    set_clauses.append('email = %s'); params.append(adresa['email'])
                if adresa.get('postal_code') and adresa.get('grad'):
                    set_clauses.append("postal_code = %s"); params.append(adresa['postal_code'])
                if not row.get('website'):
                    set_clauses.append('website = %s'); params.append('https://kosarkapgz.com/')
                # Always extend sources[] with our marker.
                set_clauses.append(
                    "sources = (SELECT array_agg(DISTINCT x) FROM unnest(coalesce(sources, ARRAY[]::text[]) || ARRAY['kosarkapgz.com page_id=70']) AS x)"
                )
                set_clauses.append('updated_at = now()')
                params.append(eid)
                cur.execute(
                    f"UPDATE civic.entities SET {', '.join(set_clauses)} WHERE id = %s",
                    params,
                )
                res['entity_id'] = eid
                res['entity_action'] = 'updated'
            else:
                addr_full = None
                if adresa.get('ulica') and adresa.get('broj'):
                    addr_full = f"{adresa['ulica']} {adresa['broj']}"
                cur.execute(
                    "INSERT INTO civic.entities "
                    "  (name, short_name, entity_type, address, city, county, "
                    "   postal_code, phone, email, website, metadata, sources) "
                    "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) "
                    "RETURNING id",
                    [_KPGZ_KSPGZ_NAME, 'KSPGŽ', 'sportski_savez',
                     addr_full, adresa.get('grad'), 'Primorsko-goranska',
                     adresa.get('postal_code'), adresa.get('telefon'),
                     adresa.get('email'), 'https://kosarkapgz.com/',
                     json.dumps({'sport': 'košarka', 'level': 'regional',
                                 'aliases': ['KSPGŽ', 'Košarkaški savez PGŽ']}),
                     ['kosarkapgz.com page_id=70']],
                )
                res['entity_id'] = cur.fetchone()['id']
                res['entity_action'] = 'inserted'

            # ── persons upsert ────────────────────────────────────────────
            eid = res['entity_id']
            SOURCE_KEY = 'kosarkapgz.com page_id=234'
            for p in osobe or []:
                fullname = f"{p['ime']} {p['prezime']}".strip()
                if not fullname:
                    continue
                cur.execute(
                    "SELECT id, function, metadata FROM civic.persons "
                    "WHERE lower(name) = lower(%s) AND entity_id = %s "
                    "  AND coalesce(metadata->>'source','') = %s "
                    "LIMIT 1",
                    [fullname, eid, SOURCE_KEY],
                )
                existing = cur.fetchone()
                meta = {
                    'source': SOURCE_KEY,
                    'source_url': 'https://kosarkapgz.com/?page_id=234',
                    'ime': p.get('ime'),
                    'prezime': p.get('prezime'),
                    'period': p.get('period'),
                    'scraped_at': int(time.time()),
                }
                if existing:
                    cur.execute(
                        "UPDATE civic.persons SET function = %s, metadata = %s, "
                        "  updated_at = now() WHERE id = %s",
                        [p.get('funkcija') or existing.get('function'),
                         json.dumps(meta), existing['id']],
                    )
                    res['persons_updated'] += 1
                else:
                    cur.execute(
                        "INSERT INTO civic.persons "
                        "  (name, function, level, entity_id, county, city, metadata, trust_tier) "
                        "VALUES (%s, %s, %s, %s, %s, %s, %s, %s)",
                        [fullname, p.get('funkcija'), 'savez',
                         eid, 'Primorsko-goranska', adresa.get('grad'),
                         json.dumps(meta), 3],
                    )
                    res['persons_inserted'] += 1

            # ── pgz_sport.savezi update (NEW 2026-05-10) ──────────────────
            # Update predsjednik / tajnik / email / telefon / web on the
            # canonical KSPGŽ row in pgz_sport.savezi (id=6 by default; lookup
            # by naziv if id moves). Provenance GUC was set at top of txn,
            # so trigger trg_provenance_log captures source='kosarkapgz.com'.
            try:
                # Predsjednik = first Upravni odbor member whose funkcija ~ 'predsjednik'
                predsj = None
                for p in (osobe or []):
                    if (p.get('funkcija') or '').lower().startswith('predsj'):
                        predsj = f"{p.get('ime','')} {p.get('prezime','')}".strip()
                        break

                cur.execute("SELECT id, predsjednik, tajnik, email, telefon, web "
                            "FROM pgz_sport.savezi WHERE lower(naziv) IN "
                            "  (lower(%s), lower(%s)) LIMIT 1",
                            ['Košarkaški savez PGŽ', 'Košarkaški savez Primorsko-goranske županije'])
                srow = cur.fetchone()
                if srow:
                    sid = srow['id']
                    sets, params, applied = [], [], {}

                    def _set_if_changed(col, new_val):
                        if not new_val: return
                        cur_val = srow.get(col)
                        if cur_val and str(cur_val).strip() == str(new_val).strip():
                            return
                        sets.append(f"{col} = %s")
                        params.append(new_val)
                        applied[col] = {'before': cur_val, 'after': new_val}

                    _set_if_changed('predsjednik', predsj)
                    # tajnik not reliably published; skip unless future scrape adds it
                    if adresa.get('email'):
                        _set_if_changed('email', adresa['email'])
                    if adresa.get('telefon'):
                        _set_if_changed('telefon', adresa['telefon'])
                    # Web: prefer kosarkapgz.com over generic sport-pgz.hr aggregator
                    if not srow.get('web') or 'sport-pgz.hr' in (srow.get('web') or ''):
                        _set_if_changed('web', 'https://kosarkapgz.com/')

                    if sets:
                        params.append(sid)
                        cur.execute(
                            f"UPDATE pgz_sport.savezi SET {', '.join(sets)} WHERE id = %s",
                            params,
                        )
                        res['savez_updates'] = {'savez_id': sid, 'fields': applied}
                    else:
                        res['savez_updates'] = {'savez_id': sid, 'fields': {}, 'note': 'no diff'}
                else:
                    res['savez_updates'] = {'note': 'KSPGŽ row not found in pgz_sport.savezi'}
            except Exception as e:
                res['errors'].append(f"savez_update: {e}")

            c.commit()
    except Exception as e:
        try: c.rollback()
        except Exception: pass
        res['errors'].append(str(e))
    finally:
        try: c.close()
        except Exception: pass
    # Bust the kosarkapgz cache so next call re-fetches and re-persists
    try:
        _cache_delete('kosarkapgz:savez_meta:v1')
    except Exception:
        pass
    return res


def _fetch_primary_site(url: str) -> Optional[dict]:
    body = _http_get(url, timeout=6)
    if not body: return None
    text = _strip_tags(body)
    meta = _extract_meta(body, url)
    return {
        'source': urllib.parse.urlparse(url).hostname or url,
        'url': url,
        'title': meta.get('title'),
        'extract': meta.get('description') or text[:500],
        'raw_text': text[:8000],
    }


# ─── Anti-halucinacija safeguard (added 2026-05-10 per Damir directive) ────
# When LLM-synthesised descriptions mention famous Croatian sports/political
# names, those mentions MUST be backed by source text — otherwise we strip
# the offending sentence and log to civic.fact_check_log. Triggered after
# Damir caught a "Toni Kukoč briljirao" fabrication for KK Kvarner 2010
# (Kukoč played for Jugoplastika Split, not Kvarner).
FAMOUS_PEOPLE_REQUIRE_PROOF = (
    'Kukoč', 'Petrović', 'Modrić', 'Šuker', 'Boban', 'Kovač',
    'Dalić', 'Mandžukić', 'Rakitić', 'Lovren', 'Vida', 'Perišić',
    'Tuđman', 'Račan', 'Mesić', 'Josipović', 'Milanović',
    'Bilić', 'Štimac', 'Prosinečki', 'Suker',     # Suker = Šuker without diacritic
    'Kostelić', 'Janica', 'Blanka Vlašić', 'Sara Kolak',
    'Sandra Perković', 'Filip Hrgović',
)

def _fact_check_response(text: str, evidence: list[str], kind: Optional[str], naziv: Optional[str]) -> str:
    """Strip sentences mentioning famous people not present in source evidence.
    Logs each strip to civic.fact_check_log. Croatian-diacritic-aware (substring match).
    Fail-soft on logging — if DB write fails the strip still happens.
    """
    if not text:
        return text
    blob = ' '.join(e for e in (evidence or []) if e)
    out = text
    for famous in FAMOUS_PEOPLE_REQUIRE_PROOF:
        if famous not in out:
            continue
        if famous in blob:
            # Mentioned in sources → keep, mark validated in log so we know which famous names DO have proof
            try:
                _c = _db(); _cur = _c.cursor()
                _cur.execute(
                    "INSERT INTO civic.fact_check_log "
                    "(entity_kind, halucinated_text, famous_person, source_validated, sources_searched, metadata) "
                    "VALUES (%s, %s, %s, TRUE, %s, %s::jsonb)",
                    (kind, '(in sources)', famous, len(evidence or []),
                     json.dumps({'naziv': naziv, 'mode': 'kept_validated'}, ensure_ascii=False))
                )
                _cur.close(); _c.close()
            except Exception:
                pass
            continue
        # Famous mention without source backing — strip the sentence(s)
        pattern = re.compile(r'[^.!?]*' + re.escape(famous) + r'[^.!?]*[.!?]?', re.UNICODE)
        stripped_match = pattern.search(out)
        stripped_text = stripped_match.group(0) if stripped_match else ''
        out = pattern.sub('', out)
        try:
            _c = _db(); _cur = _c.cursor()
            _cur.execute(
                "INSERT INTO civic.fact_check_log "
                "(entity_kind, halucinated_text, famous_person, source_validated, sources_searched, metadata) "
                "VALUES (%s, %s, %s, FALSE, %s, %s::jsonb)",
                (kind, (stripped_text or text)[:1000], famous, len(evidence or []),
                 json.dumps({'naziv': naziv, 'mode': 'stripped_no_proof'}, ensure_ascii=False))
            )
            _cur.close(); _c.close()
        except Exception:
            pass
    # Collapse double spaces / orphaned commas left after sentence removal
    out = re.sub(r'\s+', ' ', out)
    out = re.sub(r'\s+([.,;:])', r'\1', out)
    return out.strip()


# ─── DeepSeek (optional, fail-soft) ─────────────────────────────────────
def _deepseek_describe(naziv: str, kind: str, evidence: list[str]) -> Optional[str]:
    if not DEEPSEEK_KEY or not evidence: return None
    joined = "\n---\n".join(e for e in evidence if e)[:6000]
    if not joined.strip(): return None
    prompt = (f"Iz dolje navedenih izvora napiši profesionalni opis za "
              f"{kind} '{naziv}' na hrvatskom jeziku. 3-5 rečenica. "
              f"Bez uvoda 'Evo opisa', samo tekst.\n\nIZVORI:\n{joined}")
    # STRICT FACT RULE — anti-halucinacija system prompt (Damir 2026-05-10).
    # Any famous-name mention without source backing will also be stripped
    # post-response by _fact_check_response().
    system_prompt = (
        "Pišeš sažete činjenične opise sportskih organizacija na hrvatskom. "
        "STRICT FACT RULE: NE SPOMINJI specifične povijesne osobe, datume ili dostignuća "
        "osim ako su DOSLOVNO prisutni u priloženim IZVORIMA. Nikada ne nagađaj o slavnim "
        "sportašima (Kukoč, Petrović, Modrić, Šuker, Boban, Kostelić, Vlašić, ...) "
        "osim ako su izrijekom navedeni u IZVORIMA. Ako nisi siguran, IZOSTAVI. "
        "Ako izvori ne sadrže povijesnu informaciju, drži se sadašnjeg djelovanja kluba."
    )
    payload = {
        "model": "deepseek-chat",
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user",   "content": prompt},
        ],
        "max_tokens": 280, "temperature": 0.3,
    }
    req = urllib.request.Request(
        DEEPSEEK_URL, data=json.dumps(payload).encode('utf-8'),
        headers={'Authorization': 'Bearer ' + DEEPSEEK_KEY,
                 'Content-Type': 'application/json',
                 'User-Agent': UA}, method='POST')
    try:
        with urllib.request.urlopen(req, timeout=20) as r:
            d = json.loads(r.read().decode('utf-8'))
        text = d.get('choices', [{}])[0].get('message', {}).get('content', '').strip()
        if not text:
            return None
        # Post-LLM fact validator — strip famous-name mentions lacking source backing
        text = _fact_check_response(text, evidence, kind, naziv)
        return text or None
    except Exception:
        return None


# ─── Row loaders & display name ─────────────────────────────────────────
def _load_row(kind: str, eid: int) -> dict:
    if kind == 'klub':
        row = _fetch_one("""SELECT id, naziv, oib, sport, grad, predsjednik, tajnik,
                                   web, web_stranica, email, telefon, ciljevi, opis_djelatnosti,
                                   sjediste, godina_osnutka, savez_id, scrape_url, source_url,
                                   metadata
                            FROM pgz_sport.klubovi WHERE id=%s""", (eid,))
    elif kind == 'savez':
        row = _fetch_one("""SELECT id, naziv, oib, sport, predsjednik, tajnik, email, telefon, web,
                                   adresa, godina_osnutka, source_url, metadata
                            FROM pgz_sport.savezi WHERE id=%s""", (eid,))
    elif kind == 'sportas':
        row = _fetch_one("""SELECT c.id, c.ime, c.prezime, c.sport, c.klub_id, c.profile_url,
                                   c.slika_url, c.source_url, c.source, c.source_id,
                                   c.hns_igrac_id, c.biografija,
                                   c.datum_rodenja, c.mjesto_rodenja, c.broj_dresa,
                                   c.visina_cm, c.tezina_kg, c.dominantna_noga, c.oib,
                                   c.vanjski_id, c.metadata,
                                   k.sport AS klub_sport, k.naziv AS klub_naziv
                            FROM pgz_sport.clanovi c
                            LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id
                            WHERE c.id=%s""", (eid,))
        # Fall back to klub.sport when c.sport is empty
        if row and not row.get('sport') and row.get('klub_sport'):
            row['sport'] = row['klub_sport']
    else:
        raise HTTPException(400, "kind must be klub|savez|sportas")
    if not row:
        raise HTTPException(404, kind + " not found")
    return row


def _display_name(kind: str, row: dict) -> str:
    if kind == 'sportas':
        return ((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip()
    return row.get('naziv', '') or ''


# ─── Sport federations map (loaded once, refresh on file mtime) ─────────
_SPORT_FED_PATH = '/opt/pgz-sport/data/sport_federations.json'
_SPORT_FED_CACHE: dict[str, Any] = {'mtime': 0, 'data': {}, 'aliases': {}, 'media': []}


def _load_sport_feds() -> tuple[dict, dict, list]:
    """Return (feds, aliases, local_media) — refreshed when JSON changes."""
    try:
        st = os.stat(_SPORT_FED_PATH)
    except FileNotFoundError:
        return ({}, {}, [])
    if st.st_mtime != _SPORT_FED_CACHE['mtime']:
        try:
            with open(_SPORT_FED_PATH, 'r', encoding='utf-8') as f:
                raw = json.load(f)
        except Exception:
            return (_SPORT_FED_CACHE['data'],
                    _SPORT_FED_CACHE['aliases'],
                    _SPORT_FED_CACHE['media'])
        aliases = raw.pop('_aliases', {}) if isinstance(raw, dict) else {}
        media   = raw.pop('_local_media_pgz', []) if isinstance(raw, dict) else []
        raw.pop('_meta', None)
        _SPORT_FED_CACHE.update(mtime=st.st_mtime, data=raw, aliases=aliases, media=media)
    return (_SPORT_FED_CACHE['data'],
            _SPORT_FED_CACHE['aliases'],
            _SPORT_FED_CACHE['media'])


def _normalize_sport(sport: Optional[str]) -> Optional[str]:
    if not sport: return None
    s = sport.strip().lower()
    feds, aliases, _ = _load_sport_feds()
    while s in aliases:
        nxt = aliases[s]
        if nxt == s: break
        s = nxt
    return s if s in feds else None


def _sport_fed(sport: Optional[str]) -> Optional[dict]:
    """Resolve sport → federations entry (or None)."""
    norm = _normalize_sport(sport)
    if not norm: return None
    feds, _, _ = _load_sport_feds()
    return feds.get(norm)


def _research_links(naziv, kind, grad=None, sport: Optional[str] = None, row: Optional[dict] = None):
    base_q = (naziv or '').strip()
    q = (base_q + ' ' + grad) if grad else base_q
    qenc = urllib.parse.quote(q)
    out = []
    # Prefer DIRECT profile/source link if entity already has one (e.g. HNS Semafor)
    if row:
        direct = row.get('profile_url') or row.get('source_url') or row.get('scrape_url') or row.get('web') or row.get('web_stranica')
        if direct and isinstance(direct, str) and direct.startswith(('http://','https://')):
            try:
                host = urllib.parse.urlparse(direct).hostname or ''
            except Exception:
                host = ''
            label = 'Vanjski profil'
            icon = '🔗'
            if 'hns' in host: label, icon = 'HNS profil', '⚽'
            elif 'transfermarkt' in host: label, icon = 'Transfermarkt', '⚽'
            elif 'wikipedia' in host: label, icon = 'Wikipedia', '📚'
            elif host.endswith('.hr') or host.endswith('.com'): label, icon = 'Službena stranica', '🌐'
            out.append({'label': label, 'icon': icon, 'url': direct, 'is_direct': True})
    out += [
        {'label': 'Google',       'icon': '🔍', 'url': 'https://www.google.com/search?q=' + qenc},
        {'label': 'Wikipedia HR', 'icon': '📚', 'url': 'https://hr.wikipedia.org/w/index.php?search=' + qenc},
        {'label': 'sport-pgz.hr', 'icon': '🏅', 'url': 'https://sport-pgz.hr/?s=' + qenc},
    ]
    if kind == 'klub':
        out.append({'label': 'Sportilus',       'icon': '⬡', 'url': 'https://www.sportilus.com/?s=' + qenc})
        out.append({'label': 'Sudski registar', 'icon': '⚖', 'url': 'https://sudreg.pravosudje.hr/registar/oc/index.html'})

    # Sport-specific federation links (replace static HNS/transfermarkt for sportas)
    fed = _sport_fed(sport) if sport else None
    if kind == 'sportas':
        if fed and isinstance(fed.get('national'), dict):
            nat = fed['national']
            search = (nat.get('search_url') or nat.get('url') or '').replace('{q}', qenc)
            if search:
                out.append({'label': nat.get('name', 'Nacionalni savez'),
                            'icon': '🏆', 'url': search})
        if fed and isinstance(fed.get('pgz'), dict):
            pgz = fed['pgz']
            url = pgz.get('search_url') or pgz.get('url') or ''
            if url:
                out.append({'label': pgz.get('name', 'PGŽ savez'),
                            'icon': '🏟', 'url': url.replace('{q}', qenc)})
        if not fed:
            # No mapping for this sport → keep transfermarkt as legacy fallback
            # Prefer direct /igraci/{id}/{slug} when hns_igrac_id exists
            hns_id = (clan or {}).get('hns_igrac_id') if 'clan' in dir() else None
            if not hns_id:
                # Try get from current clan dict
                try: hns_id = clan.get('hns_igrac_id') if isinstance(clan, dict) else None
                except: pass
            if hns_id:
                # Slugify ime+prezime: "Franko Andrijašević" → "franko-andrijasevic"
                _ime = (clan.get('ime','') if isinstance(clan, dict) else '') or ''
                _prez = (clan.get('prezime','') if isinstance(clan, dict) else '') or ''
                _full = (_ime + ' ' + _prez).strip().lower()
                _slug = _full
                for old_c, new_c in [('č','c'),('ć','c'),('ž','z'),('š','s'),('đ','d'),(' ','-')]:
                    _slug = _slug.replace(old_c, new_c)
                _slug = re.sub(r'[^a-z0-9-]', '', _slug)
                out.append({'label': 'HNS Semafor (profil)', 'icon': '⚽', 'url': f'https://semafor.hns.family/igraci/{hns_id}/{_slug}/'})
            else:
                out.append({'label': 'HNS Semafor (pretraga)', 'icon': '⚽', 'url': 'https://semafor.hns.family/?s=' + qenc})
            out.append({'label': 'transfermarkt','icon': '⚽', 'url': 'https://www.transfermarkt.com/schnellsuche/ergebnis/schnellsuche?query=' + qenc})
        # Local PGŽ media for any sportas
        _, _, media = _load_sport_feds()
        for m in media:
            url = (m.get('search_url') or '').replace('{q}', qenc)
            if url:
                out.append({'label': m.get('name', 'Lokalni medij'),
                            'icon': '📰', 'url': url})
    if kind == 'savez':
        out.append({'label': 'sport-pgz.hr savezi', 'icon': '🏅', 'url': 'https://sport-pgz.hr/savezi'})
    return out


# ─── Proposal pipelines ─────────────────────────────────────────────────
def _name_tokens(naziv: str) -> list[str]:
    """Significant tokens from entity name (≥4 chars, deaccented)."""
    import unicodedata
    s = unicodedata.normalize('NFKD', naziv or '').encode('ascii', 'ignore').decode('ascii').lower()
    toks = [t for t in re.split(r'[^a-z0-9]+', s) if len(t) >= 4]
    stop = {'klub','udruga','sportski','sport','kosarkaski','kosarka','nogometni',
            'rukometni','savez','rijeka','primorsko','goranski','grad','grada','centar'}
    return [t for t in toks if t not in stop] or toks


def _is_relevant(source: dict, tokens: list[str]) -> bool:
    """A source is 'relevant' only if the page actually mentions the entity name."""
    if not tokens: return True
    import unicodedata
    blob = (source.get('title') or '') + ' ' + (source.get('extract') or '') + ' ' + (source.get('raw_text') or '')
    blob = unicodedata.normalize('NFKD', blob.lower()).encode('ascii', 'ignore').decode('ascii')
    return any(t in blob for t in tokens)


# ─── Klub domain guesser (HR slug → candidate URLs → HEAD probe) ────────
import re as _re_klg

def _slugify_klub(naziv: str) -> str:
    if not naziv: return ""
    s = naziv.lower()
    repl = (("č","c"),("ć","c"),("ž","z"),("š","s"),("đ","d"),
            ('"',''),("'",""),("(",""),(")",""),(",",""),(".",""),
            ("/",""),("\\",""))
    for a,b in repl: s = s.replace(a,b)
    s = _re_klg.sub(r"[^a-z0-9]+", "-", s).strip("-")
    return s

def _klub_domain_candidates(naziv: str) -> list[str]:
    """Generate ranked candidate URLs from club name."""
    if not naziv: return []
    s = _slugify_klub(naziv)
    # Strip common prefixes for cleaner domains
    base = s
    for pref in ("hnk-","nk-","rk-","kk-","ok-","bk-","gk-","tk-","ak-","hbk-"):
        if base.startswith(pref):
            base = base[len(pref):]; break
    # also try short prefix-ed variants
    short = base.split("-")[0] if base else ""
    candidates = []
    sports_prefixes = ["nk-","hnk-","rk-","kk-","bk-","ok-","ak-","tk-"]
    # full slug with original prefix
    for tld in (".hr",".com",".eu",".info"):
        candidates.append(f"https://{s}{tld}")
        candidates.append(f"https://www.{s}{tld}")
    # base-only
    for tld in (".hr",".com"):
        candidates.append(f"https://{base}{tld}")
        candidates.append(f"https://www.{base}{tld}")
    # try sport prefixes if name doesn't already have one
    if not any(s.startswith(p) for p in sports_prefixes):
        for sp in sports_prefixes[:5]:
            for tld in (".hr",".com"):
                candidates.append(f"https://{sp}{base}{tld}")
    # dedup, preserve order
    seen, out = set(), []
    for c in candidates:
        if c not in seen:
            seen.add(c); out.append(c)
    return out[:20]

def _probe_klub_url(url: str, naziv_tokens: list, timeout: int = 5) -> Optional[dict]:
    """HEAD/GET probe; return doc with raw_text if URL is alive AND mentions club tokens."""
    try:
        import requests
        r = requests.get(url, timeout=timeout, allow_redirects=True,
                         headers={"User-Agent":"Mozilla/5.0 RinetEnrichBot/1.0"})
        if r.status_code != 200: return None
        if len(r.text) < 200: return None
        text = r.text.lower()
        # Must mention at least one distinctive token from name
        toks = [t.lower() for t in (naziv_tokens or []) if len(t) > 2]
        if toks and not any(t in text for t in toks):
            return None
        return {"source": "domain_probe", "url": r.url, "raw_text": r.text[:50000]}
    except Exception:
        return None

def _guess_klub_domains(naziv: str, tokens: list) -> Optional[dict]:
    """Parallel probe candidates (5 workers, 4s timeout each); first hit wins."""
    from concurrent.futures import ThreadPoolExecutor, as_completed
    candidates = _klub_domain_candidates(naziv)
    if not candidates: return None
    with ThreadPoolExecutor(max_workers=8) as ex:
        futs = {ex.submit(_probe_klub_url, url, tokens, 4): url for url in candidates[:16]}
        try:
            for fut in as_completed(futs, timeout=10):
                try:
                    doc = fut.result()
                    if doc:
                        # Cancel remaining (best effort)
                        for f in futs:
                            if not f.done(): f.cancel()
                        return doc
                except Exception:
                    continue
        except TimeoutError:
            # All probes timed out — graceful None instead of 500
            for f in futs:
                if not f.done(): f.cancel()
            return None
    return None

def _scrape_klub_subpages(base_url: str, tokens: list) -> str:
    """Fetch /kontakt /uprava /o-nama /o-klubu and concat texts.

    Parallelized via ThreadPoolExecutor (8 workers, 3s per-URL timeout, 8s total budget).
    Was sequential 8x4s = up to 32s.
    """
    if not base_url: return ""
    import requests
    from concurrent.futures import ThreadPoolExecutor, as_completed
    base = base_url.rstrip("/")
    paths = ["/kontakt","/uprava","/o-nama","/o-klubu","/predsjednik","/klub","/contact","/about"]
    headers = {"User-Agent":"Mozilla/5.0 RinetEnrichBot/1.0"}

    def _fetch(path):
        try:
            r = requests.get(base + path, timeout=3, allow_redirects=True, headers=headers)
            if r.status_code == 200 and len(r.text) > 200:
                return r.text[:30000]
        except Exception:
            pass
        return None

    accum = []
    with ThreadPoolExecutor(max_workers=8) as ex:
        futs = {ex.submit(_fetch, p): p for p in paths}
        try:
            for fut in as_completed(futs, timeout=8):
                try:
                    txt = fut.result()
                    if txt: accum.append(txt)
                except Exception:
                    continue
        except Exception:
            for f in futs:
                if not f.done(): f.cancel()
    return "\n\n".join(accum)


def _propose_for_klub(row: dict) -> dict:
    naziv = row.get('naziv') or ''
    # Only consider HTTP(S) URLs as valid primary sources — skip placeholder strings like 'godisnjak_2025'
    raw_primary = row.get('web') or row.get('web_stranica') or row.get('source_url') or row.get('scrape_url')
    primary = raw_primary if (raw_primary and isinstance(raw_primary, str) and raw_primary.startswith(('http://','https://'))) else None
    sources, evidence = [], []
    tokens_pre = _name_tokens(naziv)
    pdoc = _fetch_primary_site(primary) if primary else None
    if not pdoc:
        # No valid web in DB — try to guess domain from club name
        pdoc = _guess_klub_domains(naziv, tokens_pre)
        if pdoc:
            # Also fetch subpages for richer evidence
            sub = _scrape_klub_subpages(pdoc.get('url',''), tokens_pre)
            if sub:
                pdoc['raw_text'] = (pdoc.get('raw_text','') + '\n\n' + sub)[:120000]
    elif pdoc:
        # Have primary site — also fetch its subpages
        sub = _scrape_klub_subpages(pdoc.get('url') or primary, tokens_pre)
        if sub:
            pdoc['raw_text'] = (pdoc.get('raw_text','') + '\n\n' + sub)[:120000]
    if pdoc: sources.append(pdoc); evidence.append(pdoc.get('raw_text') or pdoc.get('extract') or '')
    wiki = _wiki_summary(naziv)
    if wiki: sources.append(wiki); evidence.append(wiki.get('extract') or '')
    spz = _sport_pgz_search(naziv)
    if spz: sources.append(spz); evidence.append(spz.get('raw_text') or spz.get('extract') or '')
    # kosarkapgz.com — only if this looks like a PGŽ basketball club
    if _kosarkapgz_is_basketball_pgz(naziv, row.get('sport')):
        kpgz = _kosarkapgz_search_posts(naziv, limit=5)
        if kpgz: sources.append(kpgz); evidence.append(kpgz.get('raw_text') or '')

    tokens = _name_tokens(naziv)
    relevant = [s for s in sources if _is_relevant(s, tokens)]
    relevant_blob = '\n\n'.join((s.get('raw_text') or s.get('extract') or '') for s in relevant)

    proposed: dict[str, Any] = {}
    # web/email/telefon: ONLY from sources actually mentioning the entity
    if not row.get('web'):
        u = _find_official_web(relevant_blob, naziv)
        if u: proposed['web'] = u
    if not row.get('email'):
        e = _find_email(relevant_blob)
        if e: proposed['email'] = e
    if not row.get('telefon'):
        t = _find_phone(relevant_blob)
        if t: proposed['telefon'] = t
    if not row.get('opis_djelatnosti'):
        descr_evidence = [(s.get('raw_text') or s.get('extract') or '') for s in relevant] or evidence
        descr = _deepseek_describe(naziv, 'sportski klub', descr_evidence)
        if not descr:
            for s in (relevant or sources):
                if s.get('extract') and len(s['extract']) >= 80:
                    descr = s['extract']; break
        if descr: proposed['opis_djelatnosti'] = descr.strip()[:2000]
    return {'proposed': proposed, 'sources': sources}


def _propose_for_savez(row: dict) -> dict:
    naziv = row.get('naziv') or ''
    primary = row.get('web') or row.get('source_url')
    sources, evidence = [], []
    pdoc = _fetch_primary_site(primary) if primary else None
    if pdoc: sources.append(pdoc); evidence.append(pdoc.get('raw_text') or '')
    wiki = _wiki_summary(naziv)
    if wiki: sources.append(wiki); evidence.append(wiki.get('extract') or '')
    spz = _sport_pgz_search(naziv)
    if spz: sources.append(spz); evidence.append(spz.get('raw_text') or '')
    # kosarkapgz.com — fetch federation meta for any basketball savez
    if _kosarkapgz_is_basketball_pgz(naziv, row.get('sport')):
        kpgz = _kosarkapgz_savez_meta()
        if kpgz: sources.append(kpgz); evidence.append(kpgz.get('raw_text') or '')

    tokens = _name_tokens(naziv)
    relevant = [s for s in sources if _is_relevant(s, tokens)]
    relevant_blob = '\n\n'.join((s.get('raw_text') or s.get('extract') or '') for s in relevant)

    proposed: dict[str, Any] = {}
    if not row.get('web'):
        u = _find_official_web(relevant_blob, naziv)
        if u: proposed['web'] = u
    if not row.get('email'):
        e = _find_email(relevant_blob)
        if e: proposed['email'] = e
    if not row.get('telefon'):
        t = _find_phone(relevant_blob)
        if t: proposed['telefon'] = t
    return {'proposed': proposed, 'sources': sources}


# ─── HNS Semafor parsing ────────────────────────────────────────────────
_HNS_BASE = 'https://semafor.hns.family'

def _slugify(name: str) -> str:
    import unicodedata
    s = unicodedata.normalize('NFKD', name or '').encode('ascii', 'ignore').decode('ascii').lower()
    return re.sub(r'[^a-z0-9]+', '-', s).strip('-')

def _hns_url_from_row(row: dict) -> Optional[str]:
    """Try to build a semafor.hns.family /igraci/ URL for this row."""
    # 1) Already-set columns
    for k in ('profile_url', 'source_url'):
        u = row.get(k)
        if u and 'semafor.hns.family/igraci/' in (u or ''):
            return u
    # 2) hns_igrac_id column
    pid = row.get('hns_igrac_id')
    if pid:
        slug = _slugify(((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip())
        return f'{_HNS_BASE}/igraci/{int(pid)}/{slug}/'
    # 3) vanjski_id JSONB → hns_comet
    vid = row.get('vanjski_id') or {}
    if isinstance(vid, dict):
        comet = vid.get('hns_comet') or vid.get('hns_pid')
        slug  = vid.get('hns_slug') or _slugify(((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip())
        if comet:
            try:
                return f'{_HNS_BASE}/igraci/{int(comet)}/{slug}/'
            except Exception:
                pass
    # 4) source='hns_semafor' + source_id
    if (row.get('source') or '').startswith('hns_') and row.get('source_id'):
        try:
            slug = _slugify(((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip())
            return f'{_HNS_BASE}/igraci/{int(row["source_id"])}/{slug}/'
        except Exception:
            pass
    return None


def _parse_hns_player(html_doc: str, url: str) -> Optional[dict]:
    """Extract structured fields from a semafor.hns.family player page."""
    if not html_doc: return None
    try:
        from bs4 import BeautifulSoup
    except Exception:
        return _parse_hns_player_regex(html_doc, url)
    soup = BeautifulSoup(html_doc, 'html.parser')
    out: dict[str, Any] = {'source': 'semafor.hns.family', 'url': url}

    # hns_igrac_id from URL
    m = re.search(r'/igraci/(\d+)/', url)
    if m: out['hns_igrac_id'] = int(m.group(1))

    title = soup.find('title')
    if title: out['title'] = title.get_text(strip=True)[:300]

    # Photo
    photo = soup.find('div', class_='photo')
    if photo:
        img = photo.find('img')
        if img and img.get('src'):
            src = img['src']
            if not src.startswith('http'):
                src = urllib.parse.urljoin(url, src)
            out['slika_url'] = src

    # Player number (jersey)
    pn = soup.find('div', class_='playerName')
    if pn:
        h3 = pn.find('h3')
        if h3:
            t = h3.get_text(strip=True)
            if t.isdigit():
                out['broj_dresa'] = int(t)

    # Datum rodjenja
    li = soup.find('li', class_='dob')
    if li:
        h4 = li.find('h4')
        if h4:
            t = h4.get_text(' ', strip=True)
            mm = re.match(r'(\d{1,2})\.(\d{1,2})\.(\d{4})', t)
            if mm:
                from datetime import date as _date
                try:
                    out['datum_rodenja'] = _date(int(mm.group(3)), int(mm.group(2)), int(mm.group(1))).isoformat()
                except Exception:
                    pass

    # Mjesto rodjenja
    li = soup.find('li', class_='pob')
    if li:
        h4 = li.find('h4')
        if h4:
            out['mjesto_rodenja'] = h4.get_text(strip=True)

    # Trenutni klub (info only — we don't reassign klub_id from here)
    klub_link = soup.find('a', href=re.compile(r'/klubovi/(\d+)/'))
    if klub_link:
        h4 = klub_link.find('h4')
        if h4:
            out['trenutni_klub'] = h4.get_text(strip=True)
        m = re.search(r'/klubovi/(\d+)/', klub_link.get('href') or '')
        if m: out['hns_klub_id'] = int(m.group(1))

    # Description (meta)
    meta_d = soup.find('meta', attrs={'name': 'description'})
    if meta_d and meta_d.get('content'):
        out['description'] = meta_d['content'][:600]

    # Make a clean text blob for relevance / DeepSeek
    text = soup.get_text(' ', strip=True)
    out['raw_text'] = re.sub(r'\s+', ' ', text)[:4000]
    out['extract'] = (out.get('description')
                      or (out['raw_text'][:500] if out.get('raw_text') else None))
    return out


def _parse_hns_player_regex(html_doc: str, url: str) -> Optional[dict]:
    """BS4-free fallback parser."""
    out: dict[str, Any] = {'source': 'semafor.hns.family', 'url': url}
    m = re.search(r'/igraci/(\d+)/', url)
    if m: out['hns_igrac_id'] = int(m.group(1))
    m = re.search(r'<div class="photo"><img src="([^"]+)"', html_doc)
    if m:
        src = m.group(1)
        if not src.startswith('http'): src = urllib.parse.urljoin(url, src)
        out['slika_url'] = src
    m = re.search(r'<li class="dob">.*?<h4>(\d{1,2})\.(\d{1,2})\.(\d{4})', html_doc, re.S)
    if m:
        from datetime import date as _date
        try:
            out['datum_rodenja'] = _date(int(m.group(3)), int(m.group(2)), int(m.group(1))).isoformat()
        except Exception:
            pass
    m = re.search(r'<li class="pob"><i></i><h4>([^<]+)</h4>', html_doc)
    if m: out['mjesto_rodenja'] = m.group(1).strip()
    m = re.search(r'<div class="playerName"><h3>(\d+)</h3>', html_doc)
    if m: out['broj_dresa'] = int(m.group(1))
    m = re.search(r'<meta\s+name="description"\s+content="([^"]+)"', html_doc)
    if m: out['description'] = m.group(1)[:600]
    return out


def _hns_fetch_player(url: str) -> Optional[dict]:
    body = _http_get(url, timeout=8)
    if not body:
        # Try Playwright fallback
        if _HAS_PW and _pw_scraper is not None:
            r = _pw_scraper.fetch_rendered(url, timeout_ms=15000)
            if r and r.get('html_len', 0) > 2000:
                # We didn't store html in fetch_rendered — re-fetch text only is enough
                # but we need html for parse. Do a simple HTTP retry with longer timeout.
                body = _http_get(url, timeout=15)
    return _parse_hns_player(body, url) if body else None


# ─── Generic sport-federation scraper ───────────────────────────────────
def _fed_url_from_row(row: dict) -> Optional[str]:
    """If the row already points to a federation profile (source_url /
    profile_url on a known fed host), return it."""
    feds, _, _ = _load_sport_feds()
    fed_hosts = set()
    for entry in feds.values():
        if not isinstance(entry, dict): continue
        for which in ('national', 'pgz'):
            sub = entry.get(which) or {}
            for k in ('url', 'search_url', 'profile_url_pattern'):
                v = sub.get(k)
                if v:
                    try:
                        h = urllib.parse.urlparse(v.replace('{q}', 'x').replace('{slug}', 'x').replace('{hns_pid}', '1')).hostname
                        if h: fed_hosts.add(h)
                    except Exception:
                        pass
    for k in ('source_url', 'profile_url'):
        u = row.get(k)
        if not u: continue
        try:
            h = urllib.parse.urlparse(u).hostname or ''
        except Exception:
            continue
        if h in fed_hosts:
            return u
    return None


def _parse_federation_profile(html_doc: str, url: str, ime: str, prezime: str) -> Optional[dict]:
    """Best-effort parser for a generic sport-federation profile page.

    Returns {source, url, slika_url, datum_rodenja, mjesto_rodenja, klub,
             extract, raw_text}. Tolerant of varied page structures.
    """
    if not html_doc: return None
    host = urllib.parse.urlparse(url).hostname or ''
    out: dict[str, Any] = {
        'source': host,
        'url': url,
    }
    # Title
    m = re.search(r'<title[^>]*>([^<]+)</title>', html_doc, re.I)
    if m: out['title'] = html.unescape(m.group(1).strip())[:300]
    # Meta description
    m = re.search(r'<meta\s+name=["\']description["\']\s+content=["\']([^"\']+)["\']', html_doc, re.I)
    if m: out['description'] = html.unescape(m.group(1).strip())[:600]

    name_tokens = []
    for t in (ime, prezime):
        if t and len(t) >= 3:
            name_tokens.append(re.escape(t))

    # Pick the first content image whose filename contains the player's name,
    # or fall back to the first non-asset image.
    img_candidates = re.findall(r'<img[^>]+src=["\']([^"\']+)["\']', html_doc, re.I)
    chosen_img = None
    for src in img_candidates:
        low = src.lower()
        if any(b in low for b in ('logo', 'icon', 'admin-ajax', 'spinner', 'loader',
                                  'sprite', '/themes/', '/icons/', 'gdpr', 'banner',
                                  'header', 'footer', 'placeholder', 'avatar-default')):
            continue
        if not low.endswith(('.jpg', '.jpeg', '.png', '.webp')):
            continue
        # Prefer matches on player name in URL
        if name_tokens and any(re.search(t, src, re.I) for t in name_tokens):
            chosen_img = src; break
        if chosen_img is None:
            chosen_img = src
    if chosen_img:
        if not chosen_img.startswith('http'):
            chosen_img = urllib.parse.urljoin(url, chosen_img)
        out['slika_url'] = chosen_img

    # Plain text body for evidence + label scraping
    text = re.sub(r'<script[^>]*>.*?</script>', ' ', html_doc, flags=re.S | re.I)
    text = re.sub(r'<style[^>]*>.*?</style>', ' ', text, flags=re.S | re.I)
    text = re.sub(r'<[^>]+>', ' ', text)
    text = html.unescape(re.sub(r'\s+', ' ', text)).strip()
    out['raw_text'] = text[:4000]
    out['extract']  = (out.get('description')
                       or text[max(0, text.find(prezime)-30):max(0, text.find(prezime)-30)+500]
                       or text[:500])

    # Common label-driven fields (HBS layout: "Godina rođenja: 1979.", "Matični klub: …")
    m = re.search(r'Datum\s+ro[đdj]?enja[:\s]+(\d{1,2}[.\-/]\d{1,2}[.\-/]\d{4})', text, re.I)
    if m:
        try:
            from datetime import date as _date
            d = re.split(r'[.\-/]', m.group(1))
            out['datum_rodenja'] = _date(int(d[2]), int(d[1]), int(d[0])).isoformat()
        except Exception:
            pass
    if 'datum_rodenja' not in out:
        m = re.search(r'Godina\s+ro[đdj]?enja[:\s]+(\d{4})', text, re.I)
        if m:
            try:
                from datetime import date as _date
                out['datum_rodenja'] = _date(int(m.group(1)), 1, 1).isoformat()
            except Exception:
                pass
    m = re.search(r'Mjesto\s+ro[đdj]?enja[:\s]+([A-ZČĆŠĐŽ][^,\n.]{2,40})', text)
    if m: out['mjesto_rodenja'] = m.group(1).strip()
    m = re.search(r'Mati[čc]ni\s+klub[:\s]+([^\n]{3,60}?)(?:\s+(?:Sportski|Datum|Liječni|Reprezent|Sezona|Domaće|Nastupi))', text, re.I)
    if m: out['klub_naziv'] = m.group(1).strip().rstrip('.')

    return out


def _slugify_simple(s: str) -> str:
    import unicodedata
    s = unicodedata.normalize('NFKD', s or '').encode('ascii', 'ignore').decode('ascii').lower()
    return re.sub(r'[^a-z0-9]+', '-', s).strip('-')


def scrape_sport_federation(sport: Optional[str], ime: str, prezime: str) -> Optional[dict]:
    """Try to find and parse the athlete's federation profile page."""
    fed = _sport_fed(sport) if sport else None
    if not fed: return None
    nat = (fed or {}).get('national') or {}
    full_name = (ime + ' ' + prezime).strip()

    # 1) Direct profile URL via {slug} pattern (works for HBS at least)
    pattern = nat.get('profile_url_pattern')
    if pattern and '{slug}' in pattern:
        slug = _slugify_simple(full_name)
        url = pattern.replace('{slug}', slug)
        body = _http_get(url, timeout=8)
        if body and prezime.lower() in body.lower():
            return _parse_federation_profile(body, url, ime, prezime)

    # 2) Search URL → first /igraci|/profil|/clan link that mentions the surname
    search = nat.get('search_url')
    if search:
        body = _http_get(search.replace('{q}', urllib.parse.quote(full_name)), timeout=10)
        if body:
            for href_re in (r'href="([^"]*?/igraci/[^"]+)"',
                            r'href="([^"]*?/igrac/[^"]+)"',
                            r'href="([^"]*?/sportasi/[^"]+)"',
                            r'href="([^"]*?/clanovi/[^"]+)"',
                            r'href="([^"]*?/profil/[^"]+)"'):
                for m in re.finditer(href_re, body, re.I):
                    cand = m.group(1)
                    if not cand.startswith('http'):
                        cand = urllib.parse.urljoin(nat.get('url', search), cand)
                    if _slugify_simple(prezime) in _slugify_simple(cand):
                        b2 = _http_get(cand, timeout=8)
                        if b2:
                            return _parse_federation_profile(b2, cand, ime, prezime)
    return None


def _propose_for_sportas(row: dict) -> dict:
    naziv = ((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip()
    ime, prezime = (row.get('ime') or ''), (row.get('prezime') or '')
    sport = row.get('sport')
    sources, evidence = [], []
    proposed: dict[str, Any] = {}

    # 1) HNS Semafor — only meaningful when sport is football OR row already
    # carries an HNS link.
    hns_doc: Optional[dict] = None
    if _normalize_sport(sport) == 'nogomet' or _hns_url_from_row(row):
        hns_url = _hns_url_from_row(row)
        if hns_url:
            hns_doc = _hns_fetch_player(hns_url)
            if hns_doc:
                sources.append(hns_doc)
                evidence.append(hns_doc.get('raw_text') or hns_doc.get('extract') or '')

    # 2) Sport-aware federation scrape (HBS, HKS, etc.) — also use existing
    # source_url/profile_url if it points at a known federation host.
    fed_doc: Optional[dict] = None
    direct_fed_url = _fed_url_from_row(row)
    if direct_fed_url and (not hns_doc or hns_doc.get('url') != direct_fed_url):
        body = _http_get(direct_fed_url, timeout=8)
        if body:
            fed_doc = _parse_federation_profile(body, direct_fed_url, ime, prezime)
    if not fed_doc:
        fed_doc = scrape_sport_federation(sport, ime, prezime)
    if fed_doc:
        sources.append(fed_doc)
        evidence.append(fed_doc.get('raw_text') or fed_doc.get('extract') or '')

    # Helper: pick from hns_doc first then fed_doc
    def _pick(field):
        if hns_doc and hns_doc.get(field): return hns_doc[field]
        if fed_doc and fed_doc.get(field): return fed_doc[field]
        return None

    if not row.get('profile_url'):
        v = _pick('url') or (hns_doc and hns_doc.get('url')) or (fed_doc and fed_doc.get('url'))
        if v: proposed['profile_url'] = v
    if not row.get('source_url'):
        v = (hns_doc and hns_doc.get('url')) or (fed_doc and fed_doc.get('url'))
        if v: proposed['source_url'] = v
    if not row.get('slika_url'):
        v = _pick('slika_url')
        if v: proposed['slika_url'] = v
    if not row.get('hns_igrac_id') and hns_doc and hns_doc.get('hns_igrac_id'):
        proposed['hns_igrac_id'] = hns_doc['hns_igrac_id']
    if not row.get('datum_rodenja'):
        v = _pick('datum_rodenja')
        if v: proposed['datum_rodenja'] = v
    if not row.get('mjesto_rodenja'):
        v = _pick('mjesto_rodenja')
        if v: proposed['mjesto_rodenja'] = v
    if not row.get('broj_dresa') and hns_doc and hns_doc.get('broj_dresa'):
        proposed['broj_dresa'] = hns_doc['broj_dresa']

    # 3) Wikipedia HR for biografija
    if not row.get('biografija'):
        wiki = _wiki_summary(naziv)
        if wiki:
            sources.append(wiki)
            evidence.append(wiki.get('extract') or '')

    # 4) kosarkapgz.com — news mentions for PGŽ basketball players
    if _kosarkapgz_is_basketball_pgz('', sport):
        q = (ime + ' ' + prezime).strip()
        kpgz = _kosarkapgz_search_posts(q, limit=5) if q else None
        if kpgz:
            sources.append(kpgz)
            evidence.append(kpgz.get('raw_text') or '')

    # Description: prefer DeepSeek synthesis from all evidence; fallback to first long snippet
    if not row.get('biografija'):
        descr = _deepseek_describe(naziv, f'sportaš ({sport})' if sport else 'sportaš', evidence) if evidence else None
        if not descr:
            for s in sources:
                ext = s.get('extract')
                if ext and len(ext) >= 80:
                    descr = ext; break
        if descr:
            proposed['biografija'] = descr.strip()[:2000]

    return {'proposed': proposed, 'sources': sources}


# ─── Endpoints ──────────────────────────────────────────────────────────
# ─── R4 — POST /v2/enrich/forensic/{finding_id} ─────────────────────────
def _extract_pep_name(finding: dict) -> Optional[str]:
    """Pull the primary person name from a forensic_findings row."""
    title = (finding.get('title') or '').strip()
    desc  = (finding.get('description') or '').strip()
    payload = finding.get('raw_data') or {}
    if isinstance(payload, str):
        try: payload = json.loads(payload)
        except Exception: payload = {}
    if isinstance(payload, dict):
        for k in ('person_name', 'name', 'osoba'):
            v = payload.get(k)
            if v: return str(v).strip()
    # Try entities_involved.entity_name
    ents = finding.get('entities_involved') or []
    if isinstance(ents, str):
        try: ents = json.loads(ents)
        except Exception: ents = []
    if isinstance(ents, list):
        for e in ents:
            if isinstance(e, dict) and e.get('person_name'):
                return str(e['person_name']).strip()
            if isinstance(e, dict) and e.get('entity_name') and ' ' in (e.get('entity_name') or ''):
                # Some entries store person names as entity_name when entity_type='person'
                if (e.get('entity_type') or '').lower() in ('person','osoba'):
                    return str(e['entity_name']).strip()
    # Fallback: extract a "Ime Prezime" from the title
    m = re.search(r'\b([A-ZČĆŠĐŽ][a-zčćšđž]+)\s+([A-ZČĆŠĐŽ][a-zčćšđž]+(?:-[A-ZČĆŠĐŽ][a-zčćšđž]+)?)\b', title + ' ' + desc)
    if m: return f"{m.group(1)} {m.group(2)}"
    return None


def _gather_pep_evidence(name: str) -> list[dict]:
    sources: list[dict] = []
    wiki = _wiki_summary(name)
    if wiki: sources.append(wiki)
    # DDG html-lite as a "Google snippet" replacement (often OK for HR PEPs)
    ddg = 'https://html.duckduckgo.com/html/?q=' + urllib.parse.quote(f'"{name}" PGŽ Hrvatska')
    page = _http_get(ddg, timeout=8)
    if page:
        # First result block
        m = re.search(r'<a[^>]+class="result__a"[^>]+href="([^"]+)"[^>]*>([^<]{6,200})</a>', page)
        snippet_m = re.search(r'<a[^>]+class="result__snippet"[^>]*>(.*?)</a>', page, re.S)
        if m:
            sources.append({
                'source': 'duckduckgo',
                'url': html.unescape(m.group(1))[:500],
                'title': html.unescape(m.group(2)).strip()[:300],
                'extract': re.sub(r'<[^>]+>', ' ', snippet_m.group(1)).strip()[:600] if snippet_m else None,
            })
    return sources


def _related_entities_for_pep(name: str) -> list[dict]:
    """Pull civic.persons + their entity links so we have the structured graph."""
    out: list[dict] = []
    with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
        cur.execute("""SELECT id, name, function, party, county, city, oib, trust_tier
                       FROM civic.persons
                       WHERE upper(name) ILIKE upper(%s)
                       ORDER BY oib NULLS LAST, id LIMIT 10""", ('%'+name+'%',))
        for p in cur.fetchall():
            p = dict(p)
            entry = {
                'kind': 'person',
                'person_id': p['id'], 'person_name': p['name'],
                'function': p.get('function'), 'party': p.get('party'),
                'county':   p.get('county'),   'city': p.get('city'),
                'oib':      p.get('oib'),      'trust_tier': p.get('trust_tier'),
                'entities': [],
            }
            if p.get('oib'):
                cur.execute("""SELECT pel.entity_id, pel.roles, e.name AS entity_name,
                                      e.oib AS entity_oib, e.entity_type, e.city, e.risk_score
                               FROM civic.person_entity_links pel
                               LEFT JOIN civic.entities e ON e.id = pel.entity_id
                               WHERE pel.person_oib=%s LIMIT 30""", (p['oib'],))
                for r in cur.fetchall():
                    entry['entities'].append(dict(r))
            out.append(entry)
    return out


@router.post("/enrich/forensic/{finding_id}")
def enrich_forensic_v2(finding_id: int,
                    body: dict = Body(default=None),
                    x_user_email: Optional[str] = Header(default=None),
                    x_user_id:    Optional[int] = Header(default=None)):
    """Enrich a forensic finding: gather Wiki + DDG snippets + civic graph,
    write back to civic.forensic_findings.related_entities, and seal the
    payload hash on Polygon (or queue for sealing).
    """
    body = body or {}
    explicit_name = (body.get('name') or '').strip() or None

    with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
        cur.execute("""SELECT id, finding_type, severity, title, description,
                              entities_involved, raw_data, related_entities, enrichment_metadata
                       FROM civic.forensic_findings WHERE id=%s""", (finding_id,))
        finding = cur.fetchone()
    if not finding:
        raise HTTPException(404, "finding not found")
    finding = dict(finding)

    name = explicit_name or _extract_pep_name(finding)
    if not name:
        raise HTTPException(400, "could not derive a person/entity name; pass {name: \"…\"}")

    sources  = _gather_pep_evidence(name)
    related  = _related_entities_for_pep(name)

    payload = {
        'finding_id': finding_id,
        'name': name,
        'sources': [{'source': s.get('source'), 'url': s.get('url'),
                     'title': s.get('title')} for s in sources],
        'related_entities': related,
        'enriched_at': datetime.now(timezone.utc).isoformat(),
    }

    # Persist back to the finding
    enrichment_meta = finding.get('enrichment_metadata') or {}
    if not isinstance(enrichment_meta, dict): enrichment_meta = {}
    history = enrichment_meta.get('history') or []
    history.append({
        'at': payload['enriched_at'],
        'sources': payload['sources'],
        'related_count': len(related),
        'user': x_user_email,
    })
    enrichment_meta['history']      = history[-10:]
    enrichment_meta['enriched_at']  = payload['enriched_at']
    enrichment_meta['enriched_by']  = x_user_email or 'system'
    enrichment_meta['source_count'] = len(sources)

    with _db() as c, c.cursor() as cur:
        cur.execute("""UPDATE civic.forensic_findings
                          SET related_entities    = %s::jsonb,
                              enrichment_metadata = %s::jsonb
                        WHERE id=%s
                        RETURNING id""",
                    (json.dumps(related, default=str, ensure_ascii=False),
                     json.dumps(enrichment_meta, default=str, ensure_ascii=False),
                     finding_id))
        cur.fetchone()

    # Seal the enrichment payload hash on Polygon (or queue if no key)
    seal_result: dict[str, Any] = {}
    try:
        sys_path_added = False
        try:
            from blockchain import seal as _seal_mod  # noqa: E402
        except Exception:
            import sys as _ssys
            _ssys.path.insert(0, '/opt/pgz-sport')
            from blockchain import seal as _seal_mod  # noqa: E402
            sys_path_added = True
        del sys_path_added  # silence linters
        h = _seal_mod.hash_payload(payload)
        seal_result = _seal_mod.seal_to_polygon(
            data_hash=h,
            ref_id=str(finding_id),
            action='forensic.enriched',
            ref_type='forensic_finding',
            payload=payload,
            user_id=x_user_id,
            user_email=x_user_email,
        )
    except Exception as e:
        seal_result = {'error': f'{type(e).__name__}: {e}'}

    return {
        'finding_id': finding_id,
        'name': name,
        'sources': sources,
        'related_entities': related,
        'related_count': len(related),
        'enrichment_metadata': enrichment_meta,
        'seal': seal_result,
    }


from fastapi import Path as _FPath

@router.post("/enrich/{kind:str}/{eid:int}")
def enrich_preview(kind: str = _FPath(..., regex='^(klub|savez|sportas)$'),
                   eid: int = 0,
                   nocache: int = 0):
    """Enrichment preview with Redis cache (24h TTL) + slow-call telemetry.

    Pass ?nocache=1 to bypass cache.
    """
    _t0 = time.time()
    _cache_key = f"enrich:v1:{kind}:{eid}"
    if not nocache:
        _cached = _cache_get(_cache_key)
        if _cached is not None:
            _cached['_cache'] = 'hit'
            _cached['_cache_ttl_s'] = ENRICH_CACHE_TTL
            try: _enrich_slow_log(kind, eid, time.time() - _t0, cached=True)
            except Exception: pass
            return _cached

    row = _load_row(kind, eid)
    if   kind == 'klub':    res = _propose_for_klub(row)
    elif kind == 'savez':   res = _propose_for_savez(row)
    else:                   res = _propose_for_sportas(row)

    if kind == 'klub':
        keys = ['oib','sport','grad','predsjednik','tajnik','web','email','telefon',
                'sjediste','godina_osnutka','ciljevi','opis_djelatnosti']
    elif kind == 'savez':
        keys = ['oib','sport','predsjednik','tajnik','email','telefon','web','adresa','godina_osnutka']
    else:
        keys = ['sport','profile_url','slika_url','hns_igrac_id','biografija',
                'datum_rodenja','mjesto_rodenja','broj_dresa','visina_cm','tezina_kg',
                'dominantna_noga','oib']

    naziv = _display_name(kind, row)
    grad = row.get('grad') if kind == 'klub' else None
    primary = row.get('web') or row.get('web_stranica') or row.get('source_url') or row.get('scrape_url') or row.get('profile_url')

    filled  = sum(1 for k in keys if row.get(k))
    coverage = round(filled / len(keys) * 100)
    missing  = [k for k in keys if not row.get(k)]

    proposed = res['proposed']
    current  = {k: row.get(k) for k in proposed.keys()}
    meta = row.get('metadata') or {}
    if not isinstance(meta, dict): meta = {}

    _result = {
        'kind': kind, 'id': eid, 'naziv': naziv,
        'coverage': coverage, 'filled_fields': filled, 'total_fields': len(keys),
        'missing_fields': missing,
        'live_snippet': _fetch_title(primary) if primary else None,
        'research_links': _research_links(naziv, kind, grad, sport=row.get('sport'), row=row),
        'sport': row.get('sport'),
        'sport_federation': (lambda f: {
            'national': (f.get('national') or {}).get('name') if f else None,
            'national_url': (f.get('national') or {}).get('url') if f else None,
            'pgz': (f.get('pgz') or {}).get('name') if f else None,
        })(_sport_fed(row.get('sport'))),
        'sources': res['sources'],
        'current':  current,
        'proposed': proposed,
        'last_enriched_at': meta.get('enriched_at'),
        'last_enrichment_source': meta.get('enrichment_source'),
        'enriched_at': int(time.time()),
        'apply_url': f'/sport/api/v2/enrich/{kind}/{eid}/apply',
        '_cache': 'miss',
        '_cache_ttl_s': ENRICH_CACHE_TTL,
    }
    _duration = time.time() - _t0
    try:
        _cache_set(_cache_key, _result, ttl=ENRICH_CACHE_TTL)
    except Exception:
        pass
    try:
        if _duration >= ENRICH_SLOW_LOG_THRESHOLD:
            _enrich_slow_log(kind, eid, _duration, cached=False)
    except Exception:
        pass
    return _result


_TABLE_MAP = {
    'klub':    ('pgz_sport.klubovi',
                {'web','email','telefon','predsjednik','tajnik',
                 'opis_djelatnosti','ciljevi','godina_osnutka','sjediste','adresa'}),
    'savez':   ('pgz_sport.savezi',
                {'web','email','telefon','predsjednik','tajnik','adresa','godina_osnutka'}),
    'sportas': ('pgz_sport.clanovi',
                {'biografija','profile_url','source_url','slika_url','hns_igrac_id',
                 'datum_rodenja','mjesto_rodenja','broj_dresa','visina_cm',
                 'tezina_kg','dominantna_noga','oib'}),
}


def _apply_to_db(kind: str, eid: int, fields: dict, sources: list, user_email: Optional[str]):
    if kind not in _TABLE_MAP:
        raise HTTPException(400, "kind must be klub|savez|sportas")
    table, allowed = _TABLE_MAP[kind]

    # Use a manual transaction so SELECT ... FOR UPDATE actually holds the row
    # lock until we COMMIT. _db() returns autocommit=True connections, so we
    # flip it off locally for this function only.
    c = _db()
    c.autocommit = False
    error_msg = None  # populated on UniqueViolation, recorded in enrichment_log
    try:
        with c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
            cur.execute(f"SELECT * FROM {table} WHERE id=%s FOR UPDATE", (eid,))
            before = cur.fetchone()
            if not before: raise HTTPException(404, kind + " not found")
            before = dict(before)

            sets, params, applied = [], [], {}
            for k, v in (fields or {}).items():
                if k not in allowed: continue
                if v is None or str(v).strip() == '': continue
                if before.get(k):
                    continue  # never overwrite existing
                sets.append(f"{k} = %s")
                params.append(v); applied[k] = v

            meta_in = before.get('metadata') or {}
            if not isinstance(meta_in, dict): meta_in = {}
            now_iso = datetime.now(timezone.utc).isoformat()
            meta_in['enriched_at'] = now_iso
            meta_in['enrichment_source'] = [s.get('source') for s in (sources or []) if s.get('source')]
            history = meta_in.get('enrichment_history') or []
            history.append({
                'at': now_iso,
                'fields': list(applied.keys()),
                'sources': meta_in['enrichment_source'],
                'urls':    [s.get('url') for s in (sources or []) if s.get('url')],
                'user':    user_email,
            })
            meta_in['enrichment_history'] = history[-10:]
            sets.append("metadata = %s::jsonb")
            params.append(json.dumps(meta_in, ensure_ascii=False, default=str))

            params.append(eid)
            try:
                cur.execute(f"UPDATE {table} SET {', '.join(sets)} WHERE id=%s RETURNING *", params)
                after = dict(cur.fetchone())
            except psycopg2.errors.UniqueViolation as _uve:
                # The UPDATE rolled back — pretend nothing was applied so the
                # log row, the API response, and the row state all agree.
                c.rollback()
                failed_fields = list(applied.keys())
                applied = {}                       # truth-in-reporting
                constraint = getattr(getattr(_uve, 'diag', None), 'constraint_name', None)
                error_msg = f"unique_violation: {constraint or 'unknown'}"
                import logging as _lg
                _lg.getLogger("enrich").warning(
                    f"UniqueViolation table={table} id={eid} blocked_fields={failed_fields} constraint={constraint}")
                # Fetch current row state for the audit log.
                cur.execute(f"SELECT * FROM {table} WHERE id=%s", (eid,))
                row = cur.fetchone()
                after = dict(row) if row else {}
                # Park a do-not-retry marker on the row so workers stop hammering it.
                try:
                    block_meta = {
                        'reason':     'unique_violation',
                        'constraint': constraint,
                        'fields':     failed_fields,
                        'at':         now_iso,
                    }
                    cur.execute(
                        f"UPDATE {table} "
                        f"SET metadata = COALESCE(metadata,'{{}}'::jsonb) || %s::jsonb "
                        f"WHERE id=%s",
                        (json.dumps({'enrichment_block': block_meta}, default=str), eid))
                except Exception as _be:
                    # Marker is best-effort; never let it block the apply path.
                    import logging as _lg2
                    _lg2.getLogger("enrich").warning(f"enrichment_block write failed table={table} id={eid}: {_be}")

            # Always log — on success error_msg is NULL, on UniqueViolation it
            # carries the constraint name and applied is empty.
            cur.execute(
                """INSERT INTO pgz_sport.enrichment_log
                     (kind, target_id, source, url, fields_set, before_jsonb, after_jsonb, user_email, error)
                   VALUES (%s,%s,%s,%s,%s,%s::jsonb,%s::jsonb,%s,%s)""",
                (kind, eid,
                 ','.join(meta_in['enrichment_source'])[:120] if meta_in['enrichment_source'] else None,
                 (sources[0].get('url') if sources else None),
                 list(applied.keys()) or None,
                 json.dumps({k: before.get(k) for k in (list(applied.keys()) + ['metadata'])},
                            ensure_ascii=False, default=str),
                 json.dumps({k: after.get(k)  for k in (list(applied.keys()) + ['metadata'])},
                            ensure_ascii=False, default=str),
                 user_email,
                 error_msg))
        c.commit()
    except HTTPException:
        try: c.rollback()
        except Exception: pass
        raise
    except Exception:
        try: c.rollback()
        except Exception: pass
        raise
    finally:
        try: c.close()
        except Exception: pass

    snap_keys = ('id','naziv','ime','prezime','web','email','telefon',
                 'opis_djelatnosti','biografija','metadata')
    return {'applied': applied,
            'after':  {k: after.get(k) for k in snap_keys if k in after}}


@router.post("/enrich/{kind:str}/{eid:int}/apply")
def enrich_apply(kind: str = _FPath(..., regex='^(klub|savez|sportas)$'),
                 eid: int = 0,
                 body: dict = Body(default=None),
                 x_user_email: Optional[str] = Header(default=None),
                 x_user_id:    Optional[int] = Header(default=None)):
    body = body or {}
    fields  = body.get('fields')
    sources = body.get('sources')
    if not fields:
        row = _load_row(kind, eid)
        if   kind == 'klub':    res = _propose_for_klub(row)
        elif kind == 'savez':   res = _propose_for_savez(row)
        else:                   res = _propose_for_sportas(row)
        fields  = res['proposed']
        sources = res['sources']
    out = _apply_to_db(kind, eid, fields or {}, sources or [], x_user_email)
    applied = out.get('applied') or {}
    # R4-A3: write to pgz_sport.sys_audit so the audit page sees enrichment events
    try:
        from audit_seal_router import audit_log as _audit_log
        if applied:
            _audit_log(
                action='enrich.apply',
                target_type=kind,
                target_id=eid,
                payload={'applied': applied,
                         'sources': [s.get('url') for s in (sources or []) if isinstance(s, dict)]},
                user_id=x_user_id,
                user_email=x_user_email,
            )
    except Exception:
        pass
    # Invalidate cache so next preview reflects the new DB state
    try:
        _cache_delete(f"enrich:v1:{kind}:{eid}")
    except Exception:
        pass
    return {
        'status':         'success' if applied else 'no_changes',
        'kind':           kind,
        'id':             eid,
        'applied_count':  len(applied),
        'applied_fields': list(applied.keys()),
        **out,
    }


@router.get("/enrich/log")
def enrich_log(kind: Optional[str] = None, target_id: Optional[int] = None, limit: int = 50):
    where, params = [], []
    if kind:      where.append("kind=%s");      params.append(kind)
    if target_id: where.append("target_id=%s"); params.append(target_id)
    sql = ("SELECT id, kind, target_id, source, url, fields_set, user_email, created_at "
           "FROM pgz_sport.enrichment_log "
           + ("WHERE " + " AND ".join(where) + " " if where else "")
           + "ORDER BY id DESC LIMIT %s")
    params.append(min(int(limit or 50), 200))
    with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
        cur.execute(sql, params)
        rows = [dict(r) for r in cur.fetchall()]
    for r in rows:
        if r.get('created_at'): r['created_at'] = r['created_at'].isoformat()
    return {'count': len(rows), 'rows': rows}


# ─── R3B M2 — SEARCH SUGGEST (autocomplete for Mreža) ───────────────────
@router.get("/search/suggest")
def search_suggest(q: str = '', type: str = '', limit: int = 10):
    """
    Autocomplete suggestions for the Mreža search inputs.
    type ∈ {person, club, company, ''} — empty means all.
    Returns: {query, results: [{id, label, type, sub}]}
    """
    q = (q or '').strip()
    if len(q) < 2:
        return {'query': q, 'results': []}
    limit = max(1, min(50, int(limit)))
    out = []
    with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
        if type in ('', 'club'):
            cur.execute("""
              SELECT id, naziv AS label, sport, grad
              FROM pgz_sport.klubovi
              WHERE naziv ILIKE %s AND aktivan=TRUE
              ORDER BY length(naziv), naziv LIMIT %s
            """, ('%'+q+'%', limit))
            for r in cur.fetchall():
                out.append({'id':'klub:'+str(r['id']), 'label': r['label'], 'type':'club',
                            'sub': (r.get('sport') or '')+' · '+(r.get('grad') or '')})
            cur.execute("""
              SELECT id, naziv AS label, sport
              FROM pgz_sport.savezi
              WHERE naziv ILIKE %s AND aktivan=TRUE
              ORDER BY length(naziv), naziv LIMIT %s
            """, ('%'+q+'%', limit))
            for r in cur.fetchall():
                out.append({'id':'savez:'+str(r['id']), 'label': r['label'], 'type':'savez',
                            'sub': r.get('sport') or 'savez'})
        if type in ('', 'person'):
            cur.execute("""
              SELECT c.id, c.ime, c.prezime, c.sport, k.naziv AS klub_naziv
              FROM pgz_sport.clanovi c
              LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id
              WHERE (COALESCE(c.ime,'') || ' ' || COALESCE(c.prezime,'')) ILIKE %s
              ORDER BY length(COALESCE(c.ime,'')||COALESCE(c.prezime,'')), c.prezime
              LIMIT %s
            """, ('%'+q+'%', limit))
            for r in cur.fetchall():
                out.append({'id':'sportas:'+str(r['id']),
                            'label': (r.get('ime') or '')+' '+(r.get('prezime') or ''),
                            'type':'person',
                            'sub': (r.get('sport') or 'sportaš')+(r.get('klub_naziv') and ' · '+r['klub_naziv'] or '')})
            cur.execute("""
              SELECT id, name AS label, function, oib, county
              FROM civic.persons
              WHERE name ILIKE %s
              ORDER BY oib NULLS LAST, length(name) LIMIT %s
            """, ('%'+q+'%', limit))
            for r in cur.fetchall():
                out.append({'id':'civic_person:'+str(r['id']),
                            'label': r['label'], 'type':'person',
                            'sub': (r.get('function') or 'civic')+' · '+(r.get('county') or '')})
        if type in ('', 'company'):
            cur.execute("""
              SELECT id, name AS label, oib, city, entity_type
              FROM civic.entities
              WHERE name ILIKE %s
              ORDER BY length(name) LIMIT %s
            """, ('%'+q+'%', limit))
            for r in cur.fetchall():
                out.append({'id':'civic_entity:'+str(r['id']),
                            'label': r['label'], 'type':'company',
                            'sub': (r.get('entity_type') or 'tvrtka')+' · '+(r.get('city') or '')})
    return {'query': q, 'results': out[:limit*2]}


# ─── R3B M3 — FORENSIC ENRICH (Wikipedia scrape + persist) ──────────────
@router.post("/forensic/findings/{finding_id}/enrich")
def enrich_forensic(finding_id: int):
    """
    Look up the forensic finding, derive the PEP person name from
    entities_involved or title, hit Wikipedia HR for a summary, and persist
    the enriched payload into civic.forensic_findings.ai_analysis (or back into
    raw_data.enrichment).
    """
    with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
        cur.execute("""
          SELECT id, finding_type, severity, title, description, entities_involved,
                 raw_data, ai_analysis
          FROM civic.forensic_findings WHERE id=%s
        """, (finding_id,))
        f = cur.fetchone()
        if not f: raise HTTPException(404, "finding not found")
        f = dict(f)

        # Derive person name candidates
        candidates = []
        if isinstance(f.get('entities_involved'), (list, dict)):
            ei = f['entities_involved']
            if isinstance(ei, dict):
                for k in ('person','name','osoba','PEP','pep'):
                    if ei.get(k): candidates.append(str(ei[k]))
                # Also try persons: [...] list
                for p in (ei.get('persons') or ei.get('osobe') or []):
                    if isinstance(p, dict) and p.get('name'): candidates.append(p['name'])
                    elif isinstance(p, str): candidates.append(p)
            elif isinstance(ei, list):
                for it in ei:
                    if isinstance(it, dict):
                        for k in ('name','person','label'):
                            if it.get(k): candidates.append(str(it[k])); break
                    elif isinstance(it, str):
                        candidates.append(it)
        if not candidates and f.get('title'):
            # Heuristic: extract first capitalised "Ime Prezime" pair
            m = re.search(r'\b([A-ZŠĐČĆŽ][a-zšđčćž]{2,})\s+([A-ZŠĐČĆŽ][a-zšđčćž]{2,})', f['title'])
            if m: candidates.append(m.group(0))

        wiki = None
        used_query = None
        for q in candidates[:3]:
            wiki = _wiki_summary(q)
            if wiki:
                used_query = q
                break

        # Build enrichment payload
        enrichment = {
            'queried': candidates[:5],
            'used_query': used_query,
            'wiki': wiki,
            'enriched_at': datetime.now(timezone.utc).isoformat(),
        }

        # Persist into raw_data.enrichment
        raw = f.get('raw_data')
        if raw is None: raw = {}
        if not isinstance(raw, dict): raw = {'_legacy': raw}
        raw['enrichment'] = enrichment

        cur.execute("""
          UPDATE civic.forensic_findings
          SET raw_data = %s::jsonb,
              ai_analysis = COALESCE(ai_analysis, %s)
          WHERE id = %s
        """, (json.dumps(raw, default=str, ensure_ascii=False),
              (wiki or {}).get('extract'),
              finding_id))
        c.commit()

    return {
        'finding_id': finding_id,
        'queried': candidates[:5],
        'used_query': used_query,
        'wiki': wiki,
        'persisted': True,
    }


# ─── R3B P4 — FORENSIC SCAN (kept from prior version) ───────────────────
@router.post("/forensic/scan")
def forensic_scan(req: dict = Body(...)):
    name = (req.get('name') or '').strip()
    if len(name) < 3:
        raise HTTPException(400, "name must be at least 3 chars")
    with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
        cur.execute("""
          SELECT id, name, function, party, county, city, oib, trust_tier
          FROM civic.persons
          WHERE upper(name) ILIKE upper(%s)
          ORDER BY oib NULLS LAST, id LIMIT 25
        """, ('%' + name + '%',))
        persons = [dict(r) for r in cur.fetchall()]
        for p in persons:
            p['links'] = []; p['findings'] = []
            if p.get('oib'):
                cur.execute("""
                  SELECT pel.entity_id, pel.roles, e.name AS entity_name, e.oib AS entity_oib,
                         e.entity_type, e.city, e.risk_score
                  FROM civic.person_entity_links pel
                  LEFT JOIN civic.entities e ON e.id = pel.entity_id
                  WHERE pel.person_oib = %s LIMIT 50
                """, (p['oib'],))
                p['links'] = [dict(r) for r in cur.fetchall()]
                cur.execute("""
                  SELECT id, finding_type, severity, title, severity_score, created_at
                  FROM civic.forensic_findings
                  WHERE entities_involved::text ILIKE %s
                  ORDER BY severity_score DESC, created_at DESC LIMIT 30
                """, ('%' + p['oib'] + '%',))
                p['findings'] = [dict(r) for r in cur.fetchall()]
            if not p['findings']:
                cur.execute("""
                  SELECT id, finding_type, severity, title, severity_score, created_at
                  FROM civic.forensic_findings
                  WHERE title ILIKE %s OR description ILIKE %s
                  ORDER BY severity_score DESC, created_at DESC LIMIT 30
                """, ('%' + p['name'] + '%', '%' + p['name'] + '%'))
                p['findings'] = [dict(r) for r in cur.fetchall()]
    total_links = total_findings = crit_findings = 0
    for p in persons:
        total_links += len(p.get('links') or [])
        for f in p.get('findings') or []:
            total_findings += 1
            if f.get('severity') in ('CRITICAL', 'HIGH'): crit_findings += 1
        score = 0
        if (p.get('function') or '').strip(): score += 30
        if (p.get('party') or '').strip():    score += 15
        score += min(40, len(p.get('links') or []) * 5)
        score += min(40, len(p.get('findings') or []) * 10)
        score += sum(20 for f in (p.get('findings') or []) if f.get('severity') in ('CRITICAL', 'HIGH'))
        p['risk_score'] = min(100, score)
    overall = max((p.get('risk_score', 0) for p in persons), default=0)
    return {'query': name, 'matched_persons': len(persons),
            'overall_risk_score': overall, 'total_links': total_links,
            'total_findings': total_findings, 'critical_findings': crit_findings,
            'persons': persons, 'scanned_at': int(time.time())}


# ─── SB-3 — Bulk enrichment ─────────────────────────────────────────────
_BULK_KEY_MAP = {
    'klub':    ('pgz_sport.klubovi',
                ('oib','sport','grad','predsjednik','tajnik','web','email','telefon',
                 'sjediste','godina_osnutka','ciljevi','opis_djelatnosti')),
    'savez':   ('pgz_sport.savezi',
                ('oib','sport','predsjednik','tajnik','email','telefon','web',
                 'adresa','godina_osnutka')),
    'sportas': ('pgz_sport.clanovi',
                ('sport','profile_url','slika_url','hns_igrac_id','biografija',
                 'datum_rodenja','mjesto_rodenja','broj_dresa')),
}


def _coverage_sql(prefix: str, keys: tuple[str, ...]) -> str:
    parts = [f"(CASE WHEN {prefix}{k} IS NOT NULL AND ({prefix}{k}::text) <> '' THEN 1 ELSE 0 END)"
             for k in keys]
    return f"((({' + '.join(parts)})::numeric * 100) / {len(keys)})"


def _bulk_pick(kind: str, limit: int, coverage_max: int) -> list[int]:
    if kind not in _BULK_KEY_MAP:
        raise HTTPException(400, "kind must be klub|savez|sportas")
    table, keys = _BULK_KEY_MAP[kind]
    cov = _coverage_sql('', keys)
    extra_where = ''
    if kind == 'klub':
        extra_where = "AND aktivan = TRUE"
    elif kind == 'sportas':
        extra_where = "AND aktivan = TRUE"
    sql = (f"SELECT id FROM {table} "
           f"WHERE 1=1 {extra_where} "
           f"AND {cov} < %s "
           f"ORDER BY random() LIMIT %s")
    with _db() as c, c.cursor() as cur:
        cur.execute(sql, (coverage_max, limit))
        return [r[0] for r in cur.fetchall()]


@router.post("/enrich/bulk")
def enrich_bulk(body: dict = Body(default=None),
                x_user_email: Optional[str] = Header(default=None),
                x_user_id:    Optional[int] = Header(default=None)):
    """Run preview+apply over N random under-enriched rows of one kind.

    Body: {kind: 'klub'|'savez'|'sportas', limit: 50, coverage_max: 70}
    Returns aggregate stats. Synchronous (use polling, not SSE).
    """
    body = body or {}
    kind = (body.get('kind') or '').strip().lower()
    if kind not in _BULK_KEY_MAP:
        raise HTTPException(400, "kind must be klub|savez|sportas")
    limit         = max(1, min(int(body.get('limit') or 50), 200))
    coverage_max  = max(0, min(int(body.get('coverage_max') or 70), 100))

    ids = _bulk_pick(kind, limit, coverage_max)
    items: list[dict] = []
    fields_total = 0
    started = time.time()

    for eid in ids:
        try:
            row = _load_row(kind, eid)
            if   kind == 'klub':    res = _propose_for_klub(row)
            elif kind == 'savez':   res = _propose_for_savez(row)
            else:                   res = _propose_for_sportas(row)
            proposed = res.get('proposed') or {}
            srcs     = res.get('sources')  or []
            if not proposed:
                items.append({'id': eid, 'applied': 0, 'fields': []})
                continue
            out = _apply_to_db(kind, eid, proposed, srcs, x_user_email)
            applied = out.get('applied') or {}
            fields_total += len(applied)
            items.append({'id': eid, 'applied': len(applied), 'fields': list(applied.keys())})
            try:
                from audit_seal_router import audit_log as _audit_log
                if applied:
                    _audit_log(action='enrich.bulk.apply',
                               target_type=kind, target_id=eid,
                               payload={'applied': applied},
                               user_id=x_user_id, user_email=x_user_email)
            except Exception:
                pass
        except HTTPException as e:
            items.append({'id': eid, 'error': e.detail})
        except Exception as e:
            items.append({'id': eid, 'error': f'{type(e).__name__}: {e}'})

    return {
        'status':       'success',
        'kind':         kind,
        'requested':    limit,
        'processed':    len(items),
        'fields_total': fields_total,
        'elapsed_s':    round(time.time() - started, 1),
        'items':        items,
    }


# ─── SB-4 — Worker status / control ─────────────────────────────────────
_REDIS_KEYS = {
    'heartbeat':   'cc:pgz-enricher:heartbeat',
    'pause':       'cc:pgz-enricher:pause',
    'run_now':     'cc:pgz-enricher:run_now',
    'last_cycle':  'cc:pgz-enricher:last_cycle',
    'confidence':  'cc:pgz-enricher:confidence',
    'fields_24h':  'cc:pgz-enricher:fields_24h',
}


def _redis_client():
    try:
        import redis
    except Exception:
        return None
    host = os.environ.get('REDIS_HOST', 'localhost')
    port = int(os.environ.get('REDIS_PORT', '6379'))
    pwd  = (os.environ.get('REDIS_PASS') or '').strip().strip("'").strip('"') or None
    # Try with password first (prod); fall back to anonymous (dev box) on AUTH failure.
    for p in (pwd, None):
        try:
            r = redis.Redis(host=host, port=port, password=p,
                            decode_responses=True, socket_connect_timeout=2)
            r.ping()
            return r
        except Exception:
            continue
    return None


@router.get("/enrich/worker/status")
def enrich_worker_status():
    r = _redis_client()
    out = {'available': bool(r)}
    if not r:
        return out
    try:
        hb = r.get(_REDIS_KEYS['heartbeat'])
        out['heartbeat']        = int(hb) if hb else None
        out['heartbeat_age_s']  = (int(time.time()) - int(hb)) if hb else None
        out['paused']           = (r.get(_REDIS_KEYS['pause']) or '0') == '1'
        out['run_now_pending']  = (r.get(_REDIS_KEYS['run_now']) or '0') == '1'
        last = r.get(_REDIS_KEYS['last_cycle'])
        if last:
            try:    out['last_cycle'] = json.loads(last)
            except: out['last_cycle'] = last
        conf = r.get(_REDIS_KEYS['confidence'])
        out['confidence_threshold'] = float(conf) if conf else 0.7
        f24 = r.get(_REDIS_KEYS['fields_24h'])
        out['fields_24h'] = int(f24) if f24 and f24.isdigit() else 0
    except Exception as e:
        out['error'] = f'{type(e).__name__}: {e}'
    # Recent enrichment_log rows for live activity
    try:
        with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
            cur.execute("""SELECT id, kind, target_id, source, fields_set, user_email, created_at
                           FROM pgz_sport.enrichment_log
                           ORDER BY id DESC LIMIT 25""")
            rows = []
            for rr in cur.fetchall():
                rr = dict(rr)
                if rr.get('created_at'): rr['created_at'] = rr['created_at'].isoformat()
                rows.append(rr)
            out['recent'] = rows
    except Exception:
        out['recent'] = []
    return out


@router.post("/enrich/worker/pause")
def enrich_worker_pause(body: dict = Body(default=None)):
    body = body or {}
    pause = bool(body.get('paused', True))
    r = _redis_client()
    if not r: raise HTTPException(503, 'redis unavailable')
    r.set(_REDIS_KEYS['pause'], '1' if pause else '0')
    return {'status': 'success', 'paused': pause}


@router.post("/enrich/worker/run-now")
def enrich_worker_run_now():
    r = _redis_client()
    if not r: raise HTTPException(503, 'redis unavailable')
    r.set(_REDIS_KEYS['run_now'], '1')
    return {'status': 'success', 'queued': True}


@router.post("/enrich/worker/confidence")
def enrich_worker_confidence(body: dict = Body(...)):
    try:
        v = float(body.get('value'))
    except Exception:
        raise HTTPException(400, 'value must be number 0..1')
    if not (0.0 <= v <= 1.0):
        raise HTTPException(400, 'value out of range 0..1')
    r = _redis_client()
    if not r: raise HTTPException(503, 'redis unavailable')
    r.set(_REDIS_KEYS['confidence'], str(v))
    return {'status': 'success', 'confidence_threshold': v}