diff --git a/_backups/sprint_1777940323/app.html b/_backups/sprint_1777940323/app.html new file mode 100644 index 0000000..c94f455 --- /dev/null +++ b/_backups/sprint_1777940323/app.html @@ -0,0 +1,1918 @@ + + + + + +PGŽ SPORT — Operativna aplikacija + + + + + + + + + + + +
+ + +
+
+
+
Dashboard
+
Pregled stanja
+
+
+
+
+
DR
+
+
Damir Radulićpgz admin
+
Primorsko-goranska županija
+
+
+
+
+ +
+
Učitavanje...
+
+
+
+ + +
+ + + + + + + diff --git a/_backups/sprint_1777940323/enrich_router.py b/_backups/sprint_1777940323/enrich_router.py new file mode 100644 index 0000000..ccc0e7c --- /dev/null +++ b/_backups/sprint_1777940323/enrich_router.py @@ -0,0 +1,1690 @@ +""" +enrich_router.py — v3 enrichment + forensic scan +Author: dradulic@outlook.com / damir@rinet.one +Date: 2026-05-04 (R2) → 2026-05-05 (R3 CC6 v3) + +POST /v2/enrich/{kind}/{eid} + Inspect the row, scrape the web (Wikipedia HR, sport-pgz.hr search, + primary club URL if any), regex-extract candidate fields (web/email/ + telefon), optionally synthesise descriptions via DeepSeek, and return + a *preview* shape with `proposed` updates the operator can apply. + +POST /v2/enrich/{kind}/{eid}/apply + Body shapes: + None / {} → re-run preview, apply every proposed field + {"fields": {...}} → apply ONLY those (whitelist + emptiness still enforced) + Performs UPDATE on the matching table, sets metadata.enriched_at and + metadata.enrichment_source, writes a row to pgz_sport.enrichment_log, + returns the after snapshot. + +GET /v2/enrich/log?kind=&target_id=&limit= + Read recent enrichment-log entries. + +POST /v2/forensic/scan + Search civic.persons by name, return entity links + findings + risk score. + +Kinds: klub | savez | sportas +""" +from __future__ import annotations +import os, re, json, time, html, urllib.parse, urllib.request +from datetime import datetime, timezone +from typing import Any, Optional + +import psycopg2, psycopg2.extras +from fastapi import APIRouter, HTTPException, Header, Body + +router = APIRouter() + +_pgh = os.environ.get('PG_HOST', '10.10.0.2') +_pgp = int(os.environ.get('PG_PORT', '6432')) +if _pgh in ('localhost', '127.0.0.1'): + _pgh = os.environ.get('DB_HOST', '10.10.0.2') + _pgp = int(os.environ.get('DB_PORT', '6432')) +DB = dict(host=_pgh, port=_pgp, + dbname=os.environ.get('PG_DB', 'rinet_v3'), + user=os.environ.get('PG_USER', 'rinet'), + password=os.environ.get('PG_PASS', 'R1net2026!SecureDB#v7')) + +UA = 'pgz-sport-enrich/3.0 (+https://sport.rinet.one)' +TIMEOUT = 6 # seconds — fail-soft + +# Optional JS-aware fallback (Playwright). Lazy-loaded, never required. +import sys as _sys +_sys.path.insert(0, '/opt/pgz-sport') +try: + from enrichment import playwright_scraper as _pw_scraper + _HAS_PW = _pw_scraper.HAS_PLAYWRIGHT +except Exception: + _pw_scraper = None + _HAS_PW = False + +DEEPSEEK_KEY = os.environ.get('DEEPSEEK_API_KEY', '').strip() +DEEPSEEK_URL = os.environ.get('DEEPSEEK_URL', + 'https://api.deepseek.com/v1/chat/completions') + + +# ─── DB helpers ────────────────────────────────────────────────────────── +def _db(): + c = psycopg2.connect(**DB); c.autocommit = True; return c + +def _fetch_one(sql, p): + with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: + cur.execute(sql, p); r = cur.fetchone() + return dict(r) if r else None + + +# ─── HTTP helpers ──────────────────────────────────────────────────────── +def _http_get(url: str, timeout: int = TIMEOUT) -> Optional[str]: + if not url: return None + if not url.startswith('http'): return None + try: + req = urllib.request.Request(url, headers={ + 'User-Agent': UA, 'Accept-Language': 'hr,en;q=0.8'}) + with urllib.request.urlopen(req, timeout=timeout) as r: + data = r.read(150000) + try: return data.decode('utf-8') + except: return data.decode('latin-1', 'ignore') + except Exception: + return None + + +def _strip_tags(s: str) -> str: + if not s: return '' + s = re.sub(r']*>.*?', ' ', s, flags=re.S | re.I) + s = re.sub(r']*>.*?', ' ', s, flags=re.S | re.I) + s = re.sub(r'<[^>]+>', ' ', s) + s = html.unescape(s) + s = re.sub(r'\s+', ' ', s).strip() + return s + + +def _extract_meta(html_doc: str, url: str) -> dict: + if not html_doc: return {} + out = {'url': url, 'fetched_at': int(time.time())} + m = re.search(r']*>([^<]+)', html_doc, re.I) + if m: out['title'] = html.unescape(m.group(1).strip())[:300] + m = re.search(r')\]]+', re.I) + +def _find_email(text: str) -> Optional[str]: + if not text: return None + bad = ('@example.', '@test.', '@email.', 'wixpress.com', + 'sentry.io', 'jquery.com', 'googleapis', '@2x.', 'noreply@') + seen = set() + for m in RE_EMAIL.finditer(text): + e = m.group(0).lower().rstrip('.,;:)') + if any(b in e for b in bad): continue + if e in seen: continue + seen.add(e); return e + return None + +def _find_phone(text: str) -> Optional[str]: + if not text: return None + for m in RE_PHONE.finditer(text): + raw = m.group(0).strip() + digits = re.sub(r'\D', '', raw) + if not (8 <= len(digits) <= 13): continue + cleaned = re.sub(r'\s+', ' ', raw).strip() + if raw.startswith('+385'): return '+385 ' + raw[4:].lstrip().lstrip('-/') + if raw.startswith('00385'): return '+385 ' + raw[5:].lstrip().lstrip('-/') + return cleaned + return None + +def _find_official_web(text: str, hint: str = '') -> Optional[str]: + if not text: return None + blocked = ('wikipedia.org', 'sport-pgz.hr', 'google.com', 'facebook.com', + 'instagram.com', 'youtube.com', 'twitter.com', 'wikimedia', + 'sportilus.com', 'transfermarkt.com', 'wikidata.org', + 'sudreg.pravosudje.hr', 'gov.hr', 'apis.google.com', + 'rinet.one', 'pgz.hr') + candidates: list[str] = [] + for m in RE_URL.finditer(text): + u = m.group(0).rstrip('.,;:)\'"') + try: + host = urllib.parse.urlparse(u).hostname or '' + except Exception: + continue + if not host or any(b in host for b in blocked): continue + candidates.append(u) + if not candidates: return None + if hint: + slug = re.sub(r'[^a-z0-9]', '', hint.lower())[:8] + for u in candidates: + host = urllib.parse.urlparse(u).hostname or '' + if slug and slug in host.replace('-', '').replace('.', ''): + return u + return candidates[0] + + +# ─── External sources ──────────────────────────────────────────────────── +def _wiki_variants(query: str) -> list[str]: + """Generate sensible Wikipedia HR title variants for a query. + + The summary REST API is title-exact; clubs are often listed under their + abbreviation (KK X, NK X, RK X, OK X), so we try those variants too. + """ + if not query: return [] + out, seen = [], set() + raw = query.strip() + def _push(v): + if v and v not in seen: seen.add(v); out.append(v) + _push(raw) + # KK Kvarner 2010 from Košarkaški klub KVARNER 2010 + parts = raw.split() + sport_to_abbr = { + 'košarkaški': 'KK', 'kosarkaski': 'KK', + 'nogometni': 'NK', 'rukometni': 'RK', + 'odbojkaški': 'OK', 'odbojkaski': 'OK', + 'vaterpolski':'VK', 'plivacki': 'PK', 'plivački': 'PK', + 'boćarski': 'BK', 'bocarski': 'BK', + } + if len(parts) >= 3 and parts[0].lower() in sport_to_abbr and parts[1].lower() == 'klub': + _push(sport_to_abbr[parts[0].lower()] + ' ' + ' '.join(p.capitalize() if p.isupper() else p for p in parts[2:])) + return out + +def _wiki_summary(query: str) -> Optional[dict]: + for variant in _wiki_variants(query): + title = urllib.parse.quote(variant.replace(' ', '_'), safe='') + body = _http_get(f'https://hr.wikipedia.org/api/rest_v1/page/summary/{title}', timeout=5) + if not body: continue + try: + d = json.loads(body) + except Exception: + continue + if d.get('type') in ('disambiguation', 'no-extract'): continue + if not d.get('extract'): continue + return { + 'source': 'wikipedia.hr', + 'url': d.get('content_urls', {}).get('desktop', {}).get('page'), + 'title': d.get('title'), + 'extract': d.get('extract'), + 'description': d.get('description'), + 'matched_variant': variant, + } + return None + + +def _sport_pgz_search(query: str) -> Optional[dict]: + if not query: return None + page = _http_get('https://sport-pgz.hr/?s=' + urllib.parse.quote(query), timeout=6) + if not page: + # Plain HTTP failed → try JS-rendered fallback if available. + if _HAS_PW and _pw_scraper is not None: + return _pw_scraper.scrape_sport_pgz_klub(query) + return None + m = re.search(r']*>.*?]*rel=["\']bookmark["\'][^>]*>([^<]+)', + page, re.S | re.I) + if not m: + m = re.search(r']*>([^<]{6,180})', page, re.I) + if not m: + # Search page rendered but yielded nothing parseable — try JS fallback. + if _HAS_PW and _pw_scraper is not None: + return _pw_scraper.scrape_sport_pgz_klub(query) + return None + hit = m.group(1) + body = _http_get(hit, timeout=6) + if not body: + return {'source': 'sport-pgz.hr', 'url': hit, 'title': html.unescape(m.group(2).strip())} + text = _strip_tags(body)[:4000] + meta = _extract_meta(body, hit) + return { + 'source': 'sport-pgz.hr', + 'url': hit, + 'title': meta.get('title') or html.unescape(m.group(2).strip()), + 'extract': meta.get('description') or text[:500], + 'raw_text': text, + } + + +def _fetch_primary_site(url: str) -> Optional[dict]: + body = _http_get(url, timeout=6) + if not body: return None + text = _strip_tags(body) + meta = _extract_meta(body, url) + return { + 'source': urllib.parse.urlparse(url).hostname or url, + 'url': url, + 'title': meta.get('title'), + 'extract': meta.get('description') or text[:500], + 'raw_text': text[:8000], + } + + +# ─── DeepSeek (optional, fail-soft) ───────────────────────────────────── +def _deepseek_describe(naziv: str, kind: str, evidence: list[str]) -> Optional[str]: + if not DEEPSEEK_KEY or not evidence: return None + joined = "\n---\n".join(e for e in evidence if e)[:6000] + if not joined.strip(): return None + prompt = (f"Iz dolje navedenih izvora napiši profesionalni opis za " + f"{kind} '{naziv}' na hrvatskom jeziku. 3-5 rečenica. " + f"Bez uvoda 'Evo opisa', samo tekst.\n\nIZVORI:\n{joined}") + payload = { + "model": "deepseek-chat", + "messages": [ + {"role": "system", "content": "Pišeš sažete činjenične opise sportskih organizacija na hrvatskom."}, + {"role": "user", "content": prompt}, + ], + "max_tokens": 280, "temperature": 0.3, + } + req = urllib.request.Request( + DEEPSEEK_URL, data=json.dumps(payload).encode('utf-8'), + headers={'Authorization': 'Bearer ' + DEEPSEEK_KEY, + 'Content-Type': 'application/json', + 'User-Agent': UA}, method='POST') + try: + with urllib.request.urlopen(req, timeout=20) as r: + d = json.loads(r.read().decode('utf-8')) + text = d.get('choices', [{}])[0].get('message', {}).get('content', '').strip() + return text or None + except Exception: + return None + + +# ─── Row loaders & display name ───────────────────────────────────────── +def _load_row(kind: str, eid: int) -> dict: + if kind == 'klub': + row = _fetch_one("""SELECT id, naziv, oib, sport, grad, predsjednik, tajnik, + web, web_stranica, email, telefon, ciljevi, opis_djelatnosti, + sjediste, godina_osnutka, savez_id, scrape_url, source_url, + metadata + FROM pgz_sport.klubovi WHERE id=%s""", (eid,)) + elif kind == 'savez': + row = _fetch_one("""SELECT id, naziv, oib, sport, predsjednik, tajnik, email, telefon, web, + adresa, godina_osnutka, source_url, metadata + FROM pgz_sport.savezi WHERE id=%s""", (eid,)) + elif kind == 'sportas': + row = _fetch_one("""SELECT c.id, c.ime, c.prezime, c.sport, c.klub_id, c.profile_url, + c.slika_url, c.source_url, c.source, c.source_id, + c.hns_igrac_id, c.biografija, + c.datum_rodenja, c.mjesto_rodenja, c.broj_dresa, + c.visina_cm, c.tezina_kg, c.dominantna_noga, c.oib, + c.vanjski_id, c.metadata, + k.sport AS klub_sport, k.naziv AS klub_naziv + FROM pgz_sport.clanovi c + LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id + WHERE c.id=%s""", (eid,)) + # Fall back to klub.sport when c.sport is empty + if row and not row.get('sport') and row.get('klub_sport'): + row['sport'] = row['klub_sport'] + else: + raise HTTPException(400, "kind must be klub|savez|sportas") + if not row: + raise HTTPException(404, kind + " not found") + return row + + +def _display_name(kind: str, row: dict) -> str: + if kind == 'sportas': + return ((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip() + return row.get('naziv', '') or '' + + +# ─── Sport federations map (loaded once, refresh on file mtime) ───────── +_SPORT_FED_PATH = '/opt/pgz-sport/data/sport_federations.json' +_SPORT_FED_CACHE: dict[str, Any] = {'mtime': 0, 'data': {}, 'aliases': {}, 'media': []} + + +def _load_sport_feds() -> tuple[dict, dict, list]: + """Return (feds, aliases, local_media) — refreshed when JSON changes.""" + try: + st = os.stat(_SPORT_FED_PATH) + except FileNotFoundError: + return ({}, {}, []) + if st.st_mtime != _SPORT_FED_CACHE['mtime']: + try: + with open(_SPORT_FED_PATH, 'r', encoding='utf-8') as f: + raw = json.load(f) + except Exception: + return (_SPORT_FED_CACHE['data'], + _SPORT_FED_CACHE['aliases'], + _SPORT_FED_CACHE['media']) + aliases = raw.pop('_aliases', {}) if isinstance(raw, dict) else {} + media = raw.pop('_local_media_pgz', []) if isinstance(raw, dict) else [] + raw.pop('_meta', None) + _SPORT_FED_CACHE.update(mtime=st.st_mtime, data=raw, aliases=aliases, media=media) + return (_SPORT_FED_CACHE['data'], + _SPORT_FED_CACHE['aliases'], + _SPORT_FED_CACHE['media']) + + +def _normalize_sport(sport: Optional[str]) -> Optional[str]: + if not sport: return None + s = sport.strip().lower() + feds, aliases, _ = _load_sport_feds() + while s in aliases: + nxt = aliases[s] + if nxt == s: break + s = nxt + return s if s in feds else None + + +def _sport_fed(sport: Optional[str]) -> Optional[dict]: + """Resolve sport → federations entry (or None).""" + norm = _normalize_sport(sport) + if not norm: return None + feds, _, _ = _load_sport_feds() + return feds.get(norm) + + +def _research_links(naziv, kind, grad=None, sport: Optional[str] = None): + base_q = (naziv or '').strip() + q = (base_q + ' ' + grad) if grad else base_q + qenc = urllib.parse.quote(q) + out = [ + {'label': 'Google', 'icon': '🔍', 'url': 'https://www.google.com/search?q=' + qenc}, + {'label': 'Wikipedia HR', 'icon': '📚', 'url': 'https://hr.wikipedia.org/w/index.php?search=' + qenc}, + {'label': 'sport-pgz.hr', 'icon': '🏅', 'url': 'https://sport-pgz.hr/?s=' + qenc}, + ] + if kind == 'klub': + out.append({'label': 'Sportilus', 'icon': '⬡', 'url': 'https://www.sportilus.com/?s=' + qenc}) + out.append({'label': 'Sudski registar', 'icon': '⚖', 'url': 'https://sudreg.pravosudje.hr/registar/oc/index.html'}) + + # Sport-specific federation links (replace static HNS/transfermarkt for sportas) + fed = _sport_fed(sport) if sport else None + if kind == 'sportas': + if fed and isinstance(fed.get('national'), dict): + nat = fed['national'] + search = (nat.get('search_url') or nat.get('url') or '').replace('{q}', qenc) + if search: + out.append({'label': nat.get('name', 'Nacionalni savez'), + 'icon': '🏆', 'url': search}) + if fed and isinstance(fed.get('pgz'), dict): + pgz = fed['pgz'] + url = pgz.get('search_url') or pgz.get('url') or '' + if url: + out.append({'label': pgz.get('name', 'PGŽ savez'), + 'icon': '🏟', 'url': url.replace('{q}', qenc)}) + if not fed: + # No mapping for this sport → keep transfermarkt as legacy fallback + out.append({'label': 'HNS Semafor', 'icon': '⚽', 'url': 'https://semafor.hns.family/?s=' + qenc}) + out.append({'label': 'transfermarkt','icon': '⚽', 'url': 'https://www.transfermarkt.com/schnellsuche/ergebnis/schnellsuche?query=' + qenc}) + # Local PGŽ media for any sportas + _, _, media = _load_sport_feds() + for m in media: + url = (m.get('search_url') or '').replace('{q}', qenc) + if url: + out.append({'label': m.get('name', 'Lokalni medij'), + 'icon': '📰', 'url': url}) + if kind == 'savez': + out.append({'label': 'sport-pgz.hr savezi', 'icon': '🏅', 'url': 'https://sport-pgz.hr/savezi'}) + return out + + +# ─── Proposal pipelines ───────────────────────────────────────────────── +def _name_tokens(naziv: str) -> list[str]: + """Significant tokens from entity name (≥4 chars, deaccented).""" + import unicodedata + s = unicodedata.normalize('NFKD', naziv or '').encode('ascii', 'ignore').decode('ascii').lower() + toks = [t for t in re.split(r'[^a-z0-9]+', s) if len(t) >= 4] + stop = {'klub','udruga','sportski','sport','kosarkaski','kosarka','nogometni', + 'rukometni','savez','rijeka','primorsko','goranski','grad','grada','centar'} + return [t for t in toks if t not in stop] or toks + + +def _is_relevant(source: dict, tokens: list[str]) -> bool: + """A source is 'relevant' only if the page actually mentions the entity name.""" + if not tokens: return True + import unicodedata + blob = (source.get('title') or '') + ' ' + (source.get('extract') or '') + ' ' + (source.get('raw_text') or '') + blob = unicodedata.normalize('NFKD', blob.lower()).encode('ascii', 'ignore').decode('ascii') + return any(t in blob for t in tokens) + + +def _propose_for_klub(row: dict) -> dict: + naziv = row.get('naziv') or '' + primary = row.get('web') or row.get('web_stranica') or row.get('source_url') or row.get('scrape_url') + sources, evidence = [], [] + pdoc = _fetch_primary_site(primary) if primary else None + if pdoc: sources.append(pdoc); evidence.append(pdoc.get('raw_text') or pdoc.get('extract') or '') + wiki = _wiki_summary(naziv) + if wiki: sources.append(wiki); evidence.append(wiki.get('extract') or '') + spz = _sport_pgz_search(naziv) + if spz: sources.append(spz); evidence.append(spz.get('raw_text') or spz.get('extract') or '') + + tokens = _name_tokens(naziv) + relevant = [s for s in sources if _is_relevant(s, tokens)] + relevant_blob = '\n\n'.join((s.get('raw_text') or s.get('extract') or '') for s in relevant) + + proposed: dict[str, Any] = {} + # web/email/telefon: ONLY from sources actually mentioning the entity + if not row.get('web'): + u = _find_official_web(relevant_blob, naziv) + if u: proposed['web'] = u + if not row.get('email'): + e = _find_email(relevant_blob) + if e: proposed['email'] = e + if not row.get('telefon'): + t = _find_phone(relevant_blob) + if t: proposed['telefon'] = t + if not row.get('opis_djelatnosti'): + descr_evidence = [(s.get('raw_text') or s.get('extract') or '') for s in relevant] or evidence + descr = _deepseek_describe(naziv, 'sportski klub', descr_evidence) + if not descr: + for s in (relevant or sources): + if s.get('extract') and len(s['extract']) >= 80: + descr = s['extract']; break + if descr: proposed['opis_djelatnosti'] = descr.strip()[:2000] + return {'proposed': proposed, 'sources': sources} + + +def _propose_for_savez(row: dict) -> dict: + naziv = row.get('naziv') or '' + primary = row.get('web') or row.get('source_url') + sources, evidence = [], [] + pdoc = _fetch_primary_site(primary) if primary else None + if pdoc: sources.append(pdoc); evidence.append(pdoc.get('raw_text') or '') + wiki = _wiki_summary(naziv) + if wiki: sources.append(wiki); evidence.append(wiki.get('extract') or '') + spz = _sport_pgz_search(naziv) + if spz: sources.append(spz); evidence.append(spz.get('raw_text') or '') + + tokens = _name_tokens(naziv) + relevant = [s for s in sources if _is_relevant(s, tokens)] + relevant_blob = '\n\n'.join((s.get('raw_text') or s.get('extract') or '') for s in relevant) + + proposed: dict[str, Any] = {} + if not row.get('web'): + u = _find_official_web(relevant_blob, naziv) + if u: proposed['web'] = u + if not row.get('email'): + e = _find_email(relevant_blob) + if e: proposed['email'] = e + if not row.get('telefon'): + t = _find_phone(relevant_blob) + if t: proposed['telefon'] = t + return {'proposed': proposed, 'sources': sources} + + +# ─── HNS Semafor parsing ──────────────────────────────────────────────── +_HNS_BASE = 'https://semafor.hns.family' + +def _slugify(name: str) -> str: + import unicodedata + s = unicodedata.normalize('NFKD', name or '').encode('ascii', 'ignore').decode('ascii').lower() + return re.sub(r'[^a-z0-9]+', '-', s).strip('-') + +def _hns_url_from_row(row: dict) -> Optional[str]: + """Try to build a semafor.hns.family /igraci/ URL for this row.""" + # 1) Already-set columns + for k in ('profile_url', 'source_url'): + u = row.get(k) + if u and 'semafor.hns.family/igraci/' in (u or ''): + return u + # 2) hns_igrac_id column + pid = row.get('hns_igrac_id') + if pid: + slug = _slugify(((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip()) + return f'{_HNS_BASE}/igraci/{int(pid)}/{slug}/' + # 3) vanjski_id JSONB → hns_comet + vid = row.get('vanjski_id') or {} + if isinstance(vid, dict): + comet = vid.get('hns_comet') or vid.get('hns_pid') + slug = vid.get('hns_slug') or _slugify(((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip()) + if comet: + try: + return f'{_HNS_BASE}/igraci/{int(comet)}/{slug}/' + except Exception: + pass + # 4) source='hns_semafor' + source_id + if (row.get('source') or '').startswith('hns_') and row.get('source_id'): + try: + slug = _slugify(((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip()) + return f'{_HNS_BASE}/igraci/{int(row["source_id"])}/{slug}/' + except Exception: + pass + return None + + +def _parse_hns_player(html_doc: str, url: str) -> Optional[dict]: + """Extract structured fields from a semafor.hns.family player page.""" + if not html_doc: return None + try: + from bs4 import BeautifulSoup + except Exception: + return _parse_hns_player_regex(html_doc, url) + soup = BeautifulSoup(html_doc, 'html.parser') + out: dict[str, Any] = {'source': 'semafor.hns.family', 'url': url} + + # hns_igrac_id from URL + m = re.search(r'/igraci/(\d+)/', url) + if m: out['hns_igrac_id'] = int(m.group(1)) + + title = soup.find('title') + if title: out['title'] = title.get_text(strip=True)[:300] + + # Photo + photo = soup.find('div', class_='photo') + if photo: + img = photo.find('img') + if img and img.get('src'): + src = img['src'] + if not src.startswith('http'): + src = urllib.parse.urljoin(url, src) + out['slika_url'] = src + + # Player number (jersey) + pn = soup.find('div', class_='playerName') + if pn: + h3 = pn.find('h3') + if h3: + t = h3.get_text(strip=True) + if t.isdigit(): + out['broj_dresa'] = int(t) + + # Datum rodjenja + li = soup.find('li', class_='dob') + if li: + h4 = li.find('h4') + if h4: + t = h4.get_text(' ', strip=True) + mm = re.match(r'(\d{1,2})\.(\d{1,2})\.(\d{4})', t) + if mm: + from datetime import date as _date + try: + out['datum_rodenja'] = _date(int(mm.group(3)), int(mm.group(2)), int(mm.group(1))).isoformat() + except Exception: + pass + + # Mjesto rodjenja + li = soup.find('li', class_='pob') + if li: + h4 = li.find('h4') + if h4: + out['mjesto_rodenja'] = h4.get_text(strip=True) + + # Trenutni klub (info only — we don't reassign klub_id from here) + klub_link = soup.find('a', href=re.compile(r'/klubovi/(\d+)/')) + if klub_link: + h4 = klub_link.find('h4') + if h4: + out['trenutni_klub'] = h4.get_text(strip=True) + m = re.search(r'/klubovi/(\d+)/', klub_link.get('href') or '') + if m: out['hns_klub_id'] = int(m.group(1)) + + # Description (meta) + meta_d = soup.find('meta', attrs={'name': 'description'}) + if meta_d and meta_d.get('content'): + out['description'] = meta_d['content'][:600] + + # Make a clean text blob for relevance / DeepSeek + text = soup.get_text(' ', strip=True) + out['raw_text'] = re.sub(r'\s+', ' ', text)[:4000] + out['extract'] = (out.get('description') + or (out['raw_text'][:500] if out.get('raw_text') else None)) + return out + + +def _parse_hns_player_regex(html_doc: str, url: str) -> Optional[dict]: + """BS4-free fallback parser.""" + out: dict[str, Any] = {'source': 'semafor.hns.family', 'url': url} + m = re.search(r'/igraci/(\d+)/', url) + if m: out['hns_igrac_id'] = int(m.group(1)) + m = re.search(r'
.*?

(\d{1,2})\.(\d{1,2})\.(\d{4})', html_doc, re.S) + if m: + from datetime import date as _date + try: + out['datum_rodenja'] = _date(int(m.group(3)), int(m.group(2)), int(m.group(1))).isoformat() + except Exception: + pass + m = re.search(r'
  • ([^<]+)

    ', html_doc) + if m: out['mjesto_rodenja'] = m.group(1).strip() + m = re.search(r'

    (\d+)

    ', html_doc) + if m: out['broj_dresa'] = int(m.group(1)) + m = re.search(r' Optional[dict]: + body = _http_get(url, timeout=8) + if not body: + # Try Playwright fallback + if _HAS_PW and _pw_scraper is not None: + r = _pw_scraper.fetch_rendered(url, timeout_ms=15000) + if r and r.get('html_len', 0) > 2000: + # We didn't store html in fetch_rendered — re-fetch text only is enough + # but we need html for parse. Do a simple HTTP retry with longer timeout. + body = _http_get(url, timeout=15) + return _parse_hns_player(body, url) if body else None + + +# ─── Generic sport-federation scraper ─────────────────────────────────── +def _fed_url_from_row(row: dict) -> Optional[str]: + """If the row already points to a federation profile (source_url / + profile_url on a known fed host), return it.""" + feds, _, _ = _load_sport_feds() + fed_hosts = set() + for entry in feds.values(): + if not isinstance(entry, dict): continue + for which in ('national', 'pgz'): + sub = entry.get(which) or {} + for k in ('url', 'search_url', 'profile_url_pattern'): + v = sub.get(k) + if v: + try: + h = urllib.parse.urlparse(v.replace('{q}', 'x').replace('{slug}', 'x').replace('{hns_pid}', '1')).hostname + if h: fed_hosts.add(h) + except Exception: + pass + for k in ('source_url', 'profile_url'): + u = row.get(k) + if not u: continue + try: + h = urllib.parse.urlparse(u).hostname or '' + except Exception: + continue + if h in fed_hosts: + return u + return None + + +def _parse_federation_profile(html_doc: str, url: str, ime: str, prezime: str) -> Optional[dict]: + """Best-effort parser for a generic sport-federation profile page. + + Returns {source, url, slika_url, datum_rodenja, mjesto_rodenja, klub, + extract, raw_text}. Tolerant of varied page structures. + """ + if not html_doc: return None + host = urllib.parse.urlparse(url).hostname or '' + out: dict[str, Any] = { + 'source': host, + 'url': url, + } + # Title + m = re.search(r']*>([^<]+)', html_doc, re.I) + if m: out['title'] = html.unescape(m.group(1).strip())[:300] + # Meta description + m = re.search(r'= 3: + name_tokens.append(re.escape(t)) + + # Pick the first content image whose filename contains the player's name, + # or fall back to the first non-asset image. + img_candidates = re.findall(r']+src=["\']([^"\']+)["\']', html_doc, re.I) + chosen_img = None + for src in img_candidates: + low = src.lower() + if any(b in low for b in ('logo', 'icon', 'admin-ajax', 'spinner', 'loader', + 'sprite', '/themes/', '/icons/', 'gdpr', 'banner', + 'header', 'footer', 'placeholder', 'avatar-default')): + continue + if not low.endswith(('.jpg', '.jpeg', '.png', '.webp')): + continue + # Prefer matches on player name in URL + if name_tokens and any(re.search(t, src, re.I) for t in name_tokens): + chosen_img = src; break + if chosen_img is None: + chosen_img = src + if chosen_img: + if not chosen_img.startswith('http'): + chosen_img = urllib.parse.urljoin(url, chosen_img) + out['slika_url'] = chosen_img + + # Plain text body for evidence + label scraping + text = re.sub(r']*>.*?', ' ', html_doc, flags=re.S | re.I) + text = re.sub(r']*>.*?', ' ', text, flags=re.S | re.I) + text = re.sub(r'<[^>]+>', ' ', text) + text = html.unescape(re.sub(r'\s+', ' ', text)).strip() + out['raw_text'] = text[:4000] + out['extract'] = (out.get('description') + or text[max(0, text.find(prezime)-30):max(0, text.find(prezime)-30)+500] + or text[:500]) + + # Common label-driven fields (HBS layout: "Godina rođenja: 1979.", "Matični klub: …") + m = re.search(r'Datum\s+ro[đdj]?enja[:\s]+(\d{1,2}[.\-/]\d{1,2}[.\-/]\d{4})', text, re.I) + if m: + try: + from datetime import date as _date + d = re.split(r'[.\-/]', m.group(1)) + out['datum_rodenja'] = _date(int(d[2]), int(d[1]), int(d[0])).isoformat() + except Exception: + pass + if 'datum_rodenja' not in out: + m = re.search(r'Godina\s+ro[đdj]?enja[:\s]+(\d{4})', text, re.I) + if m: + try: + from datetime import date as _date + out['datum_rodenja'] = _date(int(m.group(1)), 1, 1).isoformat() + except Exception: + pass + m = re.search(r'Mjesto\s+ro[đdj]?enja[:\s]+([A-ZČĆŠĐŽ][^,\n.]{2,40})', text) + if m: out['mjesto_rodenja'] = m.group(1).strip() + m = re.search(r'Mati[čc]ni\s+klub[:\s]+([^\n]{3,60}?)(?:\s+(?:Sportski|Datum|Liječni|Reprezent|Sezona|Domaće|Nastupi))', text, re.I) + if m: out['klub_naziv'] = m.group(1).strip().rstrip('.') + + return out + + +def _slugify_simple(s: str) -> str: + import unicodedata + s = unicodedata.normalize('NFKD', s or '').encode('ascii', 'ignore').decode('ascii').lower() + return re.sub(r'[^a-z0-9]+', '-', s).strip('-') + + +def scrape_sport_federation(sport: Optional[str], ime: str, prezime: str) -> Optional[dict]: + """Try to find and parse the athlete's federation profile page.""" + fed = _sport_fed(sport) if sport else None + if not fed: return None + nat = (fed or {}).get('national') or {} + full_name = (ime + ' ' + prezime).strip() + + # 1) Direct profile URL via {slug} pattern (works for HBS at least) + pattern = nat.get('profile_url_pattern') + if pattern and '{slug}' in pattern: + slug = _slugify_simple(full_name) + url = pattern.replace('{slug}', slug) + body = _http_get(url, timeout=8) + if body and prezime.lower() in body.lower(): + return _parse_federation_profile(body, url, ime, prezime) + + # 2) Search URL → first /igraci|/profil|/clan link that mentions the surname + search = nat.get('search_url') + if search: + body = _http_get(search.replace('{q}', urllib.parse.quote(full_name)), timeout=10) + if body: + for href_re in (r'href="([^"]*?/igraci/[^"]+)"', + r'href="([^"]*?/igrac/[^"]+)"', + r'href="([^"]*?/sportasi/[^"]+)"', + r'href="([^"]*?/clanovi/[^"]+)"', + r'href="([^"]*?/profil/[^"]+)"'): + for m in re.finditer(href_re, body, re.I): + cand = m.group(1) + if not cand.startswith('http'): + cand = urllib.parse.urljoin(nat.get('url', search), cand) + if _slugify_simple(prezime) in _slugify_simple(cand): + b2 = _http_get(cand, timeout=8) + if b2: + return _parse_federation_profile(b2, cand, ime, prezime) + return None + + +def _propose_for_sportas(row: dict) -> dict: + naziv = ((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip() + ime, prezime = (row.get('ime') or ''), (row.get('prezime') or '') + sport = row.get('sport') + sources, evidence = [], [] + proposed: dict[str, Any] = {} + + # 1) HNS Semafor — only meaningful when sport is football OR row already + # carries an HNS link. + hns_doc: Optional[dict] = None + if _normalize_sport(sport) == 'nogomet' or _hns_url_from_row(row): + hns_url = _hns_url_from_row(row) + if hns_url: + hns_doc = _hns_fetch_player(hns_url) + if hns_doc: + sources.append(hns_doc) + evidence.append(hns_doc.get('raw_text') or hns_doc.get('extract') or '') + + # 2) Sport-aware federation scrape (HBS, HKS, etc.) — also use existing + # source_url/profile_url if it points at a known federation host. + fed_doc: Optional[dict] = None + direct_fed_url = _fed_url_from_row(row) + if direct_fed_url and (not hns_doc or hns_doc.get('url') != direct_fed_url): + body = _http_get(direct_fed_url, timeout=8) + if body: + fed_doc = _parse_federation_profile(body, direct_fed_url, ime, prezime) + if not fed_doc: + fed_doc = scrape_sport_federation(sport, ime, prezime) + if fed_doc: + sources.append(fed_doc) + evidence.append(fed_doc.get('raw_text') or fed_doc.get('extract') or '') + + # Helper: pick from hns_doc first then fed_doc + def _pick(field): + if hns_doc and hns_doc.get(field): return hns_doc[field] + if fed_doc and fed_doc.get(field): return fed_doc[field] + return None + + if not row.get('profile_url'): + v = _pick('url') or (hns_doc and hns_doc.get('url')) or (fed_doc and fed_doc.get('url')) + if v: proposed['profile_url'] = v + if not row.get('source_url'): + v = (hns_doc and hns_doc.get('url')) or (fed_doc and fed_doc.get('url')) + if v: proposed['source_url'] = v + if not row.get('slika_url'): + v = _pick('slika_url') + if v: proposed['slika_url'] = v + if not row.get('hns_igrac_id') and hns_doc and hns_doc.get('hns_igrac_id'): + proposed['hns_igrac_id'] = hns_doc['hns_igrac_id'] + if not row.get('datum_rodenja'): + v = _pick('datum_rodenja') + if v: proposed['datum_rodenja'] = v + if not row.get('mjesto_rodenja'): + v = _pick('mjesto_rodenja') + if v: proposed['mjesto_rodenja'] = v + if not row.get('broj_dresa') and hns_doc and hns_doc.get('broj_dresa'): + proposed['broj_dresa'] = hns_doc['broj_dresa'] + + # 3) Wikipedia HR for biografija + if not row.get('biografija'): + wiki = _wiki_summary(naziv) + if wiki: + sources.append(wiki) + evidence.append(wiki.get('extract') or '') + + # Description: prefer DeepSeek synthesis from all evidence; fallback to first long snippet + if not row.get('biografija'): + descr = _deepseek_describe(naziv, f'sportaš ({sport})' if sport else 'sportaš', evidence) if evidence else None + if not descr: + for s in sources: + ext = s.get('extract') + if ext and len(ext) >= 80: + descr = ext; break + if descr: + proposed['biografija'] = descr.strip()[:2000] + + return {'proposed': proposed, 'sources': sources} + + +# ─── Endpoints ────────────────────────────────────────────────────────── +# ─── R4 — POST /v2/enrich/forensic/{finding_id} ───────────────────────── +def _extract_pep_name(finding: dict) -> Optional[str]: + """Pull the primary person name from a forensic_findings row.""" + title = (finding.get('title') or '').strip() + desc = (finding.get('description') or '').strip() + payload = finding.get('raw_data') or {} + if isinstance(payload, str): + try: payload = json.loads(payload) + except Exception: payload = {} + if isinstance(payload, dict): + for k in ('person_name', 'name', 'osoba'): + v = payload.get(k) + if v: return str(v).strip() + # Try entities_involved.entity_name + ents = finding.get('entities_involved') or [] + if isinstance(ents, str): + try: ents = json.loads(ents) + except Exception: ents = [] + if isinstance(ents, list): + for e in ents: + if isinstance(e, dict) and e.get('person_name'): + return str(e['person_name']).strip() + if isinstance(e, dict) and e.get('entity_name') and ' ' in (e.get('entity_name') or ''): + # Some entries store person names as entity_name when entity_type='person' + if (e.get('entity_type') or '').lower() in ('person','osoba'): + return str(e['entity_name']).strip() + # Fallback: extract a "Ime Prezime" from the title + m = re.search(r'\b([A-ZČĆŠĐŽ][a-zčćšđž]+)\s+([A-ZČĆŠĐŽ][a-zčćšđž]+(?:-[A-ZČĆŠĐŽ][a-zčćšđž]+)?)\b', title + ' ' + desc) + if m: return f"{m.group(1)} {m.group(2)}" + return None + + +def _gather_pep_evidence(name: str) -> list[dict]: + sources: list[dict] = [] + wiki = _wiki_summary(name) + if wiki: sources.append(wiki) + # DDG html-lite as a "Google snippet" replacement (often OK for HR PEPs) + ddg = 'https://html.duckduckgo.com/html/?q=' + urllib.parse.quote(f'"{name}" PGŽ Hrvatska') + page = _http_get(ddg, timeout=8) + if page: + # First result block + m = re.search(r']+class="result__a"[^>]+href="([^"]+)"[^>]*>([^<]{6,200})', page) + snippet_m = re.search(r']+class="result__snippet"[^>]*>(.*?)', page, re.S) + if m: + sources.append({ + 'source': 'duckduckgo', + 'url': html.unescape(m.group(1))[:500], + 'title': html.unescape(m.group(2)).strip()[:300], + 'extract': re.sub(r'<[^>]+>', ' ', snippet_m.group(1)).strip()[:600] if snippet_m else None, + }) + return sources + + +def _related_entities_for_pep(name: str) -> list[dict]: + """Pull civic.persons + their entity links so we have the structured graph.""" + out: list[dict] = [] + with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: + cur.execute("""SELECT id, name, function, party, county, city, oib, trust_tier + FROM civic.persons + WHERE upper(name) ILIKE upper(%s) + ORDER BY oib NULLS LAST, id LIMIT 10""", ('%'+name+'%',)) + for p in cur.fetchall(): + p = dict(p) + entry = { + 'kind': 'person', + 'person_id': p['id'], 'person_name': p['name'], + 'function': p.get('function'), 'party': p.get('party'), + 'county': p.get('county'), 'city': p.get('city'), + 'oib': p.get('oib'), 'trust_tier': p.get('trust_tier'), + 'entities': [], + } + if p.get('oib'): + cur.execute("""SELECT pel.entity_id, pel.roles, e.name AS entity_name, + e.oib AS entity_oib, e.entity_type, e.city, e.risk_score + FROM civic.person_entity_links pel + LEFT JOIN civic.entities e ON e.id = pel.entity_id + WHERE pel.person_oib=%s LIMIT 30""", (p['oib'],)) + for r in cur.fetchall(): + entry['entities'].append(dict(r)) + out.append(entry) + return out + + +@router.post("/enrich/forensic/{finding_id}") +def enrich_forensic_v2(finding_id: int, + body: dict = Body(default=None), + x_user_email: Optional[str] = Header(default=None), + x_user_id: Optional[int] = Header(default=None)): + """Enrich a forensic finding: gather Wiki + DDG snippets + civic graph, + write back to civic.forensic_findings.related_entities, and seal the + payload hash on Polygon (or queue for sealing). + """ + body = body or {} + explicit_name = (body.get('name') or '').strip() or None + + with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: + cur.execute("""SELECT id, finding_type, severity, title, description, + entities_involved, raw_data, related_entities, enrichment_metadata + FROM civic.forensic_findings WHERE id=%s""", (finding_id,)) + finding = cur.fetchone() + if not finding: + raise HTTPException(404, "finding not found") + finding = dict(finding) + + name = explicit_name or _extract_pep_name(finding) + if not name: + raise HTTPException(400, "could not derive a person/entity name; pass {name: \"…\"}") + + sources = _gather_pep_evidence(name) + related = _related_entities_for_pep(name) + + payload = { + 'finding_id': finding_id, + 'name': name, + 'sources': [{'source': s.get('source'), 'url': s.get('url'), + 'title': s.get('title')} for s in sources], + 'related_entities': related, + 'enriched_at': datetime.now(timezone.utc).isoformat(), + } + + # Persist back to the finding + enrichment_meta = finding.get('enrichment_metadata') or {} + if not isinstance(enrichment_meta, dict): enrichment_meta = {} + history = enrichment_meta.get('history') or [] + history.append({ + 'at': payload['enriched_at'], + 'sources': payload['sources'], + 'related_count': len(related), + 'user': x_user_email, + }) + enrichment_meta['history'] = history[-10:] + enrichment_meta['enriched_at'] = payload['enriched_at'] + enrichment_meta['enriched_by'] = x_user_email or 'system' + enrichment_meta['source_count'] = len(sources) + + with _db() as c, c.cursor() as cur: + cur.execute("""UPDATE civic.forensic_findings + SET related_entities = %s::jsonb, + enrichment_metadata = %s::jsonb + WHERE id=%s + RETURNING id""", + (json.dumps(related, default=str, ensure_ascii=False), + json.dumps(enrichment_meta, default=str, ensure_ascii=False), + finding_id)) + cur.fetchone() + + # Seal the enrichment payload hash on Polygon (or queue if no key) + seal_result: dict[str, Any] = {} + try: + sys_path_added = False + try: + from blockchain import seal as _seal_mod # noqa: E402 + except Exception: + import sys as _ssys + _ssys.path.insert(0, '/opt/pgz-sport') + from blockchain import seal as _seal_mod # noqa: E402 + sys_path_added = True + del sys_path_added # silence linters + h = _seal_mod.hash_payload(payload) + seal_result = _seal_mod.seal_to_polygon( + data_hash=h, + ref_id=str(finding_id), + action='forensic.enriched', + ref_type='forensic_finding', + payload=payload, + user_id=x_user_id, + user_email=x_user_email, + ) + except Exception as e: + seal_result = {'error': f'{type(e).__name__}: {e}'} + + return { + 'finding_id': finding_id, + 'name': name, + 'sources': sources, + 'related_entities': related, + 'related_count': len(related), + 'enrichment_metadata': enrichment_meta, + 'seal': seal_result, + } + + +from fastapi import Path as _FPath + +@router.post("/enrich/{kind:str}/{eid:int}") +def enrich_preview(kind: str = _FPath(..., regex='^(klub|savez|sportas)$'), eid: int = 0): + row = _load_row(kind, eid) + if kind == 'klub': res = _propose_for_klub(row) + elif kind == 'savez': res = _propose_for_savez(row) + else: res = _propose_for_sportas(row) + + if kind == 'klub': + keys = ['oib','sport','grad','predsjednik','tajnik','web','email','telefon', + 'sjediste','godina_osnutka','ciljevi','opis_djelatnosti'] + elif kind == 'savez': + keys = ['oib','sport','predsjednik','tajnik','email','telefon','web','adresa','godina_osnutka'] + else: + keys = ['sport','profile_url','slika_url','hns_igrac_id','biografija', + 'datum_rodenja','mjesto_rodenja','broj_dresa','visina_cm','tezina_kg', + 'dominantna_noga','oib'] + + naziv = _display_name(kind, row) + grad = row.get('grad') if kind == 'klub' else None + primary = row.get('web') or row.get('web_stranica') or row.get('source_url') or row.get('scrape_url') or row.get('profile_url') + + filled = sum(1 for k in keys if row.get(k)) + coverage = round(filled / len(keys) * 100) + missing = [k for k in keys if not row.get(k)] + + proposed = res['proposed'] + current = {k: row.get(k) for k in proposed.keys()} + meta = row.get('metadata') or {} + if not isinstance(meta, dict): meta = {} + + return { + 'kind': kind, 'id': eid, 'naziv': naziv, + 'coverage': coverage, 'filled_fields': filled, 'total_fields': len(keys), + 'missing_fields': missing, + 'live_snippet': _fetch_title(primary) if primary else None, + 'research_links': _research_links(naziv, kind, grad, sport=row.get('sport')), + 'sport': row.get('sport'), + 'sport_federation': (lambda f: { + 'national': (f.get('national') or {}).get('name') if f else None, + 'national_url': (f.get('national') or {}).get('url') if f else None, + 'pgz': (f.get('pgz') or {}).get('name') if f else None, + })(_sport_fed(row.get('sport'))), + 'sources': res['sources'], + 'current': current, + 'proposed': proposed, + 'last_enriched_at': meta.get('enriched_at'), + 'last_enrichment_source': meta.get('enrichment_source'), + 'enriched_at': int(time.time()), + 'apply_url': f'/sport/api/v2/enrich/{kind}/{eid}/apply', + } + + +_TABLE_MAP = { + 'klub': ('pgz_sport.klubovi', + {'web','email','telefon','predsjednik','tajnik', + 'opis_djelatnosti','ciljevi','godina_osnutka','sjediste','adresa'}), + 'savez': ('pgz_sport.savezi', + {'web','email','telefon','predsjednik','tajnik','adresa','godina_osnutka'}), + 'sportas': ('pgz_sport.clanovi', + {'biografija','profile_url','source_url','slika_url','hns_igrac_id', + 'datum_rodenja','mjesto_rodenja','broj_dresa','visina_cm', + 'tezina_kg','dominantna_noga','oib'}), +} + + +def _apply_to_db(kind: str, eid: int, fields: dict, sources: list, user_email: Optional[str]): + if kind not in _TABLE_MAP: + raise HTTPException(400, "kind must be klub|savez|sportas") + table, allowed = _TABLE_MAP[kind] + + with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: + cur.execute(f"SELECT * FROM {table} WHERE id=%s FOR UPDATE", (eid,)) + before = cur.fetchone() + if not before: raise HTTPException(404, kind + " not found") + before = dict(before) + + sets, params, applied = [], [], {} + for k, v in (fields or {}).items(): + if k not in allowed: continue + if v is None or str(v).strip() == '': continue + if before.get(k): + continue # never overwrite existing + sets.append(f"{k} = %s") + params.append(v); applied[k] = v + + meta_in = before.get('metadata') or {} + if not isinstance(meta_in, dict): meta_in = {} + now_iso = datetime.now(timezone.utc).isoformat() + meta_in['enriched_at'] = now_iso + meta_in['enrichment_source'] = [s.get('source') for s in (sources or []) if s.get('source')] + history = meta_in.get('enrichment_history') or [] + history.append({ + 'at': now_iso, + 'fields': list(applied.keys()), + 'sources': meta_in['enrichment_source'], + 'urls': [s.get('url') for s in (sources or []) if s.get('url')], + 'user': user_email, + }) + meta_in['enrichment_history'] = history[-10:] + sets.append("metadata = %s::jsonb") + params.append(json.dumps(meta_in, ensure_ascii=False, default=str)) + + params.append(eid) + cur.execute(f"UPDATE {table} SET {', '.join(sets)} WHERE id=%s RETURNING *", params) + after = dict(cur.fetchone()) + + cur.execute( + """INSERT INTO pgz_sport.enrichment_log + (kind, target_id, source, url, fields_set, before_jsonb, after_jsonb, user_email) + VALUES (%s,%s,%s,%s,%s,%s::jsonb,%s::jsonb,%s)""", + (kind, eid, + ','.join(meta_in['enrichment_source'])[:120] if meta_in['enrichment_source'] else None, + (sources[0].get('url') if sources else None), + list(applied.keys()) or None, + json.dumps({k: before.get(k) for k in (list(applied.keys()) + ['metadata'])}, + ensure_ascii=False, default=str), + json.dumps({k: after.get(k) for k in (list(applied.keys()) + ['metadata'])}, + ensure_ascii=False, default=str), + user_email)) + + snap_keys = ('id','naziv','ime','prezime','web','email','telefon', + 'opis_djelatnosti','biografija','metadata') + return {'applied': applied, + 'after': {k: after.get(k) for k in snap_keys if k in after}} + + +@router.post("/enrich/{kind:str}/{eid:int}/apply") +def enrich_apply(kind: str = _FPath(..., regex='^(klub|savez|sportas)$'), + eid: int = 0, + body: dict = Body(default=None), + x_user_email: Optional[str] = Header(default=None), + x_user_id: Optional[int] = Header(default=None)): + body = body or {} + fields = body.get('fields') + sources = body.get('sources') + if not fields: + row = _load_row(kind, eid) + if kind == 'klub': res = _propose_for_klub(row) + elif kind == 'savez': res = _propose_for_savez(row) + else: res = _propose_for_sportas(row) + fields = res['proposed'] + sources = res['sources'] + out = _apply_to_db(kind, eid, fields or {}, sources or [], x_user_email) + applied = out.get('applied') or {} + # R4-A3: write to pgz_sport.sys_audit so the audit page sees enrichment events + try: + from audit_seal_router import audit_log as _audit_log + if applied: + _audit_log( + action='enrich.apply', + target_type=kind, + target_id=eid, + payload={'applied': applied, + 'sources': [s.get('url') for s in (sources or []) if isinstance(s, dict)]}, + user_id=x_user_id, + user_email=x_user_email, + ) + except Exception: + pass + return { + 'status': 'success' if applied else 'no_changes', + 'kind': kind, + 'id': eid, + 'applied_count': len(applied), + 'applied_fields': list(applied.keys()), + **out, + } + + +@router.get("/enrich/log") +def enrich_log(kind: Optional[str] = None, target_id: Optional[int] = None, limit: int = 50): + where, params = [], [] + if kind: where.append("kind=%s"); params.append(kind) + if target_id: where.append("target_id=%s"); params.append(target_id) + sql = ("SELECT id, kind, target_id, source, url, fields_set, user_email, created_at " + "FROM pgz_sport.enrichment_log " + + ("WHERE " + " AND ".join(where) + " " if where else "") + + "ORDER BY id DESC LIMIT %s") + params.append(min(int(limit or 50), 200)) + with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: + cur.execute(sql, params) + rows = [dict(r) for r in cur.fetchall()] + for r in rows: + if r.get('created_at'): r['created_at'] = r['created_at'].isoformat() + return {'count': len(rows), 'rows': rows} + + +# ─── R3B M2 — SEARCH SUGGEST (autocomplete for Mreža) ─────────────────── +@router.get("/search/suggest") +def search_suggest(q: str = '', type: str = '', limit: int = 10): + """ + Autocomplete suggestions for the Mreža search inputs. + type ∈ {person, club, company, ''} — empty means all. + Returns: {query, results: [{id, label, type, sub}]} + """ + q = (q or '').strip() + if len(q) < 2: + return {'query': q, 'results': []} + limit = max(1, min(50, int(limit))) + out = [] + with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: + if type in ('', 'club'): + cur.execute(""" + SELECT id, naziv AS label, sport, grad + FROM pgz_sport.klubovi + WHERE naziv ILIKE %s AND aktivan=TRUE + ORDER BY length(naziv), naziv LIMIT %s + """, ('%'+q+'%', limit)) + for r in cur.fetchall(): + out.append({'id':'klub:'+str(r['id']), 'label': r['label'], 'type':'club', + 'sub': (r.get('sport') or '')+' · '+(r.get('grad') or '')}) + cur.execute(""" + SELECT id, naziv AS label, sport + FROM pgz_sport.savezi + WHERE naziv ILIKE %s AND aktivan=TRUE + ORDER BY length(naziv), naziv LIMIT %s + """, ('%'+q+'%', limit)) + for r in cur.fetchall(): + out.append({'id':'savez:'+str(r['id']), 'label': r['label'], 'type':'savez', + 'sub': r.get('sport') or 'savez'}) + if type in ('', 'person'): + cur.execute(""" + SELECT c.id, c.ime, c.prezime, c.sport, k.naziv AS klub_naziv + FROM pgz_sport.clanovi c + LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id + WHERE (COALESCE(c.ime,'') || ' ' || COALESCE(c.prezime,'')) ILIKE %s + ORDER BY length(COALESCE(c.ime,'')||COALESCE(c.prezime,'')), c.prezime + LIMIT %s + """, ('%'+q+'%', limit)) + for r in cur.fetchall(): + out.append({'id':'sportas:'+str(r['id']), + 'label': (r.get('ime') or '')+' '+(r.get('prezime') or ''), + 'type':'person', + 'sub': (r.get('sport') or 'sportaš')+(r.get('klub_naziv') and ' · '+r['klub_naziv'] or '')}) + cur.execute(""" + SELECT id, name AS label, function, oib, county + FROM civic.persons + WHERE name ILIKE %s + ORDER BY oib NULLS LAST, length(name) LIMIT %s + """, ('%'+q+'%', limit)) + for r in cur.fetchall(): + out.append({'id':'civic_person:'+str(r['id']), + 'label': r['label'], 'type':'person', + 'sub': (r.get('function') or 'civic')+' · '+(r.get('county') or '')}) + if type in ('', 'company'): + cur.execute(""" + SELECT id, name AS label, oib, city, entity_type + FROM civic.entities + WHERE name ILIKE %s + ORDER BY length(name) LIMIT %s + """, ('%'+q+'%', limit)) + for r in cur.fetchall(): + out.append({'id':'civic_entity:'+str(r['id']), + 'label': r['label'], 'type':'company', + 'sub': (r.get('entity_type') or 'tvrtka')+' · '+(r.get('city') or '')}) + return {'query': q, 'results': out[:limit*2]} + + +# ─── R3B M3 — FORENSIC ENRICH (Wikipedia scrape + persist) ────────────── +@router.post("/forensic/findings/{finding_id}/enrich") +def enrich_forensic(finding_id: int): + """ + Look up the forensic finding, derive the PEP person name from + entities_involved or title, hit Wikipedia HR for a summary, and persist + the enriched payload into civic.forensic_findings.ai_analysis (or back into + raw_data.enrichment). + """ + with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: + cur.execute(""" + SELECT id, finding_type, severity, title, description, entities_involved, + raw_data, ai_analysis + FROM civic.forensic_findings WHERE id=%s + """, (finding_id,)) + f = cur.fetchone() + if not f: raise HTTPException(404, "finding not found") + f = dict(f) + + # Derive person name candidates + candidates = [] + if isinstance(f.get('entities_involved'), (list, dict)): + ei = f['entities_involved'] + if isinstance(ei, dict): + for k in ('person','name','osoba','PEP','pep'): + if ei.get(k): candidates.append(str(ei[k])) + # Also try persons: [...] list + for p in (ei.get('persons') or ei.get('osobe') or []): + if isinstance(p, dict) and p.get('name'): candidates.append(p['name']) + elif isinstance(p, str): candidates.append(p) + elif isinstance(ei, list): + for it in ei: + if isinstance(it, dict): + for k in ('name','person','label'): + if it.get(k): candidates.append(str(it[k])); break + elif isinstance(it, str): + candidates.append(it) + if not candidates and f.get('title'): + # Heuristic: extract first capitalised "Ime Prezime" pair + m = re.search(r'\b([A-ZŠĐČĆŽ][a-zšđčćž]{2,})\s+([A-ZŠĐČĆŽ][a-zšđčćž]{2,})', f['title']) + if m: candidates.append(m.group(0)) + + wiki = None + used_query = None + for q in candidates[:3]: + wiki = _wiki_summary(q) + if wiki: + used_query = q + break + + # Build enrichment payload + enrichment = { + 'queried': candidates[:5], + 'used_query': used_query, + 'wiki': wiki, + 'enriched_at': datetime.now(timezone.utc).isoformat(), + } + + # Persist into raw_data.enrichment + raw = f.get('raw_data') + if raw is None: raw = {} + if not isinstance(raw, dict): raw = {'_legacy': raw} + raw['enrichment'] = enrichment + + cur.execute(""" + UPDATE civic.forensic_findings + SET raw_data = %s::jsonb, + ai_analysis = COALESCE(ai_analysis, %s) + WHERE id = %s + """, (json.dumps(raw, default=str, ensure_ascii=False), + (wiki or {}).get('extract'), + finding_id)) + c.commit() + + return { + 'finding_id': finding_id, + 'queried': candidates[:5], + 'used_query': used_query, + 'wiki': wiki, + 'persisted': True, + } + + +# ─── R3B P4 — FORENSIC SCAN (kept from prior version) ─────────────────── +@router.post("/forensic/scan") +def forensic_scan(req: dict = Body(...)): + name = (req.get('name') or '').strip() + if len(name) < 3: + raise HTTPException(400, "name must be at least 3 chars") + with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: + cur.execute(""" + SELECT id, name, function, party, county, city, oib, trust_tier + FROM civic.persons + WHERE upper(name) ILIKE upper(%s) + ORDER BY oib NULLS LAST, id LIMIT 25 + """, ('%' + name + '%',)) + persons = [dict(r) for r in cur.fetchall()] + for p in persons: + p['links'] = []; p['findings'] = [] + if p.get('oib'): + cur.execute(""" + SELECT pel.entity_id, pel.roles, e.name AS entity_name, e.oib AS entity_oib, + e.entity_type, e.city, e.risk_score + FROM civic.person_entity_links pel + LEFT JOIN civic.entities e ON e.id = pel.entity_id + WHERE pel.person_oib = %s LIMIT 50 + """, (p['oib'],)) + p['links'] = [dict(r) for r in cur.fetchall()] + cur.execute(""" + SELECT id, finding_type, severity, title, severity_score, created_at + FROM civic.forensic_findings + WHERE entities_involved::text ILIKE %s + ORDER BY severity_score DESC, created_at DESC LIMIT 30 + """, ('%' + p['oib'] + '%',)) + p['findings'] = [dict(r) for r in cur.fetchall()] + if not p['findings']: + cur.execute(""" + SELECT id, finding_type, severity, title, severity_score, created_at + FROM civic.forensic_findings + WHERE title ILIKE %s OR description ILIKE %s + ORDER BY severity_score DESC, created_at DESC LIMIT 30 + """, ('%' + p['name'] + '%', '%' + p['name'] + '%')) + p['findings'] = [dict(r) for r in cur.fetchall()] + total_links = total_findings = crit_findings = 0 + for p in persons: + total_links += len(p.get('links') or []) + for f in p.get('findings') or []: + total_findings += 1 + if f.get('severity') in ('CRITICAL', 'HIGH'): crit_findings += 1 + score = 0 + if (p.get('function') or '').strip(): score += 30 + if (p.get('party') or '').strip(): score += 15 + score += min(40, len(p.get('links') or []) * 5) + score += min(40, len(p.get('findings') or []) * 10) + score += sum(20 for f in (p.get('findings') or []) if f.get('severity') in ('CRITICAL', 'HIGH')) + p['risk_score'] = min(100, score) + overall = max((p.get('risk_score', 0) for p in persons), default=0) + return {'query': name, 'matched_persons': len(persons), + 'overall_risk_score': overall, 'total_links': total_links, + 'total_findings': total_findings, 'critical_findings': crit_findings, + 'persons': persons, 'scanned_at': int(time.time())} + + + +# ─── SB-3 — Bulk enrichment ───────────────────────────────────────────── +_BULK_KEY_MAP = { + 'klub': ('pgz_sport.klubovi', + ('oib','sport','grad','predsjednik','tajnik','web','email','telefon', + 'sjediste','godina_osnutka','ciljevi','opis_djelatnosti')), + 'savez': ('pgz_sport.savezi', + ('oib','sport','predsjednik','tajnik','email','telefon','web', + 'adresa','godina_osnutka')), + 'sportas': ('pgz_sport.clanovi', + ('sport','profile_url','slika_url','hns_igrac_id','biografija', + 'datum_rodenja','mjesto_rodenja','broj_dresa')), +} + + +def _coverage_sql(prefix: str, keys: tuple[str, ...]) -> str: + parts = [f"(CASE WHEN {prefix}{k} IS NOT NULL AND ({prefix}{k}::text) <> '' THEN 1 ELSE 0 END)" + for k in keys] + return f"((({' + '.join(parts)})::numeric * 100) / {len(keys)})" + + +def _bulk_pick(kind: str, limit: int, coverage_max: int) -> list[int]: + if kind not in _BULK_KEY_MAP: + raise HTTPException(400, "kind must be klub|savez|sportas") + table, keys = _BULK_KEY_MAP[kind] + cov = _coverage_sql('', keys) + extra_where = '' + if kind == 'klub': + extra_where = "AND aktivan = TRUE" + elif kind == 'sportas': + extra_where = "AND aktivan = TRUE" + sql = (f"SELECT id FROM {table} " + f"WHERE 1=1 {extra_where} " + f"AND {cov} < %s " + f"ORDER BY random() LIMIT %s") + with _db() as c, c.cursor() as cur: + cur.execute(sql, (coverage_max, limit)) + return [r[0] for r in cur.fetchall()] + + +@router.post("/enrich/bulk") +def enrich_bulk(body: dict = Body(default=None), + x_user_email: Optional[str] = Header(default=None), + x_user_id: Optional[int] = Header(default=None)): + """Run preview+apply over N random under-enriched rows of one kind. + + Body: {kind: 'klub'|'savez'|'sportas', limit: 50, coverage_max: 70} + Returns aggregate stats. Synchronous (use polling, not SSE). + """ + body = body or {} + kind = (body.get('kind') or '').strip().lower() + if kind not in _BULK_KEY_MAP: + raise HTTPException(400, "kind must be klub|savez|sportas") + limit = max(1, min(int(body.get('limit') or 50), 200)) + coverage_max = max(0, min(int(body.get('coverage_max') or 70), 100)) + + ids = _bulk_pick(kind, limit, coverage_max) + items: list[dict] = [] + fields_total = 0 + started = time.time() + + for eid in ids: + try: + row = _load_row(kind, eid) + if kind == 'klub': res = _propose_for_klub(row) + elif kind == 'savez': res = _propose_for_savez(row) + else: res = _propose_for_sportas(row) + proposed = res.get('proposed') or {} + srcs = res.get('sources') or [] + if not proposed: + items.append({'id': eid, 'applied': 0, 'fields': []}) + continue + out = _apply_to_db(kind, eid, proposed, srcs, x_user_email) + applied = out.get('applied') or {} + fields_total += len(applied) + items.append({'id': eid, 'applied': len(applied), 'fields': list(applied.keys())}) + try: + from audit_seal_router import audit_log as _audit_log + if applied: + _audit_log(action='enrich.bulk.apply', + target_type=kind, target_id=eid, + payload={'applied': applied}, + user_id=x_user_id, user_email=x_user_email) + except Exception: + pass + except HTTPException as e: + items.append({'id': eid, 'error': e.detail}) + except Exception as e: + items.append({'id': eid, 'error': f'{type(e).__name__}: {e}'}) + + return { + 'status': 'success', + 'kind': kind, + 'requested': limit, + 'processed': len(items), + 'fields_total': fields_total, + 'elapsed_s': round(time.time() - started, 1), + 'items': items, + } + + +# ─── SB-4 — Worker status / control ───────────────────────────────────── +_REDIS_KEYS = { + 'heartbeat': 'cc:pgz-enricher:heartbeat', + 'pause': 'cc:pgz-enricher:pause', + 'run_now': 'cc:pgz-enricher:run_now', + 'last_cycle': 'cc:pgz-enricher:last_cycle', + 'confidence': 'cc:pgz-enricher:confidence', + 'fields_24h': 'cc:pgz-enricher:fields_24h', +} + + +def _redis_client(): + try: + import redis + except Exception: + return None + host = os.environ.get('REDIS_HOST', 'localhost') + port = int(os.environ.get('REDIS_PORT', '6379')) + pwd = (os.environ.get('REDIS_PASS') or '').strip().strip("'").strip('"') or None + # Try with password first (prod); fall back to anonymous (dev box) on AUTH failure. + for p in (pwd, None): + try: + r = redis.Redis(host=host, port=port, password=p, + decode_responses=True, socket_connect_timeout=2) + r.ping() + return r + except Exception: + continue + return None + + +@router.get("/enrich/worker/status") +def enrich_worker_status(): + r = _redis_client() + out = {'available': bool(r)} + if not r: + return out + try: + hb = r.get(_REDIS_KEYS['heartbeat']) + out['heartbeat'] = int(hb) if hb else None + out['heartbeat_age_s'] = (int(time.time()) - int(hb)) if hb else None + out['paused'] = (r.get(_REDIS_KEYS['pause']) or '0') == '1' + out['run_now_pending'] = (r.get(_REDIS_KEYS['run_now']) or '0') == '1' + last = r.get(_REDIS_KEYS['last_cycle']) + if last: + try: out['last_cycle'] = json.loads(last) + except: out['last_cycle'] = last + conf = r.get(_REDIS_KEYS['confidence']) + out['confidence_threshold'] = float(conf) if conf else 0.7 + f24 = r.get(_REDIS_KEYS['fields_24h']) + out['fields_24h'] = int(f24) if f24 and f24.isdigit() else 0 + except Exception as e: + out['error'] = f'{type(e).__name__}: {e}' + # Recent enrichment_log rows for live activity + try: + with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: + cur.execute("""SELECT id, kind, target_id, source, fields_set, user_email, created_at + FROM pgz_sport.enrichment_log + ORDER BY id DESC LIMIT 25""") + rows = [] + for rr in cur.fetchall(): + rr = dict(rr) + if rr.get('created_at'): rr['created_at'] = rr['created_at'].isoformat() + rows.append(rr) + out['recent'] = rows + except Exception: + out['recent'] = [] + return out + + +@router.post("/enrich/worker/pause") +def enrich_worker_pause(body: dict = Body(default=None)): + body = body or {} + pause = bool(body.get('paused', True)) + r = _redis_client() + if not r: raise HTTPException(503, 'redis unavailable') + r.set(_REDIS_KEYS['pause'], '1' if pause else '0') + return {'status': 'success', 'paused': pause} + + +@router.post("/enrich/worker/run-now") +def enrich_worker_run_now(): + r = _redis_client() + if not r: raise HTTPException(503, 'redis unavailable') + r.set(_REDIS_KEYS['run_now'], '1') + return {'status': 'success', 'queued': True} + + +@router.post("/enrich/worker/confidence") +def enrich_worker_confidence(body: dict = Body(...)): + try: + v = float(body.get('value')) + except Exception: + raise HTTPException(400, 'value must be number 0..1') + if not (0.0 <= v <= 1.0): + raise HTTPException(400, 'value out of range 0..1') + r = _redis_client() + if not r: raise HTTPException(503, 'redis unavailable') + r.set(_REDIS_KEYS['confidence'], str(v)) + return {'status': 'success', 'confidence_threshold': v} diff --git a/_backups/sprint_1777940323/sport2.html b/_backups/sprint_1777940323/sport2.html new file mode 100644 index 0000000..abcceb7 --- /dev/null +++ b/_backups/sprint_1777940323/sport2.html @@ -0,0 +1,2950 @@ + + + + + +PGŽ SPORT — Platforma + + + + + + + + + + + + + +
    + + +
    +
    +
    +
    Dashboard
    +
    Pregled stanja
    +
    +
    + API live · sport.rinet.one +
    +
    + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    Detalji
    +
    ×
    +
    +
    +
    + + + + diff --git a/routers/enrich_router.py b/routers/enrich_router.py index ccc0e7c..64152dc 100644 --- a/routers/enrich_router.py +++ b/routers/enrich_router.py @@ -381,11 +381,27 @@ def _sport_fed(sport: Optional[str]) -> Optional[dict]: return feds.get(norm) -def _research_links(naziv, kind, grad=None, sport: Optional[str] = None): +def _research_links(naziv, kind, grad=None, sport: Optional[str] = None, row: Optional[dict] = None): base_q = (naziv or '').strip() q = (base_q + ' ' + grad) if grad else base_q qenc = urllib.parse.quote(q) - out = [ + out = [] + # Prefer DIRECT profile/source link if entity already has one (e.g. HNS Semafor) + if row: + direct = row.get('profile_url') or row.get('source_url') or row.get('scrape_url') or row.get('web') or row.get('web_stranica') + if direct and isinstance(direct, str) and direct.startswith(('http://','https://')): + try: + host = urllib.parse.urlparse(direct).hostname or '' + except Exception: + host = '' + label = 'Vanjski profil' + icon = '🔗' + if 'hns' in host: label, icon = 'HNS profil', '⚽' + elif 'transfermarkt' in host: label, icon = 'Transfermarkt', '⚽' + elif 'wikipedia' in host: label, icon = 'Wikipedia', '📚' + elif host.endswith('.hr') or host.endswith('.com'): label, icon = 'Službena stranica', '🌐' + out.append({'label': label, 'icon': icon, 'url': direct, 'is_direct': True}) + out += [ {'label': 'Google', 'icon': '🔍', 'url': 'https://www.google.com/search?q=' + qenc}, {'label': 'Wikipedia HR', 'icon': '📚', 'url': 'https://hr.wikipedia.org/w/index.php?search=' + qenc}, {'label': 'sport-pgz.hr', 'icon': '🏅', 'url': 'https://sport-pgz.hr/?s=' + qenc}, @@ -445,11 +461,128 @@ def _is_relevant(source: dict, tokens: list[str]) -> bool: return any(t in blob for t in tokens) + +# ─── Klub domain guesser (HR slug → candidate URLs → HEAD probe) ──────── +import re as _re_klg + +def _slugify_klub(naziv: str) -> str: + if not naziv: return "" + s = naziv.lower() + repl = (("č","c"),("ć","c"),("ž","z"),("š","s"),("đ","d"), + ('"',''),("'",""),("(",""),(")",""),(",",""),(".",""), + ("/",""),("\\","")) + for a,b in repl: s = s.replace(a,b) + s = _re_klg.sub(r"[^a-z0-9]+", "-", s).strip("-") + return s + +def _klub_domain_candidates(naziv: str) -> list[str]: + """Generate ranked candidate URLs from club name.""" + if not naziv: return [] + s = _slugify_klub(naziv) + # Strip common prefixes for cleaner domains + base = s + for pref in ("hnk-","nk-","rk-","kk-","ok-","bk-","gk-","tk-","ak-","hbk-"): + if base.startswith(pref): + base = base[len(pref):]; break + # also try short prefix-ed variants + short = base.split("-")[0] if base else "" + candidates = [] + sports_prefixes = ["nk-","hnk-","rk-","kk-","bk-","ok-","ak-","tk-"] + # full slug with original prefix + for tld in (".hr",".com",".eu",".info"): + candidates.append(f"https://{s}{tld}") + candidates.append(f"https://www.{s}{tld}") + # base-only + for tld in (".hr",".com"): + candidates.append(f"https://{base}{tld}") + candidates.append(f"https://www.{base}{tld}") + # try sport prefixes if name doesn't already have one + if not any(s.startswith(p) for p in sports_prefixes): + for sp in sports_prefixes[:5]: + for tld in (".hr",".com"): + candidates.append(f"https://{sp}{base}{tld}") + # dedup, preserve order + seen, out = set(), [] + for c in candidates: + if c not in seen: + seen.add(c); out.append(c) + return out[:20] + +def _probe_klub_url(url: str, naziv_tokens: list, timeout: int = 5) -> Optional[dict]: + """HEAD/GET probe; return doc with raw_text if URL is alive AND mentions club tokens.""" + try: + import requests + r = requests.get(url, timeout=timeout, allow_redirects=True, + headers={"User-Agent":"Mozilla/5.0 RinetEnrichBot/1.0"}) + if r.status_code != 200: return None + if len(r.text) < 200: return None + text = r.text.lower() + # Must mention at least one distinctive token from name + toks = [t.lower() for t in (naziv_tokens or []) if len(t) > 2] + if toks and not any(t in text for t in toks): + return None + return {"source": "domain_probe", "url": r.url, "raw_text": r.text[:50000]} + except Exception: + return None + +def _guess_klub_domains(naziv: str, tokens: list) -> Optional[dict]: + """Parallel probe candidates (5 workers, 4s timeout each); first hit wins.""" + from concurrent.futures import ThreadPoolExecutor, as_completed + candidates = _klub_domain_candidates(naziv) + if not candidates: return None + with ThreadPoolExecutor(max_workers=8) as ex: + futs = {ex.submit(_probe_klub_url, url, tokens, 4): url for url in candidates[:16]} + for fut in as_completed(futs, timeout=10): + try: + doc = fut.result() + if doc: + # Cancel remaining (best effort) + for f in futs: + if not f.done(): f.cancel() + return doc + except Exception: + continue + return None + +def _scrape_klub_subpages(base_url: str, tokens: list) -> str: + """Fetch /kontakt /uprava /o-nama /o-klubu and concat texts.""" + if not base_url: return "" + import requests + base = base_url.rstrip("/") + paths = ["/kontakt","/uprava","/o-nama","/o-klubu","/predsjednik","/klub","/contact","/about"] + accum = [] + for path in paths: + try: + r = requests.get(base + path, timeout=4, allow_redirects=True, + headers={"User-Agent":"Mozilla/5.0 RinetEnrichBot/1.0"}) + if r.status_code == 200 and len(r.text) > 200: + accum.append(r.text[:30000]) + except Exception: + pass + return "\n\n".join(accum) + + def _propose_for_klub(row: dict) -> dict: naziv = row.get('naziv') or '' - primary = row.get('web') or row.get('web_stranica') or row.get('source_url') or row.get('scrape_url') + # Only consider HTTP(S) URLs as valid primary sources — skip placeholder strings like 'godisnjak_2025' + raw_primary = row.get('web') or row.get('web_stranica') or row.get('source_url') or row.get('scrape_url') + primary = raw_primary if (raw_primary and isinstance(raw_primary, str) and raw_primary.startswith(('http://','https://'))) else None sources, evidence = [], [] + tokens_pre = _name_tokens(naziv) pdoc = _fetch_primary_site(primary) if primary else None + if not pdoc: + # No valid web in DB — try to guess domain from club name + pdoc = _guess_klub_domains(naziv, tokens_pre) + if pdoc: + # Also fetch subpages for richer evidence + sub = _scrape_klub_subpages(pdoc.get('url',''), tokens_pre) + if sub: + pdoc['raw_text'] = (pdoc.get('raw_text','') + '\n\n' + sub)[:120000] + elif pdoc: + # Have primary site — also fetch its subpages + sub = _scrape_klub_subpages(pdoc.get('url') or primary, tokens_pre) + if sub: + pdoc['raw_text'] = (pdoc.get('raw_text','') + '\n\n' + sub)[:120000] if pdoc: sources.append(pdoc); evidence.append(pdoc.get('raw_text') or pdoc.get('extract') or '') wiki = _wiki_summary(naziv) if wiki: sources.append(wiki); evidence.append(wiki.get('extract') or '') @@ -1121,7 +1254,7 @@ def enrich_preview(kind: str = _FPath(..., regex='^(klub|savez|sportas)$'), eid: 'coverage': coverage, 'filled_fields': filled, 'total_fields': len(keys), 'missing_fields': missing, 'live_snippet': _fetch_title(primary) if primary else None, - 'research_links': _research_links(naziv, kind, grad, sport=row.get('sport')), + 'research_links': _research_links(naziv, kind, grad, sport=row.get('sport'), row=row), 'sport': row.get('sport'), 'sport_federation': (lambda f: { 'national': (f.get('national') or {}).get('name') if f else None, diff --git a/static/app.html b/static/app.html index c94f455..9f7ae72 100644 --- a/static/app.html +++ b/static/app.html @@ -265,7 +265,7 @@ table tbody tr:hover{background:var(--bg3)}