2678 lines
117 KiB
Python
2678 lines
117 KiB
Python
"""
|
||
enrich_router.py — v3 enrichment + forensic scan
|
||
Author: dradulic@outlook.com / damir@rinet.one
|
||
Date: 2026-05-04 (R2) → 2026-05-05 (R3 CC6 v3)
|
||
|
||
POST /v2/enrich/{kind}/{eid}
|
||
Inspect the row, scrape the web (Wikipedia HR, sport-pgz.hr search,
|
||
primary club URL if any), regex-extract candidate fields (web/email/
|
||
telefon), optionally synthesise descriptions via DeepSeek, and return
|
||
a *preview* shape with `proposed` updates the operator can apply.
|
||
|
||
POST /v2/enrich/{kind}/{eid}/apply
|
||
Body shapes:
|
||
None / {} → re-run preview, apply every proposed field
|
||
{"fields": {...}} → apply ONLY those (whitelist + emptiness still enforced)
|
||
Performs UPDATE on the matching table, sets metadata.enriched_at and
|
||
metadata.enrichment_source, writes a row to pgz_sport.enrichment_log,
|
||
returns the after snapshot.
|
||
|
||
GET /v2/enrich/log?kind=&target_id=&limit=
|
||
Read recent enrichment-log entries.
|
||
|
||
POST /v2/forensic/scan
|
||
Search civic.persons by name, return entity links + findings + risk score.
|
||
|
||
Kinds: klub | savez | sportas
|
||
"""
|
||
from __future__ import annotations
|
||
import os, re, json, time, html, urllib.parse, urllib.request
|
||
from datetime import datetime, timezone
|
||
from typing import Any, Optional
|
||
|
||
import psycopg2, psycopg2.extras
|
||
from fastapi import APIRouter, HTTPException, Header, Body
|
||
|
||
router = APIRouter()
|
||
|
||
_pgh = os.environ.get('PG_HOST', '10.10.0.2')
|
||
_pgp = int(os.environ.get('PG_PORT', '6432'))
|
||
if _pgh in ('localhost', '127.0.0.1'):
|
||
_pgh = os.environ.get('DB_HOST', '10.10.0.2')
|
||
_pgp = int(os.environ.get('DB_PORT', '6432'))
|
||
DB = dict(host=_pgh, port=_pgp,
|
||
dbname=os.environ.get('PG_DB', 'rinet_v3'),
|
||
user=os.environ.get('PG_USER', 'rinet'),
|
||
password=os.environ.get('PG_PASS', 'R1net2026!SecureDB#v7'))
|
||
|
||
UA = 'pgz-sport-enrich/3.0 (+https://sport.rinet.one)'
|
||
TIMEOUT = 6 # seconds — fail-soft
|
||
|
||
# ---- Sport-aware source selection (Damir 2026-05-10, task 06) ----
|
||
# Generic enrichment was only fitted to football. Other sports need their
|
||
# own federation roster + terminology. This map drives the per-sport source
|
||
# list used by enrich pipelines.
|
||
SPORT_FEDERATION_MAP: "dict[str, list[str]]" = {
|
||
"nogomet": ["hns_semafor", "hns_cff", "sport_pgz"],
|
||
"košarka": ["hks_cbf", "kosarkapgz", "sport_pgz"],
|
||
"rukomet": ["hrs", "sport_pgz"],
|
||
"vaterpolo": ["hvs", "sport_pgz"],
|
||
"atletika": ["has", "sport_pgz"],
|
||
"plivanje": ["hps", "sport_pgz"],
|
||
"hokej": ["hsh", "sport_pgz"],
|
||
"šah": ["hss_chess", "sport_pgz"],
|
||
"odbojka": ["hos", "sport_pgz"],
|
||
"tenis": ["hts", "sport_pgz"],
|
||
"biciklizam": ["hbs", "sport_pgz"],
|
||
}
|
||
|
||
# Per-sport field schemas - used by /select_sources_for_sport callers and
|
||
# by the frontend (data-sport-fields="...") to render the right fields.
|
||
SPORT_FIELD_SCHEMA: "dict[str, list[str]]" = {
|
||
"nogomet": ["position", "dominant_foot", "height_cm", "jersey_no", "club"],
|
||
"košarka": ["position", "height_cm", "weight_kg", "hand", "stats"],
|
||
"rukomet": ["position", "hand", "stats"],
|
||
"atletika": ["discipline", "category", "personal_records"],
|
||
"šah": ["elo_rating", "fide_id", "title"],
|
||
"vaterpolo": ["position", "hand", "stats"],
|
||
"plivanje": ["discipline", "personal_records"],
|
||
"hokej": ["position", "hand", "jersey_no"],
|
||
"odbojka": ["position", "height_cm", "stats"],
|
||
"tenis": ["dominant_hand", "atp_wta_rank", "personal_records"],
|
||
"biciklizam": ["discipline", "team", "category"],
|
||
}
|
||
|
||
_DEFAULT_SOURCES = ["sport_pgz", "google", "wikipedia"]
|
||
_DEFAULT_FIELDS = ["club", "position", "stats"]
|
||
|
||
|
||
def select_sources_for_sport(sport):
|
||
"""Return the prioritized source list for a given sport string.
|
||
|
||
Match strategy: lower + strip -> exact key, else longest-prefix key,
|
||
else default (_DEFAULT_SOURCES). Preserves diacritics so "košarka"
|
||
matches "košarka" and not the prefix "kosarka".
|
||
"""
|
||
if not sport:
|
||
return list(_DEFAULT_SOURCES)
|
||
s = sport.lower().strip()
|
||
if s in SPORT_FEDERATION_MAP:
|
||
return list(SPORT_FEDERATION_MAP[s])
|
||
best = None
|
||
for key in SPORT_FEDERATION_MAP:
|
||
if s.startswith(key) and (best is None or len(key) > len(best)):
|
||
best = key
|
||
if best:
|
||
return list(SPORT_FEDERATION_MAP[best])
|
||
return list(_DEFAULT_SOURCES)
|
||
|
||
|
||
def fields_for_sport(sport):
|
||
"""Return the per-sport field list (UI hint + enrichment scope)."""
|
||
if not sport:
|
||
return list(_DEFAULT_FIELDS)
|
||
s = sport.lower().strip()
|
||
if s in SPORT_FIELD_SCHEMA:
|
||
return list(SPORT_FIELD_SCHEMA[s])
|
||
for key in SPORT_FIELD_SCHEMA:
|
||
if s.startswith(key):
|
||
return list(SPORT_FIELD_SCHEMA[key])
|
||
return list(_DEFAULT_FIELDS)
|
||
# ---- end sport-aware section ----
|
||
|
||
|
||
# Optional JS-aware fallback (Playwright). Lazy-loaded, never required.
|
||
import sys as _sys
|
||
_sys.path.insert(0, '/opt/pgz-sport')
|
||
try:
|
||
from enrichment import playwright_scraper as _pw_scraper
|
||
_HAS_PW = _pw_scraper.HAS_PLAYWRIGHT
|
||
except Exception:
|
||
_pw_scraper = None
|
||
_HAS_PW = False
|
||
|
||
DEEPSEEK_KEY = os.environ.get('DEEPSEEK_API_KEY', '').strip()
|
||
DEEPSEEK_URL = os.environ.get('DEEPSEEK_URL',
|
||
'https://api.deepseek.com/v1/chat/completions')
|
||
|
||
|
||
# ─── DB helpers ──────────────────────────────────────────────────────────
|
||
def _db():
|
||
c = psycopg2.connect(**DB); c.autocommit = True; return c
|
||
|
||
def _fetch_one(sql, p):
|
||
with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
|
||
cur.execute(sql, p); r = cur.fetchone()
|
||
return dict(r) if r else None
|
||
|
||
|
||
# ─── HTTP helpers ────────────────────────────────────────────────────────
|
||
def _http_get(url: str, timeout: int = TIMEOUT) -> Optional[str]:
|
||
if not url: return None
|
||
if not url.startswith('http'): return None
|
||
try:
|
||
req = urllib.request.Request(url, headers={
|
||
'User-Agent': UA, 'Accept-Language': 'hr,en;q=0.8'})
|
||
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||
data = r.read(150000)
|
||
try: return data.decode('utf-8')
|
||
except: return data.decode('latin-1', 'ignore')
|
||
except Exception:
|
||
return None
|
||
|
||
|
||
|
||
# ─── Redis cache + slow-enrichment telemetry (added 2026-05-10) ──────────
|
||
ENRICH_CACHE_TTL = int(os.environ.get('ENRICH_CACHE_TTL_SECONDS', '86400')) # 24h default
|
||
ENRICH_SLOW_LOG = '/var/log/enrich_slow.log'
|
||
ENRICH_SLOW_TG_THRESHOLD = float(os.environ.get('ENRICH_SLOW_TG_THRESHOLD', '20.0'))
|
||
ENRICH_SLOW_LOG_THRESHOLD = float(os.environ.get('ENRICH_SLOW_LOG_THRESHOLD', '10.0'))
|
||
|
||
_enrich_redis_singleton = None
|
||
def _enrich_redis():
|
||
"""Singleton Redis connection for enrichment cache. Returns None on failure (fail-soft).
|
||
|
||
Renamed from _redis() to avoid collision with existing _redis_client() function
|
||
elsewhere in this module.
|
||
"""
|
||
global _enrich_redis_singleton
|
||
if _enrich_redis_singleton is not None:
|
||
return _enrich_redis_singleton
|
||
try:
|
||
import redis
|
||
except Exception:
|
||
return None
|
||
host = os.environ.get('REDIS_HOST', 'localhost')
|
||
port = int(os.environ.get('REDIS_PORT', '6379'))
|
||
pwd = (os.environ.get('REDIS_PASS') or os.environ.get('REDIS_PASSWORD') or '').strip().strip("'").strip('"') or None
|
||
for p in (pwd, None):
|
||
try:
|
||
r = redis.Redis(host=host, port=port, password=p,
|
||
decode_responses=True, socket_connect_timeout=2,
|
||
socket_timeout=2)
|
||
r.ping()
|
||
_enrich_redis_singleton = r
|
||
return r
|
||
except Exception:
|
||
continue
|
||
return None
|
||
|
||
def _cache_get(key: str):
|
||
"""Fail-soft cache read. Returns parsed JSON or None."""
|
||
r = _enrich_redis()
|
||
if not r: return None
|
||
try:
|
||
v = r.get(key)
|
||
if v is None: return None
|
||
return json.loads(v)
|
||
except Exception:
|
||
return None
|
||
|
||
def _cache_set(key: str, value, ttl: int = ENRICH_CACHE_TTL):
|
||
"""Fail-soft cache write."""
|
||
r = _enrich_redis()
|
||
if not r: return False
|
||
try:
|
||
r.setex(key, ttl, json.dumps(value, default=str, ensure_ascii=False))
|
||
return True
|
||
except Exception:
|
||
return False
|
||
|
||
def _cache_delete(key: str):
|
||
r = _enrich_redis()
|
||
if not r: return
|
||
try: r.delete(key)
|
||
except Exception: pass
|
||
|
||
def _enrich_slow_log(kind: str, eid: int, duration_s: float, cached: bool, error: str = None):
|
||
"""Append slow-enrichment record to /var/log/enrich_slow.log + Telegram alert if extreme."""
|
||
import logging as _lg
|
||
line = (f"{datetime.now(timezone.utc).isoformat()} "
|
||
f"kind={kind} id={eid} duration={duration_s:.2f}s cached={cached}"
|
||
f"{' error=' + error if error else ''}\n")
|
||
try:
|
||
with open(ENRICH_SLOW_LOG, "a", encoding="utf-8") as f:
|
||
f.write(line)
|
||
except Exception as e:
|
||
_lg.getLogger("enrich").warning(f"slow-log write failed: {e}")
|
||
if duration_s >= ENRICH_SLOW_TG_THRESHOLD:
|
||
try:
|
||
tok = os.environ.get('TG_BOT_TOKEN') or os.environ.get('TELEGRAM_BOT_TOKEN')
|
||
chat = os.environ.get('TG_CHAT_ID') or os.environ.get('TELEGRAM_CHAT_ID')
|
||
if tok and chat:
|
||
msg = f"\u26a0\ufe0f enrich slow: {kind}/{eid} took {duration_s:.1f}s (cached={cached})"
|
||
urllib.request.urlopen(
|
||
f"https://api.telegram.org/bot{tok}/sendMessage",
|
||
data=urllib.parse.urlencode({'chat_id': chat, 'text': msg}).encode('utf-8'),
|
||
timeout=3
|
||
).read()
|
||
except Exception:
|
||
pass
|
||
|
||
|
||
def _strip_tags(s: str) -> str:
|
||
if not s: return ''
|
||
s = re.sub(r'<script[^>]*>.*?</script>', ' ', s, flags=re.S | re.I)
|
||
s = re.sub(r'<style[^>]*>.*?</style>', ' ', s, flags=re.S | re.I)
|
||
s = re.sub(r'<[^>]+>', ' ', s)
|
||
s = html.unescape(s)
|
||
s = re.sub(r'\s+', ' ', s).strip()
|
||
return s
|
||
|
||
|
||
def _extract_meta(html_doc: str, url: str) -> dict:
|
||
if not html_doc: return {}
|
||
out = {'url': url, 'fetched_at': int(time.time())}
|
||
m = re.search(r'<title[^>]*>([^<]+)</title>', html_doc, re.I)
|
||
if m: out['title'] = html.unescape(m.group(1).strip())[:300]
|
||
m = re.search(r'<meta\s+name=["\']description["\']\s+content=["\']([^"\']+)["\']', html_doc, re.I)
|
||
if not m:
|
||
m = re.search(r'<meta\s+property=["\']og:description["\']\s+content=["\']([^"\']+)["\']', html_doc, re.I)
|
||
if m: out['description'] = html.unescape(m.group(1).strip())[:600]
|
||
return out
|
||
|
||
|
||
def _fetch_title(url, timeout=5):
|
||
body = _http_get(url, timeout=timeout)
|
||
if not body: return {'url': url, 'error': 'fetch failed'} if url else None
|
||
return _extract_meta(body, url)
|
||
|
||
|
||
# ─── Field extractors ───────────────────────────────────────────────────
|
||
RE_EMAIL = re.compile(r'[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,}', re.I)
|
||
RE_PHONE = re.compile(r'(?:\+?385[\s\-/]*|0)\d[\d\s\-/]{6,12}\d')
|
||
RE_URL = re.compile(r'https?://[^\s"\'<>)\]]+', re.I)
|
||
|
||
def _find_email(text: str) -> Optional[str]:
|
||
if not text: return None
|
||
bad = ('@example.', '@test.', '@email.', 'wixpress.com',
|
||
'sentry.io', 'jquery.com', 'googleapis', '@2x.', 'noreply@')
|
||
seen = set()
|
||
for m in RE_EMAIL.finditer(text):
|
||
e = m.group(0).lower().rstrip('.,;:)')
|
||
if any(b in e for b in bad): continue
|
||
if e in seen: continue
|
||
seen.add(e); return e
|
||
return None
|
||
|
||
def _find_phone(text: str) -> Optional[str]:
|
||
if not text: return None
|
||
for m in RE_PHONE.finditer(text):
|
||
raw = m.group(0).strip()
|
||
digits = re.sub(r'\D', '', raw)
|
||
if not (8 <= len(digits) <= 13): continue
|
||
cleaned = re.sub(r'\s+', ' ', raw).strip()
|
||
if raw.startswith('+385'): return '+385 ' + raw[4:].lstrip().lstrip('-/')
|
||
if raw.startswith('00385'): return '+385 ' + raw[5:].lstrip().lstrip('-/')
|
||
return cleaned
|
||
return None
|
||
|
||
def _find_official_web(text: str, hint: str = '') -> Optional[str]:
|
||
if not text: return None
|
||
blocked = ('wikipedia.org', 'sport-pgz.hr', 'google.com', 'facebook.com',
|
||
'instagram.com', 'youtube.com', 'twitter.com', 'wikimedia',
|
||
'sportilus.com', 'transfermarkt.com', 'wikidata.org',
|
||
'sudreg.pravosudje.hr', 'gov.hr', 'apis.google.com',
|
||
'rinet.one', 'pgz.hr')
|
||
candidates: list[str] = []
|
||
for m in RE_URL.finditer(text):
|
||
u = m.group(0).rstrip('.,;:)\'"')
|
||
try:
|
||
host = urllib.parse.urlparse(u).hostname or ''
|
||
except Exception:
|
||
continue
|
||
if not host or any(b in host for b in blocked): continue
|
||
candidates.append(u)
|
||
if not candidates: return None
|
||
if hint:
|
||
slug = re.sub(r'[^a-z0-9]', '', hint.lower())[:8]
|
||
for u in candidates:
|
||
host = urllib.parse.urlparse(u).hostname or ''
|
||
if slug and slug in host.replace('-', '').replace('.', ''):
|
||
return u
|
||
return candidates[0]
|
||
|
||
|
||
# ─── External sources ────────────────────────────────────────────────────
|
||
def _wiki_variants(query: str) -> list[str]:
|
||
"""Generate sensible Wikipedia HR title variants for a query.
|
||
|
||
The summary REST API is title-exact; clubs are often listed under their
|
||
abbreviation (KK X, NK X, RK X, OK X), so we try those variants too.
|
||
"""
|
||
if not query: return []
|
||
out, seen = [], set()
|
||
raw = query.strip()
|
||
def _push(v):
|
||
if v and v not in seen: seen.add(v); out.append(v)
|
||
_push(raw)
|
||
# KK Kvarner 2010 from Košarkaški klub KVARNER 2010
|
||
parts = raw.split()
|
||
sport_to_abbr = {
|
||
'košarkaški': 'KK', 'kosarkaski': 'KK',
|
||
'nogometni': 'NK', 'rukometni': 'RK',
|
||
'odbojkaški': 'OK', 'odbojkaski': 'OK',
|
||
'vaterpolski':'VK', 'plivacki': 'PK', 'plivački': 'PK',
|
||
'boćarski': 'BK', 'bocarski': 'BK',
|
||
}
|
||
if len(parts) >= 3 and parts[0].lower() in sport_to_abbr and parts[1].lower() == 'klub':
|
||
_push(sport_to_abbr[parts[0].lower()] + ' ' + ' '.join(p.capitalize() if p.isupper() else p for p in parts[2:]))
|
||
return out
|
||
|
||
def _wiki_summary(query: str) -> Optional[dict]:
|
||
for variant in _wiki_variants(query):
|
||
title = urllib.parse.quote(variant.replace(' ', '_'), safe='')
|
||
body = _http_get(f'https://hr.wikipedia.org/api/rest_v1/page/summary/{title}', timeout=5)
|
||
if not body: continue
|
||
try:
|
||
d = json.loads(body)
|
||
except Exception:
|
||
continue
|
||
if d.get('type') in ('disambiguation', 'no-extract'): continue
|
||
if not d.get('extract'): continue
|
||
return {
|
||
'source': 'wikipedia.hr',
|
||
'url': d.get('content_urls', {}).get('desktop', {}).get('page'),
|
||
'title': d.get('title'),
|
||
'extract': d.get('extract'),
|
||
'description': d.get('description'),
|
||
'matched_variant': variant,
|
||
}
|
||
return None
|
||
|
||
|
||
def _sport_pgz_search(query: str) -> Optional[dict]:
|
||
if not query: return None
|
||
page = _http_get('https://sport-pgz.hr/?s=' + urllib.parse.quote(query), timeout=6)
|
||
if not page:
|
||
# Plain HTTP failed → try JS-rendered fallback if available.
|
||
if _HAS_PW and _pw_scraper is not None:
|
||
return _pw_scraper.scrape_sport_pgz_klub(query)
|
||
return None
|
||
m = re.search(r'<article[^>]*>.*?<a\s+href=["\']([^"\']+)["\'][^>]*rel=["\']bookmark["\'][^>]*>([^<]+)</a>',
|
||
page, re.S | re.I)
|
||
if not m:
|
||
m = re.search(r'<a\s+href=["\'](https?://sport-pgz\.hr/[^"\']+)["\'][^>]*>([^<]{6,180})</a>', page, re.I)
|
||
if not m:
|
||
# Search page rendered but yielded nothing parseable — try JS fallback.
|
||
if _HAS_PW and _pw_scraper is not None:
|
||
return _pw_scraper.scrape_sport_pgz_klub(query)
|
||
return None
|
||
hit = m.group(1)
|
||
body = _http_get(hit, timeout=6)
|
||
if not body:
|
||
return {'source': 'sport-pgz.hr', 'url': hit, 'title': html.unescape(m.group(2).strip())}
|
||
text = _strip_tags(body)[:4000]
|
||
meta = _extract_meta(body, hit)
|
||
return {
|
||
'source': 'sport-pgz.hr',
|
||
'url': hit,
|
||
'title': meta.get('title') or html.unescape(m.group(2).strip()),
|
||
'extract': meta.get('description') or text[:500],
|
||
'raw_text': text,
|
||
}
|
||
|
||
|
||
# ─── kosarkapgz.com (Košarkaški savez PGŽ) ──────────────────────────────
|
||
# Added 2026-05-10. WordPress site with permalinks DISABLED — REST API is
|
||
# reachable only via `?rest_route=/...` (the `/wp-json/...` paths 404).
|
||
# Use as enrichment source for PGŽ basketball entities (savez/klub/sportaš).
|
||
_KOSARKAPGZ_BASE = 'https://kosarkapgz.com'
|
||
|
||
def _kosarkapgz_is_basketball_pgz(naziv: str, sport: Optional[str]) -> bool:
|
||
"""Heuristic: PGŽ basketball entity worth scraping kosarkapgz.com for.
|
||
|
||
The pgz_sport.klubovi/savezi tables are PGŽ-scoped by design
|
||
(klubovi.is_pgz default true, savezi.region default 'PGŽ'), so we don't
|
||
re-check region here — a basketball signal is enough.
|
||
"""
|
||
def _fold(s: str) -> str:
|
||
return (s or '').lower().replace('ć', 'c').replace('š', 's') \
|
||
.replace('ž', 'z').replace('č', 'c').replace('đ', 'd')
|
||
n, sp = _fold(naziv), _fold(sport)
|
||
if 'kosark' in sp: return True # 'košarka' / 'košarkaški'
|
||
if 'kosark' in n: return True
|
||
if re.match(r'^\s*kk\b', n): return True # 'KK X' = Košarkaški klub X
|
||
return False
|
||
|
||
def _kosarkapgz_savez_meta() -> Optional[dict]:
|
||
"""Fetch homepage + key WP pages from kosarkapgz.com. Cached 24h.
|
||
|
||
Also runs structured extractors on /?page_id=234 (Upravni odbor) and
|
||
/?page_id=70 (Kontakt), and persists the result into civic.entities +
|
||
civic.persons exactly once per cache miss. See _kosarkapgz_persist_kspgz.
|
||
"""
|
||
cache_key = 'kosarkapgz:savez_meta:v1'
|
||
cached = _cache_get(cache_key)
|
||
if cached: return cached
|
||
|
||
home = _http_get(_KOSARKAPGZ_BASE + '/', timeout=8)
|
||
if not home:
|
||
_cache_set(cache_key, None, ttl=900); return None
|
||
meta = _extract_meta(home, _KOSARKAPGZ_BASE + '/')
|
||
|
||
page_ids = {'povijest': 232, 'upravni_odbor': 234, 'kontakt': 70, 'objave': 217}
|
||
page_html: dict[str, str] = {}
|
||
page_text: dict[str, str] = {}
|
||
parts: list[str] = [meta.get('description', '') or '']
|
||
for slug, pid in page_ids.items():
|
||
body = _http_get(f'{_KOSARKAPGZ_BASE}/?page_id={pid}', timeout=8)
|
||
if not body: continue
|
||
page_html[slug] = body
|
||
text = _strip_tags(body)
|
||
if text:
|
||
page_text[slug] = text[:5000]
|
||
parts.append(f'[{slug}] ' + text[:5000])
|
||
|
||
# Structured extraction + idempotent persistence.
|
||
osobe = _kosarkapgz_extract_osobe(page_html.get('upravni_odbor', '')) or []
|
||
adresa = _kosarkapgz_extract_adresa(page_html.get('kontakt', '')) or {}
|
||
persist = _kosarkapgz_persist_kspgz(osobe, adresa) if (osobe or adresa) else {}
|
||
|
||
result = {
|
||
'source': 'kosarkapgz.com',
|
||
'url': _KOSARKAPGZ_BASE + '/',
|
||
'title': meta.get('title') or 'Košarkaški savez PGŽ',
|
||
'extract': (meta.get('description') or '')[:600],
|
||
'pages': page_text,
|
||
'structured': {
|
||
'upravni_odbor': osobe,
|
||
'adresa': adresa,
|
||
'persist': persist,
|
||
},
|
||
'raw_text': ('\n\n'.join(p for p in parts if p))[:80000],
|
||
}
|
||
_cache_set(cache_key, result, ttl=86400)
|
||
return result
|
||
|
||
def _kosarkapgz_search_posts(query: str, limit: int = 5) -> Optional[dict]:
|
||
"""WP REST search for posts mentioning `query`. Source-doc shape."""
|
||
if not query: return None
|
||
q = query.strip()
|
||
if len(q) < 3: return None
|
||
cache_key = 'kosarkapgz:search:' + re.sub(r'[^a-z0-9]+', '_', q.lower())[:60] + ':v1'
|
||
cached = _cache_get(cache_key)
|
||
if cached is not None:
|
||
return cached or None
|
||
|
||
url = (f'{_KOSARKAPGZ_BASE}/?rest_route=/wp/v2/posts'
|
||
f'&search={urllib.parse.quote(q)}&per_page={max(1, min(limit, 20))}'
|
||
f'&_fields=id,date,slug,title,excerpt,link,categories')
|
||
body = _http_get(url, timeout=8)
|
||
if not body or not body.lstrip().startswith('['):
|
||
_cache_set(cache_key, [], ttl=3600); return None
|
||
try:
|
||
posts = json.loads(body)
|
||
except Exception:
|
||
_cache_set(cache_key, [], ttl=3600); return None
|
||
if not posts:
|
||
_cache_set(cache_key, [], ttl=3600); return None
|
||
|
||
articles = []
|
||
for p in posts[:limit]:
|
||
title_html = ((p.get('title') or {}).get('rendered')) or ''
|
||
excerpt_html = ((p.get('excerpt') or {}).get('rendered')) or ''
|
||
articles.append({
|
||
'title': html.unescape(_strip_tags(title_html))[:300],
|
||
'excerpt': html.unescape(_strip_tags(excerpt_html))[:400],
|
||
'date': p.get('date'),
|
||
'link': p.get('link'),
|
||
})
|
||
blob_lines = [
|
||
f"{(a.get('date') or '')[:10]} — {a.get('title','')}\n"
|
||
f"{a.get('excerpt','')}\n{a.get('link','')}"
|
||
for a in articles
|
||
]
|
||
result = {
|
||
'source': 'kosarkapgz.com',
|
||
'url': _KOSARKAPGZ_BASE + '/',
|
||
'title': f'kosarkapgz.com — {len(articles)} mention(s) for "{q[:60]}"',
|
||
'extract': blob_lines[0] if blob_lines else '',
|
||
'articles': articles,
|
||
'raw_text': ('\n\n'.join(blob_lines))[:60000],
|
||
}
|
||
_cache_set(cache_key, result, ttl=86400)
|
||
return result
|
||
|
||
|
||
|
||
# ─── kosarkapgz.com — structured extractors (page_id=234, page_id=70) ────
|
||
# Added 2026-05-10. Cheap, deterministic parsers for two specific WP pages
|
||
# whose layout is known:
|
||
# ?page_id=234 → "Upravni odbor" — entry-content is a single <ul> of names,
|
||
# optionally suffixed with " – funkcija" (en/em dash).
|
||
# ?page_id=70 → "Kontakt" — the OG description packs ulica/grad/email/
|
||
# tel/web in a single string ("Adresa Verdijeva 11/III51000
|
||
# RijekaHrvatska E-mail: ... Mob: ... Tel: ... http://...").
|
||
# Period of office is not published on either page, so persons.period is
|
||
# always None today; the field is kept so future scrapers (kszpg.hr archive)
|
||
# can fill it in without a schema change.
|
||
|
||
_KPGZ_DASH_RE = re.compile(r'\s*[\u2013\u2014\-]\s*') # en/em/ascii dash
|
||
_KPGZ_LI_RE = re.compile(r'<li[^>]*>(.*?)</li>', re.I | re.S)
|
||
_KPGZ_TAG_RE = re.compile(r'<[^>]+>')
|
||
_KPGZ_OG_DESC = re.compile(r'<meta[^>]+property=["\']og:description["\'][^>]+content=["\']([^"\']+)', re.I)
|
||
_KPGZ_EMAIL_RE = re.compile(r'[\w.+-]+@[\w-]+\.[\w.-]+')
|
||
_KPGZ_TEL_RE = re.compile(r'(?i)Tel\s*[:\.\-]?\s*(\+?[\d\s().\-/]{6,})')
|
||
_KPGZ_MOB_RE = re.compile(r'(?i)Mob\s*[:\.\-]?\s*(\+?[\d\s().\-/]{6,})')
|
||
_KPGZ_WEB_RE = re.compile(r'https?://[\w.\-]+(?:/[\w.\-/?#=&%]*)?', re.I)
|
||
# "Verdijeva 11/III" → ulica="Verdijeva", broj="11/III"
|
||
_KPGZ_ULICA_RE = re.compile(r'(?i)Adresa\s+([A-ZČĆĐŠŽa-zčćđšž\s]+?)\s+([\dA-Za-z][\w./\-]*)')
|
||
# "51000 Rijeka"
|
||
_KPGZ_POSTAL_RE = re.compile(r'(\d{5})\s+([A-ZČĆĐŠŽa-zčćđšž][A-ZČĆĐŠŽa-zčćđšž\s\-]+?)(?=\s+Hrvatska|\s+E-mail|\s+Mob|\s+Tel|$)')
|
||
|
||
def _kosarkapgz_extract_osobe(html_doc: str) -> list[dict]:
|
||
"""Return [{'ime':, 'prezime':, 'funkcija':, 'period':}] for ?page_id=234."""
|
||
if not html_doc:
|
||
return []
|
||
# Find the entry-content div by hand (depth-balanced) to avoid grabbing
|
||
# JSON-LD / scripts. WordPress always wraps the body in <div class="entry-content">.
|
||
i = html_doc.find('class="entry-content')
|
||
if i < 0:
|
||
return []
|
||
j = html_doc.find('>', i) + 1
|
||
depth, k = 1, j
|
||
while k < len(html_doc) and depth > 0:
|
||
no = html_doc.find('<div', k)
|
||
nc = html_doc.find('</div>', k)
|
||
if nc < 0:
|
||
break
|
||
if 0 <= no < nc:
|
||
depth += 1; k = no + 4
|
||
else:
|
||
depth -= 1; k = nc + 6
|
||
body = html_doc[j:k-6] if depth == 0 else html_doc[j:]
|
||
out: list[dict] = []
|
||
for raw in _KPGZ_LI_RE.findall(body):
|
||
text = html.unescape(_KPGZ_TAG_RE.sub(' ', raw)).strip()
|
||
text = re.sub(r'\s+', ' ', text)
|
||
if not text:
|
||
continue
|
||
# Split on en/em/ASCII dash → "Name Surname" — "funkcija"
|
||
parts = _KPGZ_DASH_RE.split(text, maxsplit=1)
|
||
person = parts[0].strip()
|
||
funkcija = parts[1].strip().lower() if len(parts) > 1 else None
|
||
# Split person → first name(s) + last name on last whitespace.
|
||
toks = person.split()
|
||
if len(toks) < 2:
|
||
continue
|
||
ime = ' '.join(toks[:-1])
|
||
prezime = toks[-1]
|
||
out.append({
|
||
'ime': ime, 'prezime': prezime,
|
||
'funkcija': funkcija, 'period': None,
|
||
})
|
||
return out
|
||
|
||
def _kosarkapgz_extract_adresa(html_doc: str) -> dict:
|
||
"""Return {'ulica','broj','grad','postal_code','telefon','mobitel','email','web'}.
|
||
|
||
Prefers the og:description meta on /?page_id=70 — it is one clean string
|
||
the site re-renders verbatim from the page body. Falls back to stripping
|
||
the page itself if the meta is missing.
|
||
"""
|
||
if not html_doc:
|
||
return {}
|
||
desc_match = _KPGZ_OG_DESC.search(html_doc)
|
||
text = html.unescape(desc_match.group(1)) if desc_match else _strip_tags(html_doc)
|
||
text = re.sub(r'\s+', ' ', text).strip()
|
||
if not text:
|
||
return {}
|
||
# The og:description glues fields without whitespace
|
||
# ("11/III51000 RijekaHrvatska E-mail: a@b.com, c@d.comMob: ...").
|
||
# Inject separators so the field regexes don't gobble across boundaries.
|
||
text = re.sub(r'(?<=\D)(?=\d{5}\s)', ' ', text)
|
||
text = re.sub(r'(?<=[a-zćčđšž])(?=[A-ZČĆĐŠŽ])', ' ', text)
|
||
text = re.sub(r'(?i)\s*(Hrvatska|E-?mail|Mob|Tel|IBAN|http)', r' \1', text)
|
||
text = re.sub(r'\s+', ' ', text).strip()
|
||
out: dict = {}
|
||
m = _KPGZ_ULICA_RE.search(text)
|
||
if m:
|
||
out['ulica'] = m.group(1).strip()
|
||
out['broj'] = m.group(2).strip()
|
||
m = _KPGZ_POSTAL_RE.search(text)
|
||
if m:
|
||
out['postal_code'] = m.group(1)
|
||
out['grad'] = m.group(2).strip()
|
||
emails = list(dict.fromkeys(_KPGZ_EMAIL_RE.findall(text)))
|
||
if emails:
|
||
out['email'] = emails[0]
|
||
if len(emails) > 1:
|
||
out['email_extra'] = emails[1:]
|
||
m = _KPGZ_TEL_RE.search(text)
|
||
if m:
|
||
out['telefon'] = re.sub(r'\s+', ' ', m.group(1)).strip()
|
||
m = _KPGZ_MOB_RE.search(text)
|
||
if m:
|
||
out['mobitel'] = re.sub(r'\s+', ' ', m.group(1)).strip()
|
||
# Pick the first http(s) URL that isn't a mailto / schema.org probe.
|
||
for u in _KPGZ_WEB_RE.findall(text):
|
||
if 'schema.org' in u: continue
|
||
out['web'] = u.rstrip('.,')
|
||
break
|
||
return out
|
||
|
||
# Canonical name kept in civic.entities for the Košarkaški savez PGŽ.
|
||
# (civic.entities has rows for many county basketball savezi but NOT for the
|
||
# PGŽ one as of 2026-05-10 — we insert it on first persist call.)
|
||
_KPGZ_KSPGZ_NAME = 'Košarkaški savez Primorsko-goranske županije'
|
||
_KPGZ_KSPGZ_ALIASES = (
|
||
_KPGZ_KSPGZ_NAME.lower(),
|
||
'košarkaški savez pgž',
|
||
'kosarkaski savez pgz',
|
||
'košarkaški savez primorsko-goranske županije',
|
||
'kosarkaski savez primorsko-goranske zupanije',
|
||
)
|
||
|
||
def _kosarkapgz_is_kspgz(naziv: str) -> bool:
|
||
"""True for the PGŽ-county savez specifically (not 'Grada Rijeke', not HKS)."""
|
||
if not naziv: return False
|
||
n = naziv.lower().strip()
|
||
if any(a in n for a in _KPGZ_KSPGZ_ALIASES):
|
||
return True
|
||
# Permissive fallback: anything matching košarkaški savez + PGŽ token.
|
||
return ('košarkaški savez' in n or 'kosarkaski savez' in n) and (
|
||
'pgž' in n or 'pgz' in n or 'primorsko' in n
|
||
)
|
||
|
||
def _kosarkapgz_persist_kspgz(osobe: list[dict], adresa: dict) -> dict:
|
||
"""Upsert civic.entities row for KSPGŽ + civic.persons for each Upravni
|
||
odbor member. Also updates pgz_sport.savezi (id=6, KSPGŽ) for predsjednik,
|
||
email, telefon, web. All writes carry provenance.source='kosarkapgz.com'
|
||
via SET LOCAL so the trigger logs them correctly.
|
||
|
||
Idempotent: persons keyed on (lower(name), entity_id, metadata->>'source').
|
||
"""
|
||
# # KSPGZ-SAVEZI-PATCH-20260510
|
||
res = {
|
||
'entity_id': None, 'entity_action': 'noop',
|
||
'persons_inserted': 0, 'persons_updated': 0,
|
||
'savez_updates': {}, # NEW: pgz_sport.savezi field changes
|
||
'errors': [],
|
||
}
|
||
try:
|
||
c = _db()
|
||
c.autocommit = False # need a transaction for SET LOCAL
|
||
with c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
|
||
# ── provenance attribution for this whole transaction ─────────
|
||
cur.execute("SET LOCAL provenance.source = 'kosarkapgz.com'")
|
||
cur.execute("SET LOCAL provenance.source_url = 'https://kosarkapgz.com/'")
|
||
cur.execute("SET LOCAL provenance.edited_by = 'kosarkapgz_scraper@enrich_router'")
|
||
cur.execute("SET LOCAL provenance.trust_tier = '2'") # 1=visit, 2=verified site, 3=oss
|
||
from datetime import date as _date
|
||
cur.execute("SET LOCAL provenance.source_date = %s", (_date.today().isoformat(),))
|
||
# ── entity upsert ─────────────────────────────────────────────
|
||
cur.execute(
|
||
"SELECT id, address, city, county, phone, email, website "
|
||
"FROM civic.entities WHERE lower(name) = lower(%s) LIMIT 1",
|
||
[_KPGZ_KSPGZ_NAME],
|
||
)
|
||
row = cur.fetchone()
|
||
if row:
|
||
eid = row['id']
|
||
# Patch any missing address-ish field. Never overwrite existing
|
||
# non-null values — they may have been hand-curated.
|
||
set_clauses, params = [], []
|
||
addr_full = None
|
||
if adresa.get('ulica') and adresa.get('broj'):
|
||
addr_full = f"{adresa['ulica']} {adresa['broj']}"
|
||
if addr_full and not row.get('address'):
|
||
set_clauses.append('address = %s'); params.append(addr_full)
|
||
if adresa.get('grad') and not row.get('city'):
|
||
set_clauses.append('city = %s'); params.append(adresa['grad'])
|
||
if not row.get('county'):
|
||
set_clauses.append('county = %s'); params.append('Primorsko-goranska')
|
||
if adresa.get('telefon') and not row.get('phone'):
|
||
set_clauses.append('phone = %s'); params.append(adresa['telefon'])
|
||
if adresa.get('email') and not row.get('email'):
|
||
set_clauses.append('email = %s'); params.append(adresa['email'])
|
||
if adresa.get('postal_code') and adresa.get('grad'):
|
||
set_clauses.append("postal_code = %s"); params.append(adresa['postal_code'])
|
||
if not row.get('website'):
|
||
set_clauses.append('website = %s'); params.append('https://kosarkapgz.com/')
|
||
# Always extend sources[] with our marker.
|
||
set_clauses.append(
|
||
"sources = (SELECT array_agg(DISTINCT x) FROM unnest(coalesce(sources, ARRAY[]::text[]) || ARRAY['kosarkapgz.com page_id=70']) AS x)"
|
||
)
|
||
set_clauses.append('updated_at = now()')
|
||
params.append(eid)
|
||
cur.execute(
|
||
f"UPDATE civic.entities SET {', '.join(set_clauses)} WHERE id = %s",
|
||
params,
|
||
)
|
||
res['entity_id'] = eid
|
||
res['entity_action'] = 'updated'
|
||
else:
|
||
addr_full = None
|
||
if adresa.get('ulica') and adresa.get('broj'):
|
||
addr_full = f"{adresa['ulica']} {adresa['broj']}"
|
||
cur.execute(
|
||
"INSERT INTO civic.entities "
|
||
" (name, short_name, entity_type, address, city, county, "
|
||
" postal_code, phone, email, website, metadata, sources) "
|
||
"VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) "
|
||
"RETURNING id",
|
||
[_KPGZ_KSPGZ_NAME, 'KSPGŽ', 'sportski_savez',
|
||
addr_full, adresa.get('grad'), 'Primorsko-goranska',
|
||
adresa.get('postal_code'), adresa.get('telefon'),
|
||
adresa.get('email'), 'https://kosarkapgz.com/',
|
||
json.dumps({'sport': 'košarka', 'level': 'regional',
|
||
'aliases': ['KSPGŽ', 'Košarkaški savez PGŽ']}),
|
||
['kosarkapgz.com page_id=70']],
|
||
)
|
||
res['entity_id'] = cur.fetchone()['id']
|
||
res['entity_action'] = 'inserted'
|
||
|
||
# ── persons upsert ────────────────────────────────────────────
|
||
eid = res['entity_id']
|
||
SOURCE_KEY = 'kosarkapgz.com page_id=234'
|
||
for p in osobe or []:
|
||
fullname = f"{p['ime']} {p['prezime']}".strip()
|
||
if not fullname:
|
||
continue
|
||
cur.execute(
|
||
"SELECT id, function, metadata FROM civic.persons "
|
||
"WHERE lower(name) = lower(%s) AND entity_id = %s "
|
||
" AND coalesce(metadata->>'source','') = %s "
|
||
"LIMIT 1",
|
||
[fullname, eid, SOURCE_KEY],
|
||
)
|
||
existing = cur.fetchone()
|
||
meta = {
|
||
'source': SOURCE_KEY,
|
||
'source_url': 'https://kosarkapgz.com/?page_id=234',
|
||
'ime': p.get('ime'),
|
||
'prezime': p.get('prezime'),
|
||
'period': p.get('period'),
|
||
'scraped_at': int(time.time()),
|
||
}
|
||
if existing:
|
||
cur.execute(
|
||
"UPDATE civic.persons SET function = %s, metadata = %s, "
|
||
" updated_at = now() WHERE id = %s",
|
||
[p.get('funkcija') or existing.get('function'),
|
||
json.dumps(meta), existing['id']],
|
||
)
|
||
res['persons_updated'] += 1
|
||
else:
|
||
cur.execute(
|
||
"INSERT INTO civic.persons "
|
||
" (name, function, level, entity_id, county, city, metadata, trust_tier) "
|
||
"VALUES (%s, %s, %s, %s, %s, %s, %s, %s)",
|
||
[fullname, p.get('funkcija'), 'savez',
|
||
eid, 'Primorsko-goranska', adresa.get('grad'),
|
||
json.dumps(meta), 3],
|
||
)
|
||
res['persons_inserted'] += 1
|
||
|
||
# ── pgz_sport.savezi update (NEW 2026-05-10) ──────────────────
|
||
# Update predsjednik / tajnik / email / telefon / web on the
|
||
# canonical KSPGŽ row in pgz_sport.savezi (id=6 by default; lookup
|
||
# by naziv if id moves). Provenance GUC was set at top of txn,
|
||
# so trigger trg_provenance_log captures source='kosarkapgz.com'.
|
||
try:
|
||
# Predsjednik = first Upravni odbor member whose funkcija ~ 'predsjednik'
|
||
predsj = None
|
||
for p in (osobe or []):
|
||
if (p.get('funkcija') or '').lower().startswith('predsj'):
|
||
predsj = f"{p.get('ime','')} {p.get('prezime','')}".strip()
|
||
break
|
||
|
||
cur.execute("SELECT id, predsjednik, tajnik, email, telefon, web "
|
||
"FROM pgz_sport.savezi WHERE lower(naziv) IN "
|
||
" (lower(%s), lower(%s)) LIMIT 1",
|
||
['Košarkaški savez PGŽ', 'Košarkaški savez Primorsko-goranske županije'])
|
||
srow = cur.fetchone()
|
||
if srow:
|
||
sid = srow['id']
|
||
sets, params, applied = [], [], {}
|
||
|
||
def _set_if_changed(col, new_val):
|
||
if not new_val: return
|
||
cur_val = srow.get(col)
|
||
if cur_val and str(cur_val).strip() == str(new_val).strip():
|
||
return
|
||
sets.append(f"{col} = %s")
|
||
params.append(new_val)
|
||
applied[col] = {'before': cur_val, 'after': new_val}
|
||
|
||
_set_if_changed('predsjednik', predsj)
|
||
# tajnik not reliably published; skip unless future scrape adds it
|
||
if adresa.get('email'):
|
||
_set_if_changed('email', adresa['email'])
|
||
if adresa.get('telefon'):
|
||
_set_if_changed('telefon', adresa['telefon'])
|
||
# Web: prefer kosarkapgz.com over generic sport-pgz.hr aggregator
|
||
if not srow.get('web') or 'sport-pgz.hr' in (srow.get('web') or ''):
|
||
_set_if_changed('web', 'https://kosarkapgz.com/')
|
||
|
||
if sets:
|
||
params.append(sid)
|
||
cur.execute(
|
||
f"UPDATE pgz_sport.savezi SET {', '.join(sets)} WHERE id = %s",
|
||
params,
|
||
)
|
||
res['savez_updates'] = {'savez_id': sid, 'fields': applied}
|
||
else:
|
||
res['savez_updates'] = {'savez_id': sid, 'fields': {}, 'note': 'no diff'}
|
||
else:
|
||
res['savez_updates'] = {'note': 'KSPGŽ row not found in pgz_sport.savezi'}
|
||
except Exception as e:
|
||
res['errors'].append(f"savez_update: {e}")
|
||
|
||
c.commit()
|
||
except Exception as e:
|
||
try: c.rollback()
|
||
except Exception: pass
|
||
res['errors'].append(str(e))
|
||
finally:
|
||
try: c.close()
|
||
except Exception: pass
|
||
# Bust the kosarkapgz cache so next call re-fetches and re-persists
|
||
try:
|
||
_cache_delete('kosarkapgz:savez_meta:v1')
|
||
except Exception:
|
||
pass
|
||
return res
|
||
|
||
|
||
def _fetch_primary_site(url: str) -> Optional[dict]:
|
||
body = _http_get(url, timeout=6)
|
||
if not body: return None
|
||
text = _strip_tags(body)
|
||
meta = _extract_meta(body, url)
|
||
return {
|
||
'source': urllib.parse.urlparse(url).hostname or url,
|
||
'url': url,
|
||
'title': meta.get('title'),
|
||
'extract': meta.get('description') or text[:500],
|
||
'raw_text': text[:8000],
|
||
}
|
||
|
||
|
||
# ─── Anti-halucinacija safeguard (added 2026-05-10 per Damir directive) ────
|
||
# When LLM-synthesised descriptions mention famous Croatian sports/political
|
||
# names, those mentions MUST be backed by source text — otherwise we strip
|
||
# the offending sentence and log to civic.fact_check_log. Triggered after
|
||
# Damir caught a "Toni Kukoč briljirao" fabrication for KK Kvarner 2010
|
||
# (Kukoč played for Jugoplastika Split, not Kvarner).
|
||
FAMOUS_PEOPLE_REQUIRE_PROOF = (
|
||
'Kukoč', 'Petrović', 'Modrić', 'Šuker', 'Boban', 'Kovač',
|
||
'Dalić', 'Mandžukić', 'Rakitić', 'Lovren', 'Vida', 'Perišić',
|
||
'Tuđman', 'Račan', 'Mesić', 'Josipović', 'Milanović',
|
||
'Bilić', 'Štimac', 'Prosinečki', 'Suker', # Suker = Šuker without diacritic
|
||
'Kostelić', 'Janica', 'Blanka Vlašić', 'Sara Kolak',
|
||
'Sandra Perković', 'Filip Hrgović',
|
||
)
|
||
|
||
def _fact_check_response(text: str, evidence: list[str], kind: Optional[str], naziv: Optional[str]) -> str:
|
||
"""Strip sentences mentioning famous people not present in source evidence.
|
||
Logs each strip to civic.fact_check_log. Croatian-diacritic-aware (substring match).
|
||
Fail-soft on logging — if DB write fails the strip still happens.
|
||
"""
|
||
if not text:
|
||
return text
|
||
blob = ' '.join(e for e in (evidence or []) if e)
|
||
out = text
|
||
for famous in FAMOUS_PEOPLE_REQUIRE_PROOF:
|
||
if famous not in out:
|
||
continue
|
||
if famous in blob:
|
||
# Mentioned in sources → keep, mark validated in log so we know which famous names DO have proof
|
||
try:
|
||
_c = _db(); _cur = _c.cursor()
|
||
_cur.execute(
|
||
"INSERT INTO civic.fact_check_log "
|
||
"(entity_kind, halucinated_text, famous_person, source_validated, sources_searched, metadata) "
|
||
"VALUES (%s, %s, %s, TRUE, %s, %s::jsonb)",
|
||
(kind, '(in sources)', famous, len(evidence or []),
|
||
json.dumps({'naziv': naziv, 'mode': 'kept_validated'}, ensure_ascii=False))
|
||
)
|
||
_cur.close(); _c.close()
|
||
except Exception:
|
||
pass
|
||
continue
|
||
# Famous mention without source backing — strip the sentence(s)
|
||
pattern = re.compile(r'[^.!?]*' + re.escape(famous) + r'[^.!?]*[.!?]?', re.UNICODE)
|
||
stripped_match = pattern.search(out)
|
||
stripped_text = stripped_match.group(0) if stripped_match else ''
|
||
out = pattern.sub('', out)
|
||
try:
|
||
_c = _db(); _cur = _c.cursor()
|
||
_cur.execute(
|
||
"INSERT INTO civic.fact_check_log "
|
||
"(entity_kind, halucinated_text, famous_person, source_validated, sources_searched, metadata) "
|
||
"VALUES (%s, %s, %s, FALSE, %s, %s::jsonb)",
|
||
(kind, (stripped_text or text)[:1000], famous, len(evidence or []),
|
||
json.dumps({'naziv': naziv, 'mode': 'stripped_no_proof'}, ensure_ascii=False))
|
||
)
|
||
_cur.close(); _c.close()
|
||
except Exception:
|
||
pass
|
||
# Collapse double spaces / orphaned commas left after sentence removal
|
||
out = re.sub(r'\s+', ' ', out)
|
||
out = re.sub(r'\s+([.,;:])', r'\1', out)
|
||
return out.strip()
|
||
|
||
|
||
# ─── DeepSeek (optional, fail-soft) ─────────────────────────────────────
|
||
def _deepseek_describe(naziv: str, kind: str, evidence: list[str]) -> Optional[str]:
|
||
if not DEEPSEEK_KEY or not evidence: return None
|
||
joined = "\n---\n".join(e for e in evidence if e)[:6000]
|
||
if not joined.strip(): return None
|
||
prompt = (f"Iz dolje navedenih izvora napiši profesionalni opis za "
|
||
f"{kind} '{naziv}' na hrvatskom jeziku. 3-5 rečenica. "
|
||
f"Bez uvoda 'Evo opisa', samo tekst.\n\nIZVORI:\n{joined}")
|
||
# STRICT FACT RULE — anti-halucinacija system prompt (Damir 2026-05-10).
|
||
# Any famous-name mention without source backing will also be stripped
|
||
# post-response by _fact_check_response().
|
||
system_prompt = (
|
||
"Pišeš sažete činjenične opise sportskih organizacija na hrvatskom. "
|
||
"STRICT FACT RULE: NE SPOMINJI specifične povijesne osobe, datume ili dostignuća "
|
||
"osim ako su DOSLOVNO prisutni u priloženim IZVORIMA. Nikada ne nagađaj o slavnim "
|
||
"sportašima (Kukoč, Petrović, Modrić, Šuker, Boban, Kostelić, Vlašić, ...) "
|
||
"osim ako su izrijekom navedeni u IZVORIMA. Ako nisi siguran, IZOSTAVI. "
|
||
"Ako izvori ne sadrže povijesnu informaciju, drži se sadašnjeg djelovanja kluba."
|
||
)
|
||
payload = {
|
||
"model": "deepseek-chat",
|
||
"messages": [
|
||
{"role": "system", "content": system_prompt},
|
||
{"role": "user", "content": prompt},
|
||
],
|
||
"max_tokens": 280, "temperature": 0.3,
|
||
}
|
||
req = urllib.request.Request(
|
||
DEEPSEEK_URL, data=json.dumps(payload).encode('utf-8'),
|
||
headers={'Authorization': 'Bearer ' + DEEPSEEK_KEY,
|
||
'Content-Type': 'application/json',
|
||
'User-Agent': UA}, method='POST')
|
||
try:
|
||
with urllib.request.urlopen(req, timeout=20) as r:
|
||
d = json.loads(r.read().decode('utf-8'))
|
||
text = d.get('choices', [{}])[0].get('message', {}).get('content', '').strip()
|
||
if not text:
|
||
return None
|
||
# Post-LLM fact validator — strip famous-name mentions lacking source backing
|
||
text = _fact_check_response(text, evidence, kind, naziv)
|
||
return text or None
|
||
except Exception:
|
||
return None
|
||
|
||
|
||
# ─── Row loaders & display name ─────────────────────────────────────────
|
||
def _load_row(kind: str, eid: int) -> dict:
|
||
if kind == 'klub':
|
||
row = _fetch_one("""SELECT id, naziv, oib, sport, grad, predsjednik, tajnik,
|
||
web, web_stranica, email, telefon, ciljevi, opis_djelatnosti,
|
||
sjediste, godina_osnutka, savez_id, scrape_url, source_url,
|
||
metadata
|
||
FROM pgz_sport.klubovi WHERE id=%s""", (eid,))
|
||
elif kind == 'savez':
|
||
row = _fetch_one("""SELECT id, naziv, oib, sport, predsjednik, tajnik, email, telefon, web,
|
||
adresa, godina_osnutka, source_url, metadata
|
||
FROM pgz_sport.savezi WHERE id=%s""", (eid,))
|
||
elif kind == 'sportas':
|
||
row = _fetch_one("""SELECT c.id, c.ime, c.prezime, c.sport, c.klub_id, c.profile_url,
|
||
c.slika_url, c.source_url, c.source, c.source_id,
|
||
c.hns_igrac_id, c.biografija,
|
||
c.datum_rodenja, c.mjesto_rodenja, c.broj_dresa,
|
||
c.visina_cm, c.tezina_kg, c.dominantna_noga, c.oib,
|
||
c.vanjski_id, c.metadata,
|
||
k.sport AS klub_sport, k.naziv AS klub_naziv
|
||
FROM pgz_sport.clanovi c
|
||
LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id
|
||
WHERE c.id=%s""", (eid,))
|
||
# Fall back to klub.sport when c.sport is empty
|
||
if row and not row.get('sport') and row.get('klub_sport'):
|
||
row['sport'] = row['klub_sport']
|
||
else:
|
||
raise HTTPException(400, "kind must be klub|savez|sportas")
|
||
if not row:
|
||
raise HTTPException(404, kind + " not found")
|
||
return row
|
||
|
||
|
||
def _display_name(kind: str, row: dict) -> str:
|
||
if kind == 'sportas':
|
||
return ((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip()
|
||
return row.get('naziv', '') or ''
|
||
|
||
|
||
# ─── Sport federations map (loaded once, refresh on file mtime) ─────────
|
||
_SPORT_FED_PATH = '/opt/pgz-sport/data/sport_federations.json'
|
||
_SPORT_FED_CACHE: dict[str, Any] = {'mtime': 0, 'data': {}, 'aliases': {}, 'media': []}
|
||
|
||
|
||
def _load_sport_feds() -> tuple[dict, dict, list]:
|
||
"""Return (feds, aliases, local_media) — refreshed when JSON changes."""
|
||
try:
|
||
st = os.stat(_SPORT_FED_PATH)
|
||
except FileNotFoundError:
|
||
return ({}, {}, [])
|
||
if st.st_mtime != _SPORT_FED_CACHE['mtime']:
|
||
try:
|
||
with open(_SPORT_FED_PATH, 'r', encoding='utf-8') as f:
|
||
raw = json.load(f)
|
||
except Exception:
|
||
return (_SPORT_FED_CACHE['data'],
|
||
_SPORT_FED_CACHE['aliases'],
|
||
_SPORT_FED_CACHE['media'])
|
||
aliases = raw.pop('_aliases', {}) if isinstance(raw, dict) else {}
|
||
media = raw.pop('_local_media_pgz', []) if isinstance(raw, dict) else []
|
||
raw.pop('_meta', None)
|
||
_SPORT_FED_CACHE.update(mtime=st.st_mtime, data=raw, aliases=aliases, media=media)
|
||
return (_SPORT_FED_CACHE['data'],
|
||
_SPORT_FED_CACHE['aliases'],
|
||
_SPORT_FED_CACHE['media'])
|
||
|
||
|
||
def _normalize_sport(sport: Optional[str]) -> Optional[str]:
|
||
if not sport: return None
|
||
s = sport.strip().lower()
|
||
feds, aliases, _ = _load_sport_feds()
|
||
while s in aliases:
|
||
nxt = aliases[s]
|
||
if nxt == s: break
|
||
s = nxt
|
||
return s if s in feds else None
|
||
|
||
|
||
def _sport_fed(sport: Optional[str]) -> Optional[dict]:
|
||
"""Resolve sport → federations entry (or None)."""
|
||
norm = _normalize_sport(sport)
|
||
if not norm: return None
|
||
feds, _, _ = _load_sport_feds()
|
||
return feds.get(norm)
|
||
|
||
|
||
def _research_links(naziv, kind, grad=None, sport: Optional[str] = None, row: Optional[dict] = None):
|
||
base_q = (naziv or '').strip()
|
||
q = (base_q + ' ' + grad) if grad else base_q
|
||
qenc = urllib.parse.quote(q)
|
||
out = []
|
||
# Prefer DIRECT profile/source link if entity already has one (e.g. HNS Semafor)
|
||
if row:
|
||
direct = row.get('profile_url') or row.get('source_url') or row.get('scrape_url') or row.get('web') or row.get('web_stranica')
|
||
if direct and isinstance(direct, str) and direct.startswith(('http://','https://')):
|
||
try:
|
||
host = urllib.parse.urlparse(direct).hostname or ''
|
||
except Exception:
|
||
host = ''
|
||
label = 'Vanjski profil'
|
||
icon = '🔗'
|
||
if 'hns' in host: label, icon = 'HNS profil', '⚽'
|
||
elif 'transfermarkt' in host: label, icon = 'Transfermarkt', '⚽'
|
||
elif 'wikipedia' in host: label, icon = 'Wikipedia', '📚'
|
||
elif host.endswith('.hr') or host.endswith('.com'): label, icon = 'Službena stranica', '🌐'
|
||
out.append({'label': label, 'icon': icon, 'url': direct, 'is_direct': True})
|
||
out += [
|
||
{'label': 'Google', 'icon': '🔍', 'url': 'https://www.google.com/search?q=' + qenc},
|
||
{'label': 'Wikipedia HR', 'icon': '📚', 'url': 'https://hr.wikipedia.org/w/index.php?search=' + qenc},
|
||
{'label': 'sport-pgz.hr', 'icon': '🏅', 'url': 'https://sport-pgz.hr/?s=' + qenc},
|
||
]
|
||
if kind == 'klub':
|
||
out.append({'label': 'Sportilus', 'icon': '⬡', 'url': 'https://www.sportilus.com/?s=' + qenc})
|
||
out.append({'label': 'Sudski registar', 'icon': '⚖', 'url': 'https://sudreg.pravosudje.hr/registar/oc/index.html'})
|
||
|
||
# Sport-specific federation links (replace static HNS/transfermarkt for sportas)
|
||
fed = _sport_fed(sport) if sport else None
|
||
if kind == 'sportas':
|
||
if fed and isinstance(fed.get('national'), dict):
|
||
nat = fed['national']
|
||
search = (nat.get('search_url') or nat.get('url') or '').replace('{q}', qenc)
|
||
if search:
|
||
out.append({'label': nat.get('name', 'Nacionalni savez'),
|
||
'icon': '🏆', 'url': search})
|
||
if fed and isinstance(fed.get('pgz'), dict):
|
||
pgz = fed['pgz']
|
||
url = pgz.get('search_url') or pgz.get('url') or ''
|
||
if url:
|
||
out.append({'label': pgz.get('name', 'PGŽ savez'),
|
||
'icon': '🏟', 'url': url.replace('{q}', qenc)})
|
||
if not fed:
|
||
# No mapping for this sport → keep transfermarkt as legacy fallback
|
||
# Prefer direct /igraci/{id}/{slug} when hns_igrac_id exists
|
||
hns_id = (clan or {}).get('hns_igrac_id') if 'clan' in dir() else None
|
||
if not hns_id:
|
||
# Try get from current clan dict
|
||
try: hns_id = clan.get('hns_igrac_id') if isinstance(clan, dict) else None
|
||
except: pass
|
||
if hns_id:
|
||
# Slugify ime+prezime: "Franko Andrijašević" → "franko-andrijasevic"
|
||
_ime = (clan.get('ime','') if isinstance(clan, dict) else '') or ''
|
||
_prez = (clan.get('prezime','') if isinstance(clan, dict) else '') or ''
|
||
_full = (_ime + ' ' + _prez).strip().lower()
|
||
_slug = _full
|
||
for old_c, new_c in [('č','c'),('ć','c'),('ž','z'),('š','s'),('đ','d'),(' ','-')]:
|
||
_slug = _slug.replace(old_c, new_c)
|
||
_slug = re.sub(r'[^a-z0-9-]', '', _slug)
|
||
out.append({'label': 'HNS Semafor (profil)', 'icon': '⚽', 'url': f'https://semafor.hns.family/igraci/{hns_id}/{_slug}/'})
|
||
else:
|
||
out.append({'label': 'HNS Semafor (pretraga)', 'icon': '⚽', 'url': 'https://semafor.hns.family/?s=' + qenc})
|
||
out.append({'label': 'transfermarkt','icon': '⚽', 'url': 'https://www.transfermarkt.com/schnellsuche/ergebnis/schnellsuche?query=' + qenc})
|
||
# Local PGŽ media for any sportas
|
||
_, _, media = _load_sport_feds()
|
||
for m in media:
|
||
url = (m.get('search_url') or '').replace('{q}', qenc)
|
||
if url:
|
||
out.append({'label': m.get('name', 'Lokalni medij'),
|
||
'icon': '📰', 'url': url})
|
||
if kind == 'savez':
|
||
out.append({'label': 'sport-pgz.hr savezi', 'icon': '🏅', 'url': 'https://sport-pgz.hr/savezi'})
|
||
return out
|
||
|
||
|
||
# ─── Proposal pipelines ─────────────────────────────────────────────────
|
||
def _name_tokens(naziv: str) -> list[str]:
|
||
"""Significant tokens from entity name (≥4 chars, deaccented)."""
|
||
import unicodedata
|
||
s = unicodedata.normalize('NFKD', naziv or '').encode('ascii', 'ignore').decode('ascii').lower()
|
||
toks = [t for t in re.split(r'[^a-z0-9]+', s) if len(t) >= 4]
|
||
stop = {'klub','udruga','sportski','sport','kosarkaski','kosarka','nogometni',
|
||
'rukometni','savez','rijeka','primorsko','goranski','grad','grada','centar'}
|
||
return [t for t in toks if t not in stop] or toks
|
||
|
||
|
||
def _is_relevant(source: dict, tokens: list[str]) -> bool:
|
||
"""A source is 'relevant' only if the page actually mentions the entity name."""
|
||
if not tokens: return True
|
||
import unicodedata
|
||
blob = (source.get('title') or '') + ' ' + (source.get('extract') or '') + ' ' + (source.get('raw_text') or '')
|
||
blob = unicodedata.normalize('NFKD', blob.lower()).encode('ascii', 'ignore').decode('ascii')
|
||
return any(t in blob for t in tokens)
|
||
|
||
|
||
|
||
# ─── Klub domain guesser (HR slug → candidate URLs → HEAD probe) ────────
|
||
import re as _re_klg
|
||
|
||
def _slugify_klub(naziv: str) -> str:
|
||
if not naziv: return ""
|
||
s = naziv.lower()
|
||
repl = (("č","c"),("ć","c"),("ž","z"),("š","s"),("đ","d"),
|
||
('"',''),("'",""),("(",""),(")",""),(",",""),(".",""),
|
||
("/",""),("\\",""))
|
||
for a,b in repl: s = s.replace(a,b)
|
||
s = _re_klg.sub(r"[^a-z0-9]+", "-", s).strip("-")
|
||
return s
|
||
|
||
def _klub_domain_candidates(naziv: str) -> list[str]:
|
||
"""Generate ranked candidate URLs from club name."""
|
||
if not naziv: return []
|
||
s = _slugify_klub(naziv)
|
||
# Strip common prefixes for cleaner domains
|
||
base = s
|
||
for pref in ("hnk-","nk-","rk-","kk-","ok-","bk-","gk-","tk-","ak-","hbk-"):
|
||
if base.startswith(pref):
|
||
base = base[len(pref):]; break
|
||
# also try short prefix-ed variants
|
||
short = base.split("-")[0] if base else ""
|
||
candidates = []
|
||
sports_prefixes = ["nk-","hnk-","rk-","kk-","bk-","ok-","ak-","tk-"]
|
||
# full slug with original prefix
|
||
for tld in (".hr",".com",".eu",".info"):
|
||
candidates.append(f"https://{s}{tld}")
|
||
candidates.append(f"https://www.{s}{tld}")
|
||
# base-only
|
||
for tld in (".hr",".com"):
|
||
candidates.append(f"https://{base}{tld}")
|
||
candidates.append(f"https://www.{base}{tld}")
|
||
# try sport prefixes if name doesn't already have one
|
||
if not any(s.startswith(p) for p in sports_prefixes):
|
||
for sp in sports_prefixes[:5]:
|
||
for tld in (".hr",".com"):
|
||
candidates.append(f"https://{sp}{base}{tld}")
|
||
# dedup, preserve order
|
||
seen, out = set(), []
|
||
for c in candidates:
|
||
if c not in seen:
|
||
seen.add(c); out.append(c)
|
||
return out[:20]
|
||
|
||
def _probe_klub_url(url: str, naziv_tokens: list, timeout: int = 5) -> Optional[dict]:
|
||
"""HEAD/GET probe; return doc with raw_text if URL is alive AND mentions club tokens."""
|
||
try:
|
||
import requests
|
||
r = requests.get(url, timeout=timeout, allow_redirects=True,
|
||
headers={"User-Agent":"Mozilla/5.0 RinetEnrichBot/1.0"})
|
||
if r.status_code != 200: return None
|
||
if len(r.text) < 200: return None
|
||
text = r.text.lower()
|
||
# Must mention at least one distinctive token from name
|
||
toks = [t.lower() for t in (naziv_tokens or []) if len(t) > 2]
|
||
if toks and not any(t in text for t in toks):
|
||
return None
|
||
return {"source": "domain_probe", "url": r.url, "raw_text": r.text[:50000]}
|
||
except Exception:
|
||
return None
|
||
|
||
def _guess_klub_domains(naziv: str, tokens: list) -> Optional[dict]:
|
||
"""Parallel probe candidates (5 workers, 4s timeout each); first hit wins."""
|
||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||
candidates = _klub_domain_candidates(naziv)
|
||
if not candidates: return None
|
||
with ThreadPoolExecutor(max_workers=8) as ex:
|
||
futs = {ex.submit(_probe_klub_url, url, tokens, 4): url for url in candidates[:16]}
|
||
try:
|
||
for fut in as_completed(futs, timeout=10):
|
||
try:
|
||
doc = fut.result()
|
||
if doc:
|
||
# Cancel remaining (best effort)
|
||
for f in futs:
|
||
if not f.done(): f.cancel()
|
||
return doc
|
||
except Exception:
|
||
continue
|
||
except TimeoutError:
|
||
# All probes timed out — graceful None instead of 500
|
||
for f in futs:
|
||
if not f.done(): f.cancel()
|
||
return None
|
||
return None
|
||
|
||
def _scrape_klub_subpages(base_url: str, tokens: list) -> str:
|
||
"""Fetch /kontakt /uprava /o-nama /o-klubu and concat texts.
|
||
|
||
Parallelized via ThreadPoolExecutor (8 workers, 3s per-URL timeout, 8s total budget).
|
||
Was sequential 8x4s = up to 32s.
|
||
"""
|
||
if not base_url: return ""
|
||
import requests
|
||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||
base = base_url.rstrip("/")
|
||
paths = ["/kontakt","/uprava","/o-nama","/o-klubu","/predsjednik","/klub","/contact","/about"]
|
||
headers = {"User-Agent":"Mozilla/5.0 RinetEnrichBot/1.0"}
|
||
|
||
def _fetch(path):
|
||
try:
|
||
r = requests.get(base + path, timeout=3, allow_redirects=True, headers=headers)
|
||
if r.status_code == 200 and len(r.text) > 200:
|
||
return r.text[:30000]
|
||
except Exception:
|
||
pass
|
||
return None
|
||
|
||
accum = []
|
||
with ThreadPoolExecutor(max_workers=8) as ex:
|
||
futs = {ex.submit(_fetch, p): p for p in paths}
|
||
try:
|
||
for fut in as_completed(futs, timeout=8):
|
||
try:
|
||
txt = fut.result()
|
||
if txt: accum.append(txt)
|
||
except Exception:
|
||
continue
|
||
except Exception:
|
||
for f in futs:
|
||
if not f.done(): f.cancel()
|
||
return "\n\n".join(accum)
|
||
|
||
|
||
def _propose_for_klub(row: dict) -> dict:
|
||
naziv = row.get('naziv') or ''
|
||
# Only consider HTTP(S) URLs as valid primary sources — skip placeholder strings like 'godisnjak_2025'
|
||
raw_primary = row.get('web') or row.get('web_stranica') or row.get('source_url') or row.get('scrape_url')
|
||
primary = raw_primary if (raw_primary and isinstance(raw_primary, str) and raw_primary.startswith(('http://','https://'))) else None
|
||
sources, evidence = [], []
|
||
tokens_pre = _name_tokens(naziv)
|
||
pdoc = _fetch_primary_site(primary) if primary else None
|
||
if not pdoc:
|
||
# No valid web in DB — try to guess domain from club name
|
||
pdoc = _guess_klub_domains(naziv, tokens_pre)
|
||
if pdoc:
|
||
# Also fetch subpages for richer evidence
|
||
sub = _scrape_klub_subpages(pdoc.get('url',''), tokens_pre)
|
||
if sub:
|
||
pdoc['raw_text'] = (pdoc.get('raw_text','') + '\n\n' + sub)[:120000]
|
||
elif pdoc:
|
||
# Have primary site — also fetch its subpages
|
||
sub = _scrape_klub_subpages(pdoc.get('url') or primary, tokens_pre)
|
||
if sub:
|
||
pdoc['raw_text'] = (pdoc.get('raw_text','') + '\n\n' + sub)[:120000]
|
||
if pdoc: sources.append(pdoc); evidence.append(pdoc.get('raw_text') or pdoc.get('extract') or '')
|
||
wiki = _wiki_summary(naziv)
|
||
if wiki: sources.append(wiki); evidence.append(wiki.get('extract') or '')
|
||
spz = _sport_pgz_search(naziv)
|
||
if spz: sources.append(spz); evidence.append(spz.get('raw_text') or spz.get('extract') or '')
|
||
# kosarkapgz.com — only if this looks like a PGŽ basketball club
|
||
if _kosarkapgz_is_basketball_pgz(naziv, row.get('sport')):
|
||
kpgz = _kosarkapgz_search_posts(naziv, limit=5)
|
||
if kpgz: sources.append(kpgz); evidence.append(kpgz.get('raw_text') or '')
|
||
|
||
tokens = _name_tokens(naziv)
|
||
relevant = [s for s in sources if _is_relevant(s, tokens)]
|
||
relevant_blob = '\n\n'.join((s.get('raw_text') or s.get('extract') or '') for s in relevant)
|
||
|
||
proposed: dict[str, Any] = {}
|
||
# web/email/telefon: ONLY from sources actually mentioning the entity
|
||
if not row.get('web'):
|
||
u = _find_official_web(relevant_blob, naziv)
|
||
if u: proposed['web'] = u
|
||
if not row.get('email'):
|
||
e = _find_email(relevant_blob)
|
||
if e: proposed['email'] = e
|
||
if not row.get('telefon'):
|
||
t = _find_phone(relevant_blob)
|
||
if t: proposed['telefon'] = t
|
||
if not row.get('opis_djelatnosti'):
|
||
descr_evidence = [(s.get('raw_text') or s.get('extract') or '') for s in relevant] or evidence
|
||
descr = _deepseek_describe(naziv, 'sportski klub', descr_evidence)
|
||
if not descr:
|
||
for s in (relevant or sources):
|
||
if s.get('extract') and len(s['extract']) >= 80:
|
||
descr = s['extract']; break
|
||
if descr: proposed['opis_djelatnosti'] = descr.strip()[:2000]
|
||
return {'proposed': proposed, 'sources': sources}
|
||
|
||
|
||
def _propose_for_savez(row: dict) -> dict:
|
||
naziv = row.get('naziv') or ''
|
||
primary = row.get('web') or row.get('source_url')
|
||
sources, evidence = [], []
|
||
pdoc = _fetch_primary_site(primary) if primary else None
|
||
if pdoc: sources.append(pdoc); evidence.append(pdoc.get('raw_text') or '')
|
||
wiki = _wiki_summary(naziv)
|
||
if wiki: sources.append(wiki); evidence.append(wiki.get('extract') or '')
|
||
spz = _sport_pgz_search(naziv)
|
||
if spz: sources.append(spz); evidence.append(spz.get('raw_text') or '')
|
||
# kosarkapgz.com — fetch federation meta for any basketball savez
|
||
if _kosarkapgz_is_basketball_pgz(naziv, row.get('sport')):
|
||
kpgz = _kosarkapgz_savez_meta()
|
||
if kpgz: sources.append(kpgz); evidence.append(kpgz.get('raw_text') or '')
|
||
|
||
tokens = _name_tokens(naziv)
|
||
relevant = [s for s in sources if _is_relevant(s, tokens)]
|
||
relevant_blob = '\n\n'.join((s.get('raw_text') or s.get('extract') or '') for s in relevant)
|
||
|
||
proposed: dict[str, Any] = {}
|
||
if not row.get('web'):
|
||
u = _find_official_web(relevant_blob, naziv)
|
||
if u: proposed['web'] = u
|
||
if not row.get('email'):
|
||
e = _find_email(relevant_blob)
|
||
if e: proposed['email'] = e
|
||
if not row.get('telefon'):
|
||
t = _find_phone(relevant_blob)
|
||
if t: proposed['telefon'] = t
|
||
return {'proposed': proposed, 'sources': sources}
|
||
|
||
|
||
# ─── HNS Semafor parsing ────────────────────────────────────────────────
|
||
_HNS_BASE = 'https://semafor.hns.family'
|
||
|
||
def _slugify(name: str) -> str:
|
||
import unicodedata
|
||
s = unicodedata.normalize('NFKD', name or '').encode('ascii', 'ignore').decode('ascii').lower()
|
||
return re.sub(r'[^a-z0-9]+', '-', s).strip('-')
|
||
|
||
def _hns_url_from_row(row: dict) -> Optional[str]:
|
||
"""Try to build a semafor.hns.family /igraci/ URL for this row."""
|
||
# 1) Already-set columns
|
||
for k in ('profile_url', 'source_url'):
|
||
u = row.get(k)
|
||
if u and 'semafor.hns.family/igraci/' in (u or ''):
|
||
return u
|
||
# 2) hns_igrac_id column
|
||
pid = row.get('hns_igrac_id')
|
||
if pid:
|
||
slug = _slugify(((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip())
|
||
return f'{_HNS_BASE}/igraci/{int(pid)}/{slug}/'
|
||
# 3) vanjski_id JSONB → hns_comet
|
||
vid = row.get('vanjski_id') or {}
|
||
if isinstance(vid, dict):
|
||
comet = vid.get('hns_comet') or vid.get('hns_pid')
|
||
slug = vid.get('hns_slug') or _slugify(((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip())
|
||
if comet:
|
||
try:
|
||
return f'{_HNS_BASE}/igraci/{int(comet)}/{slug}/'
|
||
except Exception:
|
||
pass
|
||
# 4) source='hns_semafor' + source_id
|
||
if (row.get('source') or '').startswith('hns_') and row.get('source_id'):
|
||
try:
|
||
slug = _slugify(((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip())
|
||
return f'{_HNS_BASE}/igraci/{int(row["source_id"])}/{slug}/'
|
||
except Exception:
|
||
pass
|
||
return None
|
||
|
||
|
||
def _parse_hns_player(html_doc: str, url: str) -> Optional[dict]:
|
||
"""Extract structured fields from a semafor.hns.family player page."""
|
||
if not html_doc: return None
|
||
try:
|
||
from bs4 import BeautifulSoup
|
||
except Exception:
|
||
return _parse_hns_player_regex(html_doc, url)
|
||
soup = BeautifulSoup(html_doc, 'html.parser')
|
||
out: dict[str, Any] = {'source': 'semafor.hns.family', 'url': url}
|
||
|
||
# hns_igrac_id from URL
|
||
m = re.search(r'/igraci/(\d+)/', url)
|
||
if m: out['hns_igrac_id'] = int(m.group(1))
|
||
|
||
title = soup.find('title')
|
||
if title: out['title'] = title.get_text(strip=True)[:300]
|
||
|
||
# Photo
|
||
photo = soup.find('div', class_='photo')
|
||
if photo:
|
||
img = photo.find('img')
|
||
if img and img.get('src'):
|
||
src = img['src']
|
||
if not src.startswith('http'):
|
||
src = urllib.parse.urljoin(url, src)
|
||
out['slika_url'] = src
|
||
|
||
# Player number (jersey)
|
||
pn = soup.find('div', class_='playerName')
|
||
if pn:
|
||
h3 = pn.find('h3')
|
||
if h3:
|
||
t = h3.get_text(strip=True)
|
||
if t.isdigit():
|
||
out['broj_dresa'] = int(t)
|
||
|
||
# Datum rodjenja
|
||
li = soup.find('li', class_='dob')
|
||
if li:
|
||
h4 = li.find('h4')
|
||
if h4:
|
||
t = h4.get_text(' ', strip=True)
|
||
mm = re.match(r'(\d{1,2})\.(\d{1,2})\.(\d{4})', t)
|
||
if mm:
|
||
from datetime import date as _date
|
||
try:
|
||
out['datum_rodenja'] = _date(int(mm.group(3)), int(mm.group(2)), int(mm.group(1))).isoformat()
|
||
except Exception:
|
||
pass
|
||
|
||
# Mjesto rodjenja
|
||
li = soup.find('li', class_='pob')
|
||
if li:
|
||
h4 = li.find('h4')
|
||
if h4:
|
||
out['mjesto_rodenja'] = h4.get_text(strip=True)
|
||
|
||
# Trenutni klub (info only — we don't reassign klub_id from here)
|
||
klub_link = soup.find('a', href=re.compile(r'/klubovi/(\d+)/'))
|
||
if klub_link:
|
||
h4 = klub_link.find('h4')
|
||
if h4:
|
||
out['trenutni_klub'] = h4.get_text(strip=True)
|
||
m = re.search(r'/klubovi/(\d+)/', klub_link.get('href') or '')
|
||
if m: out['hns_klub_id'] = int(m.group(1))
|
||
|
||
# Description (meta)
|
||
meta_d = soup.find('meta', attrs={'name': 'description'})
|
||
if meta_d and meta_d.get('content'):
|
||
out['description'] = meta_d['content'][:600]
|
||
|
||
# Make a clean text blob for relevance / DeepSeek
|
||
text = soup.get_text(' ', strip=True)
|
||
out['raw_text'] = re.sub(r'\s+', ' ', text)[:4000]
|
||
out['extract'] = (out.get('description')
|
||
or (out['raw_text'][:500] if out.get('raw_text') else None))
|
||
return out
|
||
|
||
|
||
def _parse_hns_player_regex(html_doc: str, url: str) -> Optional[dict]:
|
||
"""BS4-free fallback parser."""
|
||
out: dict[str, Any] = {'source': 'semafor.hns.family', 'url': url}
|
||
m = re.search(r'/igraci/(\d+)/', url)
|
||
if m: out['hns_igrac_id'] = int(m.group(1))
|
||
m = re.search(r'<div class="photo"><img src="([^"]+)"', html_doc)
|
||
if m:
|
||
src = m.group(1)
|
||
if not src.startswith('http'): src = urllib.parse.urljoin(url, src)
|
||
out['slika_url'] = src
|
||
m = re.search(r'<li class="dob">.*?<h4>(\d{1,2})\.(\d{1,2})\.(\d{4})', html_doc, re.S)
|
||
if m:
|
||
from datetime import date as _date
|
||
try:
|
||
out['datum_rodenja'] = _date(int(m.group(3)), int(m.group(2)), int(m.group(1))).isoformat()
|
||
except Exception:
|
||
pass
|
||
m = re.search(r'<li class="pob"><i></i><h4>([^<]+)</h4>', html_doc)
|
||
if m: out['mjesto_rodenja'] = m.group(1).strip()
|
||
m = re.search(r'<div class="playerName"><h3>(\d+)</h3>', html_doc)
|
||
if m: out['broj_dresa'] = int(m.group(1))
|
||
m = re.search(r'<meta\s+name="description"\s+content="([^"]+)"', html_doc)
|
||
if m: out['description'] = m.group(1)[:600]
|
||
return out
|
||
|
||
|
||
def _hns_fetch_player(url: str) -> Optional[dict]:
|
||
body = _http_get(url, timeout=8)
|
||
if not body:
|
||
# Try Playwright fallback
|
||
if _HAS_PW and _pw_scraper is not None:
|
||
r = _pw_scraper.fetch_rendered(url, timeout_ms=15000)
|
||
if r and r.get('html_len', 0) > 2000:
|
||
# We didn't store html in fetch_rendered — re-fetch text only is enough
|
||
# but we need html for parse. Do a simple HTTP retry with longer timeout.
|
||
body = _http_get(url, timeout=15)
|
||
return _parse_hns_player(body, url) if body else None
|
||
|
||
|
||
# ─── Generic sport-federation scraper ───────────────────────────────────
|
||
def _fed_url_from_row(row: dict) -> Optional[str]:
|
||
"""If the row already points to a federation profile (source_url /
|
||
profile_url on a known fed host), return it."""
|
||
feds, _, _ = _load_sport_feds()
|
||
fed_hosts = set()
|
||
for entry in feds.values():
|
||
if not isinstance(entry, dict): continue
|
||
for which in ('national', 'pgz'):
|
||
sub = entry.get(which) or {}
|
||
for k in ('url', 'search_url', 'profile_url_pattern'):
|
||
v = sub.get(k)
|
||
if v:
|
||
try:
|
||
h = urllib.parse.urlparse(v.replace('{q}', 'x').replace('{slug}', 'x').replace('{hns_pid}', '1')).hostname
|
||
if h: fed_hosts.add(h)
|
||
except Exception:
|
||
pass
|
||
for k in ('source_url', 'profile_url'):
|
||
u = row.get(k)
|
||
if not u: continue
|
||
try:
|
||
h = urllib.parse.urlparse(u).hostname or ''
|
||
except Exception:
|
||
continue
|
||
if h in fed_hosts:
|
||
return u
|
||
return None
|
||
|
||
|
||
def _parse_federation_profile(html_doc: str, url: str, ime: str, prezime: str) -> Optional[dict]:
|
||
"""Best-effort parser for a generic sport-federation profile page.
|
||
|
||
Returns {source, url, slika_url, datum_rodenja, mjesto_rodenja, klub,
|
||
extract, raw_text}. Tolerant of varied page structures.
|
||
"""
|
||
if not html_doc: return None
|
||
host = urllib.parse.urlparse(url).hostname or ''
|
||
out: dict[str, Any] = {
|
||
'source': host,
|
||
'url': url,
|
||
}
|
||
# Title
|
||
m = re.search(r'<title[^>]*>([^<]+)</title>', html_doc, re.I)
|
||
if m: out['title'] = html.unescape(m.group(1).strip())[:300]
|
||
# Meta description
|
||
m = re.search(r'<meta\s+name=["\']description["\']\s+content=["\']([^"\']+)["\']', html_doc, re.I)
|
||
if m: out['description'] = html.unescape(m.group(1).strip())[:600]
|
||
|
||
name_tokens = []
|
||
for t in (ime, prezime):
|
||
if t and len(t) >= 3:
|
||
name_tokens.append(re.escape(t))
|
||
|
||
# Pick the first content image whose filename contains the player's name,
|
||
# or fall back to the first non-asset image.
|
||
img_candidates = re.findall(r'<img[^>]+src=["\']([^"\']+)["\']', html_doc, re.I)
|
||
chosen_img = None
|
||
for src in img_candidates:
|
||
low = src.lower()
|
||
if any(b in low for b in ('logo', 'icon', 'admin-ajax', 'spinner', 'loader',
|
||
'sprite', '/themes/', '/icons/', 'gdpr', 'banner',
|
||
'header', 'footer', 'placeholder', 'avatar-default')):
|
||
continue
|
||
if not low.endswith(('.jpg', '.jpeg', '.png', '.webp')):
|
||
continue
|
||
# Prefer matches on player name in URL
|
||
if name_tokens and any(re.search(t, src, re.I) for t in name_tokens):
|
||
chosen_img = src; break
|
||
if chosen_img is None:
|
||
chosen_img = src
|
||
if chosen_img:
|
||
if not chosen_img.startswith('http'):
|
||
chosen_img = urllib.parse.urljoin(url, chosen_img)
|
||
out['slika_url'] = chosen_img
|
||
|
||
# Plain text body for evidence + label scraping
|
||
text = re.sub(r'<script[^>]*>.*?</script>', ' ', html_doc, flags=re.S | re.I)
|
||
text = re.sub(r'<style[^>]*>.*?</style>', ' ', text, flags=re.S | re.I)
|
||
text = re.sub(r'<[^>]+>', ' ', text)
|
||
text = html.unescape(re.sub(r'\s+', ' ', text)).strip()
|
||
out['raw_text'] = text[:4000]
|
||
out['extract'] = (out.get('description')
|
||
or text[max(0, text.find(prezime)-30):max(0, text.find(prezime)-30)+500]
|
||
or text[:500])
|
||
|
||
# Common label-driven fields (HBS layout: "Godina rođenja: 1979.", "Matični klub: …")
|
||
m = re.search(r'Datum\s+ro[đdj]?enja[:\s]+(\d{1,2}[.\-/]\d{1,2}[.\-/]\d{4})', text, re.I)
|
||
if m:
|
||
try:
|
||
from datetime import date as _date
|
||
d = re.split(r'[.\-/]', m.group(1))
|
||
out['datum_rodenja'] = _date(int(d[2]), int(d[1]), int(d[0])).isoformat()
|
||
except Exception:
|
||
pass
|
||
if 'datum_rodenja' not in out:
|
||
m = re.search(r'Godina\s+ro[đdj]?enja[:\s]+(\d{4})', text, re.I)
|
||
if m:
|
||
try:
|
||
from datetime import date as _date
|
||
out['datum_rodenja'] = _date(int(m.group(1)), 1, 1).isoformat()
|
||
except Exception:
|
||
pass
|
||
m = re.search(r'Mjesto\s+ro[đdj]?enja[:\s]+([A-ZČĆŠĐŽ][^,\n.]{2,40})', text)
|
||
if m: out['mjesto_rodenja'] = m.group(1).strip()
|
||
m = re.search(r'Mati[čc]ni\s+klub[:\s]+([^\n]{3,60}?)(?:\s+(?:Sportski|Datum|Liječni|Reprezent|Sezona|Domaće|Nastupi))', text, re.I)
|
||
if m: out['klub_naziv'] = m.group(1).strip().rstrip('.')
|
||
|
||
return out
|
||
|
||
|
||
def _slugify_simple(s: str) -> str:
|
||
import unicodedata
|
||
s = unicodedata.normalize('NFKD', s or '').encode('ascii', 'ignore').decode('ascii').lower()
|
||
return re.sub(r'[^a-z0-9]+', '-', s).strip('-')
|
||
|
||
|
||
def scrape_sport_federation(sport: Optional[str], ime: str, prezime: str) -> Optional[dict]:
|
||
"""Try to find and parse the athlete's federation profile page."""
|
||
fed = _sport_fed(sport) if sport else None
|
||
if not fed: return None
|
||
nat = (fed or {}).get('national') or {}
|
||
full_name = (ime + ' ' + prezime).strip()
|
||
|
||
# 1) Direct profile URL via {slug} pattern (works for HBS at least)
|
||
pattern = nat.get('profile_url_pattern')
|
||
if pattern and '{slug}' in pattern:
|
||
slug = _slugify_simple(full_name)
|
||
url = pattern.replace('{slug}', slug)
|
||
body = _http_get(url, timeout=8)
|
||
if body and prezime.lower() in body.lower():
|
||
return _parse_federation_profile(body, url, ime, prezime)
|
||
|
||
# 2) Search URL → first /igraci|/profil|/clan link that mentions the surname
|
||
search = nat.get('search_url')
|
||
if search:
|
||
body = _http_get(search.replace('{q}', urllib.parse.quote(full_name)), timeout=10)
|
||
if body:
|
||
for href_re in (r'href="([^"]*?/igraci/[^"]+)"',
|
||
r'href="([^"]*?/igrac/[^"]+)"',
|
||
r'href="([^"]*?/sportasi/[^"]+)"',
|
||
r'href="([^"]*?/clanovi/[^"]+)"',
|
||
r'href="([^"]*?/profil/[^"]+)"'):
|
||
for m in re.finditer(href_re, body, re.I):
|
||
cand = m.group(1)
|
||
if not cand.startswith('http'):
|
||
cand = urllib.parse.urljoin(nat.get('url', search), cand)
|
||
if _slugify_simple(prezime) in _slugify_simple(cand):
|
||
b2 = _http_get(cand, timeout=8)
|
||
if b2:
|
||
return _parse_federation_profile(b2, cand, ime, prezime)
|
||
return None
|
||
|
||
|
||
def _propose_for_sportas(row: dict) -> dict:
|
||
naziv = ((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip()
|
||
ime, prezime = (row.get('ime') or ''), (row.get('prezime') or '')
|
||
sport = row.get('sport')
|
||
sources, evidence = [], []
|
||
proposed: dict[str, Any] = {}
|
||
|
||
# 1) HNS Semafor — only meaningful when sport is football OR row already
|
||
# carries an HNS link.
|
||
hns_doc: Optional[dict] = None
|
||
if _normalize_sport(sport) == 'nogomet' or _hns_url_from_row(row):
|
||
hns_url = _hns_url_from_row(row)
|
||
if hns_url:
|
||
hns_doc = _hns_fetch_player(hns_url)
|
||
if hns_doc:
|
||
sources.append(hns_doc)
|
||
evidence.append(hns_doc.get('raw_text') or hns_doc.get('extract') or '')
|
||
|
||
# 2) Sport-aware federation scrape (HBS, HKS, etc.) — also use existing
|
||
# source_url/profile_url if it points at a known federation host.
|
||
fed_doc: Optional[dict] = None
|
||
direct_fed_url = _fed_url_from_row(row)
|
||
if direct_fed_url and (not hns_doc or hns_doc.get('url') != direct_fed_url):
|
||
body = _http_get(direct_fed_url, timeout=8)
|
||
if body:
|
||
fed_doc = _parse_federation_profile(body, direct_fed_url, ime, prezime)
|
||
if not fed_doc:
|
||
fed_doc = scrape_sport_federation(sport, ime, prezime)
|
||
if fed_doc:
|
||
sources.append(fed_doc)
|
||
evidence.append(fed_doc.get('raw_text') or fed_doc.get('extract') or '')
|
||
|
||
# Helper: pick from hns_doc first then fed_doc
|
||
def _pick(field):
|
||
if hns_doc and hns_doc.get(field): return hns_doc[field]
|
||
if fed_doc and fed_doc.get(field): return fed_doc[field]
|
||
return None
|
||
|
||
if not row.get('profile_url'):
|
||
v = _pick('url') or (hns_doc and hns_doc.get('url')) or (fed_doc and fed_doc.get('url'))
|
||
if v: proposed['profile_url'] = v
|
||
if not row.get('source_url'):
|
||
v = (hns_doc and hns_doc.get('url')) or (fed_doc and fed_doc.get('url'))
|
||
if v: proposed['source_url'] = v
|
||
if not row.get('slika_url'):
|
||
v = _pick('slika_url')
|
||
if v: proposed['slika_url'] = v
|
||
if not row.get('hns_igrac_id') and hns_doc and hns_doc.get('hns_igrac_id'):
|
||
proposed['hns_igrac_id'] = hns_doc['hns_igrac_id']
|
||
if not row.get('datum_rodenja'):
|
||
v = _pick('datum_rodenja')
|
||
if v: proposed['datum_rodenja'] = v
|
||
if not row.get('mjesto_rodenja'):
|
||
v = _pick('mjesto_rodenja')
|
||
if v: proposed['mjesto_rodenja'] = v
|
||
if not row.get('broj_dresa') and hns_doc and hns_doc.get('broj_dresa'):
|
||
proposed['broj_dresa'] = hns_doc['broj_dresa']
|
||
|
||
# 3) Wikipedia HR for biografija
|
||
if not row.get('biografija'):
|
||
wiki = _wiki_summary(naziv)
|
||
if wiki:
|
||
sources.append(wiki)
|
||
evidence.append(wiki.get('extract') or '')
|
||
|
||
# 4) kosarkapgz.com — news mentions for PGŽ basketball players
|
||
if _kosarkapgz_is_basketball_pgz('', sport):
|
||
q = (ime + ' ' + prezime).strip()
|
||
kpgz = _kosarkapgz_search_posts(q, limit=5) if q else None
|
||
if kpgz:
|
||
sources.append(kpgz)
|
||
evidence.append(kpgz.get('raw_text') or '')
|
||
|
||
# Description: prefer DeepSeek synthesis from all evidence; fallback to first long snippet
|
||
if not row.get('biografija'):
|
||
descr = _deepseek_describe(naziv, f'sportaš ({sport})' if sport else 'sportaš', evidence) if evidence else None
|
||
if not descr:
|
||
for s in sources:
|
||
ext = s.get('extract')
|
||
if ext and len(ext) >= 80:
|
||
descr = ext; break
|
||
if descr:
|
||
proposed['biografija'] = descr.strip()[:2000]
|
||
|
||
return {'proposed': proposed, 'sources': sources}
|
||
|
||
|
||
# ─── Endpoints ──────────────────────────────────────────────────────────
|
||
# ─── R4 — POST /v2/enrich/forensic/{finding_id} ─────────────────────────
|
||
def _extract_pep_name(finding: dict) -> Optional[str]:
|
||
"""Pull the primary person name from a forensic_findings row."""
|
||
title = (finding.get('title') or '').strip()
|
||
desc = (finding.get('description') or '').strip()
|
||
payload = finding.get('raw_data') or {}
|
||
if isinstance(payload, str):
|
||
try: payload = json.loads(payload)
|
||
except Exception: payload = {}
|
||
if isinstance(payload, dict):
|
||
for k in ('person_name', 'name', 'osoba'):
|
||
v = payload.get(k)
|
||
if v: return str(v).strip()
|
||
# Try entities_involved.entity_name
|
||
ents = finding.get('entities_involved') or []
|
||
if isinstance(ents, str):
|
||
try: ents = json.loads(ents)
|
||
except Exception: ents = []
|
||
if isinstance(ents, list):
|
||
for e in ents:
|
||
if isinstance(e, dict) and e.get('person_name'):
|
||
return str(e['person_name']).strip()
|
||
if isinstance(e, dict) and e.get('entity_name') and ' ' in (e.get('entity_name') or ''):
|
||
# Some entries store person names as entity_name when entity_type='person'
|
||
if (e.get('entity_type') or '').lower() in ('person','osoba'):
|
||
return str(e['entity_name']).strip()
|
||
# Fallback: extract a "Ime Prezime" from the title
|
||
m = re.search(r'\b([A-ZČĆŠĐŽ][a-zčćšđž]+)\s+([A-ZČĆŠĐŽ][a-zčćšđž]+(?:-[A-ZČĆŠĐŽ][a-zčćšđž]+)?)\b', title + ' ' + desc)
|
||
if m: return f"{m.group(1)} {m.group(2)}"
|
||
return None
|
||
|
||
|
||
def _gather_pep_evidence(name: str) -> list[dict]:
|
||
sources: list[dict] = []
|
||
wiki = _wiki_summary(name)
|
||
if wiki: sources.append(wiki)
|
||
# DDG html-lite as a "Google snippet" replacement (often OK for HR PEPs)
|
||
ddg = 'https://html.duckduckgo.com/html/?q=' + urllib.parse.quote(f'"{name}" PGŽ Hrvatska')
|
||
page = _http_get(ddg, timeout=8)
|
||
if page:
|
||
# First result block
|
||
m = re.search(r'<a[^>]+class="result__a"[^>]+href="([^"]+)"[^>]*>([^<]{6,200})</a>', page)
|
||
snippet_m = re.search(r'<a[^>]+class="result__snippet"[^>]*>(.*?)</a>', page, re.S)
|
||
if m:
|
||
sources.append({
|
||
'source': 'duckduckgo',
|
||
'url': html.unescape(m.group(1))[:500],
|
||
'title': html.unescape(m.group(2)).strip()[:300],
|
||
'extract': re.sub(r'<[^>]+>', ' ', snippet_m.group(1)).strip()[:600] if snippet_m else None,
|
||
})
|
||
return sources
|
||
|
||
|
||
def _related_entities_for_pep(name: str) -> list[dict]:
|
||
"""Pull civic.persons + their entity links so we have the structured graph."""
|
||
out: list[dict] = []
|
||
with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
|
||
cur.execute("""SELECT id, name, function, party, county, city, oib, trust_tier
|
||
FROM civic.persons
|
||
WHERE upper(name) ILIKE upper(%s)
|
||
ORDER BY oib NULLS LAST, id LIMIT 10""", ('%'+name+'%',))
|
||
for p in cur.fetchall():
|
||
p = dict(p)
|
||
entry = {
|
||
'kind': 'person',
|
||
'person_id': p['id'], 'person_name': p['name'],
|
||
'function': p.get('function'), 'party': p.get('party'),
|
||
'county': p.get('county'), 'city': p.get('city'),
|
||
'oib': p.get('oib'), 'trust_tier': p.get('trust_tier'),
|
||
'entities': [],
|
||
}
|
||
if p.get('oib'):
|
||
cur.execute("""SELECT pel.entity_id, pel.roles, e.name AS entity_name,
|
||
e.oib AS entity_oib, e.entity_type, e.city, e.risk_score
|
||
FROM civic.person_entity_links pel
|
||
LEFT JOIN civic.entities e ON e.id = pel.entity_id
|
||
WHERE pel.person_oib=%s LIMIT 30""", (p['oib'],))
|
||
for r in cur.fetchall():
|
||
entry['entities'].append(dict(r))
|
||
out.append(entry)
|
||
return out
|
||
|
||
|
||
@router.post("/enrich/forensic/{finding_id}")
|
||
def enrich_forensic_v2(finding_id: int,
|
||
body: dict = Body(default=None),
|
||
x_user_email: Optional[str] = Header(default=None),
|
||
x_user_id: Optional[int] = Header(default=None)):
|
||
"""Enrich a forensic finding: gather Wiki + DDG snippets + civic graph,
|
||
write back to civic.forensic_findings.related_entities, and seal the
|
||
payload hash on Polygon (or queue for sealing).
|
||
"""
|
||
body = body or {}
|
||
explicit_name = (body.get('name') or '').strip() or None
|
||
|
||
with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
|
||
cur.execute("""SELECT id, finding_type, severity, title, description,
|
||
entities_involved, raw_data, related_entities, enrichment_metadata
|
||
FROM civic.forensic_findings WHERE id=%s""", (finding_id,))
|
||
finding = cur.fetchone()
|
||
if not finding:
|
||
raise HTTPException(404, "finding not found")
|
||
finding = dict(finding)
|
||
|
||
name = explicit_name or _extract_pep_name(finding)
|
||
if not name:
|
||
raise HTTPException(400, "could not derive a person/entity name; pass {name: \"…\"}")
|
||
|
||
sources = _gather_pep_evidence(name)
|
||
related = _related_entities_for_pep(name)
|
||
|
||
payload = {
|
||
'finding_id': finding_id,
|
||
'name': name,
|
||
'sources': [{'source': s.get('source'), 'url': s.get('url'),
|
||
'title': s.get('title')} for s in sources],
|
||
'related_entities': related,
|
||
'enriched_at': datetime.now(timezone.utc).isoformat(),
|
||
}
|
||
|
||
# Persist back to the finding
|
||
enrichment_meta = finding.get('enrichment_metadata') or {}
|
||
if not isinstance(enrichment_meta, dict): enrichment_meta = {}
|
||
history = enrichment_meta.get('history') or []
|
||
history.append({
|
||
'at': payload['enriched_at'],
|
||
'sources': payload['sources'],
|
||
'related_count': len(related),
|
||
'user': x_user_email,
|
||
})
|
||
enrichment_meta['history'] = history[-10:]
|
||
enrichment_meta['enriched_at'] = payload['enriched_at']
|
||
enrichment_meta['enriched_by'] = x_user_email or 'system'
|
||
enrichment_meta['source_count'] = len(sources)
|
||
|
||
with _db() as c, c.cursor() as cur:
|
||
cur.execute("""UPDATE civic.forensic_findings
|
||
SET related_entities = %s::jsonb,
|
||
enrichment_metadata = %s::jsonb
|
||
WHERE id=%s
|
||
RETURNING id""",
|
||
(json.dumps(related, default=str, ensure_ascii=False),
|
||
json.dumps(enrichment_meta, default=str, ensure_ascii=False),
|
||
finding_id))
|
||
cur.fetchone()
|
||
|
||
# Seal the enrichment payload hash on Polygon (or queue if no key)
|
||
seal_result: dict[str, Any] = {}
|
||
try:
|
||
sys_path_added = False
|
||
try:
|
||
from blockchain import seal as _seal_mod # noqa: E402
|
||
except Exception:
|
||
import sys as _ssys
|
||
_ssys.path.insert(0, '/opt/pgz-sport')
|
||
from blockchain import seal as _seal_mod # noqa: E402
|
||
sys_path_added = True
|
||
del sys_path_added # silence linters
|
||
h = _seal_mod.hash_payload(payload)
|
||
seal_result = _seal_mod.seal_to_polygon(
|
||
data_hash=h,
|
||
ref_id=str(finding_id),
|
||
action='forensic.enriched',
|
||
ref_type='forensic_finding',
|
||
payload=payload,
|
||
user_id=x_user_id,
|
||
user_email=x_user_email,
|
||
)
|
||
except Exception as e:
|
||
seal_result = {'error': f'{type(e).__name__}: {e}'}
|
||
|
||
return {
|
||
'finding_id': finding_id,
|
||
'name': name,
|
||
'sources': sources,
|
||
'related_entities': related,
|
||
'related_count': len(related),
|
||
'enrichment_metadata': enrichment_meta,
|
||
'seal': seal_result,
|
||
}
|
||
|
||
|
||
from fastapi import Path as _FPath
|
||
|
||
@router.post("/enrich/{kind:str}/{eid:int}")
|
||
def enrich_preview(kind: str = _FPath(..., regex='^(klub|savez|sportas)$'),
|
||
eid: int = 0,
|
||
nocache: int = 0):
|
||
"""Enrichment preview with Redis cache (24h TTL) + slow-call telemetry.
|
||
|
||
Pass ?nocache=1 to bypass cache.
|
||
"""
|
||
_t0 = time.time()
|
||
_cache_key = f"enrich:v1:{kind}:{eid}"
|
||
if not nocache:
|
||
_cached = _cache_get(_cache_key)
|
||
if _cached is not None:
|
||
_cached['_cache'] = 'hit'
|
||
_cached['_cache_ttl_s'] = ENRICH_CACHE_TTL
|
||
try: _enrich_slow_log(kind, eid, time.time() - _t0, cached=True)
|
||
except Exception: pass
|
||
return _cached
|
||
|
||
row = _load_row(kind, eid)
|
||
if kind == 'klub': res = _propose_for_klub(row)
|
||
elif kind == 'savez': res = _propose_for_savez(row)
|
||
else: res = _propose_for_sportas(row)
|
||
|
||
if kind == 'klub':
|
||
keys = ['oib','sport','grad','predsjednik','tajnik','web','email','telefon',
|
||
'sjediste','godina_osnutka','ciljevi','opis_djelatnosti']
|
||
elif kind == 'savez':
|
||
keys = ['oib','sport','predsjednik','tajnik','email','telefon','web','adresa','godina_osnutka']
|
||
else:
|
||
keys = ['sport','profile_url','slika_url','hns_igrac_id','biografija',
|
||
'datum_rodenja','mjesto_rodenja','broj_dresa','visina_cm','tezina_kg',
|
||
'dominantna_noga','oib']
|
||
|
||
naziv = _display_name(kind, row)
|
||
grad = row.get('grad') if kind == 'klub' else None
|
||
primary = row.get('web') or row.get('web_stranica') or row.get('source_url') or row.get('scrape_url') or row.get('profile_url')
|
||
|
||
filled = sum(1 for k in keys if row.get(k))
|
||
coverage = round(filled / len(keys) * 100)
|
||
missing = [k for k in keys if not row.get(k)]
|
||
|
||
proposed = res['proposed']
|
||
current = {k: row.get(k) for k in proposed.keys()}
|
||
meta = row.get('metadata') or {}
|
||
if not isinstance(meta, dict): meta = {}
|
||
|
||
_result = {
|
||
'kind': kind, 'id': eid, 'naziv': naziv,
|
||
'coverage': coverage, 'filled_fields': filled, 'total_fields': len(keys),
|
||
'missing_fields': missing,
|
||
'live_snippet': _fetch_title(primary) if primary else None,
|
||
'research_links': _research_links(naziv, kind, grad, sport=row.get('sport'), row=row),
|
||
'sport': row.get('sport'),
|
||
'sport_federation': (lambda f: {
|
||
'national': (f.get('national') or {}).get('name') if f else None,
|
||
'national_url': (f.get('national') or {}).get('url') if f else None,
|
||
'pgz': (f.get('pgz') or {}).get('name') if f else None,
|
||
})(_sport_fed(row.get('sport'))),
|
||
'sources': res['sources'],
|
||
'current': current,
|
||
'proposed': proposed,
|
||
'last_enriched_at': meta.get('enriched_at'),
|
||
'last_enrichment_source': meta.get('enrichment_source'),
|
||
'enriched_at': int(time.time()),
|
||
'apply_url': f'/sport/api/v2/enrich/{kind}/{eid}/apply',
|
||
'_cache': 'miss',
|
||
'_cache_ttl_s': ENRICH_CACHE_TTL,
|
||
}
|
||
_duration = time.time() - _t0
|
||
try:
|
||
_cache_set(_cache_key, _result, ttl=ENRICH_CACHE_TTL)
|
||
except Exception:
|
||
pass
|
||
try:
|
||
if _duration >= ENRICH_SLOW_LOG_THRESHOLD:
|
||
_enrich_slow_log(kind, eid, _duration, cached=False)
|
||
except Exception:
|
||
pass
|
||
return _result
|
||
|
||
|
||
_TABLE_MAP = {
|
||
'klub': ('pgz_sport.klubovi',
|
||
{'web','email','telefon','predsjednik','tajnik',
|
||
'opis_djelatnosti','ciljevi','godina_osnutka','sjediste','adresa'}),
|
||
'savez': ('pgz_sport.savezi',
|
||
{'web','email','telefon','predsjednik','tajnik','adresa','godina_osnutka'}),
|
||
'sportas': ('pgz_sport.clanovi',
|
||
{'biografija','profile_url','source_url','slika_url','hns_igrac_id',
|
||
'datum_rodenja','mjesto_rodenja','broj_dresa','visina_cm',
|
||
'tezina_kg','dominantna_noga','oib'}),
|
||
}
|
||
|
||
|
||
def _apply_to_db(kind: str, eid: int, fields: dict, sources: list, user_email: Optional[str]):
|
||
if kind not in _TABLE_MAP:
|
||
raise HTTPException(400, "kind must be klub|savez|sportas")
|
||
table, allowed = _TABLE_MAP[kind]
|
||
|
||
# Use a manual transaction so SELECT ... FOR UPDATE actually holds the row
|
||
# lock until we COMMIT. _db() returns autocommit=True connections, so we
|
||
# flip it off locally for this function only.
|
||
c = _db()
|
||
c.autocommit = False
|
||
error_msg = None # populated on UniqueViolation, recorded in enrichment_log
|
||
try:
|
||
with c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
|
||
cur.execute(f"SELECT * FROM {table} WHERE id=%s FOR UPDATE", (eid,))
|
||
before = cur.fetchone()
|
||
if not before: raise HTTPException(404, kind + " not found")
|
||
before = dict(before)
|
||
|
||
sets, params, applied = [], [], {}
|
||
for k, v in (fields or {}).items():
|
||
if k not in allowed: continue
|
||
if v is None or str(v).strip() == '': continue
|
||
if before.get(k):
|
||
continue # never overwrite existing
|
||
sets.append(f"{k} = %s")
|
||
params.append(v); applied[k] = v
|
||
|
||
meta_in = before.get('metadata') or {}
|
||
if not isinstance(meta_in, dict): meta_in = {}
|
||
now_iso = datetime.now(timezone.utc).isoformat()
|
||
meta_in['enriched_at'] = now_iso
|
||
meta_in['enrichment_source'] = [s.get('source') for s in (sources or []) if s.get('source')]
|
||
history = meta_in.get('enrichment_history') or []
|
||
history.append({
|
||
'at': now_iso,
|
||
'fields': list(applied.keys()),
|
||
'sources': meta_in['enrichment_source'],
|
||
'urls': [s.get('url') for s in (sources or []) if s.get('url')],
|
||
'user': user_email,
|
||
})
|
||
meta_in['enrichment_history'] = history[-10:]
|
||
sets.append("metadata = %s::jsonb")
|
||
params.append(json.dumps(meta_in, ensure_ascii=False, default=str))
|
||
|
||
params.append(eid)
|
||
try:
|
||
cur.execute(f"UPDATE {table} SET {', '.join(sets)} WHERE id=%s RETURNING *", params)
|
||
after = dict(cur.fetchone())
|
||
except psycopg2.errors.UniqueViolation as _uve:
|
||
# The UPDATE rolled back — pretend nothing was applied so the
|
||
# log row, the API response, and the row state all agree.
|
||
c.rollback()
|
||
failed_fields = list(applied.keys())
|
||
applied = {} # truth-in-reporting
|
||
constraint = getattr(getattr(_uve, 'diag', None), 'constraint_name', None)
|
||
error_msg = f"unique_violation: {constraint or 'unknown'}"
|
||
import logging as _lg
|
||
_lg.getLogger("enrich").warning(
|
||
f"UniqueViolation table={table} id={eid} blocked_fields={failed_fields} constraint={constraint}")
|
||
# Fetch current row state for the audit log.
|
||
cur.execute(f"SELECT * FROM {table} WHERE id=%s", (eid,))
|
||
row = cur.fetchone()
|
||
after = dict(row) if row else {}
|
||
# Park a do-not-retry marker on the row so workers stop hammering it.
|
||
try:
|
||
block_meta = {
|
||
'reason': 'unique_violation',
|
||
'constraint': constraint,
|
||
'fields': failed_fields,
|
||
'at': now_iso,
|
||
}
|
||
cur.execute(
|
||
f"UPDATE {table} "
|
||
f"SET metadata = COALESCE(metadata,'{{}}'::jsonb) || %s::jsonb "
|
||
f"WHERE id=%s",
|
||
(json.dumps({'enrichment_block': block_meta}, default=str), eid))
|
||
except Exception as _be:
|
||
# Marker is best-effort; never let it block the apply path.
|
||
import logging as _lg2
|
||
_lg2.getLogger("enrich").warning(f"enrichment_block write failed table={table} id={eid}: {_be}")
|
||
|
||
# Always log — on success error_msg is NULL, on UniqueViolation it
|
||
# carries the constraint name and applied is empty.
|
||
cur.execute(
|
||
"""INSERT INTO pgz_sport.enrichment_log
|
||
(kind, target_id, source, url, fields_set, before_jsonb, after_jsonb, user_email, error)
|
||
VALUES (%s,%s,%s,%s,%s,%s::jsonb,%s::jsonb,%s,%s)""",
|
||
(kind, eid,
|
||
','.join(meta_in['enrichment_source'])[:120] if meta_in['enrichment_source'] else None,
|
||
(sources[0].get('url') if sources else None),
|
||
list(applied.keys()) or None,
|
||
json.dumps({k: before.get(k) for k in (list(applied.keys()) + ['metadata'])},
|
||
ensure_ascii=False, default=str),
|
||
json.dumps({k: after.get(k) for k in (list(applied.keys()) + ['metadata'])},
|
||
ensure_ascii=False, default=str),
|
||
user_email,
|
||
error_msg))
|
||
c.commit()
|
||
except HTTPException:
|
||
try: c.rollback()
|
||
except Exception: pass
|
||
raise
|
||
except Exception:
|
||
try: c.rollback()
|
||
except Exception: pass
|
||
raise
|
||
finally:
|
||
try: c.close()
|
||
except Exception: pass
|
||
|
||
snap_keys = ('id','naziv','ime','prezime','web','email','telefon',
|
||
'opis_djelatnosti','biografija','metadata')
|
||
return {'applied': applied,
|
||
'after': {k: after.get(k) for k in snap_keys if k in after}}
|
||
|
||
|
||
@router.post("/enrich/{kind:str}/{eid:int}/apply")
|
||
def enrich_apply(kind: str = _FPath(..., regex='^(klub|savez|sportas)$'),
|
||
eid: int = 0,
|
||
body: dict = Body(default=None),
|
||
x_user_email: Optional[str] = Header(default=None),
|
||
x_user_id: Optional[int] = Header(default=None)):
|
||
body = body or {}
|
||
fields = body.get('fields')
|
||
sources = body.get('sources')
|
||
if not fields:
|
||
row = _load_row(kind, eid)
|
||
if kind == 'klub': res = _propose_for_klub(row)
|
||
elif kind == 'savez': res = _propose_for_savez(row)
|
||
else: res = _propose_for_sportas(row)
|
||
fields = res['proposed']
|
||
sources = res['sources']
|
||
out = _apply_to_db(kind, eid, fields or {}, sources or [], x_user_email)
|
||
applied = out.get('applied') or {}
|
||
# R4-A3: write to pgz_sport.sys_audit so the audit page sees enrichment events
|
||
try:
|
||
from audit_seal_router import audit_log as _audit_log
|
||
if applied:
|
||
_audit_log(
|
||
action='enrich.apply',
|
||
target_type=kind,
|
||
target_id=eid,
|
||
payload={'applied': applied,
|
||
'sources': [s.get('url') for s in (sources or []) if isinstance(s, dict)]},
|
||
user_id=x_user_id,
|
||
user_email=x_user_email,
|
||
)
|
||
except Exception:
|
||
pass
|
||
# Invalidate cache so next preview reflects the new DB state
|
||
try:
|
||
_cache_delete(f"enrich:v1:{kind}:{eid}")
|
||
except Exception:
|
||
pass
|
||
return {
|
||
'status': 'success' if applied else 'no_changes',
|
||
'kind': kind,
|
||
'id': eid,
|
||
'applied_count': len(applied),
|
||
'applied_fields': list(applied.keys()),
|
||
**out,
|
||
}
|
||
|
||
|
||
@router.get("/enrich/log")
|
||
def enrich_log(kind: Optional[str] = None, target_id: Optional[int] = None, limit: int = 50):
|
||
where, params = [], []
|
||
if kind: where.append("kind=%s"); params.append(kind)
|
||
if target_id: where.append("target_id=%s"); params.append(target_id)
|
||
sql = ("SELECT id, kind, target_id, source, url, fields_set, user_email, created_at "
|
||
"FROM pgz_sport.enrichment_log "
|
||
+ ("WHERE " + " AND ".join(where) + " " if where else "")
|
||
+ "ORDER BY id DESC LIMIT %s")
|
||
params.append(min(int(limit or 50), 200))
|
||
with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
|
||
cur.execute(sql, params)
|
||
rows = [dict(r) for r in cur.fetchall()]
|
||
for r in rows:
|
||
if r.get('created_at'): r['created_at'] = r['created_at'].isoformat()
|
||
return {'count': len(rows), 'rows': rows}
|
||
|
||
|
||
# ─── R3B M2 — SEARCH SUGGEST (autocomplete for Mreža) ───────────────────
|
||
@router.get("/search/suggest")
|
||
def search_suggest(q: str = '', type: str = '', limit: int = 10):
|
||
"""
|
||
Autocomplete suggestions for the Mreža search inputs.
|
||
type ∈ {person, club, company, ''} — empty means all.
|
||
Returns: {query, results: [{id, label, type, sub}]}
|
||
"""
|
||
q = (q or '').strip()
|
||
if len(q) < 2:
|
||
return {'query': q, 'results': []}
|
||
limit = max(1, min(50, int(limit)))
|
||
out = []
|
||
with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
|
||
if type in ('', 'club'):
|
||
cur.execute("""
|
||
SELECT id, naziv AS label, sport, grad
|
||
FROM pgz_sport.klubovi
|
||
WHERE naziv ILIKE %s AND aktivan=TRUE
|
||
ORDER BY length(naziv), naziv LIMIT %s
|
||
""", ('%'+q+'%', limit))
|
||
for r in cur.fetchall():
|
||
out.append({'id':'klub:'+str(r['id']), 'label': r['label'], 'type':'club',
|
||
'sub': (r.get('sport') or '')+' · '+(r.get('grad') or '')})
|
||
cur.execute("""
|
||
SELECT id, naziv AS label, sport
|
||
FROM pgz_sport.savezi
|
||
WHERE naziv ILIKE %s AND aktivan=TRUE
|
||
ORDER BY length(naziv), naziv LIMIT %s
|
||
""", ('%'+q+'%', limit))
|
||
for r in cur.fetchall():
|
||
out.append({'id':'savez:'+str(r['id']), 'label': r['label'], 'type':'savez',
|
||
'sub': r.get('sport') or 'savez'})
|
||
if type in ('', 'person'):
|
||
cur.execute("""
|
||
SELECT c.id, c.ime, c.prezime, c.sport, k.naziv AS klub_naziv
|
||
FROM pgz_sport.clanovi c
|
||
LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id
|
||
WHERE (COALESCE(c.ime,'') || ' ' || COALESCE(c.prezime,'')) ILIKE %s
|
||
ORDER BY length(COALESCE(c.ime,'')||COALESCE(c.prezime,'')), c.prezime
|
||
LIMIT %s
|
||
""", ('%'+q+'%', limit))
|
||
for r in cur.fetchall():
|
||
out.append({'id':'sportas:'+str(r['id']),
|
||
'label': (r.get('ime') or '')+' '+(r.get('prezime') or ''),
|
||
'type':'person',
|
||
'sub': (r.get('sport') or 'sportaš')+(r.get('klub_naziv') and ' · '+r['klub_naziv'] or '')})
|
||
cur.execute("""
|
||
SELECT id, name AS label, function, oib, county
|
||
FROM civic.persons
|
||
WHERE name ILIKE %s
|
||
ORDER BY oib NULLS LAST, length(name) LIMIT %s
|
||
""", ('%'+q+'%', limit))
|
||
for r in cur.fetchall():
|
||
out.append({'id':'civic_person:'+str(r['id']),
|
||
'label': r['label'], 'type':'person',
|
||
'sub': (r.get('function') or 'civic')+' · '+(r.get('county') or '')})
|
||
if type in ('', 'company'):
|
||
cur.execute("""
|
||
SELECT id, name AS label, oib, city, entity_type
|
||
FROM civic.entities
|
||
WHERE name ILIKE %s
|
||
ORDER BY length(name) LIMIT %s
|
||
""", ('%'+q+'%', limit))
|
||
for r in cur.fetchall():
|
||
out.append({'id':'civic_entity:'+str(r['id']),
|
||
'label': r['label'], 'type':'company',
|
||
'sub': (r.get('entity_type') or 'tvrtka')+' · '+(r.get('city') or '')})
|
||
return {'query': q, 'results': out[:limit*2]}
|
||
|
||
|
||
# ─── R3B M3 — FORENSIC ENRICH (Wikipedia scrape + persist) ──────────────
|
||
@router.post("/forensic/findings/{finding_id}/enrich")
|
||
def enrich_forensic(finding_id: int):
|
||
"""
|
||
Look up the forensic finding, derive the PEP person name from
|
||
entities_involved or title, hit Wikipedia HR for a summary, and persist
|
||
the enriched payload into civic.forensic_findings.ai_analysis (or back into
|
||
raw_data.enrichment).
|
||
"""
|
||
with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
|
||
cur.execute("""
|
||
SELECT id, finding_type, severity, title, description, entities_involved,
|
||
raw_data, ai_analysis
|
||
FROM civic.forensic_findings WHERE id=%s
|
||
""", (finding_id,))
|
||
f = cur.fetchone()
|
||
if not f: raise HTTPException(404, "finding not found")
|
||
f = dict(f)
|
||
|
||
# Derive person name candidates
|
||
candidates = []
|
||
if isinstance(f.get('entities_involved'), (list, dict)):
|
||
ei = f['entities_involved']
|
||
if isinstance(ei, dict):
|
||
for k in ('person','name','osoba','PEP','pep'):
|
||
if ei.get(k): candidates.append(str(ei[k]))
|
||
# Also try persons: [...] list
|
||
for p in (ei.get('persons') or ei.get('osobe') or []):
|
||
if isinstance(p, dict) and p.get('name'): candidates.append(p['name'])
|
||
elif isinstance(p, str): candidates.append(p)
|
||
elif isinstance(ei, list):
|
||
for it in ei:
|
||
if isinstance(it, dict):
|
||
for k in ('name','person','label'):
|
||
if it.get(k): candidates.append(str(it[k])); break
|
||
elif isinstance(it, str):
|
||
candidates.append(it)
|
||
if not candidates and f.get('title'):
|
||
# Heuristic: extract first capitalised "Ime Prezime" pair
|
||
m = re.search(r'\b([A-ZŠĐČĆŽ][a-zšđčćž]{2,})\s+([A-ZŠĐČĆŽ][a-zšđčćž]{2,})', f['title'])
|
||
if m: candidates.append(m.group(0))
|
||
|
||
wiki = None
|
||
used_query = None
|
||
for q in candidates[:3]:
|
||
wiki = _wiki_summary(q)
|
||
if wiki:
|
||
used_query = q
|
||
break
|
||
|
||
# Build enrichment payload
|
||
enrichment = {
|
||
'queried': candidates[:5],
|
||
'used_query': used_query,
|
||
'wiki': wiki,
|
||
'enriched_at': datetime.now(timezone.utc).isoformat(),
|
||
}
|
||
|
||
# Persist into raw_data.enrichment
|
||
raw = f.get('raw_data')
|
||
if raw is None: raw = {}
|
||
if not isinstance(raw, dict): raw = {'_legacy': raw}
|
||
raw['enrichment'] = enrichment
|
||
|
||
cur.execute("""
|
||
UPDATE civic.forensic_findings
|
||
SET raw_data = %s::jsonb,
|
||
ai_analysis = COALESCE(ai_analysis, %s)
|
||
WHERE id = %s
|
||
""", (json.dumps(raw, default=str, ensure_ascii=False),
|
||
(wiki or {}).get('extract'),
|
||
finding_id))
|
||
c.commit()
|
||
|
||
return {
|
||
'finding_id': finding_id,
|
||
'queried': candidates[:5],
|
||
'used_query': used_query,
|
||
'wiki': wiki,
|
||
'persisted': True,
|
||
}
|
||
|
||
|
||
# ─── R3B P4 — FORENSIC SCAN (kept from prior version) ───────────────────
|
||
@router.post("/forensic/scan")
|
||
def forensic_scan(req: dict = Body(...)):
|
||
name = (req.get('name') or '').strip()
|
||
if len(name) < 3:
|
||
raise HTTPException(400, "name must be at least 3 chars")
|
||
with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
|
||
cur.execute("""
|
||
SELECT id, name, function, party, county, city, oib, trust_tier
|
||
FROM civic.persons
|
||
WHERE upper(name) ILIKE upper(%s)
|
||
ORDER BY oib NULLS LAST, id LIMIT 25
|
||
""", ('%' + name + '%',))
|
||
persons = [dict(r) for r in cur.fetchall()]
|
||
for p in persons:
|
||
p['links'] = []; p['findings'] = []
|
||
if p.get('oib'):
|
||
cur.execute("""
|
||
SELECT pel.entity_id, pel.roles, e.name AS entity_name, e.oib AS entity_oib,
|
||
e.entity_type, e.city, e.risk_score
|
||
FROM civic.person_entity_links pel
|
||
LEFT JOIN civic.entities e ON e.id = pel.entity_id
|
||
WHERE pel.person_oib = %s LIMIT 50
|
||
""", (p['oib'],))
|
||
p['links'] = [dict(r) for r in cur.fetchall()]
|
||
cur.execute("""
|
||
SELECT id, finding_type, severity, title, severity_score, created_at
|
||
FROM civic.forensic_findings
|
||
WHERE entities_involved::text ILIKE %s
|
||
ORDER BY severity_score DESC, created_at DESC LIMIT 30
|
||
""", ('%' + p['oib'] + '%',))
|
||
p['findings'] = [dict(r) for r in cur.fetchall()]
|
||
if not p['findings']:
|
||
cur.execute("""
|
||
SELECT id, finding_type, severity, title, severity_score, created_at
|
||
FROM civic.forensic_findings
|
||
WHERE title ILIKE %s OR description ILIKE %s
|
||
ORDER BY severity_score DESC, created_at DESC LIMIT 30
|
||
""", ('%' + p['name'] + '%', '%' + p['name'] + '%'))
|
||
p['findings'] = [dict(r) for r in cur.fetchall()]
|
||
total_links = total_findings = crit_findings = 0
|
||
for p in persons:
|
||
total_links += len(p.get('links') or [])
|
||
for f in p.get('findings') or []:
|
||
total_findings += 1
|
||
if f.get('severity') in ('CRITICAL', 'HIGH'): crit_findings += 1
|
||
score = 0
|
||
if (p.get('function') or '').strip(): score += 30
|
||
if (p.get('party') or '').strip(): score += 15
|
||
score += min(40, len(p.get('links') or []) * 5)
|
||
score += min(40, len(p.get('findings') or []) * 10)
|
||
score += sum(20 for f in (p.get('findings') or []) if f.get('severity') in ('CRITICAL', 'HIGH'))
|
||
p['risk_score'] = min(100, score)
|
||
overall = max((p.get('risk_score', 0) for p in persons), default=0)
|
||
return {'query': name, 'matched_persons': len(persons),
|
||
'overall_risk_score': overall, 'total_links': total_links,
|
||
'total_findings': total_findings, 'critical_findings': crit_findings,
|
||
'persons': persons, 'scanned_at': int(time.time())}
|
||
|
||
|
||
|
||
# ─── SB-3 — Bulk enrichment ─────────────────────────────────────────────
|
||
_BULK_KEY_MAP = {
|
||
'klub': ('pgz_sport.klubovi',
|
||
('oib','sport','grad','predsjednik','tajnik','web','email','telefon',
|
||
'sjediste','godina_osnutka','ciljevi','opis_djelatnosti')),
|
||
'savez': ('pgz_sport.savezi',
|
||
('oib','sport','predsjednik','tajnik','email','telefon','web',
|
||
'adresa','godina_osnutka')),
|
||
'sportas': ('pgz_sport.clanovi',
|
||
('sport','profile_url','slika_url','hns_igrac_id','biografija',
|
||
'datum_rodenja','mjesto_rodenja','broj_dresa')),
|
||
}
|
||
|
||
|
||
def _coverage_sql(prefix: str, keys: tuple[str, ...]) -> str:
|
||
parts = [f"(CASE WHEN {prefix}{k} IS NOT NULL AND ({prefix}{k}::text) <> '' THEN 1 ELSE 0 END)"
|
||
for k in keys]
|
||
return f"((({' + '.join(parts)})::numeric * 100) / {len(keys)})"
|
||
|
||
|
||
def _bulk_pick(kind: str, limit: int, coverage_max: int) -> list[int]:
|
||
if kind not in _BULK_KEY_MAP:
|
||
raise HTTPException(400, "kind must be klub|savez|sportas")
|
||
table, keys = _BULK_KEY_MAP[kind]
|
||
cov = _coverage_sql('', keys)
|
||
extra_where = ''
|
||
if kind == 'klub':
|
||
extra_where = "AND aktivan = TRUE"
|
||
elif kind == 'sportas':
|
||
extra_where = "AND aktivan = TRUE"
|
||
sql = (f"SELECT id FROM {table} "
|
||
f"WHERE 1=1 {extra_where} "
|
||
f"AND {cov} < %s "
|
||
f"ORDER BY random() LIMIT %s")
|
||
with _db() as c, c.cursor() as cur:
|
||
cur.execute(sql, (coverage_max, limit))
|
||
return [r[0] for r in cur.fetchall()]
|
||
|
||
|
||
@router.post("/enrich/bulk")
|
||
def enrich_bulk(body: dict = Body(default=None),
|
||
x_user_email: Optional[str] = Header(default=None),
|
||
x_user_id: Optional[int] = Header(default=None)):
|
||
"""Run preview+apply over N random under-enriched rows of one kind.
|
||
|
||
Body: {kind: 'klub'|'savez'|'sportas', limit: 50, coverage_max: 70}
|
||
Returns aggregate stats. Synchronous (use polling, not SSE).
|
||
"""
|
||
body = body or {}
|
||
kind = (body.get('kind') or '').strip().lower()
|
||
if kind not in _BULK_KEY_MAP:
|
||
raise HTTPException(400, "kind must be klub|savez|sportas")
|
||
limit = max(1, min(int(body.get('limit') or 50), 200))
|
||
coverage_max = max(0, min(int(body.get('coverage_max') or 70), 100))
|
||
|
||
ids = _bulk_pick(kind, limit, coverage_max)
|
||
items: list[dict] = []
|
||
fields_total = 0
|
||
started = time.time()
|
||
|
||
for eid in ids:
|
||
try:
|
||
row = _load_row(kind, eid)
|
||
if kind == 'klub': res = _propose_for_klub(row)
|
||
elif kind == 'savez': res = _propose_for_savez(row)
|
||
else: res = _propose_for_sportas(row)
|
||
proposed = res.get('proposed') or {}
|
||
srcs = res.get('sources') or []
|
||
if not proposed:
|
||
items.append({'id': eid, 'applied': 0, 'fields': []})
|
||
continue
|
||
out = _apply_to_db(kind, eid, proposed, srcs, x_user_email)
|
||
applied = out.get('applied') or {}
|
||
fields_total += len(applied)
|
||
items.append({'id': eid, 'applied': len(applied), 'fields': list(applied.keys())})
|
||
try:
|
||
from audit_seal_router import audit_log as _audit_log
|
||
if applied:
|
||
_audit_log(action='enrich.bulk.apply',
|
||
target_type=kind, target_id=eid,
|
||
payload={'applied': applied},
|
||
user_id=x_user_id, user_email=x_user_email)
|
||
except Exception:
|
||
pass
|
||
except HTTPException as e:
|
||
items.append({'id': eid, 'error': e.detail})
|
||
except Exception as e:
|
||
items.append({'id': eid, 'error': f'{type(e).__name__}: {e}'})
|
||
|
||
return {
|
||
'status': 'success',
|
||
'kind': kind,
|
||
'requested': limit,
|
||
'processed': len(items),
|
||
'fields_total': fields_total,
|
||
'elapsed_s': round(time.time() - started, 1),
|
||
'items': items,
|
||
}
|
||
|
||
|
||
# ─── SB-4 — Worker status / control ─────────────────────────────────────
|
||
_REDIS_KEYS = {
|
||
'heartbeat': 'cc:pgz-enricher:heartbeat',
|
||
'pause': 'cc:pgz-enricher:pause',
|
||
'run_now': 'cc:pgz-enricher:run_now',
|
||
'last_cycle': 'cc:pgz-enricher:last_cycle',
|
||
'confidence': 'cc:pgz-enricher:confidence',
|
||
'fields_24h': 'cc:pgz-enricher:fields_24h',
|
||
}
|
||
|
||
|
||
def _redis_client():
|
||
try:
|
||
import redis
|
||
except Exception:
|
||
return None
|
||
host = os.environ.get('REDIS_HOST', 'localhost')
|
||
port = int(os.environ.get('REDIS_PORT', '6379'))
|
||
pwd = (os.environ.get('REDIS_PASS') or '').strip().strip("'").strip('"') or None
|
||
# Try with password first (prod); fall back to anonymous (dev box) on AUTH failure.
|
||
for p in (pwd, None):
|
||
try:
|
||
r = redis.Redis(host=host, port=port, password=p,
|
||
decode_responses=True, socket_connect_timeout=2)
|
||
r.ping()
|
||
return r
|
||
except Exception:
|
||
continue
|
||
return None
|
||
|
||
|
||
@router.get("/enrich/worker/status")
|
||
def enrich_worker_status():
|
||
r = _redis_client()
|
||
out = {'available': bool(r)}
|
||
if not r:
|
||
return out
|
||
try:
|
||
hb = r.get(_REDIS_KEYS['heartbeat'])
|
||
out['heartbeat'] = int(hb) if hb else None
|
||
out['heartbeat_age_s'] = (int(time.time()) - int(hb)) if hb else None
|
||
out['paused'] = (r.get(_REDIS_KEYS['pause']) or '0') == '1'
|
||
out['run_now_pending'] = (r.get(_REDIS_KEYS['run_now']) or '0') == '1'
|
||
last = r.get(_REDIS_KEYS['last_cycle'])
|
||
if last:
|
||
try: out['last_cycle'] = json.loads(last)
|
||
except: out['last_cycle'] = last
|
||
conf = r.get(_REDIS_KEYS['confidence'])
|
||
out['confidence_threshold'] = float(conf) if conf else 0.7
|
||
f24 = r.get(_REDIS_KEYS['fields_24h'])
|
||
out['fields_24h'] = int(f24) if f24 and f24.isdigit() else 0
|
||
except Exception as e:
|
||
out['error'] = f'{type(e).__name__}: {e}'
|
||
# Recent enrichment_log rows for live activity
|
||
try:
|
||
with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
|
||
cur.execute("""SELECT id, kind, target_id, source, fields_set, user_email, created_at
|
||
FROM pgz_sport.enrichment_log
|
||
ORDER BY id DESC LIMIT 25""")
|
||
rows = []
|
||
for rr in cur.fetchall():
|
||
rr = dict(rr)
|
||
if rr.get('created_at'): rr['created_at'] = rr['created_at'].isoformat()
|
||
rows.append(rr)
|
||
out['recent'] = rows
|
||
except Exception:
|
||
out['recent'] = []
|
||
return out
|
||
|
||
|
||
@router.post("/enrich/worker/pause")
|
||
def enrich_worker_pause(body: dict = Body(default=None)):
|
||
body = body or {}
|
||
pause = bool(body.get('paused', True))
|
||
r = _redis_client()
|
||
if not r: raise HTTPException(503, 'redis unavailable')
|
||
r.set(_REDIS_KEYS['pause'], '1' if pause else '0')
|
||
return {'status': 'success', 'paused': pause}
|
||
|
||
|
||
@router.post("/enrich/worker/run-now")
|
||
def enrich_worker_run_now():
|
||
r = _redis_client()
|
||
if not r: raise HTTPException(503, 'redis unavailable')
|
||
r.set(_REDIS_KEYS['run_now'], '1')
|
||
return {'status': 'success', 'queued': True}
|
||
|
||
|
||
@router.post("/enrich/worker/confidence")
|
||
def enrich_worker_confidence(body: dict = Body(...)):
|
||
try:
|
||
v = float(body.get('value'))
|
||
except Exception:
|
||
raise HTTPException(400, 'value must be number 0..1')
|
||
if not (0.0 <= v <= 1.0):
|
||
raise HTTPException(400, 'value out of range 0..1')
|
||
r = _redis_client()
|
||
if not r: raise HTTPException(503, 'redis unavailable')
|
||
r.set(_REDIS_KEYS['confidence'], str(v))
|
||
return {'status': 'success', 'confidence_threshold': v}
|