M12.1: enrich v3 — preview + /apply persists to DB (klubovi/savezi/clanovi)

- POST /v2/enrich/{kind}/{eid} now scrapes Wikipedia HR + sport-pgz.hr +
  primary site, runs relevance filter so contact info from off-topic pages
  isn't lifted, optionally calls DeepSeek for opis_djelatnosti, returns
  {current, proposed, sources, last_enriched_at} for diff UI.
- POST /v2/enrich/{kind}/{eid}/apply UPDATES klubovi/savezi/clanovi for
  whitelisted empty fields, sets metadata.enriched_at +
  metadata.enrichment_source + metadata.enrichment_history, writes a row
  to pgz_sport.enrichment_log (new table).
- GET /v2/enrich/log read-back endpoint.
- Tested on klub 3 (KK Kvarner 2010): opis_djelatnosti persisted; metadata
  carries enriched_at + sources.
- New tables/columns: pgz_sport.enrichment_log; metadata jsonb on klubovi/savezi.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
CC6 Worker
2026-05-05 00:14:17 +02:00
parent 21be7ff42b
commit 85fd51bfd9
+677 -224
View File
@@ -1,310 +1,763 @@
"""
enrich_router.py — Round-2/3B enrichment + forensic-scan endpoints
Author: dradulic@outlook.com Date: 2026-05-04 (R2), 2026-05-05 (R3B)
enrich_router.py — v3 enrichment + forensic scan
Author: dradulic@outlook.com / damir@rinet.one
Date: 2026-05-04 (R2) → 2026-05-05 (R3 CC6 v3)
Surfaces "Obogati podatke" buttons for klubovi, savezi, sportasi, plus
the Forenzika "Pokreni novu analizu" scan endpoint that searches civic.*.
POST /v2/enrich/{kind}/{eid}
Inspect the row, scrape the web (Wikipedia HR, sport-pgz.hr search,
primary club URL if any), regex-extract candidate fields (web/email/
telefon), optionally synthesise descriptions via DeepSeek, and return
a *preview* shape with `proposed` updates the operator can apply.
Strategy:
1) Read what's already in DB and surface fields the frontend may not have shown.
2) Build curated research URLs (Google, Wikipedia HR, Sportilus, sport-pgz.hr,
HNS Semafor) so the operator can verify or expand by hand.
3) If the entity has a `web` URL set, quickly fetch the page and extract
<title> + <meta description> to return as a "live snippet". 5s timeout, fail-soft.
4) /forensic/scan — match name across civic.persons, return entity links,
forensic_findings hits, and a synthesised risk score.
5) /enrich/{kind}/{id}/apply — fetch best web source for entity and UPDATE the
row's web/email/telefon fields when missing.
POST /v2/enrich/{kind}/{eid}/apply
Body shapes:
None / {} → re-run preview, apply every proposed field
{"fields": {...}} → apply ONLY those (whitelist + emptiness still enforced)
Performs UPDATE on the matching table, sets metadata.enriched_at and
metadata.enrichment_source, writes a row to pgz_sport.enrichment_log,
returns the after snapshot.
GET /v2/enrich/log?kind=&target_id=&limit=
Read recent enrichment-log entries.
POST /v2/forensic/scan
Search civic.persons by name, return entity links + findings + risk score.
Kinds: klub | savez | sportas
"""
import os, re, json, time, urllib.parse, urllib.request, html
from __future__ import annotations
import os, re, json, time, html, urllib.parse, urllib.request
from datetime import datetime, timezone
from typing import Any, Optional
import psycopg2, psycopg2.extras
from fastapi import APIRouter, HTTPException, Body
from fastapi import APIRouter, HTTPException, Header, Body
router = APIRouter()
_pgh = os.environ.get('PG_HOST','10.10.0.2')
_pgp = int(os.environ.get('PG_PORT','6432'))
# pgz-sport.service inherits PG_HOST=localhost:5432 from /opt/.env.rinet which is wrong
# (local PG is disabled). Force the Server B DSN if env says localhost.
_pgh = os.environ.get('PG_HOST', '10.10.0.2')
_pgp = int(os.environ.get('PG_PORT', '6432'))
if _pgh in ('localhost', '127.0.0.1'):
_pgh = os.environ.get('DB_HOST','10.10.0.2')
_pgp = int(os.environ.get('DB_PORT','6432'))
_pgh = os.environ.get('DB_HOST', '10.10.0.2')
_pgp = int(os.environ.get('DB_PORT', '6432'))
DB = dict(host=_pgh, port=_pgp,
dbname=os.environ.get('PG_DB','rinet_v3'),
user=os.environ.get('PG_USER','rinet'),
password=os.environ.get('PG_PASS','R1net2026!SecureDB#v7'))
dbname=os.environ.get('PG_DB', 'rinet_v3'),
user=os.environ.get('PG_USER', 'rinet'),
password=os.environ.get('PG_PASS', 'R1net2026!SecureDB#v7'))
UA = 'pgz-sport-enrich/2.0'
UA = 'pgz-sport-enrich/3.0 (+https://sport.rinet.one)'
TIMEOUT = 6 # seconds — fail-soft
DEEPSEEK_KEY = os.environ.get('DEEPSEEK_API_KEY', '').strip()
DEEPSEEK_URL = os.environ.get('DEEPSEEK_URL',
'https://api.deepseek.com/v1/chat/completions')
# ─── DB helpers ──────────────────────────────────────────────────────────
def _db():
c = psycopg2.connect(**DB); c.autocommit = True; return c
def _fetch_one(sql, p):
with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
cur.execute(sql, p)
r = cur.fetchone()
cur.execute(sql, p); r = cur.fetchone()
return dict(r) if r else None
def _fetch_title(url, timeout=5):
if not url: return None
try:
if not url.startswith('http'):
return None
req = urllib.request.Request(url, headers={'User-Agent': UA})
with urllib.request.urlopen(req, timeout=timeout) as r:
data = r.read(40000).decode('utf-8','ignore')
title_m = re.search(r'<title[^>]*>([^<]+)</title>', data, re.I)
desc_m = re.search(r'<meta\s+name=["\']description["\']\s+content=["\']([^"\']+)["\']', data, re.I)
og_desc_m = re.search(r'<meta\s+property=["\']og:description["\']\s+content=["\']([^"\']+)["\']', data, re.I)
return {
'url': url,
'title': html.unescape(title_m.group(1).strip())[:300] if title_m else None,
'description': html.unescape((desc_m or og_desc_m).group(1).strip())[:500] if (desc_m or og_desc_m) else None,
'fetched_at': int(time.time()),
}
except Exception as e:
return {'url': url, 'error': str(e)[:120]}
def _research_links(naziv, kind, grad=None):
base_q = (naziv or '').strip()
if grad: q = base_q + ' ' + grad
else: q = base_q
qenc = urllib.parse.quote(q)
out = [
{'label':'Google', 'icon':'🔍', 'url':'https://www.google.com/search?q='+qenc},
{'label':'Wikipedia HR', 'icon':'📚', 'url':'https://hr.wikipedia.org/w/index.php?search='+qenc},
{'label':'sport-pgz.hr', 'icon':'🏅', 'url':'https://sport-pgz.hr/?s='+qenc},
]
if kind == 'klub':
out.append({'label':'Sportilus', 'icon':'', 'url':'https://www.sportilus.com/?s='+qenc})
out.append({'label':'Sudski registar', 'icon':'', 'url':'https://sudreg.pravosudje.hr/registar/oc/index.html'})
if kind == 'sportas':
out.append({'label':'HNS Semafor', 'icon':'', 'url':'https://semafor.hns.family/?s='+qenc})
out.append({'label':'transfermarkt', 'icon':'', 'url':'https://www.transfermarkt.com/schnellsuche/ergebnis/schnellsuche?query='+qenc})
if kind == 'savez':
out.append({'label':'sport-pgz.hr savezi', 'icon':'🏅', 'url':'https://sport-pgz.hr/savezi'})
# ─── HTTP helpers ────────────────────────────────────────────────────────
def _http_get(url: str, timeout: int = TIMEOUT) -> Optional[str]:
if not url: return None
if not url.startswith('http'): return None
try:
req = urllib.request.Request(url, headers={
'User-Agent': UA, 'Accept-Language': 'hr,en;q=0.8'})
with urllib.request.urlopen(req, timeout=timeout) as r:
data = r.read(150000)
try: return data.decode('utf-8')
except: return data.decode('latin-1', 'ignore')
except Exception:
return None
def _strip_tags(s: str) -> str:
if not s: return ''
s = re.sub(r'<script[^>]*>.*?</script>', ' ', s, flags=re.S | re.I)
s = re.sub(r'<style[^>]*>.*?</style>', ' ', s, flags=re.S | re.I)
s = re.sub(r'<[^>]+>', ' ', s)
s = html.unescape(s)
s = re.sub(r'\s+', ' ', s).strip()
return s
def _extract_meta(html_doc: str, url: str) -> dict:
if not html_doc: return {}
out = {'url': url, 'fetched_at': int(time.time())}
m = re.search(r'<title[^>]*>([^<]+)</title>', html_doc, re.I)
if m: out['title'] = html.unescape(m.group(1).strip())[:300]
m = re.search(r'<meta\s+name=["\']description["\']\s+content=["\']([^"\']+)["\']', html_doc, re.I)
if not m:
m = re.search(r'<meta\s+property=["\']og:description["\']\s+content=["\']([^"\']+)["\']', html_doc, re.I)
if m: out['description'] = html.unescape(m.group(1).strip())[:600]
return out
@router.post("/enrich/{kind}/{eid}")
def enrich(kind: str, eid: int):
if kind not in ('klub','savez','sportas'):
raise HTTPException(400, "kind must be klub|savez|sportas")
def _fetch_title(url, timeout=5):
body = _http_get(url, timeout=timeout)
if not body: return {'url': url, 'error': 'fetch failed'} if url else None
return _extract_meta(body, url)
# ─── Field extractors ───────────────────────────────────────────────────
RE_EMAIL = re.compile(r'[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,}', re.I)
RE_PHONE = re.compile(r'(?:\+?385[\s\-/]*|0)\d[\d\s\-/]{6,12}\d')
RE_URL = re.compile(r'https?://[^\s"\'<>)\]]+', re.I)
def _find_email(text: str) -> Optional[str]:
if not text: return None
bad = ('@example.', '@test.', '@email.', 'wixpress.com',
'sentry.io', 'jquery.com', 'googleapis', '@2x.', 'noreply@')
seen = set()
for m in RE_EMAIL.finditer(text):
e = m.group(0).lower().rstrip('.,;:)')
if any(b in e for b in bad): continue
if e in seen: continue
seen.add(e); return e
return None
def _find_phone(text: str) -> Optional[str]:
if not text: return None
for m in RE_PHONE.finditer(text):
raw = m.group(0).strip()
digits = re.sub(r'\D', '', raw)
if not (8 <= len(digits) <= 13): continue
cleaned = re.sub(r'\s+', ' ', raw).strip()
if raw.startswith('+385'): return '+385 ' + raw[4:].lstrip().lstrip('-/')
if raw.startswith('00385'): return '+385 ' + raw[5:].lstrip().lstrip('-/')
return cleaned
return None
def _find_official_web(text: str, hint: str = '') -> Optional[str]:
if not text: return None
blocked = ('wikipedia.org', 'sport-pgz.hr', 'google.com', 'facebook.com',
'instagram.com', 'youtube.com', 'twitter.com', 'wikimedia',
'sportilus.com', 'transfermarkt.com', 'wikidata.org',
'sudreg.pravosudje.hr', 'gov.hr', 'apis.google.com',
'rinet.one', 'pgz.hr')
candidates: list[str] = []
for m in RE_URL.finditer(text):
u = m.group(0).rstrip('.,;:)\'"')
try:
host = urllib.parse.urlparse(u).hostname or ''
except Exception:
continue
if not host or any(b in host for b in blocked): continue
candidates.append(u)
if not candidates: return None
if hint:
slug = re.sub(r'[^a-z0-9]', '', hint.lower())[:8]
for u in candidates:
host = urllib.parse.urlparse(u).hostname or ''
if slug and slug in host.replace('-', '').replace('.', ''):
return u
return candidates[0]
# ─── External sources ────────────────────────────────────────────────────
def _wiki_summary(query: str) -> Optional[dict]:
if not query: return None
title = urllib.parse.quote(query.replace(' ', '_'), safe='')
body = _http_get(f'https://hr.wikipedia.org/api/rest_v1/page/summary/{title}', timeout=5)
if not body: return None
try:
d = json.loads(body)
if d.get('type') == 'disambiguation' or 'extract' not in d: return None
return {
'source': 'wikipedia.hr',
'url': d.get('content_urls', {}).get('desktop', {}).get('page'),
'title': d.get('title'),
'extract': d.get('extract'),
'description': d.get('description'),
}
except Exception:
return None
def _sport_pgz_search(query: str) -> Optional[dict]:
if not query: return None
page = _http_get('https://sport-pgz.hr/?s=' + urllib.parse.quote(query), timeout=6)
if not page: return None
m = re.search(r'<article[^>]*>.*?<a\s+href=["\']([^"\']+)["\'][^>]*rel=["\']bookmark["\'][^>]*>([^<]+)</a>',
page, re.S | re.I)
if not m:
m = re.search(r'<a\s+href=["\'](https?://sport-pgz\.hr/[^"\']+)["\'][^>]*>([^<]{6,180})</a>', page, re.I)
if not m: return None
hit = m.group(1)
body = _http_get(hit, timeout=6)
if not body:
return {'source': 'sport-pgz.hr', 'url': hit, 'title': html.unescape(m.group(2).strip())}
text = _strip_tags(body)[:4000]
meta = _extract_meta(body, hit)
return {
'source': 'sport-pgz.hr',
'url': hit,
'title': meta.get('title') or html.unescape(m.group(2).strip()),
'extract': meta.get('description') or text[:500],
'raw_text': text,
}
def _fetch_primary_site(url: str) -> Optional[dict]:
body = _http_get(url, timeout=6)
if not body: return None
text = _strip_tags(body)
meta = _extract_meta(body, url)
return {
'source': urllib.parse.urlparse(url).hostname or url,
'url': url,
'title': meta.get('title'),
'extract': meta.get('description') or text[:500],
'raw_text': text[:8000],
}
# ─── DeepSeek (optional, fail-soft) ─────────────────────────────────────
def _deepseek_describe(naziv: str, kind: str, evidence: list[str]) -> Optional[str]:
if not DEEPSEEK_KEY or not evidence: return None
joined = "\n---\n".join(e for e in evidence if e)[:6000]
if not joined.strip(): return None
prompt = (f"Iz dolje navedenih izvora napiši profesionalni opis za "
f"{kind} '{naziv}' na hrvatskom jeziku. 3-5 rečenica. "
f"Bez uvoda 'Evo opisa', samo tekst.\n\nIZVORI:\n{joined}")
payload = {
"model": "deepseek-chat",
"messages": [
{"role": "system", "content": "Pišeš sažete činjenične opise sportskih organizacija na hrvatskom."},
{"role": "user", "content": prompt},
],
"max_tokens": 280, "temperature": 0.3,
}
req = urllib.request.Request(
DEEPSEEK_URL, data=json.dumps(payload).encode('utf-8'),
headers={'Authorization': 'Bearer ' + DEEPSEEK_KEY,
'Content-Type': 'application/json',
'User-Agent': UA}, method='POST')
try:
with urllib.request.urlopen(req, timeout=20) as r:
d = json.loads(r.read().decode('utf-8'))
text = d.get('choices', [{}])[0].get('message', {}).get('content', '').strip()
return text or None
except Exception:
return None
# ─── Row loaders & display name ─────────────────────────────────────────
def _load_row(kind: str, eid: int) -> dict:
if kind == 'klub':
row = _fetch_one("""SELECT id, naziv, oib, sport, grad, predsjednik, tajnik,
web, web_stranica, email, telefon, ciljevi, opis_djelatnosti,
sjediste, godina_osnutka, savez_id, scrape_url, source_url
sjediste, godina_osnutka, savez_id, scrape_url, source_url,
metadata
FROM pgz_sport.klubovi WHERE id=%s""", (eid,))
elif kind == 'savez':
row = _fetch_one("""SELECT id, naziv, oib, sport, predsjednik, tajnik, email, telefon, web,
adresa, godina_osnutka, source_url
adresa, godina_osnutka, source_url, metadata
FROM pgz_sport.savezi WHERE id=%s""", (eid,))
else: # sportas
elif kind == 'sportas':
row = _fetch_one("""SELECT id, ime, prezime, sport, klub_id, profile_url, scrape_url,
slika_url, source_url, hns_igrac_id, biografija
slika_url, source_url, hns_igrac_id, biografija, metadata
FROM pgz_sport.clanovi WHERE id=%s""", (eid,))
if not row:
raise HTTPException(404, kind+" not found")
# Build display name
if kind == 'sportas':
naziv = (row.get('ime','') + ' ' + row.get('prezime','')).strip()
grad = None
else:
naziv = row.get('naziv','')
grad = row.get('grad') if kind=='klub' else None
raise HTTPException(400, "kind must be klub|savez|sportas")
if not row:
raise HTTPException(404, kind + " not found")
return row
# Live web snippet from primary URL
primary = row.get('web') or row.get('web_stranica') or row.get('source_url') or row.get('scrape_url') or row.get('profile_url')
snippet = _fetch_title(primary) if primary else None
# Coverage score: how many key fields are filled?
def _display_name(kind: str, row: dict) -> str:
if kind == 'sportas':
return ((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip()
return row.get('naziv', '') or ''
def _research_links(naziv, kind, grad=None):
base_q = (naziv or '').strip()
q = (base_q + ' ' + grad) if grad else base_q
qenc = urllib.parse.quote(q)
out = [
{'label': 'Google', 'icon': '🔍', 'url': 'https://www.google.com/search?q=' + qenc},
{'label': 'Wikipedia HR', 'icon': '📚', 'url': 'https://hr.wikipedia.org/w/index.php?search=' + qenc},
{'label': 'sport-pgz.hr', 'icon': '🏅', 'url': 'https://sport-pgz.hr/?s=' + qenc},
]
if kind == 'klub':
keys = ['oib','sport','grad','predsjednik','tajnik','web','email','telefon','sjediste','godina_osnutka','ciljevi']
out.append({'label': 'Sportilus', 'icon': '', 'url': 'https://www.sportilus.com/?s=' + qenc})
out.append({'label': 'Sudski registar', 'icon': '', 'url': 'https://sudreg.pravosudje.hr/registar/oc/index.html'})
if kind == 'sportas':
out.append({'label': 'HNS Semafor', 'icon': '', 'url': 'https://semafor.hns.family/?s=' + qenc})
out.append({'label': 'transfermarkt','icon': '', 'url': 'https://www.transfermarkt.com/schnellsuche/ergebnis/schnellsuche?query=' + qenc})
if kind == 'savez':
out.append({'label': 'sport-pgz.hr savezi', 'icon': '🏅', 'url': 'https://sport-pgz.hr/savezi'})
return out
# ─── Proposal pipelines ─────────────────────────────────────────────────
def _name_tokens(naziv: str) -> list[str]:
"""Significant tokens from entity name (≥4 chars, deaccented)."""
import unicodedata
s = unicodedata.normalize('NFKD', naziv or '').encode('ascii', 'ignore').decode('ascii').lower()
toks = [t for t in re.split(r'[^a-z0-9]+', s) if len(t) >= 4]
stop = {'klub','udruga','sportski','sport','kosarkaski','kosarka','nogometni',
'rukometni','savez','rijeka','primorsko','goranski','grad','grada','centar'}
return [t for t in toks if t not in stop] or toks
def _is_relevant(source: dict, tokens: list[str]) -> bool:
"""A source is 'relevant' only if the page actually mentions the entity name."""
if not tokens: return True
import unicodedata
blob = (source.get('title') or '') + ' ' + (source.get('extract') or '') + ' ' + (source.get('raw_text') or '')
blob = unicodedata.normalize('NFKD', blob.lower()).encode('ascii', 'ignore').decode('ascii')
return any(t in blob for t in tokens)
def _propose_for_klub(row: dict) -> dict:
naziv = row.get('naziv') or ''
primary = row.get('web') or row.get('web_stranica') or row.get('source_url') or row.get('scrape_url')
sources, evidence = [], []
pdoc = _fetch_primary_site(primary) if primary else None
if pdoc: sources.append(pdoc); evidence.append(pdoc.get('raw_text') or pdoc.get('extract') or '')
wiki = _wiki_summary(naziv)
if wiki: sources.append(wiki); evidence.append(wiki.get('extract') or '')
spz = _sport_pgz_search(naziv)
if spz: sources.append(spz); evidence.append(spz.get('raw_text') or spz.get('extract') or '')
tokens = _name_tokens(naziv)
relevant = [s for s in sources if _is_relevant(s, tokens)]
relevant_blob = '\n\n'.join((s.get('raw_text') or s.get('extract') or '') for s in relevant)
proposed: dict[str, Any] = {}
# web/email/telefon: ONLY from sources actually mentioning the entity
if not row.get('web'):
u = _find_official_web(relevant_blob, naziv)
if u: proposed['web'] = u
if not row.get('email'):
e = _find_email(relevant_blob)
if e: proposed['email'] = e
if not row.get('telefon'):
t = _find_phone(relevant_blob)
if t: proposed['telefon'] = t
if not row.get('opis_djelatnosti'):
descr_evidence = [(s.get('raw_text') or s.get('extract') or '') for s in relevant] or evidence
descr = _deepseek_describe(naziv, 'sportski klub', descr_evidence)
if not descr:
for s in (relevant or sources):
if s.get('extract') and len(s['extract']) >= 80:
descr = s['extract']; break
if descr: proposed['opis_djelatnosti'] = descr.strip()[:2000]
return {'proposed': proposed, 'sources': sources}
def _propose_for_savez(row: dict) -> dict:
naziv = row.get('naziv') or ''
primary = row.get('web') or row.get('source_url')
sources, evidence = [], []
pdoc = _fetch_primary_site(primary) if primary else None
if pdoc: sources.append(pdoc); evidence.append(pdoc.get('raw_text') or '')
wiki = _wiki_summary(naziv)
if wiki: sources.append(wiki); evidence.append(wiki.get('extract') or '')
spz = _sport_pgz_search(naziv)
if spz: sources.append(spz); evidence.append(spz.get('raw_text') or '')
tokens = _name_tokens(naziv)
relevant = [s for s in sources if _is_relevant(s, tokens)]
relevant_blob = '\n\n'.join((s.get('raw_text') or s.get('extract') or '') for s in relevant)
proposed: dict[str, Any] = {}
if not row.get('web'):
u = _find_official_web(relevant_blob, naziv)
if u: proposed['web'] = u
if not row.get('email'):
e = _find_email(relevant_blob)
if e: proposed['email'] = e
if not row.get('telefon'):
t = _find_phone(relevant_blob)
if t: proposed['telefon'] = t
return {'proposed': proposed, 'sources': sources}
def _propose_for_sportas(row: dict) -> dict:
naziv = ((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip()
sources, evidence = [], []
wiki = _wiki_summary(naziv)
if wiki: sources.append(wiki); evidence.append(wiki.get('extract') or '')
proposed: dict[str, Any] = {}
if not row.get('biografija') and evidence:
descr = _deepseek_describe(naziv, 'sportaš', evidence)
if not descr and wiki: descr = wiki.get('extract')
if descr: proposed['biografija'] = descr.strip()[:2000]
return {'proposed': proposed, 'sources': sources}
# ─── Endpoints ──────────────────────────────────────────────────────────
@router.post("/enrich/{kind}/{eid}")
def enrich_preview(kind: str, eid: int):
row = _load_row(kind, eid)
if kind == 'klub': res = _propose_for_klub(row)
elif kind == 'savez': res = _propose_for_savez(row)
else: res = _propose_for_sportas(row)
if kind == 'klub':
keys = ['oib','sport','grad','predsjednik','tajnik','web','email','telefon',
'sjediste','godina_osnutka','ciljevi','opis_djelatnosti']
elif kind == 'savez':
keys = ['oib','sport','predsjednik','tajnik','email','telefon','web','adresa','godina_osnutka']
else:
keys = ['sport','profile_url','slika_url','hns_igrac_id','biografija']
filled = sum(1 for k in keys if row.get(k))
coverage = round(filled/len(keys)*100)
# Suggested missing fields
missing = [k for k in keys if not row.get(k)]
naziv = _display_name(kind, row)
grad = row.get('grad') if kind == 'klub' else None
primary = row.get('web') or row.get('web_stranica') or row.get('source_url') or row.get('scrape_url') or row.get('profile_url')
filled = sum(1 for k in keys if row.get(k))
coverage = round(filled / len(keys) * 100)
missing = [k for k in keys if not row.get(k)]
proposed = res['proposed']
current = {k: row.get(k) for k in proposed.keys()}
meta = row.get('metadata') or {}
if not isinstance(meta, dict): meta = {}
return {
'kind': kind,
'id': eid,
'naziv': naziv,
'coverage': coverage,
'filled_fields': filled,
'total_fields': len(keys),
'kind': kind, 'id': eid, 'naziv': naziv,
'coverage': coverage, 'filled_fields': filled, 'total_fields': len(keys),
'missing_fields': missing,
'live_snippet': snippet,
'live_snippet': _fetch_title(primary) if primary else None,
'research_links': _research_links(naziv, kind, grad),
'sources': res['sources'],
'current': current,
'proposed': proposed,
'last_enriched_at': meta.get('enriched_at'),
'last_enrichment_source': meta.get('enrichment_source'),
'enriched_at': int(time.time()),
'apply_url': f'/sport/api/v2/enrich/{kind}/{eid}/apply',
}
# ── R3B P4 — FORENSIC SCAN ──────────────────────────────────────────
_TABLE_MAP = {
'klub': ('pgz_sport.klubovi',
{'web','email','telefon','predsjednik','tajnik',
'opis_djelatnosti','ciljevi','godina_osnutka','sjediste'}),
'savez': ('pgz_sport.savezi',
{'web','email','telefon','predsjednik','tajnik','adresa','godina_osnutka'}),
'sportas': ('pgz_sport.clanovi',
{'biografija','profile_url','slika_url'}),
}
def _apply_to_db(kind: str, eid: int, fields: dict, sources: list, user_email: Optional[str]):
if kind not in _TABLE_MAP:
raise HTTPException(400, "kind must be klub|savez|sportas")
table, allowed = _TABLE_MAP[kind]
with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
cur.execute(f"SELECT * FROM {table} WHERE id=%s FOR UPDATE", (eid,))
before = cur.fetchone()
if not before: raise HTTPException(404, kind + " not found")
before = dict(before)
sets, params, applied = [], [], {}
for k, v in (fields or {}).items():
if k not in allowed: continue
if v is None or str(v).strip() == '': continue
if before.get(k):
continue # never overwrite existing
sets.append(f"{k} = %s")
params.append(v); applied[k] = v
meta_in = before.get('metadata') or {}
if not isinstance(meta_in, dict): meta_in = {}
now_iso = datetime.now(timezone.utc).isoformat()
meta_in['enriched_at'] = now_iso
meta_in['enrichment_source'] = [s.get('source') for s in (sources or []) if s.get('source')]
history = meta_in.get('enrichment_history') or []
history.append({
'at': now_iso,
'fields': list(applied.keys()),
'sources': meta_in['enrichment_source'],
'urls': [s.get('url') for s in (sources or []) if s.get('url')],
'user': user_email,
})
meta_in['enrichment_history'] = history[-10:]
sets.append("metadata = %s::jsonb")
params.append(json.dumps(meta_in, ensure_ascii=False, default=str))
params.append(eid)
cur.execute(f"UPDATE {table} SET {', '.join(sets)} WHERE id=%s RETURNING *", params)
after = dict(cur.fetchone())
cur.execute(
"""INSERT INTO pgz_sport.enrichment_log
(kind, target_id, source, url, fields_set, before_jsonb, after_jsonb, user_email)
VALUES (%s,%s,%s,%s,%s,%s::jsonb,%s::jsonb,%s)""",
(kind, eid,
','.join(meta_in['enrichment_source'])[:120] if meta_in['enrichment_source'] else None,
(sources[0].get('url') if sources else None),
list(applied.keys()) or None,
json.dumps({k: before.get(k) for k in (list(applied.keys()) + ['metadata'])},
ensure_ascii=False, default=str),
json.dumps({k: after.get(k) for k in (list(applied.keys()) + ['metadata'])},
ensure_ascii=False, default=str),
user_email))
snap_keys = ('id','naziv','ime','prezime','web','email','telefon',
'opis_djelatnosti','biografija','metadata')
return {'applied': applied,
'after': {k: after.get(k) for k in snap_keys if k in after}}
@router.post("/enrich/{kind}/{eid}/apply")
def enrich_apply(kind: str, eid: int,
body: dict = Body(default=None),
x_user_email: Optional[str] = Header(default=None)):
body = body or {}
fields = body.get('fields')
sources = body.get('sources')
if not fields:
row = _load_row(kind, eid)
if kind == 'klub': res = _propose_for_klub(row)
elif kind == 'savez': res = _propose_for_savez(row)
else: res = _propose_for_sportas(row)
fields = res['proposed']
sources = res['sources']
out = _apply_to_db(kind, eid, fields or {}, sources or [], x_user_email)
return {'kind': kind, 'id': eid, **out}
@router.get("/enrich/log")
def enrich_log(kind: Optional[str] = None, target_id: Optional[int] = None, limit: int = 50):
where, params = [], []
if kind: where.append("kind=%s"); params.append(kind)
if target_id: where.append("target_id=%s"); params.append(target_id)
sql = ("SELECT id, kind, target_id, source, url, fields_set, user_email, created_at "
"FROM pgz_sport.enrichment_log "
+ ("WHERE " + " AND ".join(where) + " " if where else "")
+ "ORDER BY id DESC LIMIT %s")
params.append(min(int(limit or 50), 200))
with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
cur.execute(sql, params)
rows = [dict(r) for r in cur.fetchall()]
for r in rows:
if r.get('created_at'): r['created_at'] = r['created_at'].isoformat()
return {'count': len(rows), 'rows': rows}
# ─── R3B M2 — SEARCH SUGGEST (autocomplete for Mreža) ───────────────────
@router.get("/search/suggest")
def search_suggest(q: str = '', type: str = '', limit: int = 10):
"""
Autocomplete suggestions for the Mreža search inputs.
type ∈ {person, club, company, ''} — empty means all.
Returns: {query, results: [{id, label, type, sub}]}
"""
q = (q or '').strip()
if len(q) < 2:
return {'query': q, 'results': []}
limit = max(1, min(50, int(limit)))
out = []
with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
if type in ('', 'club'):
cur.execute("""
SELECT id, naziv AS label, sport, grad
FROM pgz_sport.klubovi
WHERE naziv ILIKE %s AND aktivan=TRUE
ORDER BY length(naziv), naziv LIMIT %s
""", ('%'+q+'%', limit))
for r in cur.fetchall():
out.append({'id':'klub:'+str(r['id']), 'label': r['label'], 'type':'club',
'sub': (r.get('sport') or '')+' · '+(r.get('grad') or '')})
cur.execute("""
SELECT id, naziv AS label, sport
FROM pgz_sport.savezi
WHERE naziv ILIKE %s AND aktivan=TRUE
ORDER BY length(naziv), naziv LIMIT %s
""", ('%'+q+'%', limit))
for r in cur.fetchall():
out.append({'id':'savez:'+str(r['id']), 'label': r['label'], 'type':'savez',
'sub': r.get('sport') or 'savez'})
if type in ('', 'person'):
cur.execute("""
SELECT c.id, c.ime, c.prezime, c.sport, k.naziv AS klub_naziv
FROM pgz_sport.clanovi c
LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id
WHERE (COALESCE(c.ime,'') || ' ' || COALESCE(c.prezime,'')) ILIKE %s
ORDER BY length(COALESCE(c.ime,'')||COALESCE(c.prezime,'')), c.prezime
LIMIT %s
""", ('%'+q+'%', limit))
for r in cur.fetchall():
out.append({'id':'sportas:'+str(r['id']),
'label': (r.get('ime') or '')+' '+(r.get('prezime') or ''),
'type':'person',
'sub': (r.get('sport') or 'sportaš')+(r.get('klub_naziv') and ' · '+r['klub_naziv'] or '')})
cur.execute("""
SELECT id, name AS label, function, oib, county
FROM civic.persons
WHERE name ILIKE %s
ORDER BY oib NULLS LAST, length(name) LIMIT %s
""", ('%'+q+'%', limit))
for r in cur.fetchall():
out.append({'id':'civic_person:'+str(r['id']),
'label': r['label'], 'type':'person',
'sub': (r.get('function') or 'civic')+' · '+(r.get('county') or '')})
if type in ('', 'company'):
cur.execute("""
SELECT id, name AS label, oib, city, entity_type
FROM civic.entities
WHERE name ILIKE %s
ORDER BY length(name) LIMIT %s
""", ('%'+q+'%', limit))
for r in cur.fetchall():
out.append({'id':'civic_entity:'+str(r['id']),
'label': r['label'], 'type':'company',
'sub': (r.get('entity_type') or 'tvrtka')+' · '+(r.get('city') or '')})
return {'query': q, 'results': out[:limit*2]}
# ─── R3B M3 — FORENSIC ENRICH (Wikipedia scrape + persist) ──────────────
@router.post("/forensic/findings/{finding_id}/enrich")
def enrich_forensic(finding_id: int):
"""
Look up the forensic finding, derive the PEP person name from
entities_involved or title, hit Wikipedia HR for a summary, and persist
the enriched payload into civic.forensic_findings.ai_analysis (or back into
raw_data.enrichment).
"""
with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
cur.execute("""
SELECT id, finding_type, severity, title, description, entities_involved,
raw_data, ai_analysis
FROM civic.forensic_findings WHERE id=%s
""", (finding_id,))
f = cur.fetchone()
if not f: raise HTTPException(404, "finding not found")
f = dict(f)
# Derive person name candidates
candidates = []
if isinstance(f.get('entities_involved'), (list, dict)):
ei = f['entities_involved']
if isinstance(ei, dict):
for k in ('person','name','osoba','PEP','pep'):
if ei.get(k): candidates.append(str(ei[k]))
# Also try persons: [...] list
for p in (ei.get('persons') or ei.get('osobe') or []):
if isinstance(p, dict) and p.get('name'): candidates.append(p['name'])
elif isinstance(p, str): candidates.append(p)
elif isinstance(ei, list):
for it in ei:
if isinstance(it, dict):
for k in ('name','person','label'):
if it.get(k): candidates.append(str(it[k])); break
elif isinstance(it, str):
candidates.append(it)
if not candidates and f.get('title'):
# Heuristic: extract first capitalised "Ime Prezime" pair
m = re.search(r'\b([A-ZŠĐČĆŽ][a-zšđčćž]{2,})\s+([A-ZŠĐČĆŽ][a-zšđčćž]{2,})', f['title'])
if m: candidates.append(m.group(0))
wiki = None
used_query = None
for q in candidates[:3]:
wiki = _wiki_summary(q)
if wiki:
used_query = q
break
# Build enrichment payload
enrichment = {
'queried': candidates[:5],
'used_query': used_query,
'wiki': wiki,
'enriched_at': datetime.now(timezone.utc).isoformat(),
}
# Persist into raw_data.enrichment
raw = f.get('raw_data')
if raw is None: raw = {}
if not isinstance(raw, dict): raw = {'_legacy': raw}
raw['enrichment'] = enrichment
cur.execute("""
UPDATE civic.forensic_findings
SET raw_data = %s::jsonb,
ai_analysis = COALESCE(ai_analysis, %s)
WHERE id = %s
""", (json.dumps(raw, default=str, ensure_ascii=False),
(wiki or {}).get('extract'),
finding_id))
c.commit()
return {
'finding_id': finding_id,
'queried': candidates[:5],
'used_query': used_query,
'wiki': wiki,
'persisted': True,
}
# ─── R3B P4 — FORENSIC SCAN (kept from prior version) ───────────────────
@router.post("/forensic/scan")
def forensic_scan(req: dict = Body(...)):
"""
Search civic.persons by name. For each match, gather entities, person
role, forensic_findings count, and synthesise a risk score.
Body: {"name": "Velimir Liverić"}
"""
name = (req.get('name') or '').strip()
if len(name) < 3:
raise HTTPException(400, "name must be at least 3 chars")
with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
cur.execute("""
SELECT id, name, function, party, county, city, oib, trust_tier
FROM civic.persons
WHERE upper(name) ILIKE upper(%s)
ORDER BY oib NULLS LAST, id
LIMIT 25
""", ('%'+name+'%',))
ORDER BY oib NULLS LAST, id LIMIT 25
""", ('%' + name + '%',))
persons = [dict(r) for r in cur.fetchall()]
# For each person collect entity links via OIB
for p in persons:
p['links'] = []
p['findings'] = []
p['links'] = []; p['findings'] = []
if p.get('oib'):
cur.execute("""
SELECT pel.entity_id, pel.roles, e.name AS entity_name, e.oib AS entity_oib,
e.entity_type, e.city, e.risk_score
FROM civic.person_entity_links pel
LEFT JOIN civic.entities e ON e.id = pel.entity_id
WHERE pel.person_oib = %s
LIMIT 50
WHERE pel.person_oib = %s LIMIT 50
""", (p['oib'],))
p['links'] = [dict(r) for r in cur.fetchall()]
# Forensic findings JSONB containing this OIB
cur.execute("""
SELECT id, finding_type, severity, title, severity_score, created_at
FROM civic.forensic_findings
WHERE entities_involved::text ILIKE %s
ORDER BY severity_score DESC, created_at DESC
LIMIT 30
""", ('%'+p['oib']+'%',))
ORDER BY severity_score DESC, created_at DESC LIMIT 30
""", ('%' + p['oib'] + '%',))
p['findings'] = [dict(r) for r in cur.fetchall()]
# Also search forensic_findings by name
if not p['findings']:
cur.execute("""
SELECT id, finding_type, severity, title, severity_score, created_at
FROM civic.forensic_findings
WHERE title ILIKE %s OR description ILIKE %s
ORDER BY severity_score DESC, created_at DESC
LIMIT 30
""", ('%'+p['name']+'%', '%'+p['name']+'%'))
ORDER BY severity_score DESC, created_at DESC LIMIT 30
""", ('%' + p['name'] + '%', '%' + p['name'] + '%'))
p['findings'] = [dict(r) for r in cur.fetchall()]
# Synthesise risk score per person and overall
total_links = 0
total_findings = 0
crit_findings = 0
total_links = total_findings = crit_findings = 0
for p in persons:
total_links += len(p.get('links') or [])
for f in p.get('findings') or []:
total_findings += 1
if f.get('severity') in ('CRITICAL','HIGH'):
crit_findings += 1
# per-person risk: 30 base if PEP-like (function set), +5 per link, +10 per finding, +20 per crit
if f.get('severity') in ('CRITICAL', 'HIGH'): crit_findings += 1
score = 0
if (p.get('function') or '').strip():
score += 30
if (p.get('party') or '').strip():
score += 15
score += min(40, len(p.get('links') or [])*5)
score += min(40, len(p.get('findings') or [])*10)
score += sum(20 for f in (p.get('findings') or []) if f.get('severity') in ('CRITICAL','HIGH'))
if (p.get('function') or '').strip(): score += 30
if (p.get('party') or '').strip(): score += 15
score += min(40, len(p.get('links') or []) * 5)
score += min(40, len(p.get('findings') or []) * 10)
score += sum(20 for f in (p.get('findings') or []) if f.get('severity') in ('CRITICAL', 'HIGH'))
p['risk_score'] = min(100, score)
overall = 0
if persons:
overall = max(p.get('risk_score',0) for p in persons)
return {
'query': name,
'matched_persons': len(persons),
'overall_risk_score': overall,
'total_links': total_links,
'total_findings': total_findings,
'critical_findings': crit_findings,
'persons': persons,
'scanned_at': int(time.time()),
}
# ── R3B P6 — ENRICH /apply (write enriched fields back to DB) ───────
@router.post("/enrich/{kind}/{eid}/apply")
def enrich_apply(kind: str, eid: int, req: dict = Body(default={})):
"""
Apply enrichment to DB. Body may contain {fields: {web, email, telefon}}
to override the auto-derived suggestions; otherwise we apply derived ones.
Only updates fields that are currently NULL or empty in DB (additive only).
"""
if kind not in ('klub','savez','sportas'):
raise HTTPException(400, "kind must be klub|savez|sportas")
body_fields = (req.get('fields') if isinstance(req, dict) else {}) or {}
if kind == 'klub':
table = 'pgz_sport.klubovi'
cols = ['web','email','telefon']
elif kind == 'savez':
table = 'pgz_sport.savezi'
cols = ['web','email','telefon']
else:
table = 'pgz_sport.clanovi'
cols = ['biografija','profile_url']
with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
cur.execute(f"SELECT * FROM {table} WHERE id=%s", (eid,))
row = cur.fetchone()
if not row: raise HTTPException(404, kind+" not found")
row = dict(row)
# Try a live fetch from primary URL to glean email/phone
primary = row.get('web') or row.get('web_stranica') or row.get('source_url') or row.get('scrape_url') or row.get('profile_url')
derived = {}
if primary:
snippet = _fetch_title(primary, timeout=6)
try:
if snippet and snippet.get('url'):
req2 = urllib.request.Request(primary, headers={'User-Agent': UA})
with urllib.request.urlopen(req2, timeout=6) as r:
page = r.read(80000).decode('utf-8','ignore')
em = re.search(r'[\w\.-]+@[\w\.-]+\.[a-z]{2,8}', page, re.I)
if em: derived['email'] = em.group(0)
tel = re.search(r'\+?385[\s\-]?\d[\d\s\-/]{6,}', page)
if tel: derived['telefon'] = re.sub(r'\s+', ' ', tel.group(0).strip())
except Exception:
pass
# Merge: body fields override derived
proposed = dict(derived)
for k, v in (body_fields or {}).items():
if k in cols and v:
proposed[k] = v
# Only apply where DB currently empty
applied = {}
for k, v in proposed.items():
if k in cols and (row.get(k) is None or row.get(k)==''):
applied[k] = v
if applied:
sets = ', '.join([f"{k}=%s" for k in applied])
params = list(applied.values()) + [eid]
cur.execute(f"UPDATE {table} SET {sets} WHERE id=%s", params)
c.commit()
return {
'kind': kind, 'id': eid,
'proposed': proposed,
'applied': applied,
'skipped_existing': [k for k in proposed if k not in applied],
'applied_at': int(time.time()),
}
overall = max((p.get('risk_score', 0) for p in persons), default=0)
return {'query': name, 'matched_persons': len(persons),
'overall_risk_score': overall, 'total_links': total_links,
'total_findings': total_findings, 'critical_findings': crit_findings,
'persons': persons, 'scanned_at': int(time.time())}