ca92717039
- enrich_apply now imports audit_seal_router.audit_log and writes a sys_audit
row after every successful UPDATE: action='enrich.apply', target_type=kind,
target_id=eid, payload={applied: {...}, sources: [...]}, user from headers.
- Other modules (cc2 users, cc4 invoices/putni_nalozi, cc5 clanarine/lijecnicki/
obrasci) can call the same helper:
from audit_seal_router import audit_log
audit_log(action='users.update', target_type='users', target_id=u['id'],
payload={'changed':[...]}, user_email=actor)
- Verified: real apply on klub 4528 produced sys_audit id 102.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1003 lines
44 KiB
Python
1003 lines
44 KiB
Python
"""
|
|
enrich_router.py — v3 enrichment + forensic scan
|
|
Author: dradulic@outlook.com / damir@rinet.one
|
|
Date: 2026-05-04 (R2) → 2026-05-05 (R3 CC6 v3)
|
|
|
|
POST /v2/enrich/{kind}/{eid}
|
|
Inspect the row, scrape the web (Wikipedia HR, sport-pgz.hr search,
|
|
primary club URL if any), regex-extract candidate fields (web/email/
|
|
telefon), optionally synthesise descriptions via DeepSeek, and return
|
|
a *preview* shape with `proposed` updates the operator can apply.
|
|
|
|
POST /v2/enrich/{kind}/{eid}/apply
|
|
Body shapes:
|
|
None / {} → re-run preview, apply every proposed field
|
|
{"fields": {...}} → apply ONLY those (whitelist + emptiness still enforced)
|
|
Performs UPDATE on the matching table, sets metadata.enriched_at and
|
|
metadata.enrichment_source, writes a row to pgz_sport.enrichment_log,
|
|
returns the after snapshot.
|
|
|
|
GET /v2/enrich/log?kind=&target_id=&limit=
|
|
Read recent enrichment-log entries.
|
|
|
|
POST /v2/forensic/scan
|
|
Search civic.persons by name, return entity links + findings + risk score.
|
|
|
|
Kinds: klub | savez | sportas
|
|
"""
|
|
from __future__ import annotations
|
|
import os, re, json, time, html, urllib.parse, urllib.request
|
|
from datetime import datetime, timezone
|
|
from typing import Any, Optional
|
|
|
|
import psycopg2, psycopg2.extras
|
|
from fastapi import APIRouter, HTTPException, Header, Body
|
|
|
|
router = APIRouter()
|
|
|
|
_pgh = os.environ.get('PG_HOST', '10.10.0.2')
|
|
_pgp = int(os.environ.get('PG_PORT', '6432'))
|
|
if _pgh in ('localhost', '127.0.0.1'):
|
|
_pgh = os.environ.get('DB_HOST', '10.10.0.2')
|
|
_pgp = int(os.environ.get('DB_PORT', '6432'))
|
|
DB = dict(host=_pgh, port=_pgp,
|
|
dbname=os.environ.get('PG_DB', 'rinet_v3'),
|
|
user=os.environ.get('PG_USER', 'rinet'),
|
|
password=os.environ.get('PG_PASS', 'R1net2026!SecureDB#v7'))
|
|
|
|
UA = 'pgz-sport-enrich/3.0 (+https://sport.rinet.one)'
|
|
TIMEOUT = 6 # seconds — fail-soft
|
|
|
|
# Optional JS-aware fallback (Playwright). Lazy-loaded, never required.
|
|
import sys as _sys
|
|
_sys.path.insert(0, '/opt/pgz-sport')
|
|
try:
|
|
from enrichment import playwright_scraper as _pw_scraper
|
|
_HAS_PW = _pw_scraper.HAS_PLAYWRIGHT
|
|
except Exception:
|
|
_pw_scraper = None
|
|
_HAS_PW = False
|
|
|
|
DEEPSEEK_KEY = os.environ.get('DEEPSEEK_API_KEY', '').strip()
|
|
DEEPSEEK_URL = os.environ.get('DEEPSEEK_URL',
|
|
'https://api.deepseek.com/v1/chat/completions')
|
|
|
|
|
|
# ─── DB helpers ──────────────────────────────────────────────────────────
|
|
def _db():
|
|
c = psycopg2.connect(**DB); c.autocommit = True; return c
|
|
|
|
def _fetch_one(sql, p):
|
|
with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
|
|
cur.execute(sql, p); r = cur.fetchone()
|
|
return dict(r) if r else None
|
|
|
|
|
|
# ─── HTTP helpers ────────────────────────────────────────────────────────
|
|
def _http_get(url: str, timeout: int = TIMEOUT) -> Optional[str]:
|
|
if not url: return None
|
|
if not url.startswith('http'): return None
|
|
try:
|
|
req = urllib.request.Request(url, headers={
|
|
'User-Agent': UA, 'Accept-Language': 'hr,en;q=0.8'})
|
|
with urllib.request.urlopen(req, timeout=timeout) as r:
|
|
data = r.read(150000)
|
|
try: return data.decode('utf-8')
|
|
except: return data.decode('latin-1', 'ignore')
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
def _strip_tags(s: str) -> str:
|
|
if not s: return ''
|
|
s = re.sub(r'<script[^>]*>.*?</script>', ' ', s, flags=re.S | re.I)
|
|
s = re.sub(r'<style[^>]*>.*?</style>', ' ', s, flags=re.S | re.I)
|
|
s = re.sub(r'<[^>]+>', ' ', s)
|
|
s = html.unescape(s)
|
|
s = re.sub(r'\s+', ' ', s).strip()
|
|
return s
|
|
|
|
|
|
def _extract_meta(html_doc: str, url: str) -> dict:
|
|
if not html_doc: return {}
|
|
out = {'url': url, 'fetched_at': int(time.time())}
|
|
m = re.search(r'<title[^>]*>([^<]+)</title>', html_doc, re.I)
|
|
if m: out['title'] = html.unescape(m.group(1).strip())[:300]
|
|
m = re.search(r'<meta\s+name=["\']description["\']\s+content=["\']([^"\']+)["\']', html_doc, re.I)
|
|
if not m:
|
|
m = re.search(r'<meta\s+property=["\']og:description["\']\s+content=["\']([^"\']+)["\']', html_doc, re.I)
|
|
if m: out['description'] = html.unescape(m.group(1).strip())[:600]
|
|
return out
|
|
|
|
|
|
def _fetch_title(url, timeout=5):
|
|
body = _http_get(url, timeout=timeout)
|
|
if not body: return {'url': url, 'error': 'fetch failed'} if url else None
|
|
return _extract_meta(body, url)
|
|
|
|
|
|
# ─── Field extractors ───────────────────────────────────────────────────
|
|
RE_EMAIL = re.compile(r'[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,}', re.I)
|
|
RE_PHONE = re.compile(r'(?:\+?385[\s\-/]*|0)\d[\d\s\-/]{6,12}\d')
|
|
RE_URL = re.compile(r'https?://[^\s"\'<>)\]]+', re.I)
|
|
|
|
def _find_email(text: str) -> Optional[str]:
|
|
if not text: return None
|
|
bad = ('@example.', '@test.', '@email.', 'wixpress.com',
|
|
'sentry.io', 'jquery.com', 'googleapis', '@2x.', 'noreply@')
|
|
seen = set()
|
|
for m in RE_EMAIL.finditer(text):
|
|
e = m.group(0).lower().rstrip('.,;:)')
|
|
if any(b in e for b in bad): continue
|
|
if e in seen: continue
|
|
seen.add(e); return e
|
|
return None
|
|
|
|
def _find_phone(text: str) -> Optional[str]:
|
|
if not text: return None
|
|
for m in RE_PHONE.finditer(text):
|
|
raw = m.group(0).strip()
|
|
digits = re.sub(r'\D', '', raw)
|
|
if not (8 <= len(digits) <= 13): continue
|
|
cleaned = re.sub(r'\s+', ' ', raw).strip()
|
|
if raw.startswith('+385'): return '+385 ' + raw[4:].lstrip().lstrip('-/')
|
|
if raw.startswith('00385'): return '+385 ' + raw[5:].lstrip().lstrip('-/')
|
|
return cleaned
|
|
return None
|
|
|
|
def _find_official_web(text: str, hint: str = '') -> Optional[str]:
|
|
if not text: return None
|
|
blocked = ('wikipedia.org', 'sport-pgz.hr', 'google.com', 'facebook.com',
|
|
'instagram.com', 'youtube.com', 'twitter.com', 'wikimedia',
|
|
'sportilus.com', 'transfermarkt.com', 'wikidata.org',
|
|
'sudreg.pravosudje.hr', 'gov.hr', 'apis.google.com',
|
|
'rinet.one', 'pgz.hr')
|
|
candidates: list[str] = []
|
|
for m in RE_URL.finditer(text):
|
|
u = m.group(0).rstrip('.,;:)\'"')
|
|
try:
|
|
host = urllib.parse.urlparse(u).hostname or ''
|
|
except Exception:
|
|
continue
|
|
if not host or any(b in host for b in blocked): continue
|
|
candidates.append(u)
|
|
if not candidates: return None
|
|
if hint:
|
|
slug = re.sub(r'[^a-z0-9]', '', hint.lower())[:8]
|
|
for u in candidates:
|
|
host = urllib.parse.urlparse(u).hostname or ''
|
|
if slug and slug in host.replace('-', '').replace('.', ''):
|
|
return u
|
|
return candidates[0]
|
|
|
|
|
|
# ─── External sources ────────────────────────────────────────────────────
|
|
def _wiki_summary(query: str) -> Optional[dict]:
|
|
if not query: return None
|
|
title = urllib.parse.quote(query.replace(' ', '_'), safe='')
|
|
body = _http_get(f'https://hr.wikipedia.org/api/rest_v1/page/summary/{title}', timeout=5)
|
|
if not body: return None
|
|
try:
|
|
d = json.loads(body)
|
|
if d.get('type') == 'disambiguation' or 'extract' not in d: return None
|
|
return {
|
|
'source': 'wikipedia.hr',
|
|
'url': d.get('content_urls', {}).get('desktop', {}).get('page'),
|
|
'title': d.get('title'),
|
|
'extract': d.get('extract'),
|
|
'description': d.get('description'),
|
|
}
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
def _sport_pgz_search(query: str) -> Optional[dict]:
|
|
if not query: return None
|
|
page = _http_get('https://sport-pgz.hr/?s=' + urllib.parse.quote(query), timeout=6)
|
|
if not page:
|
|
# Plain HTTP failed → try JS-rendered fallback if available.
|
|
if _HAS_PW and _pw_scraper is not None:
|
|
return _pw_scraper.scrape_sport_pgz_klub(query)
|
|
return None
|
|
m = re.search(r'<article[^>]*>.*?<a\s+href=["\']([^"\']+)["\'][^>]*rel=["\']bookmark["\'][^>]*>([^<]+)</a>',
|
|
page, re.S | re.I)
|
|
if not m:
|
|
m = re.search(r'<a\s+href=["\'](https?://sport-pgz\.hr/[^"\']+)["\'][^>]*>([^<]{6,180})</a>', page, re.I)
|
|
if not m:
|
|
# Search page rendered but yielded nothing parseable — try JS fallback.
|
|
if _HAS_PW and _pw_scraper is not None:
|
|
return _pw_scraper.scrape_sport_pgz_klub(query)
|
|
return None
|
|
hit = m.group(1)
|
|
body = _http_get(hit, timeout=6)
|
|
if not body:
|
|
return {'source': 'sport-pgz.hr', 'url': hit, 'title': html.unescape(m.group(2).strip())}
|
|
text = _strip_tags(body)[:4000]
|
|
meta = _extract_meta(body, hit)
|
|
return {
|
|
'source': 'sport-pgz.hr',
|
|
'url': hit,
|
|
'title': meta.get('title') or html.unescape(m.group(2).strip()),
|
|
'extract': meta.get('description') or text[:500],
|
|
'raw_text': text,
|
|
}
|
|
|
|
|
|
def _fetch_primary_site(url: str) -> Optional[dict]:
|
|
body = _http_get(url, timeout=6)
|
|
if not body: return None
|
|
text = _strip_tags(body)
|
|
meta = _extract_meta(body, url)
|
|
return {
|
|
'source': urllib.parse.urlparse(url).hostname or url,
|
|
'url': url,
|
|
'title': meta.get('title'),
|
|
'extract': meta.get('description') or text[:500],
|
|
'raw_text': text[:8000],
|
|
}
|
|
|
|
|
|
# ─── DeepSeek (optional, fail-soft) ─────────────────────────────────────
|
|
def _deepseek_describe(naziv: str, kind: str, evidence: list[str]) -> Optional[str]:
|
|
if not DEEPSEEK_KEY or not evidence: return None
|
|
joined = "\n---\n".join(e for e in evidence if e)[:6000]
|
|
if not joined.strip(): return None
|
|
prompt = (f"Iz dolje navedenih izvora napiši profesionalni opis za "
|
|
f"{kind} '{naziv}' na hrvatskom jeziku. 3-5 rečenica. "
|
|
f"Bez uvoda 'Evo opisa', samo tekst.\n\nIZVORI:\n{joined}")
|
|
payload = {
|
|
"model": "deepseek-chat",
|
|
"messages": [
|
|
{"role": "system", "content": "Pišeš sažete činjenične opise sportskih organizacija na hrvatskom."},
|
|
{"role": "user", "content": prompt},
|
|
],
|
|
"max_tokens": 280, "temperature": 0.3,
|
|
}
|
|
req = urllib.request.Request(
|
|
DEEPSEEK_URL, data=json.dumps(payload).encode('utf-8'),
|
|
headers={'Authorization': 'Bearer ' + DEEPSEEK_KEY,
|
|
'Content-Type': 'application/json',
|
|
'User-Agent': UA}, method='POST')
|
|
try:
|
|
with urllib.request.urlopen(req, timeout=20) as r:
|
|
d = json.loads(r.read().decode('utf-8'))
|
|
text = d.get('choices', [{}])[0].get('message', {}).get('content', '').strip()
|
|
return text or None
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
# ─── Row loaders & display name ─────────────────────────────────────────
|
|
def _load_row(kind: str, eid: int) -> dict:
|
|
if kind == 'klub':
|
|
row = _fetch_one("""SELECT id, naziv, oib, sport, grad, predsjednik, tajnik,
|
|
web, web_stranica, email, telefon, ciljevi, opis_djelatnosti,
|
|
sjediste, godina_osnutka, savez_id, scrape_url, source_url,
|
|
metadata
|
|
FROM pgz_sport.klubovi WHERE id=%s""", (eid,))
|
|
elif kind == 'savez':
|
|
row = _fetch_one("""SELECT id, naziv, oib, sport, predsjednik, tajnik, email, telefon, web,
|
|
adresa, godina_osnutka, source_url, metadata
|
|
FROM pgz_sport.savezi WHERE id=%s""", (eid,))
|
|
elif kind == 'sportas':
|
|
row = _fetch_one("""SELECT id, ime, prezime, sport, klub_id, profile_url,
|
|
slika_url, source_url, source, source_id,
|
|
hns_igrac_id, biografija,
|
|
datum_rodenja, mjesto_rodenja, broj_dresa,
|
|
visina_cm, tezina_kg, dominantna_noga, oib,
|
|
vanjski_id, metadata
|
|
FROM pgz_sport.clanovi WHERE id=%s""", (eid,))
|
|
else:
|
|
raise HTTPException(400, "kind must be klub|savez|sportas")
|
|
if not row:
|
|
raise HTTPException(404, kind + " not found")
|
|
return row
|
|
|
|
|
|
def _display_name(kind: str, row: dict) -> str:
|
|
if kind == 'sportas':
|
|
return ((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip()
|
|
return row.get('naziv', '') or ''
|
|
|
|
|
|
def _research_links(naziv, kind, grad=None):
|
|
base_q = (naziv or '').strip()
|
|
q = (base_q + ' ' + grad) if grad else base_q
|
|
qenc = urllib.parse.quote(q)
|
|
out = [
|
|
{'label': 'Google', 'icon': '🔍', 'url': 'https://www.google.com/search?q=' + qenc},
|
|
{'label': 'Wikipedia HR', 'icon': '📚', 'url': 'https://hr.wikipedia.org/w/index.php?search=' + qenc},
|
|
{'label': 'sport-pgz.hr', 'icon': '🏅', 'url': 'https://sport-pgz.hr/?s=' + qenc},
|
|
]
|
|
if kind == 'klub':
|
|
out.append({'label': 'Sportilus', 'icon': '⬡', 'url': 'https://www.sportilus.com/?s=' + qenc})
|
|
out.append({'label': 'Sudski registar', 'icon': '⚖', 'url': 'https://sudreg.pravosudje.hr/registar/oc/index.html'})
|
|
if kind == 'sportas':
|
|
out.append({'label': 'HNS Semafor', 'icon': '⚽', 'url': 'https://semafor.hns.family/?s=' + qenc})
|
|
out.append({'label': 'transfermarkt','icon': '⚽', 'url': 'https://www.transfermarkt.com/schnellsuche/ergebnis/schnellsuche?query=' + qenc})
|
|
if kind == 'savez':
|
|
out.append({'label': 'sport-pgz.hr savezi', 'icon': '🏅', 'url': 'https://sport-pgz.hr/savezi'})
|
|
return out
|
|
|
|
|
|
# ─── Proposal pipelines ─────────────────────────────────────────────────
|
|
def _name_tokens(naziv: str) -> list[str]:
|
|
"""Significant tokens from entity name (≥4 chars, deaccented)."""
|
|
import unicodedata
|
|
s = unicodedata.normalize('NFKD', naziv or '').encode('ascii', 'ignore').decode('ascii').lower()
|
|
toks = [t for t in re.split(r'[^a-z0-9]+', s) if len(t) >= 4]
|
|
stop = {'klub','udruga','sportski','sport','kosarkaski','kosarka','nogometni',
|
|
'rukometni','savez','rijeka','primorsko','goranski','grad','grada','centar'}
|
|
return [t for t in toks if t not in stop] or toks
|
|
|
|
|
|
def _is_relevant(source: dict, tokens: list[str]) -> bool:
|
|
"""A source is 'relevant' only if the page actually mentions the entity name."""
|
|
if not tokens: return True
|
|
import unicodedata
|
|
blob = (source.get('title') or '') + ' ' + (source.get('extract') or '') + ' ' + (source.get('raw_text') or '')
|
|
blob = unicodedata.normalize('NFKD', blob.lower()).encode('ascii', 'ignore').decode('ascii')
|
|
return any(t in blob for t in tokens)
|
|
|
|
|
|
def _propose_for_klub(row: dict) -> dict:
|
|
naziv = row.get('naziv') or ''
|
|
primary = row.get('web') or row.get('web_stranica') or row.get('source_url') or row.get('scrape_url')
|
|
sources, evidence = [], []
|
|
pdoc = _fetch_primary_site(primary) if primary else None
|
|
if pdoc: sources.append(pdoc); evidence.append(pdoc.get('raw_text') or pdoc.get('extract') or '')
|
|
wiki = _wiki_summary(naziv)
|
|
if wiki: sources.append(wiki); evidence.append(wiki.get('extract') or '')
|
|
spz = _sport_pgz_search(naziv)
|
|
if spz: sources.append(spz); evidence.append(spz.get('raw_text') or spz.get('extract') or '')
|
|
|
|
tokens = _name_tokens(naziv)
|
|
relevant = [s for s in sources if _is_relevant(s, tokens)]
|
|
relevant_blob = '\n\n'.join((s.get('raw_text') or s.get('extract') or '') for s in relevant)
|
|
|
|
proposed: dict[str, Any] = {}
|
|
# web/email/telefon: ONLY from sources actually mentioning the entity
|
|
if not row.get('web'):
|
|
u = _find_official_web(relevant_blob, naziv)
|
|
if u: proposed['web'] = u
|
|
if not row.get('email'):
|
|
e = _find_email(relevant_blob)
|
|
if e: proposed['email'] = e
|
|
if not row.get('telefon'):
|
|
t = _find_phone(relevant_blob)
|
|
if t: proposed['telefon'] = t
|
|
if not row.get('opis_djelatnosti'):
|
|
descr_evidence = [(s.get('raw_text') or s.get('extract') or '') for s in relevant] or evidence
|
|
descr = _deepseek_describe(naziv, 'sportski klub', descr_evidence)
|
|
if not descr:
|
|
for s in (relevant or sources):
|
|
if s.get('extract') and len(s['extract']) >= 80:
|
|
descr = s['extract']; break
|
|
if descr: proposed['opis_djelatnosti'] = descr.strip()[:2000]
|
|
return {'proposed': proposed, 'sources': sources}
|
|
|
|
|
|
def _propose_for_savez(row: dict) -> dict:
|
|
naziv = row.get('naziv') or ''
|
|
primary = row.get('web') or row.get('source_url')
|
|
sources, evidence = [], []
|
|
pdoc = _fetch_primary_site(primary) if primary else None
|
|
if pdoc: sources.append(pdoc); evidence.append(pdoc.get('raw_text') or '')
|
|
wiki = _wiki_summary(naziv)
|
|
if wiki: sources.append(wiki); evidence.append(wiki.get('extract') or '')
|
|
spz = _sport_pgz_search(naziv)
|
|
if spz: sources.append(spz); evidence.append(spz.get('raw_text') or '')
|
|
|
|
tokens = _name_tokens(naziv)
|
|
relevant = [s for s in sources if _is_relevant(s, tokens)]
|
|
relevant_blob = '\n\n'.join((s.get('raw_text') or s.get('extract') or '') for s in relevant)
|
|
|
|
proposed: dict[str, Any] = {}
|
|
if not row.get('web'):
|
|
u = _find_official_web(relevant_blob, naziv)
|
|
if u: proposed['web'] = u
|
|
if not row.get('email'):
|
|
e = _find_email(relevant_blob)
|
|
if e: proposed['email'] = e
|
|
if not row.get('telefon'):
|
|
t = _find_phone(relevant_blob)
|
|
if t: proposed['telefon'] = t
|
|
return {'proposed': proposed, 'sources': sources}
|
|
|
|
|
|
# ─── HNS Semafor parsing ────────────────────────────────────────────────
|
|
_HNS_BASE = 'https://semafor.hns.family'
|
|
|
|
def _slugify(name: str) -> str:
|
|
import unicodedata
|
|
s = unicodedata.normalize('NFKD', name or '').encode('ascii', 'ignore').decode('ascii').lower()
|
|
return re.sub(r'[^a-z0-9]+', '-', s).strip('-')
|
|
|
|
def _hns_url_from_row(row: dict) -> Optional[str]:
|
|
"""Try to build a semafor.hns.family /igraci/ URL for this row."""
|
|
# 1) Already-set columns
|
|
for k in ('profile_url', 'source_url'):
|
|
u = row.get(k)
|
|
if u and 'semafor.hns.family/igraci/' in (u or ''):
|
|
return u
|
|
# 2) hns_igrac_id column
|
|
pid = row.get('hns_igrac_id')
|
|
if pid:
|
|
slug = _slugify(((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip())
|
|
return f'{_HNS_BASE}/igraci/{int(pid)}/{slug}/'
|
|
# 3) vanjski_id JSONB → hns_comet
|
|
vid = row.get('vanjski_id') or {}
|
|
if isinstance(vid, dict):
|
|
comet = vid.get('hns_comet') or vid.get('hns_pid')
|
|
slug = vid.get('hns_slug') or _slugify(((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip())
|
|
if comet:
|
|
try:
|
|
return f'{_HNS_BASE}/igraci/{int(comet)}/{slug}/'
|
|
except Exception:
|
|
pass
|
|
# 4) source='hns_semafor' + source_id
|
|
if (row.get('source') or '').startswith('hns_') and row.get('source_id'):
|
|
try:
|
|
slug = _slugify(((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip())
|
|
return f'{_HNS_BASE}/igraci/{int(row["source_id"])}/{slug}/'
|
|
except Exception:
|
|
pass
|
|
return None
|
|
|
|
|
|
def _parse_hns_player(html_doc: str, url: str) -> Optional[dict]:
|
|
"""Extract structured fields from a semafor.hns.family player page."""
|
|
if not html_doc: return None
|
|
try:
|
|
from bs4 import BeautifulSoup
|
|
except Exception:
|
|
return _parse_hns_player_regex(html_doc, url)
|
|
soup = BeautifulSoup(html_doc, 'html.parser')
|
|
out: dict[str, Any] = {'source': 'semafor.hns.family', 'url': url}
|
|
|
|
# hns_igrac_id from URL
|
|
m = re.search(r'/igraci/(\d+)/', url)
|
|
if m: out['hns_igrac_id'] = int(m.group(1))
|
|
|
|
title = soup.find('title')
|
|
if title: out['title'] = title.get_text(strip=True)[:300]
|
|
|
|
# Photo
|
|
photo = soup.find('div', class_='photo')
|
|
if photo:
|
|
img = photo.find('img')
|
|
if img and img.get('src'):
|
|
src = img['src']
|
|
if not src.startswith('http'):
|
|
src = urllib.parse.urljoin(url, src)
|
|
out['slika_url'] = src
|
|
|
|
# Player number (jersey)
|
|
pn = soup.find('div', class_='playerName')
|
|
if pn:
|
|
h3 = pn.find('h3')
|
|
if h3:
|
|
t = h3.get_text(strip=True)
|
|
if t.isdigit():
|
|
out['broj_dresa'] = int(t)
|
|
|
|
# Datum rodjenja
|
|
li = soup.find('li', class_='dob')
|
|
if li:
|
|
h4 = li.find('h4')
|
|
if h4:
|
|
t = h4.get_text(' ', strip=True)
|
|
mm = re.match(r'(\d{1,2})\.(\d{1,2})\.(\d{4})', t)
|
|
if mm:
|
|
from datetime import date as _date
|
|
try:
|
|
out['datum_rodenja'] = _date(int(mm.group(3)), int(mm.group(2)), int(mm.group(1))).isoformat()
|
|
except Exception:
|
|
pass
|
|
|
|
# Mjesto rodjenja
|
|
li = soup.find('li', class_='pob')
|
|
if li:
|
|
h4 = li.find('h4')
|
|
if h4:
|
|
out['mjesto_rodenja'] = h4.get_text(strip=True)
|
|
|
|
# Trenutni klub (info only — we don't reassign klub_id from here)
|
|
klub_link = soup.find('a', href=re.compile(r'/klubovi/(\d+)/'))
|
|
if klub_link:
|
|
h4 = klub_link.find('h4')
|
|
if h4:
|
|
out['trenutni_klub'] = h4.get_text(strip=True)
|
|
m = re.search(r'/klubovi/(\d+)/', klub_link.get('href') or '')
|
|
if m: out['hns_klub_id'] = int(m.group(1))
|
|
|
|
# Description (meta)
|
|
meta_d = soup.find('meta', attrs={'name': 'description'})
|
|
if meta_d and meta_d.get('content'):
|
|
out['description'] = meta_d['content'][:600]
|
|
|
|
# Make a clean text blob for relevance / DeepSeek
|
|
text = soup.get_text(' ', strip=True)
|
|
out['raw_text'] = re.sub(r'\s+', ' ', text)[:4000]
|
|
out['extract'] = (out.get('description')
|
|
or (out['raw_text'][:500] if out.get('raw_text') else None))
|
|
return out
|
|
|
|
|
|
def _parse_hns_player_regex(html_doc: str, url: str) -> Optional[dict]:
|
|
"""BS4-free fallback parser."""
|
|
out: dict[str, Any] = {'source': 'semafor.hns.family', 'url': url}
|
|
m = re.search(r'/igraci/(\d+)/', url)
|
|
if m: out['hns_igrac_id'] = int(m.group(1))
|
|
m = re.search(r'<div class="photo"><img src="([^"]+)"', html_doc)
|
|
if m:
|
|
src = m.group(1)
|
|
if not src.startswith('http'): src = urllib.parse.urljoin(url, src)
|
|
out['slika_url'] = src
|
|
m = re.search(r'<li class="dob">.*?<h4>(\d{1,2})\.(\d{1,2})\.(\d{4})', html_doc, re.S)
|
|
if m:
|
|
from datetime import date as _date
|
|
try:
|
|
out['datum_rodenja'] = _date(int(m.group(3)), int(m.group(2)), int(m.group(1))).isoformat()
|
|
except Exception:
|
|
pass
|
|
m = re.search(r'<li class="pob"><i></i><h4>([^<]+)</h4>', html_doc)
|
|
if m: out['mjesto_rodenja'] = m.group(1).strip()
|
|
m = re.search(r'<div class="playerName"><h3>(\d+)</h3>', html_doc)
|
|
if m: out['broj_dresa'] = int(m.group(1))
|
|
m = re.search(r'<meta\s+name="description"\s+content="([^"]+)"', html_doc)
|
|
if m: out['description'] = m.group(1)[:600]
|
|
return out
|
|
|
|
|
|
def _hns_fetch_player(url: str) -> Optional[dict]:
|
|
body = _http_get(url, timeout=8)
|
|
if not body:
|
|
# Try Playwright fallback
|
|
if _HAS_PW and _pw_scraper is not None:
|
|
r = _pw_scraper.fetch_rendered(url, timeout_ms=15000)
|
|
if r and r.get('html_len', 0) > 2000:
|
|
# We didn't store html in fetch_rendered — re-fetch text only is enough
|
|
# but we need html for parse. Do a simple HTTP retry with longer timeout.
|
|
body = _http_get(url, timeout=15)
|
|
return _parse_hns_player(body, url) if body else None
|
|
|
|
|
|
def _propose_for_sportas(row: dict) -> dict:
|
|
naziv = ((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip()
|
|
sources, evidence = [], []
|
|
proposed: dict[str, Any] = {}
|
|
|
|
# 1) Resolve a HNS Semafor URL for this athlete (column / vanjski_id / source_id)
|
|
hns_url = _hns_url_from_row(row)
|
|
hns_doc: Optional[dict] = None
|
|
if hns_url:
|
|
hns_doc = _hns_fetch_player(hns_url)
|
|
if hns_doc:
|
|
sources.append(hns_doc)
|
|
evidence.append(hns_doc.get('raw_text') or hns_doc.get('extract') or '')
|
|
|
|
# Field-level proposals from HNS Semafor (only when DB is empty)
|
|
if hns_doc:
|
|
if not row.get('profile_url') and hns_doc.get('url'):
|
|
proposed['profile_url'] = hns_doc['url']
|
|
if not row.get('source_url') and hns_doc.get('url'):
|
|
proposed['source_url'] = hns_doc['url']
|
|
if not row.get('slika_url') and hns_doc.get('slika_url'):
|
|
proposed['slika_url'] = hns_doc['slika_url']
|
|
if not row.get('hns_igrac_id') and hns_doc.get('hns_igrac_id'):
|
|
proposed['hns_igrac_id'] = hns_doc['hns_igrac_id']
|
|
if not row.get('datum_rodenja') and hns_doc.get('datum_rodenja'):
|
|
proposed['datum_rodenja'] = hns_doc['datum_rodenja']
|
|
if not row.get('mjesto_rodenja') and hns_doc.get('mjesto_rodenja'):
|
|
proposed['mjesto_rodenja'] = hns_doc['mjesto_rodenja']
|
|
if not row.get('broj_dresa') and hns_doc.get('broj_dresa'):
|
|
proposed['broj_dresa'] = hns_doc['broj_dresa']
|
|
|
|
# 2) Wikipedia HR for biografija
|
|
if not row.get('biografija'):
|
|
wiki = _wiki_summary(naziv)
|
|
if wiki:
|
|
sources.append(wiki)
|
|
evidence.append(wiki.get('extract') or '')
|
|
|
|
# Description: prefer DeepSeek synthesis from all evidence; fallback to first long snippet
|
|
if not row.get('biografija'):
|
|
descr = _deepseek_describe(naziv, 'sportaš', evidence) if evidence else None
|
|
if not descr:
|
|
for s in sources:
|
|
ext = s.get('extract')
|
|
if ext and len(ext) >= 80:
|
|
descr = ext; break
|
|
if descr:
|
|
proposed['biografija'] = descr.strip()[:2000]
|
|
|
|
return {'proposed': proposed, 'sources': sources}
|
|
|
|
|
|
# ─── Endpoints ──────────────────────────────────────────────────────────
|
|
@router.post("/enrich/{kind}/{eid}")
|
|
def enrich_preview(kind: str, eid: int):
|
|
row = _load_row(kind, eid)
|
|
if kind == 'klub': res = _propose_for_klub(row)
|
|
elif kind == 'savez': res = _propose_for_savez(row)
|
|
else: res = _propose_for_sportas(row)
|
|
|
|
if kind == 'klub':
|
|
keys = ['oib','sport','grad','predsjednik','tajnik','web','email','telefon',
|
|
'sjediste','godina_osnutka','ciljevi','opis_djelatnosti']
|
|
elif kind == 'savez':
|
|
keys = ['oib','sport','predsjednik','tajnik','email','telefon','web','adresa','godina_osnutka']
|
|
else:
|
|
keys = ['sport','profile_url','slika_url','hns_igrac_id','biografija',
|
|
'datum_rodenja','mjesto_rodenja','broj_dresa','visina_cm','tezina_kg',
|
|
'dominantna_noga','oib']
|
|
|
|
naziv = _display_name(kind, row)
|
|
grad = row.get('grad') if kind == 'klub' else None
|
|
primary = row.get('web') or row.get('web_stranica') or row.get('source_url') or row.get('scrape_url') or row.get('profile_url')
|
|
|
|
filled = sum(1 for k in keys if row.get(k))
|
|
coverage = round(filled / len(keys) * 100)
|
|
missing = [k for k in keys if not row.get(k)]
|
|
|
|
proposed = res['proposed']
|
|
current = {k: row.get(k) for k in proposed.keys()}
|
|
meta = row.get('metadata') or {}
|
|
if not isinstance(meta, dict): meta = {}
|
|
|
|
return {
|
|
'kind': kind, 'id': eid, 'naziv': naziv,
|
|
'coverage': coverage, 'filled_fields': filled, 'total_fields': len(keys),
|
|
'missing_fields': missing,
|
|
'live_snippet': _fetch_title(primary) if primary else None,
|
|
'research_links': _research_links(naziv, kind, grad),
|
|
'sources': res['sources'],
|
|
'current': current,
|
|
'proposed': proposed,
|
|
'last_enriched_at': meta.get('enriched_at'),
|
|
'last_enrichment_source': meta.get('enrichment_source'),
|
|
'enriched_at': int(time.time()),
|
|
'apply_url': f'/sport/api/v2/enrich/{kind}/{eid}/apply',
|
|
}
|
|
|
|
|
|
_TABLE_MAP = {
|
|
'klub': ('pgz_sport.klubovi',
|
|
{'web','email','telefon','predsjednik','tajnik',
|
|
'opis_djelatnosti','ciljevi','godina_osnutka','sjediste','adresa'}),
|
|
'savez': ('pgz_sport.savezi',
|
|
{'web','email','telefon','predsjednik','tajnik','adresa','godina_osnutka'}),
|
|
'sportas': ('pgz_sport.clanovi',
|
|
{'biografija','profile_url','source_url','slika_url','hns_igrac_id',
|
|
'datum_rodenja','mjesto_rodenja','broj_dresa','visina_cm',
|
|
'tezina_kg','dominantna_noga','oib'}),
|
|
}
|
|
|
|
|
|
def _apply_to_db(kind: str, eid: int, fields: dict, sources: list, user_email: Optional[str]):
|
|
if kind not in _TABLE_MAP:
|
|
raise HTTPException(400, "kind must be klub|savez|sportas")
|
|
table, allowed = _TABLE_MAP[kind]
|
|
|
|
with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
|
|
cur.execute(f"SELECT * FROM {table} WHERE id=%s FOR UPDATE", (eid,))
|
|
before = cur.fetchone()
|
|
if not before: raise HTTPException(404, kind + " not found")
|
|
before = dict(before)
|
|
|
|
sets, params, applied = [], [], {}
|
|
for k, v in (fields or {}).items():
|
|
if k not in allowed: continue
|
|
if v is None or str(v).strip() == '': continue
|
|
if before.get(k):
|
|
continue # never overwrite existing
|
|
sets.append(f"{k} = %s")
|
|
params.append(v); applied[k] = v
|
|
|
|
meta_in = before.get('metadata') or {}
|
|
if not isinstance(meta_in, dict): meta_in = {}
|
|
now_iso = datetime.now(timezone.utc).isoformat()
|
|
meta_in['enriched_at'] = now_iso
|
|
meta_in['enrichment_source'] = [s.get('source') for s in (sources or []) if s.get('source')]
|
|
history = meta_in.get('enrichment_history') or []
|
|
history.append({
|
|
'at': now_iso,
|
|
'fields': list(applied.keys()),
|
|
'sources': meta_in['enrichment_source'],
|
|
'urls': [s.get('url') for s in (sources or []) if s.get('url')],
|
|
'user': user_email,
|
|
})
|
|
meta_in['enrichment_history'] = history[-10:]
|
|
sets.append("metadata = %s::jsonb")
|
|
params.append(json.dumps(meta_in, ensure_ascii=False, default=str))
|
|
|
|
params.append(eid)
|
|
cur.execute(f"UPDATE {table} SET {', '.join(sets)} WHERE id=%s RETURNING *", params)
|
|
after = dict(cur.fetchone())
|
|
|
|
cur.execute(
|
|
"""INSERT INTO pgz_sport.enrichment_log
|
|
(kind, target_id, source, url, fields_set, before_jsonb, after_jsonb, user_email)
|
|
VALUES (%s,%s,%s,%s,%s,%s::jsonb,%s::jsonb,%s)""",
|
|
(kind, eid,
|
|
','.join(meta_in['enrichment_source'])[:120] if meta_in['enrichment_source'] else None,
|
|
(sources[0].get('url') if sources else None),
|
|
list(applied.keys()) or None,
|
|
json.dumps({k: before.get(k) for k in (list(applied.keys()) + ['metadata'])},
|
|
ensure_ascii=False, default=str),
|
|
json.dumps({k: after.get(k) for k in (list(applied.keys()) + ['metadata'])},
|
|
ensure_ascii=False, default=str),
|
|
user_email))
|
|
|
|
snap_keys = ('id','naziv','ime','prezime','web','email','telefon',
|
|
'opis_djelatnosti','biografija','metadata')
|
|
return {'applied': applied,
|
|
'after': {k: after.get(k) for k in snap_keys if k in after}}
|
|
|
|
|
|
@router.post("/enrich/{kind}/{eid}/apply")
|
|
def enrich_apply(kind: str, eid: int,
|
|
body: dict = Body(default=None),
|
|
x_user_email: Optional[str] = Header(default=None),
|
|
x_user_id: Optional[int] = Header(default=None)):
|
|
body = body or {}
|
|
fields = body.get('fields')
|
|
sources = body.get('sources')
|
|
if not fields:
|
|
row = _load_row(kind, eid)
|
|
if kind == 'klub': res = _propose_for_klub(row)
|
|
elif kind == 'savez': res = _propose_for_savez(row)
|
|
else: res = _propose_for_sportas(row)
|
|
fields = res['proposed']
|
|
sources = res['sources']
|
|
out = _apply_to_db(kind, eid, fields or {}, sources or [], x_user_email)
|
|
# R4-A3: write to pgz_sport.sys_audit so the audit page sees enrichment events
|
|
try:
|
|
from audit_seal_router import audit_log as _audit_log
|
|
if out.get('applied'):
|
|
_audit_log(
|
|
action='enrich.apply',
|
|
target_type=kind,
|
|
target_id=eid,
|
|
payload={'applied': out.get('applied'),
|
|
'sources': [s.get('url') for s in (sources or []) if isinstance(s, dict)]},
|
|
user_id=x_user_id,
|
|
user_email=x_user_email,
|
|
)
|
|
except Exception:
|
|
pass
|
|
return {'kind': kind, 'id': eid, **out}
|
|
|
|
|
|
@router.get("/enrich/log")
|
|
def enrich_log(kind: Optional[str] = None, target_id: Optional[int] = None, limit: int = 50):
|
|
where, params = [], []
|
|
if kind: where.append("kind=%s"); params.append(kind)
|
|
if target_id: where.append("target_id=%s"); params.append(target_id)
|
|
sql = ("SELECT id, kind, target_id, source, url, fields_set, user_email, created_at "
|
|
"FROM pgz_sport.enrichment_log "
|
|
+ ("WHERE " + " AND ".join(where) + " " if where else "")
|
|
+ "ORDER BY id DESC LIMIT %s")
|
|
params.append(min(int(limit or 50), 200))
|
|
with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
|
|
cur.execute(sql, params)
|
|
rows = [dict(r) for r in cur.fetchall()]
|
|
for r in rows:
|
|
if r.get('created_at'): r['created_at'] = r['created_at'].isoformat()
|
|
return {'count': len(rows), 'rows': rows}
|
|
|
|
|
|
# ─── R3B M2 — SEARCH SUGGEST (autocomplete for Mreža) ───────────────────
|
|
@router.get("/search/suggest")
|
|
def search_suggest(q: str = '', type: str = '', limit: int = 10):
|
|
"""
|
|
Autocomplete suggestions for the Mreža search inputs.
|
|
type ∈ {person, club, company, ''} — empty means all.
|
|
Returns: {query, results: [{id, label, type, sub}]}
|
|
"""
|
|
q = (q or '').strip()
|
|
if len(q) < 2:
|
|
return {'query': q, 'results': []}
|
|
limit = max(1, min(50, int(limit)))
|
|
out = []
|
|
with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
|
|
if type in ('', 'club'):
|
|
cur.execute("""
|
|
SELECT id, naziv AS label, sport, grad
|
|
FROM pgz_sport.klubovi
|
|
WHERE naziv ILIKE %s AND aktivan=TRUE
|
|
ORDER BY length(naziv), naziv LIMIT %s
|
|
""", ('%'+q+'%', limit))
|
|
for r in cur.fetchall():
|
|
out.append({'id':'klub:'+str(r['id']), 'label': r['label'], 'type':'club',
|
|
'sub': (r.get('sport') or '')+' · '+(r.get('grad') or '')})
|
|
cur.execute("""
|
|
SELECT id, naziv AS label, sport
|
|
FROM pgz_sport.savezi
|
|
WHERE naziv ILIKE %s AND aktivan=TRUE
|
|
ORDER BY length(naziv), naziv LIMIT %s
|
|
""", ('%'+q+'%', limit))
|
|
for r in cur.fetchall():
|
|
out.append({'id':'savez:'+str(r['id']), 'label': r['label'], 'type':'savez',
|
|
'sub': r.get('sport') or 'savez'})
|
|
if type in ('', 'person'):
|
|
cur.execute("""
|
|
SELECT c.id, c.ime, c.prezime, c.sport, k.naziv AS klub_naziv
|
|
FROM pgz_sport.clanovi c
|
|
LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id
|
|
WHERE (COALESCE(c.ime,'') || ' ' || COALESCE(c.prezime,'')) ILIKE %s
|
|
ORDER BY length(COALESCE(c.ime,'')||COALESCE(c.prezime,'')), c.prezime
|
|
LIMIT %s
|
|
""", ('%'+q+'%', limit))
|
|
for r in cur.fetchall():
|
|
out.append({'id':'sportas:'+str(r['id']),
|
|
'label': (r.get('ime') or '')+' '+(r.get('prezime') or ''),
|
|
'type':'person',
|
|
'sub': (r.get('sport') or 'sportaš')+(r.get('klub_naziv') and ' · '+r['klub_naziv'] or '')})
|
|
cur.execute("""
|
|
SELECT id, name AS label, function, oib, county
|
|
FROM civic.persons
|
|
WHERE name ILIKE %s
|
|
ORDER BY oib NULLS LAST, length(name) LIMIT %s
|
|
""", ('%'+q+'%', limit))
|
|
for r in cur.fetchall():
|
|
out.append({'id':'civic_person:'+str(r['id']),
|
|
'label': r['label'], 'type':'person',
|
|
'sub': (r.get('function') or 'civic')+' · '+(r.get('county') or '')})
|
|
if type in ('', 'company'):
|
|
cur.execute("""
|
|
SELECT id, name AS label, oib, city, entity_type
|
|
FROM civic.entities
|
|
WHERE name ILIKE %s
|
|
ORDER BY length(name) LIMIT %s
|
|
""", ('%'+q+'%', limit))
|
|
for r in cur.fetchall():
|
|
out.append({'id':'civic_entity:'+str(r['id']),
|
|
'label': r['label'], 'type':'company',
|
|
'sub': (r.get('entity_type') or 'tvrtka')+' · '+(r.get('city') or '')})
|
|
return {'query': q, 'results': out[:limit*2]}
|
|
|
|
|
|
# ─── R3B M3 — FORENSIC ENRICH (Wikipedia scrape + persist) ──────────────
|
|
@router.post("/forensic/findings/{finding_id}/enrich")
|
|
def enrich_forensic(finding_id: int):
|
|
"""
|
|
Look up the forensic finding, derive the PEP person name from
|
|
entities_involved or title, hit Wikipedia HR for a summary, and persist
|
|
the enriched payload into civic.forensic_findings.ai_analysis (or back into
|
|
raw_data.enrichment).
|
|
"""
|
|
with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
|
|
cur.execute("""
|
|
SELECT id, finding_type, severity, title, description, entities_involved,
|
|
raw_data, ai_analysis
|
|
FROM civic.forensic_findings WHERE id=%s
|
|
""", (finding_id,))
|
|
f = cur.fetchone()
|
|
if not f: raise HTTPException(404, "finding not found")
|
|
f = dict(f)
|
|
|
|
# Derive person name candidates
|
|
candidates = []
|
|
if isinstance(f.get('entities_involved'), (list, dict)):
|
|
ei = f['entities_involved']
|
|
if isinstance(ei, dict):
|
|
for k in ('person','name','osoba','PEP','pep'):
|
|
if ei.get(k): candidates.append(str(ei[k]))
|
|
# Also try persons: [...] list
|
|
for p in (ei.get('persons') or ei.get('osobe') or []):
|
|
if isinstance(p, dict) and p.get('name'): candidates.append(p['name'])
|
|
elif isinstance(p, str): candidates.append(p)
|
|
elif isinstance(ei, list):
|
|
for it in ei:
|
|
if isinstance(it, dict):
|
|
for k in ('name','person','label'):
|
|
if it.get(k): candidates.append(str(it[k])); break
|
|
elif isinstance(it, str):
|
|
candidates.append(it)
|
|
if not candidates and f.get('title'):
|
|
# Heuristic: extract first capitalised "Ime Prezime" pair
|
|
m = re.search(r'\b([A-ZŠĐČĆŽ][a-zšđčćž]{2,})\s+([A-ZŠĐČĆŽ][a-zšđčćž]{2,})', f['title'])
|
|
if m: candidates.append(m.group(0))
|
|
|
|
wiki = None
|
|
used_query = None
|
|
for q in candidates[:3]:
|
|
wiki = _wiki_summary(q)
|
|
if wiki:
|
|
used_query = q
|
|
break
|
|
|
|
# Build enrichment payload
|
|
enrichment = {
|
|
'queried': candidates[:5],
|
|
'used_query': used_query,
|
|
'wiki': wiki,
|
|
'enriched_at': datetime.now(timezone.utc).isoformat(),
|
|
}
|
|
|
|
# Persist into raw_data.enrichment
|
|
raw = f.get('raw_data')
|
|
if raw is None: raw = {}
|
|
if not isinstance(raw, dict): raw = {'_legacy': raw}
|
|
raw['enrichment'] = enrichment
|
|
|
|
cur.execute("""
|
|
UPDATE civic.forensic_findings
|
|
SET raw_data = %s::jsonb,
|
|
ai_analysis = COALESCE(ai_analysis, %s)
|
|
WHERE id = %s
|
|
""", (json.dumps(raw, default=str, ensure_ascii=False),
|
|
(wiki or {}).get('extract'),
|
|
finding_id))
|
|
c.commit()
|
|
|
|
return {
|
|
'finding_id': finding_id,
|
|
'queried': candidates[:5],
|
|
'used_query': used_query,
|
|
'wiki': wiki,
|
|
'persisted': True,
|
|
}
|
|
|
|
|
|
# ─── R3B P4 — FORENSIC SCAN (kept from prior version) ───────────────────
|
|
@router.post("/forensic/scan")
|
|
def forensic_scan(req: dict = Body(...)):
|
|
name = (req.get('name') or '').strip()
|
|
if len(name) < 3:
|
|
raise HTTPException(400, "name must be at least 3 chars")
|
|
with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
|
|
cur.execute("""
|
|
SELECT id, name, function, party, county, city, oib, trust_tier
|
|
FROM civic.persons
|
|
WHERE upper(name) ILIKE upper(%s)
|
|
ORDER BY oib NULLS LAST, id LIMIT 25
|
|
""", ('%' + name + '%',))
|
|
persons = [dict(r) for r in cur.fetchall()]
|
|
for p in persons:
|
|
p['links'] = []; p['findings'] = []
|
|
if p.get('oib'):
|
|
cur.execute("""
|
|
SELECT pel.entity_id, pel.roles, e.name AS entity_name, e.oib AS entity_oib,
|
|
e.entity_type, e.city, e.risk_score
|
|
FROM civic.person_entity_links pel
|
|
LEFT JOIN civic.entities e ON e.id = pel.entity_id
|
|
WHERE pel.person_oib = %s LIMIT 50
|
|
""", (p['oib'],))
|
|
p['links'] = [dict(r) for r in cur.fetchall()]
|
|
cur.execute("""
|
|
SELECT id, finding_type, severity, title, severity_score, created_at
|
|
FROM civic.forensic_findings
|
|
WHERE entities_involved::text ILIKE %s
|
|
ORDER BY severity_score DESC, created_at DESC LIMIT 30
|
|
""", ('%' + p['oib'] + '%',))
|
|
p['findings'] = [dict(r) for r in cur.fetchall()]
|
|
if not p['findings']:
|
|
cur.execute("""
|
|
SELECT id, finding_type, severity, title, severity_score, created_at
|
|
FROM civic.forensic_findings
|
|
WHERE title ILIKE %s OR description ILIKE %s
|
|
ORDER BY severity_score DESC, created_at DESC LIMIT 30
|
|
""", ('%' + p['name'] + '%', '%' + p['name'] + '%'))
|
|
p['findings'] = [dict(r) for r in cur.fetchall()]
|
|
total_links = total_findings = crit_findings = 0
|
|
for p in persons:
|
|
total_links += len(p.get('links') or [])
|
|
for f in p.get('findings') or []:
|
|
total_findings += 1
|
|
if f.get('severity') in ('CRITICAL', 'HIGH'): crit_findings += 1
|
|
score = 0
|
|
if (p.get('function') or '').strip(): score += 30
|
|
if (p.get('party') or '').strip(): score += 15
|
|
score += min(40, len(p.get('links') or []) * 5)
|
|
score += min(40, len(p.get('findings') or []) * 10)
|
|
score += sum(20 for f in (p.get('findings') or []) if f.get('severity') in ('CRITICAL', 'HIGH'))
|
|
p['risk_score'] = min(100, score)
|
|
overall = max((p.get('risk_score', 0) for p in persons), default=0)
|
|
return {'query': name, 'matched_persons': len(persons),
|
|
'overall_risk_score': overall, 'total_links': total_links,
|
|
'total_findings': total_findings, 'critical_findings': crit_findings,
|
|
'persons': persons, 'scanned_at': int(time.time())}
|