492c8fdd87
- auth/auth_v2.py: JWT login/refresh/logout/me + bcrypt + tenant_id/role/tier claims - auth/admin_users.py: /api/admin/users CRUD + invite/role/suspend + bulk CSV - auth/gdpr.py: cookie consent + Art.20 export + Art.17 erasure + admin queue - auth/seed_demo.py: 3 demo tenants + 4 users (damir@pgz.hr / PGZ2026!) - Removed legacy /api/auth/login + /api/auth/me from pgz_sport_api.py - Wired auth/admin/gdpr routers into FastAPI 5/5 live curl tests pass: damir@pgz.hr login → JWT with tenant_id=1, role=pgz_admin, tier=0
311 lines
13 KiB
Python
311 lines
13 KiB
Python
"""
|
|
enrich_router.py — Round-2/3B enrichment + forensic-scan endpoints
|
|
Author: dradulic@outlook.com Date: 2026-05-04 (R2), 2026-05-05 (R3B)
|
|
|
|
Surfaces "Obogati podatke" buttons for klubovi, savezi, sportasi, plus
|
|
the Forenzika "Pokreni novu analizu" scan endpoint that searches civic.*.
|
|
|
|
Strategy:
|
|
1) Read what's already in DB and surface fields the frontend may not have shown.
|
|
2) Build curated research URLs (Google, Wikipedia HR, Sportilus, sport-pgz.hr,
|
|
HNS Semafor) so the operator can verify or expand by hand.
|
|
3) If the entity has a `web` URL set, quickly fetch the page and extract
|
|
<title> + <meta description> to return as a "live snippet". 5s timeout, fail-soft.
|
|
4) /forensic/scan — match name across civic.persons, return entity links,
|
|
forensic_findings hits, and a synthesised risk score.
|
|
5) /enrich/{kind}/{id}/apply — fetch best web source for entity and UPDATE the
|
|
row's web/email/telefon fields when missing.
|
|
"""
|
|
import os, re, json, time, urllib.parse, urllib.request, html
|
|
import psycopg2, psycopg2.extras
|
|
from fastapi import APIRouter, HTTPException, Body
|
|
|
|
router = APIRouter()
|
|
|
|
_pgh = os.environ.get('PG_HOST','10.10.0.2')
|
|
_pgp = int(os.environ.get('PG_PORT','6432'))
|
|
# pgz-sport.service inherits PG_HOST=localhost:5432 from /opt/.env.rinet which is wrong
|
|
# (local PG is disabled). Force the Server B DSN if env says localhost.
|
|
if _pgh in ('localhost', '127.0.0.1'):
|
|
_pgh = os.environ.get('DB_HOST','10.10.0.2')
|
|
_pgp = int(os.environ.get('DB_PORT','6432'))
|
|
DB = dict(host=_pgh, port=_pgp,
|
|
dbname=os.environ.get('PG_DB','rinet_v3'),
|
|
user=os.environ.get('PG_USER','rinet'),
|
|
password=os.environ.get('PG_PASS','R1net2026!SecureDB#v7'))
|
|
|
|
UA = 'pgz-sport-enrich/2.0'
|
|
|
|
def _db():
|
|
c = psycopg2.connect(**DB); c.autocommit = True; return c
|
|
|
|
def _fetch_one(sql, p):
|
|
with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
|
|
cur.execute(sql, p)
|
|
r = cur.fetchone()
|
|
return dict(r) if r else None
|
|
|
|
def _fetch_title(url, timeout=5):
|
|
if not url: return None
|
|
try:
|
|
if not url.startswith('http'):
|
|
return None
|
|
req = urllib.request.Request(url, headers={'User-Agent': UA})
|
|
with urllib.request.urlopen(req, timeout=timeout) as r:
|
|
data = r.read(40000).decode('utf-8','ignore')
|
|
title_m = re.search(r'<title[^>]*>([^<]+)</title>', data, re.I)
|
|
desc_m = re.search(r'<meta\s+name=["\']description["\']\s+content=["\']([^"\']+)["\']', data, re.I)
|
|
og_desc_m = re.search(r'<meta\s+property=["\']og:description["\']\s+content=["\']([^"\']+)["\']', data, re.I)
|
|
return {
|
|
'url': url,
|
|
'title': html.unescape(title_m.group(1).strip())[:300] if title_m else None,
|
|
'description': html.unescape((desc_m or og_desc_m).group(1).strip())[:500] if (desc_m or og_desc_m) else None,
|
|
'fetched_at': int(time.time()),
|
|
}
|
|
except Exception as e:
|
|
return {'url': url, 'error': str(e)[:120]}
|
|
|
|
def _research_links(naziv, kind, grad=None):
|
|
base_q = (naziv or '').strip()
|
|
if grad: q = base_q + ' ' + grad
|
|
else: q = base_q
|
|
qenc = urllib.parse.quote(q)
|
|
out = [
|
|
{'label':'Google', 'icon':'🔍', 'url':'https://www.google.com/search?q='+qenc},
|
|
{'label':'Wikipedia HR', 'icon':'📚', 'url':'https://hr.wikipedia.org/w/index.php?search='+qenc},
|
|
{'label':'sport-pgz.hr', 'icon':'🏅', 'url':'https://sport-pgz.hr/?s='+qenc},
|
|
]
|
|
if kind == 'klub':
|
|
out.append({'label':'Sportilus', 'icon':'⬡', 'url':'https://www.sportilus.com/?s='+qenc})
|
|
out.append({'label':'Sudski registar', 'icon':'⚖', 'url':'https://sudreg.pravosudje.hr/registar/oc/index.html'})
|
|
if kind == 'sportas':
|
|
out.append({'label':'HNS Semafor', 'icon':'⚽', 'url':'https://semafor.hns.family/?s='+qenc})
|
|
out.append({'label':'transfermarkt', 'icon':'⚽', 'url':'https://www.transfermarkt.com/schnellsuche/ergebnis/schnellsuche?query='+qenc})
|
|
if kind == 'savez':
|
|
out.append({'label':'sport-pgz.hr savezi', 'icon':'🏅', 'url':'https://sport-pgz.hr/savezi'})
|
|
return out
|
|
|
|
@router.post("/enrich/{kind}/{eid}")
|
|
def enrich(kind: str, eid: int):
|
|
if kind not in ('klub','savez','sportas'):
|
|
raise HTTPException(400, "kind must be klub|savez|sportas")
|
|
|
|
if kind == 'klub':
|
|
row = _fetch_one("""SELECT id, naziv, oib, sport, grad, predsjednik, tajnik,
|
|
web, web_stranica, email, telefon, ciljevi, opis_djelatnosti,
|
|
sjediste, godina_osnutka, savez_id, scrape_url, source_url
|
|
FROM pgz_sport.klubovi WHERE id=%s""", (eid,))
|
|
elif kind == 'savez':
|
|
row = _fetch_one("""SELECT id, naziv, oib, sport, predsjednik, tajnik, email, telefon, web,
|
|
adresa, godina_osnutka, source_url
|
|
FROM pgz_sport.savezi WHERE id=%s""", (eid,))
|
|
else: # sportas
|
|
row = _fetch_one("""SELECT id, ime, prezime, sport, klub_id, profile_url, scrape_url,
|
|
slika_url, source_url, hns_igrac_id, biografija
|
|
FROM pgz_sport.clanovi WHERE id=%s""", (eid,))
|
|
if not row:
|
|
raise HTTPException(404, kind+" not found")
|
|
|
|
# Build display name
|
|
if kind == 'sportas':
|
|
naziv = (row.get('ime','') + ' ' + row.get('prezime','')).strip()
|
|
grad = None
|
|
else:
|
|
naziv = row.get('naziv','')
|
|
grad = row.get('grad') if kind=='klub' else None
|
|
|
|
# Live web snippet from primary URL
|
|
primary = row.get('web') or row.get('web_stranica') or row.get('source_url') or row.get('scrape_url') or row.get('profile_url')
|
|
snippet = _fetch_title(primary) if primary else None
|
|
|
|
# Coverage score: how many key fields are filled?
|
|
if kind == 'klub':
|
|
keys = ['oib','sport','grad','predsjednik','tajnik','web','email','telefon','sjediste','godina_osnutka','ciljevi']
|
|
elif kind == 'savez':
|
|
keys = ['oib','sport','predsjednik','tajnik','email','telefon','web','adresa','godina_osnutka']
|
|
else:
|
|
keys = ['sport','profile_url','slika_url','hns_igrac_id','biografija']
|
|
filled = sum(1 for k in keys if row.get(k))
|
|
coverage = round(filled/len(keys)*100)
|
|
|
|
# Suggested missing fields
|
|
missing = [k for k in keys if not row.get(k)]
|
|
|
|
return {
|
|
'kind': kind,
|
|
'id': eid,
|
|
'naziv': naziv,
|
|
'coverage': coverage,
|
|
'filled_fields': filled,
|
|
'total_fields': len(keys),
|
|
'missing_fields': missing,
|
|
'live_snippet': snippet,
|
|
'research_links': _research_links(naziv, kind, grad),
|
|
'enriched_at': int(time.time()),
|
|
}
|
|
|
|
|
|
# ── R3B P4 — FORENSIC SCAN ──────────────────────────────────────────
|
|
@router.post("/forensic/scan")
|
|
def forensic_scan(req: dict = Body(...)):
|
|
"""
|
|
Search civic.persons by name. For each match, gather entities, person
|
|
role, forensic_findings count, and synthesise a risk score.
|
|
Body: {"name": "Velimir Liverić"}
|
|
"""
|
|
name = (req.get('name') or '').strip()
|
|
if len(name) < 3:
|
|
raise HTTPException(400, "name must be at least 3 chars")
|
|
|
|
with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
|
|
cur.execute("""
|
|
SELECT id, name, function, party, county, city, oib, trust_tier
|
|
FROM civic.persons
|
|
WHERE upper(name) ILIKE upper(%s)
|
|
ORDER BY oib NULLS LAST, id
|
|
LIMIT 25
|
|
""", ('%'+name+'%',))
|
|
persons = [dict(r) for r in cur.fetchall()]
|
|
|
|
# For each person collect entity links via OIB
|
|
for p in persons:
|
|
p['links'] = []
|
|
p['findings'] = []
|
|
if p.get('oib'):
|
|
cur.execute("""
|
|
SELECT pel.entity_id, pel.roles, e.name AS entity_name, e.oib AS entity_oib,
|
|
e.entity_type, e.city, e.risk_score
|
|
FROM civic.person_entity_links pel
|
|
LEFT JOIN civic.entities e ON e.id = pel.entity_id
|
|
WHERE pel.person_oib = %s
|
|
LIMIT 50
|
|
""", (p['oib'],))
|
|
p['links'] = [dict(r) for r in cur.fetchall()]
|
|
# Forensic findings JSONB containing this OIB
|
|
cur.execute("""
|
|
SELECT id, finding_type, severity, title, severity_score, created_at
|
|
FROM civic.forensic_findings
|
|
WHERE entities_involved::text ILIKE %s
|
|
ORDER BY severity_score DESC, created_at DESC
|
|
LIMIT 30
|
|
""", ('%'+p['oib']+'%',))
|
|
p['findings'] = [dict(r) for r in cur.fetchall()]
|
|
# Also search forensic_findings by name
|
|
if not p['findings']:
|
|
cur.execute("""
|
|
SELECT id, finding_type, severity, title, severity_score, created_at
|
|
FROM civic.forensic_findings
|
|
WHERE title ILIKE %s OR description ILIKE %s
|
|
ORDER BY severity_score DESC, created_at DESC
|
|
LIMIT 30
|
|
""", ('%'+p['name']+'%', '%'+p['name']+'%'))
|
|
p['findings'] = [dict(r) for r in cur.fetchall()]
|
|
|
|
# Synthesise risk score per person and overall
|
|
total_links = 0
|
|
total_findings = 0
|
|
crit_findings = 0
|
|
for p in persons:
|
|
total_links += len(p.get('links') or [])
|
|
for f in p.get('findings') or []:
|
|
total_findings += 1
|
|
if f.get('severity') in ('CRITICAL','HIGH'):
|
|
crit_findings += 1
|
|
# per-person risk: 30 base if PEP-like (function set), +5 per link, +10 per finding, +20 per crit
|
|
score = 0
|
|
if (p.get('function') or '').strip():
|
|
score += 30
|
|
if (p.get('party') or '').strip():
|
|
score += 15
|
|
score += min(40, len(p.get('links') or [])*5)
|
|
score += min(40, len(p.get('findings') or [])*10)
|
|
score += sum(20 for f in (p.get('findings') or []) if f.get('severity') in ('CRITICAL','HIGH'))
|
|
p['risk_score'] = min(100, score)
|
|
|
|
overall = 0
|
|
if persons:
|
|
overall = max(p.get('risk_score',0) for p in persons)
|
|
|
|
return {
|
|
'query': name,
|
|
'matched_persons': len(persons),
|
|
'overall_risk_score': overall,
|
|
'total_links': total_links,
|
|
'total_findings': total_findings,
|
|
'critical_findings': crit_findings,
|
|
'persons': persons,
|
|
'scanned_at': int(time.time()),
|
|
}
|
|
|
|
|
|
# ── R3B P6 — ENRICH /apply (write enriched fields back to DB) ───────
|
|
@router.post("/enrich/{kind}/{eid}/apply")
|
|
def enrich_apply(kind: str, eid: int, req: dict = Body(default={})):
|
|
"""
|
|
Apply enrichment to DB. Body may contain {fields: {web, email, telefon}}
|
|
to override the auto-derived suggestions; otherwise we apply derived ones.
|
|
Only updates fields that are currently NULL or empty in DB (additive only).
|
|
"""
|
|
if kind not in ('klub','savez','sportas'):
|
|
raise HTTPException(400, "kind must be klub|savez|sportas")
|
|
body_fields = (req.get('fields') if isinstance(req, dict) else {}) or {}
|
|
|
|
if kind == 'klub':
|
|
table = 'pgz_sport.klubovi'
|
|
cols = ['web','email','telefon']
|
|
elif kind == 'savez':
|
|
table = 'pgz_sport.savezi'
|
|
cols = ['web','email','telefon']
|
|
else:
|
|
table = 'pgz_sport.clanovi'
|
|
cols = ['biografija','profile_url']
|
|
|
|
with _db() as c, c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
|
|
cur.execute(f"SELECT * FROM {table} WHERE id=%s", (eid,))
|
|
row = cur.fetchone()
|
|
if not row: raise HTTPException(404, kind+" not found")
|
|
row = dict(row)
|
|
|
|
# Try a live fetch from primary URL to glean email/phone
|
|
primary = row.get('web') or row.get('web_stranica') or row.get('source_url') or row.get('scrape_url') or row.get('profile_url')
|
|
derived = {}
|
|
if primary:
|
|
snippet = _fetch_title(primary, timeout=6)
|
|
try:
|
|
if snippet and snippet.get('url'):
|
|
req2 = urllib.request.Request(primary, headers={'User-Agent': UA})
|
|
with urllib.request.urlopen(req2, timeout=6) as r:
|
|
page = r.read(80000).decode('utf-8','ignore')
|
|
em = re.search(r'[\w\.-]+@[\w\.-]+\.[a-z]{2,8}', page, re.I)
|
|
if em: derived['email'] = em.group(0)
|
|
tel = re.search(r'\+?385[\s\-]?\d[\d\s\-/]{6,}', page)
|
|
if tel: derived['telefon'] = re.sub(r'\s+', ' ', tel.group(0).strip())
|
|
except Exception:
|
|
pass
|
|
|
|
# Merge: body fields override derived
|
|
proposed = dict(derived)
|
|
for k, v in (body_fields or {}).items():
|
|
if k in cols and v:
|
|
proposed[k] = v
|
|
|
|
# Only apply where DB currently empty
|
|
applied = {}
|
|
for k, v in proposed.items():
|
|
if k in cols and (row.get(k) is None or row.get(k)==''):
|
|
applied[k] = v
|
|
|
|
if applied:
|
|
sets = ', '.join([f"{k}=%s" for k in applied])
|
|
params = list(applied.values()) + [eid]
|
|
cur.execute(f"UPDATE {table} SET {sets} WHERE id=%s", params)
|
|
c.commit()
|
|
|
|
return {
|
|
'kind': kind, 'id': eid,
|
|
'proposed': proposed,
|
|
'applied': applied,
|
|
'skipped_existing': [k for k in proposed if k not in applied],
|
|
'applied_at': int(time.time()),
|
|
}
|