CC2 R5: defense-in-depth JWT + invite/reset token flows + audit

#1 JWT middleware:
- pgz_sport_api.py: starlette middleware require_jwt_on_admin runs before
  every /api/admin/* route. Even routes that lack Depends(require_user)
  cannot be reached without a valid Bearer token (verifies signature,
  exp, typ='access', revocation via user_sessions). OPTIONS passes for CORS.

#2 Invitation flow:
- pgz_sport.user_action_tokens table (token_hash, user_id, kind, expires_at,
  used_at, created_by, ip, meta). Single-use, raw token never persisted.
- POST /api/admin/users/{id}/invite — issues 'invite' token (TTL 7d),
  marks must_change_pwd, revokes existing sessions, returns invite_link.
- GET  /api/auth/setup-password?token=X — preflight (no consume).
- POST /api/auth/setup-password — consumes token, sets password, sets
  email_verified=true.

#3 Password reset flow:
- POST /api/auth/forgot-password — generic 'ako račun postoji' response;
  issues 'reset' token (TTL 2h) only for active users. Token returned in
  response only on localhost or if PGZ_REVEAL_RESET_TOKEN=1.
- GET  /api/auth/reset-password?token=X — preflight.
- POST /api/auth/reset-password — consumes token, sets new password,
  revokes all active sessions.

#4 Audit coverage (auth events):
- login.ok, login.fail (with reason), login.locked, login.2fa_required,
  login.2fa_fail, logout, auth.refresh, password.change, password.reset.ok,
  password.reset.fail, password.forgot.issue, password.forgot.miss,
  invite.consume.ok, invite.consume.fail, user.invite, user.create,
  user.update, user.delete, user.role.change, user.suspend, user.unsuspend,
  user.password.reset, 2fa.verify.ok, 2fa.verify.fail, 2fa.disable.

#5 Live tests: 41/41 across 6 demo users (incl. fresh invited+deleted user).
   Phase 2 verifies 14 endpoints reject no-auth and accept valid Bearer.
This commit is contained in:
Damir Radulić
2026-05-05 01:28:29 +02:00
parent 8dce58c5f9
commit 0046b8d695
24 changed files with 15419 additions and 72 deletions
+300 -36
View File
@@ -308,13 +308,19 @@ def _load_row(kind: str, eid: int) -> dict:
adresa, godina_osnutka, source_url, metadata
FROM pgz_sport.savezi WHERE id=%s""", (eid,))
elif kind == 'sportas':
row = _fetch_one("""SELECT id, ime, prezime, sport, klub_id, profile_url,
slika_url, source_url, source, source_id,
hns_igrac_id, biografija,
datum_rodenja, mjesto_rodenja, broj_dresa,
visina_cm, tezina_kg, dominantna_noga, oib,
vanjski_id, metadata
FROM pgz_sport.clanovi WHERE id=%s""", (eid,))
row = _fetch_one("""SELECT c.id, c.ime, c.prezime, c.sport, c.klub_id, c.profile_url,
c.slika_url, c.source_url, c.source, c.source_id,
c.hns_igrac_id, c.biografija,
c.datum_rodenja, c.mjesto_rodenja, c.broj_dresa,
c.visina_cm, c.tezina_kg, c.dominantna_noga, c.oib,
c.vanjski_id, c.metadata,
k.sport AS klub_sport, k.naziv AS klub_naziv
FROM pgz_sport.clanovi c
LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id
WHERE c.id=%s""", (eid,))
# Fall back to klub.sport when c.sport is empty
if row and not row.get('sport') and row.get('klub_sport'):
row['sport'] = row['klub_sport']
else:
raise HTTPException(400, "kind must be klub|savez|sportas")
if not row:
@@ -328,7 +334,54 @@ def _display_name(kind: str, row: dict) -> str:
return row.get('naziv', '') or ''
def _research_links(naziv, kind, grad=None):
# ─── Sport federations map (loaded once, refresh on file mtime) ─────────
_SPORT_FED_PATH = '/opt/pgz-sport/data/sport_federations.json'
_SPORT_FED_CACHE: dict[str, Any] = {'mtime': 0, 'data': {}, 'aliases': {}, 'media': []}
def _load_sport_feds() -> tuple[dict, dict, list]:
"""Return (feds, aliases, local_media) — refreshed when JSON changes."""
try:
st = os.stat(_SPORT_FED_PATH)
except FileNotFoundError:
return ({}, {}, [])
if st.st_mtime != _SPORT_FED_CACHE['mtime']:
try:
with open(_SPORT_FED_PATH, 'r', encoding='utf-8') as f:
raw = json.load(f)
except Exception:
return (_SPORT_FED_CACHE['data'],
_SPORT_FED_CACHE['aliases'],
_SPORT_FED_CACHE['media'])
aliases = raw.pop('_aliases', {}) if isinstance(raw, dict) else {}
media = raw.pop('_local_media_pgz', []) if isinstance(raw, dict) else []
raw.pop('_meta', None)
_SPORT_FED_CACHE.update(mtime=st.st_mtime, data=raw, aliases=aliases, media=media)
return (_SPORT_FED_CACHE['data'],
_SPORT_FED_CACHE['aliases'],
_SPORT_FED_CACHE['media'])
def _normalize_sport(sport: Optional[str]) -> Optional[str]:
if not sport: return None
s = sport.strip().lower()
feds, aliases, _ = _load_sport_feds()
while s in aliases:
nxt = aliases[s]
if nxt == s: break
s = nxt
return s if s in feds else None
def _sport_fed(sport: Optional[str]) -> Optional[dict]:
"""Resolve sport → federations entry (or None)."""
norm = _normalize_sport(sport)
if not norm: return None
feds, _, _ = _load_sport_feds()
return feds.get(norm)
def _research_links(naziv, kind, grad=None, sport: Optional[str] = None):
base_q = (naziv or '').strip()
q = (base_q + ' ' + grad) if grad else base_q
qenc = urllib.parse.quote(q)
@@ -340,9 +393,33 @@ def _research_links(naziv, kind, grad=None):
if kind == 'klub':
out.append({'label': 'Sportilus', 'icon': '', 'url': 'https://www.sportilus.com/?s=' + qenc})
out.append({'label': 'Sudski registar', 'icon': '', 'url': 'https://sudreg.pravosudje.hr/registar/oc/index.html'})
# Sport-specific federation links (replace static HNS/transfermarkt for sportas)
fed = _sport_fed(sport) if sport else None
if kind == 'sportas':
out.append({'label': 'HNS Semafor', 'icon': '', 'url': 'https://semafor.hns.family/?s=' + qenc})
out.append({'label': 'transfermarkt','icon': '', 'url': 'https://www.transfermarkt.com/schnellsuche/ergebnis/schnellsuche?query=' + qenc})
if fed and isinstance(fed.get('national'), dict):
nat = fed['national']
search = (nat.get('search_url') or nat.get('url') or '').replace('{q}', qenc)
if search:
out.append({'label': nat.get('name', 'Nacionalni savez'),
'icon': '🏆', 'url': search})
if fed and isinstance(fed.get('pgz'), dict):
pgz = fed['pgz']
url = pgz.get('search_url') or pgz.get('url') or ''
if url:
out.append({'label': pgz.get('name', 'PGŽ savez'),
'icon': '🏟', 'url': url.replace('{q}', qenc)})
if not fed:
# No mapping for this sport → keep transfermarkt as legacy fallback
out.append({'label': 'HNS Semafor', 'icon': '', 'url': 'https://semafor.hns.family/?s=' + qenc})
out.append({'label': 'transfermarkt','icon': '', 'url': 'https://www.transfermarkt.com/schnellsuche/ergebnis/schnellsuche?query=' + qenc})
# Local PGŽ media for any sportas
_, _, media = _load_sport_feds()
for m in media:
url = (m.get('search_url') or '').replace('{q}', qenc)
if url:
out.append({'label': m.get('name', 'Lokalni medij'),
'icon': '📰', 'url': url})
if kind == 'savez':
out.append({'label': 'sport-pgz.hr savezi', 'icon': '🏅', 'url': 'https://sport-pgz.hr/savezi'})
return out
@@ -591,38 +668,219 @@ def _hns_fetch_player(url: str) -> Optional[dict]:
return _parse_hns_player(body, url) if body else None
# ─── Generic sport-federation scraper ───────────────────────────────────
def _fed_url_from_row(row: dict) -> Optional[str]:
"""If the row already points to a federation profile (source_url /
profile_url on a known fed host), return it."""
feds, _, _ = _load_sport_feds()
fed_hosts = set()
for entry in feds.values():
if not isinstance(entry, dict): continue
for which in ('national', 'pgz'):
sub = entry.get(which) or {}
for k in ('url', 'search_url', 'profile_url_pattern'):
v = sub.get(k)
if v:
try:
h = urllib.parse.urlparse(v.replace('{q}', 'x').replace('{slug}', 'x').replace('{hns_pid}', '1')).hostname
if h: fed_hosts.add(h)
except Exception:
pass
for k in ('source_url', 'profile_url'):
u = row.get(k)
if not u: continue
try:
h = urllib.parse.urlparse(u).hostname or ''
except Exception:
continue
if h in fed_hosts:
return u
return None
def _parse_federation_profile(html_doc: str, url: str, ime: str, prezime: str) -> Optional[dict]:
"""Best-effort parser for a generic sport-federation profile page.
Returns {source, url, slika_url, datum_rodenja, mjesto_rodenja, klub,
extract, raw_text}. Tolerant of varied page structures.
"""
if not html_doc: return None
host = urllib.parse.urlparse(url).hostname or ''
out: dict[str, Any] = {
'source': host,
'url': url,
}
# Title
m = re.search(r'<title[^>]*>([^<]+)</title>', html_doc, re.I)
if m: out['title'] = html.unescape(m.group(1).strip())[:300]
# Meta description
m = re.search(r'<meta\s+name=["\']description["\']\s+content=["\']([^"\']+)["\']', html_doc, re.I)
if m: out['description'] = html.unescape(m.group(1).strip())[:600]
name_tokens = []
for t in (ime, prezime):
if t and len(t) >= 3:
name_tokens.append(re.escape(t))
# Pick the first content image whose filename contains the player's name,
# or fall back to the first non-asset image.
img_candidates = re.findall(r'<img[^>]+src=["\']([^"\']+)["\']', html_doc, re.I)
chosen_img = None
for src in img_candidates:
low = src.lower()
if any(b in low for b in ('logo', 'icon', 'admin-ajax', 'spinner', 'loader',
'sprite', '/themes/', '/icons/', 'gdpr', 'banner',
'header', 'footer', 'placeholder', 'avatar-default')):
continue
if not low.endswith(('.jpg', '.jpeg', '.png', '.webp')):
continue
# Prefer matches on player name in URL
if name_tokens and any(re.search(t, src, re.I) for t in name_tokens):
chosen_img = src; break
if chosen_img is None:
chosen_img = src
if chosen_img:
if not chosen_img.startswith('http'):
chosen_img = urllib.parse.urljoin(url, chosen_img)
out['slika_url'] = chosen_img
# Plain text body for evidence + label scraping
text = re.sub(r'<script[^>]*>.*?</script>', ' ', html_doc, flags=re.S | re.I)
text = re.sub(r'<style[^>]*>.*?</style>', ' ', text, flags=re.S | re.I)
text = re.sub(r'<[^>]+>', ' ', text)
text = html.unescape(re.sub(r'\s+', ' ', text)).strip()
out['raw_text'] = text[:4000]
out['extract'] = (out.get('description')
or text[max(0, text.find(prezime)-30):max(0, text.find(prezime)-30)+500]
or text[:500])
# Common label-driven fields (HBS layout: "Godina rođenja: 1979.", "Matični klub: …")
m = re.search(r'Datum\s+ro[đdj]?enja[:\s]+(\d{1,2}[.\-/]\d{1,2}[.\-/]\d{4})', text, re.I)
if m:
try:
from datetime import date as _date
d = re.split(r'[.\-/]', m.group(1))
out['datum_rodenja'] = _date(int(d[2]), int(d[1]), int(d[0])).isoformat()
except Exception:
pass
if 'datum_rodenja' not in out:
m = re.search(r'Godina\s+ro[đdj]?enja[:\s]+(\d{4})', text, re.I)
if m:
try:
from datetime import date as _date
out['datum_rodenja'] = _date(int(m.group(1)), 1, 1).isoformat()
except Exception:
pass
m = re.search(r'Mjesto\s+ro[đdj]?enja[:\s]+([A-ZČĆŠĐŽ][^,\n.]{2,40})', text)
if m: out['mjesto_rodenja'] = m.group(1).strip()
m = re.search(r'Mati[čc]ni\s+klub[:\s]+([^\n]{3,60}?)(?:\s+(?:Sportski|Datum|Liječni|Reprezent|Sezona|Domaće|Nastupi))', text, re.I)
if m: out['klub_naziv'] = m.group(1).strip().rstrip('.')
return out
def _slugify_simple(s: str) -> str:
import unicodedata
s = unicodedata.normalize('NFKD', s or '').encode('ascii', 'ignore').decode('ascii').lower()
return re.sub(r'[^a-z0-9]+', '-', s).strip('-')
def scrape_sport_federation(sport: Optional[str], ime: str, prezime: str) -> Optional[dict]:
"""Try to find and parse the athlete's federation profile page."""
fed = _sport_fed(sport) if sport else None
if not fed: return None
nat = (fed or {}).get('national') or {}
full_name = (ime + ' ' + prezime).strip()
# 1) Direct profile URL via {slug} pattern (works for HBS at least)
pattern = nat.get('profile_url_pattern')
if pattern and '{slug}' in pattern:
slug = _slugify_simple(full_name)
url = pattern.replace('{slug}', slug)
body = _http_get(url, timeout=8)
if body and prezime.lower() in body.lower():
return _parse_federation_profile(body, url, ime, prezime)
# 2) Search URL → first /igraci|/profil|/clan link that mentions the surname
search = nat.get('search_url')
if search:
body = _http_get(search.replace('{q}', urllib.parse.quote(full_name)), timeout=10)
if body:
for href_re in (r'href="([^"]*?/igraci/[^"]+)"',
r'href="([^"]*?/igrac/[^"]+)"',
r'href="([^"]*?/sportasi/[^"]+)"',
r'href="([^"]*?/clanovi/[^"]+)"',
r'href="([^"]*?/profil/[^"]+)"'):
for m in re.finditer(href_re, body, re.I):
cand = m.group(1)
if not cand.startswith('http'):
cand = urllib.parse.urljoin(nat.get('url', search), cand)
if _slugify_simple(prezime) in _slugify_simple(cand):
b2 = _http_get(cand, timeout=8)
if b2:
return _parse_federation_profile(b2, cand, ime, prezime)
return None
def _propose_for_sportas(row: dict) -> dict:
naziv = ((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip()
ime, prezime = (row.get('ime') or ''), (row.get('prezime') or '')
sport = row.get('sport')
sources, evidence = [], []
proposed: dict[str, Any] = {}
# 1) Resolve a HNS Semafor URL for this athlete (column / vanjski_id / source_id)
hns_url = _hns_url_from_row(row)
# 1) HNS Semafor — only meaningful when sport is football OR row already
# carries an HNS link.
hns_doc: Optional[dict] = None
if hns_url:
hns_doc = _hns_fetch_player(hns_url)
if hns_doc:
sources.append(hns_doc)
evidence.append(hns_doc.get('raw_text') or hns_doc.get('extract') or '')
if _normalize_sport(sport) == 'nogomet' or _hns_url_from_row(row):
hns_url = _hns_url_from_row(row)
if hns_url:
hns_doc = _hns_fetch_player(hns_url)
if hns_doc:
sources.append(hns_doc)
evidence.append(hns_doc.get('raw_text') or hns_doc.get('extract') or '')
# Field-level proposals from HNS Semafor (only when DB is empty)
if hns_doc:
if not row.get('profile_url') and hns_doc.get('url'):
proposed['profile_url'] = hns_doc['url']
if not row.get('source_url') and hns_doc.get('url'):
proposed['source_url'] = hns_doc['url']
if not row.get('slika_url') and hns_doc.get('slika_url'):
proposed['slika_url'] = hns_doc['slika_url']
if not row.get('hns_igrac_id') and hns_doc.get('hns_igrac_id'):
proposed['hns_igrac_id'] = hns_doc['hns_igrac_id']
if not row.get('datum_rodenja') and hns_doc.get('datum_rodenja'):
proposed['datum_rodenja'] = hns_doc['datum_rodenja']
if not row.get('mjesto_rodenja') and hns_doc.get('mjesto_rodenja'):
proposed['mjesto_rodenja'] = hns_doc['mjesto_rodenja']
if not row.get('broj_dresa') and hns_doc.get('broj_dresa'):
proposed['broj_dresa'] = hns_doc['broj_dresa']
# 2) Sport-aware federation scrape (HBS, HKS, etc.) — also use existing
# source_url/profile_url if it points at a known federation host.
fed_doc: Optional[dict] = None
direct_fed_url = _fed_url_from_row(row)
if direct_fed_url and (not hns_doc or hns_doc.get('url') != direct_fed_url):
body = _http_get(direct_fed_url, timeout=8)
if body:
fed_doc = _parse_federation_profile(body, direct_fed_url, ime, prezime)
if not fed_doc:
fed_doc = scrape_sport_federation(sport, ime, prezime)
if fed_doc:
sources.append(fed_doc)
evidence.append(fed_doc.get('raw_text') or fed_doc.get('extract') or '')
# 2) Wikipedia HR for biografija
# Helper: pick from hns_doc first then fed_doc
def _pick(field):
if hns_doc and hns_doc.get(field): return hns_doc[field]
if fed_doc and fed_doc.get(field): return fed_doc[field]
return None
if not row.get('profile_url'):
v = _pick('url') or (hns_doc and hns_doc.get('url')) or (fed_doc and fed_doc.get('url'))
if v: proposed['profile_url'] = v
if not row.get('source_url'):
v = (hns_doc and hns_doc.get('url')) or (fed_doc and fed_doc.get('url'))
if v: proposed['source_url'] = v
if not row.get('slika_url'):
v = _pick('slika_url')
if v: proposed['slika_url'] = v
if not row.get('hns_igrac_id') and hns_doc and hns_doc.get('hns_igrac_id'):
proposed['hns_igrac_id'] = hns_doc['hns_igrac_id']
if not row.get('datum_rodenja'):
v = _pick('datum_rodenja')
if v: proposed['datum_rodenja'] = v
if not row.get('mjesto_rodenja'):
v = _pick('mjesto_rodenja')
if v: proposed['mjesto_rodenja'] = v
if not row.get('broj_dresa') and hns_doc and hns_doc.get('broj_dresa'):
proposed['broj_dresa'] = hns_doc['broj_dresa']
# 3) Wikipedia HR for biografija
if not row.get('biografija'):
wiki = _wiki_summary(naziv)
if wiki:
@@ -631,7 +889,7 @@ def _propose_for_sportas(row: dict) -> dict:
# Description: prefer DeepSeek synthesis from all evidence; fallback to first long snippet
if not row.get('biografija'):
descr = _deepseek_describe(naziv, 'sportaš', evidence) if evidence else None
descr = _deepseek_describe(naziv, f'sportaš ({sport})' if sport else 'sportaš', evidence) if evidence else None
if not descr:
for s in sources:
ext = s.get('extract')
@@ -863,7 +1121,13 @@ def enrich_preview(kind: str = _FPath(..., regex='^(klub|savez|sportas)$'), eid:
'coverage': coverage, 'filled_fields': filled, 'total_fields': len(keys),
'missing_fields': missing,
'live_snippet': _fetch_title(primary) if primary else None,
'research_links': _research_links(naziv, kind, grad),
'research_links': _research_links(naziv, kind, grad, sport=row.get('sport')),
'sport': row.get('sport'),
'sport_federation': (lambda f: {
'national': (f.get('national') or {}).get('name') if f else None,
'national_url': (f.get('national') or {}).get('url') if f else None,
'pgz': (f.get('pgz') or {}).get('name') if f else None,
})(_sport_fed(row.get('sport'))),
'sources': res['sources'],
'current': current,
'proposed': proposed,