R7+: 5x P0 demo fixes — HNS direct link, avatar cache, logo home, klub→sportaši, smarter enrichment

1) HNS direct link u research_links: za sportaš s profile_url/source_url
   (npr. https://semafor.hns.family/igraci/X/...) generira [DIRECT] link na vrhu liste,
   umjesto generic Google search. _research_links sada prima row dict.

2) Avatar cache buster: applyMeToHeader dodaje ?t=Date.now() na sve avatar img tagove.
   Avatar upload handler dodatno persistira novi avatar_url u localStorage.pgz_user
   tako da preživi page refresh + cross-page navigacije.

3) Logo home link: <div class='logo'> → <a href='/' class='logo'> u app.html i sport2.html.
   Klik na PGŽ SPORT logo vodi na public portal.

4) Klub → Sportaši drill-down: u klub Info tabu dodan button
   '👥 Vidi sportaše ovog kluba (N)' koji prebacuje na k-clan tab.
   Plus '🌐 Službena stranica' link kad klub ima web.

5) Smarter klub enrichment:
   - URL validacija (skip placeholder strings poput 'godisnjak_zspgz_2025')
   - Domain candidate guesser (slug → 16 candidate URLs s common HR TLD-ovima i sport prefix-ima)
   - Parallel HEAD probe (8 threads, 10s budget) — first 200 + name token match wins
   - Subpage scrape (/kontakt, /uprava, /o-nama, /o-klubu, /predsjednik) za richer evidence
   - HNK Orijent (id 3766) test: pogađa https://www.orijent.hr/, predlaže web+email+telefon+opis

E2E verified:
- 9/9 sidebar URL-ova → 200
- /users/me/gdpr-export → 200 (28KB JSON)
- /users/me/request-deletion → 200 (DB row pgz_sport.gdpr_erasure_requests)
- /enrich/klub/3766 → 4 proposed fields (web, email, telefon, opis)
- HNS sportaš research_links:  HNS profil DIRECT link na vrhu

Backend: routers/enrich_router.py
Frontend: static/app.html, static/sport2.html
Backups: _backups/sprint_1777940670/

Tag: R7-demo-ready
This commit is contained in:
2026-05-05 02:24:30 +02:00
parent 67372d6c58
commit c38f15a566
6 changed files with 6715 additions and 8 deletions
+137 -4
View File
@@ -381,11 +381,27 @@ def _sport_fed(sport: Optional[str]) -> Optional[dict]:
return feds.get(norm)
def _research_links(naziv, kind, grad=None, sport: Optional[str] = None):
def _research_links(naziv, kind, grad=None, sport: Optional[str] = None, row: Optional[dict] = None):
base_q = (naziv or '').strip()
q = (base_q + ' ' + grad) if grad else base_q
qenc = urllib.parse.quote(q)
out = [
out = []
# Prefer DIRECT profile/source link if entity already has one (e.g. HNS Semafor)
if row:
direct = row.get('profile_url') or row.get('source_url') or row.get('scrape_url') or row.get('web') or row.get('web_stranica')
if direct and isinstance(direct, str) and direct.startswith(('http://','https://')):
try:
host = urllib.parse.urlparse(direct).hostname or ''
except Exception:
host = ''
label = 'Vanjski profil'
icon = '🔗'
if 'hns' in host: label, icon = 'HNS profil', ''
elif 'transfermarkt' in host: label, icon = 'Transfermarkt', ''
elif 'wikipedia' in host: label, icon = 'Wikipedia', '📚'
elif host.endswith('.hr') or host.endswith('.com'): label, icon = 'Službena stranica', '🌐'
out.append({'label': label, 'icon': icon, 'url': direct, 'is_direct': True})
out += [
{'label': 'Google', 'icon': '🔍', 'url': 'https://www.google.com/search?q=' + qenc},
{'label': 'Wikipedia HR', 'icon': '📚', 'url': 'https://hr.wikipedia.org/w/index.php?search=' + qenc},
{'label': 'sport-pgz.hr', 'icon': '🏅', 'url': 'https://sport-pgz.hr/?s=' + qenc},
@@ -445,11 +461,128 @@ def _is_relevant(source: dict, tokens: list[str]) -> bool:
return any(t in blob for t in tokens)
# ─── Klub domain guesser (HR slug → candidate URLs → HEAD probe) ────────
import re as _re_klg
def _slugify_klub(naziv: str) -> str:
if not naziv: return ""
s = naziv.lower()
repl = (("č","c"),("ć","c"),("ž","z"),("š","s"),("đ","d"),
('"',''),("'",""),("(",""),(")",""),(",",""),(".",""),
("/",""),("\\",""))
for a,b in repl: s = s.replace(a,b)
s = _re_klg.sub(r"[^a-z0-9]+", "-", s).strip("-")
return s
def _klub_domain_candidates(naziv: str) -> list[str]:
"""Generate ranked candidate URLs from club name."""
if not naziv: return []
s = _slugify_klub(naziv)
# Strip common prefixes for cleaner domains
base = s
for pref in ("hnk-","nk-","rk-","kk-","ok-","bk-","gk-","tk-","ak-","hbk-"):
if base.startswith(pref):
base = base[len(pref):]; break
# also try short prefix-ed variants
short = base.split("-")[0] if base else ""
candidates = []
sports_prefixes = ["nk-","hnk-","rk-","kk-","bk-","ok-","ak-","tk-"]
# full slug with original prefix
for tld in (".hr",".com",".eu",".info"):
candidates.append(f"https://{s}{tld}")
candidates.append(f"https://www.{s}{tld}")
# base-only
for tld in (".hr",".com"):
candidates.append(f"https://{base}{tld}")
candidates.append(f"https://www.{base}{tld}")
# try sport prefixes if name doesn't already have one
if not any(s.startswith(p) for p in sports_prefixes):
for sp in sports_prefixes[:5]:
for tld in (".hr",".com"):
candidates.append(f"https://{sp}{base}{tld}")
# dedup, preserve order
seen, out = set(), []
for c in candidates:
if c not in seen:
seen.add(c); out.append(c)
return out[:20]
def _probe_klub_url(url: str, naziv_tokens: list, timeout: int = 5) -> Optional[dict]:
"""HEAD/GET probe; return doc with raw_text if URL is alive AND mentions club tokens."""
try:
import requests
r = requests.get(url, timeout=timeout, allow_redirects=True,
headers={"User-Agent":"Mozilla/5.0 RinetEnrichBot/1.0"})
if r.status_code != 200: return None
if len(r.text) < 200: return None
text = r.text.lower()
# Must mention at least one distinctive token from name
toks = [t.lower() for t in (naziv_tokens or []) if len(t) > 2]
if toks and not any(t in text for t in toks):
return None
return {"source": "domain_probe", "url": r.url, "raw_text": r.text[:50000]}
except Exception:
return None
def _guess_klub_domains(naziv: str, tokens: list) -> Optional[dict]:
"""Parallel probe candidates (5 workers, 4s timeout each); first hit wins."""
from concurrent.futures import ThreadPoolExecutor, as_completed
candidates = _klub_domain_candidates(naziv)
if not candidates: return None
with ThreadPoolExecutor(max_workers=8) as ex:
futs = {ex.submit(_probe_klub_url, url, tokens, 4): url for url in candidates[:16]}
for fut in as_completed(futs, timeout=10):
try:
doc = fut.result()
if doc:
# Cancel remaining (best effort)
for f in futs:
if not f.done(): f.cancel()
return doc
except Exception:
continue
return None
def _scrape_klub_subpages(base_url: str, tokens: list) -> str:
"""Fetch /kontakt /uprava /o-nama /o-klubu and concat texts."""
if not base_url: return ""
import requests
base = base_url.rstrip("/")
paths = ["/kontakt","/uprava","/o-nama","/o-klubu","/predsjednik","/klub","/contact","/about"]
accum = []
for path in paths:
try:
r = requests.get(base + path, timeout=4, allow_redirects=True,
headers={"User-Agent":"Mozilla/5.0 RinetEnrichBot/1.0"})
if r.status_code == 200 and len(r.text) > 200:
accum.append(r.text[:30000])
except Exception:
pass
return "\n\n".join(accum)
def _propose_for_klub(row: dict) -> dict:
naziv = row.get('naziv') or ''
primary = row.get('web') or row.get('web_stranica') or row.get('source_url') or row.get('scrape_url')
# Only consider HTTP(S) URLs as valid primary sources — skip placeholder strings like 'godisnjak_2025'
raw_primary = row.get('web') or row.get('web_stranica') or row.get('source_url') or row.get('scrape_url')
primary = raw_primary if (raw_primary and isinstance(raw_primary, str) and raw_primary.startswith(('http://','https://'))) else None
sources, evidence = [], []
tokens_pre = _name_tokens(naziv)
pdoc = _fetch_primary_site(primary) if primary else None
if not pdoc:
# No valid web in DB — try to guess domain from club name
pdoc = _guess_klub_domains(naziv, tokens_pre)
if pdoc:
# Also fetch subpages for richer evidence
sub = _scrape_klub_subpages(pdoc.get('url',''), tokens_pre)
if sub:
pdoc['raw_text'] = (pdoc.get('raw_text','') + '\n\n' + sub)[:120000]
elif pdoc:
# Have primary site — also fetch its subpages
sub = _scrape_klub_subpages(pdoc.get('url') or primary, tokens_pre)
if sub:
pdoc['raw_text'] = (pdoc.get('raw_text','') + '\n\n' + sub)[:120000]
if pdoc: sources.append(pdoc); evidence.append(pdoc.get('raw_text') or pdoc.get('extract') or '')
wiki = _wiki_summary(naziv)
if wiki: sources.append(wiki); evidence.append(wiki.get('extract') or '')
@@ -1121,7 +1254,7 @@ def enrich_preview(kind: str = _FPath(..., regex='^(klub|savez|sportas)$'), eid:
'coverage': coverage, 'filled_fields': filled, 'total_fields': len(keys),
'missing_fields': missing,
'live_snippet': _fetch_title(primary) if primary else None,
'research_links': _research_links(naziv, kind, grad, sport=row.get('sport')),
'research_links': _research_links(naziv, kind, grad, sport=row.get('sport'), row=row),
'sport': row.get('sport'),
'sport_federation': (lambda f: {
'national': (f.get('national') or {}).get('name') if f else None,