R7+: 5x P0 demo fixes — HNS direct link, avatar cache, logo home, klub→sportaši, smarter enrichment

1) HNS direct link u research_links: za sportaš s profile_url/source_url (npr. https://semafor.hns.family/igraci/X/...) generira [⭐DIRECT] link na vrhu liste, umjesto generic Google search. _research_links sada prima row dict. 2) Avatar cache buster: applyMeToHeader dodaje ?t=Date.now() na sve avatar img tagove. Avatar upload handler dodatno persistira novi avatar_url u localStorage.pgz_user tako da preživi page refresh + cross-page navigacije. 3) Logo home link: <div class='logo'> → <a href='/' class='logo'> u app.html i sport2.html. Klik na PGŽ SPORT logo vodi na public portal. 4) Klub → Sportaši drill-down: u klub Info tabu dodan button '👥 Vidi sportaše ovog kluba (N)' koji prebacuje na k-clan tab. Plus '🌐 Službena stranica' link kad klub ima web. 5) Smarter klub enrichment: - URL validacija (skip placeholder strings poput 'godisnjak_zspgz_2025') - Domain candidate guesser (slug → 16 candidate URLs s common HR TLD-ovima i sport prefix-ima) - Parallel HEAD probe (8 threads, 10s budget) — first 200 + name token match wins - Subpage scrape (/kontakt, /uprava, /o-nama, /o-klubu, /predsjednik) za richer evidence - HNK Orijent (id 3766) test: pogađa https://www.orijent.hr/, predlaže web+email+telefon+opis E2E verified: - 9/9 sidebar URL-ova → 200 - /users/me/gdpr-export → 200 (28KB JSON) - /users/me/request-deletion → 200 (DB row pgz_sport.gdpr_erasure_requests) - /enrich/klub/3766 → 4 proposed fields (web, email, telefon, opis) - HNS sportaš research_links: ⭐ HNS profil DIRECT link na vrhu Backend: routers/enrich_router.py Frontend: static/app.html, static/sport2.html Backups: _backups/sprint_1777940670/ Tag: R7-demo-ready
2026-05-05 02:24:30 +02:00
parent 67372d6c58
commit c38f15a566
6 changed files with 6715 additions and 8 deletions
@@ -381,11 +381,27 @@ def _sport_fed(sport: Optional[str]) -> Optional[dict]:
    return feds.get(norm)


-def _research_links(naziv, kind, grad=None, sport: Optional[str] = None):
+def _research_links(naziv, kind, grad=None, sport: Optional[str] = None, row: Optional[dict] = None):
    base_q = (naziv or '').strip()
    q = (base_q + ' ' + grad) if grad else base_q
    qenc = urllib.parse.quote(q)
-    out = [
+    out = []
+    # Prefer DIRECT profile/source link if entity already has one (e.g. HNS Semafor)
+    if row:
+        direct = row.get('profile_url') or row.get('source_url') or row.get('scrape_url') or row.get('web') or row.get('web_stranica')
+        if direct and isinstance(direct, str) and direct.startswith(('http://','https://')):
+            try:
+                host = urllib.parse.urlparse(direct).hostname or ''
+            except Exception:
+                host = ''
+            label = 'Vanjski profil'
+            icon = '🔗'
+            if 'hns' in host: label, icon = 'HNS profil', '⚽'
+            elif 'transfermarkt' in host: label, icon = 'Transfermarkt', '⚽'
+            elif 'wikipedia' in host: label, icon = 'Wikipedia', '📚'
+            elif host.endswith('.hr') or host.endswith('.com'): label, icon = 'Službena stranica', '🌐'
+            out.append({'label': label, 'icon': icon, 'url': direct, 'is_direct': True})
+    out += [
        {'label': 'Google',       'icon': '🔍', 'url': 'https://www.google.com/search?q=' + qenc},
        {'label': 'Wikipedia HR', 'icon': '📚', 'url': 'https://hr.wikipedia.org/w/index.php?search=' + qenc},
        {'label': 'sport-pgz.hr', 'icon': '🏅', 'url': 'https://sport-pgz.hr/?s=' + qenc},
@@ -445,11 +461,128 @@ def _is_relevant(source: dict, tokens: list[str]) -> bool:
    return any(t in blob for t in tokens)


+
+# ─── Klub domain guesser (HR slug → candidate URLs → HEAD probe) ────────
+import re as _re_klg
+
+def _slugify_klub(naziv: str) -> str:
+    if not naziv: return ""
+    s = naziv.lower()
+    repl = (("č","c"),("ć","c"),("ž","z"),("š","s"),("đ","d"),
+            ('"',''),("'",""),("(",""),(")",""),(",",""),(".",""),
+            ("/",""),("\\",""))
+    for a,b in repl: s = s.replace(a,b)
+    s = _re_klg.sub(r"[^a-z0-9]+", "-", s).strip("-")
+    return s
+
+def _klub_domain_candidates(naziv: str) -> list[str]:
+    """Generate ranked candidate URLs from club name."""
+    if not naziv: return []
+    s = _slugify_klub(naziv)
+    # Strip common prefixes for cleaner domains
+    base = s
+    for pref in ("hnk-","nk-","rk-","kk-","ok-","bk-","gk-","tk-","ak-","hbk-"):
+        if base.startswith(pref):
+            base = base[len(pref):]; break
+    # also try short prefix-ed variants
+    short = base.split("-")[0] if base else ""
+    candidates = []
+    sports_prefixes = ["nk-","hnk-","rk-","kk-","bk-","ok-","ak-","tk-"]
+    # full slug with original prefix
+    for tld in (".hr",".com",".eu",".info"):
+        candidates.append(f"https://{s}{tld}")
+        candidates.append(f"https://www.{s}{tld}")
+    # base-only
+    for tld in (".hr",".com"):
+        candidates.append(f"https://{base}{tld}")
+        candidates.append(f"https://www.{base}{tld}")
+    # try sport prefixes if name doesn't already have one
+    if not any(s.startswith(p) for p in sports_prefixes):
+        for sp in sports_prefixes[:5]:
+            for tld in (".hr",".com"):
+                candidates.append(f"https://{sp}{base}{tld}")
+    # dedup, preserve order
+    seen, out = set(), []
+    for c in candidates:
+        if c not in seen:
+            seen.add(c); out.append(c)
+    return out[:20]
+
+def _probe_klub_url(url: str, naziv_tokens: list, timeout: int = 5) -> Optional[dict]:
+    """HEAD/GET probe; return doc with raw_text if URL is alive AND mentions club tokens."""
+    try:
+        import requests
+        r = requests.get(url, timeout=timeout, allow_redirects=True,
+                         headers={"User-Agent":"Mozilla/5.0 RinetEnrichBot/1.0"})
+        if r.status_code != 200: return None
+        if len(r.text) < 200: return None
+        text = r.text.lower()
+        # Must mention at least one distinctive token from name
+        toks = [t.lower() for t in (naziv_tokens or []) if len(t) > 2]
+        if toks and not any(t in text for t in toks):
+            return None
+        return {"source": "domain_probe", "url": r.url, "raw_text": r.text[:50000]}
+    except Exception:
+        return None
+
+def _guess_klub_domains(naziv: str, tokens: list) -> Optional[dict]:
+    """Parallel probe candidates (5 workers, 4s timeout each); first hit wins."""
+    from concurrent.futures import ThreadPoolExecutor, as_completed
+    candidates = _klub_domain_candidates(naziv)
+    if not candidates: return None
+    with ThreadPoolExecutor(max_workers=8) as ex:
+        futs = {ex.submit(_probe_klub_url, url, tokens, 4): url for url in candidates[:16]}
+        for fut in as_completed(futs, timeout=10):
+            try:
+                doc = fut.result()
+                if doc:
+                    # Cancel remaining (best effort)
+                    for f in futs:
+                        if not f.done(): f.cancel()
+                    return doc
+            except Exception:
+                continue
+    return None
+
+def _scrape_klub_subpages(base_url: str, tokens: list) -> str:
+    """Fetch /kontakt /uprava /o-nama /o-klubu and concat texts."""
+    if not base_url: return ""
+    import requests
+    base = base_url.rstrip("/")
+    paths = ["/kontakt","/uprava","/o-nama","/o-klubu","/predsjednik","/klub","/contact","/about"]
+    accum = []
+    for path in paths:
+        try:
+            r = requests.get(base + path, timeout=4, allow_redirects=True,
+                             headers={"User-Agent":"Mozilla/5.0 RinetEnrichBot/1.0"})
+            if r.status_code == 200 and len(r.text) > 200:
+                accum.append(r.text[:30000])
+        except Exception:
+            pass
+    return "\n\n".join(accum)
+
+
 def _propose_for_klub(row: dict) -> dict:
    naziv = row.get('naziv') or ''
-    primary = row.get('web') or row.get('web_stranica') or row.get('source_url') or row.get('scrape_url')
+    # Only consider HTTP(S) URLs as valid primary sources — skip placeholder strings like 'godisnjak_2025'
+    raw_primary = row.get('web') or row.get('web_stranica') or row.get('source_url') or row.get('scrape_url')
+    primary = raw_primary if (raw_primary and isinstance(raw_primary, str) and raw_primary.startswith(('http://','https://'))) else None
    sources, evidence = [], []
+    tokens_pre = _name_tokens(naziv)
    pdoc = _fetch_primary_site(primary) if primary else None
+    if not pdoc:
+        # No valid web in DB — try to guess domain from club name
+        pdoc = _guess_klub_domains(naziv, tokens_pre)
+        if pdoc:
+            # Also fetch subpages for richer evidence
+            sub = _scrape_klub_subpages(pdoc.get('url',''), tokens_pre)
+            if sub:
+                pdoc['raw_text'] = (pdoc.get('raw_text','') + '\n\n' + sub)[:120000]
+    elif pdoc:
+        # Have primary site — also fetch its subpages
+        sub = _scrape_klub_subpages(pdoc.get('url') or primary, tokens_pre)
+        if sub:
+            pdoc['raw_text'] = (pdoc.get('raw_text','') + '\n\n' + sub)[:120000]
    if pdoc: sources.append(pdoc); evidence.append(pdoc.get('raw_text') or pdoc.get('extract') or '')
    wiki = _wiki_summary(naziv)
    if wiki: sources.append(wiki); evidence.append(wiki.get('extract') or '')
@@ -1121,7 +1254,7 @@ def enrich_preview(kind: str = _FPath(..., regex='^(klub|savez|sportas)$'), eid:
        'coverage': coverage, 'filled_fields': filled, 'total_fields': len(keys),
        'missing_fields': missing,
        'live_snippet': _fetch_title(primary) if primary else None,
-        'research_links': _research_links(naziv, kind, grad, sport=row.get('sport')),
+        'research_links': _research_links(naziv, kind, grad, sport=row.get('sport'), row=row),
        'sport': row.get('sport'),
        'sport_federation': (lambda f: {
            'national': (f.get('national') or {}).get('name') if f else None,