M12.4: real HNS Semafor scraper for sportas + 24/7 enrichment worker

Critical bug fix: /v2/enrich/sportas/{id} returned proposed:{} for athletes
because the v3 pipeline was still relying on Wikipedia-only evidence and never
actually fetched semafor.hns.family/igraci/.

- enrich_router._propose_for_sportas now:
  • Resolves a HNS Semafor URL from profile_url, source_url, hns_igrac_id,
    vanjski_id JSONB ('hns_comet'+'hns_slug'), or source='hns_semafor'+source_id.
  • Fetches and parses the player page (BS4, regex fallback) and proposes
    profile_url, source_url, slika_url, hns_igrac_id, datum_rodenja,
    mjesto_rodenja, broj_dresa, biografija (DeepSeek synthesis from HNS+Wiki).
- _load_row(sportas) widened to read every relevant column + vanjski_id.
- _TABLE_MAP['sportas'] writeback whitelist expanded to 12 fields.
- workers/enrichment_worker.py: 24/7 daemon, picks under-enriched
  clanovi/klubovi/savezi every 5 min via SQL, calls /apply for each.
- systemd unit pgz-sport-enricher.service installed + enabled.
- Tested end-to-end: id=2222 (Abdija) and id=449 (Zec) now have
  profile_url, slika_url, hns_igrac_id, biografija persisted.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
CC6 Worker
2026-05-05 00:36:57 +02:00
parent 47c366de7e
commit ece556de11
2 changed files with 416 additions and 11 deletions
+216 -11
View File
@@ -280,8 +280,12 @@ def _load_row(kind: str, eid: int) -> dict:
adresa, godina_osnutka, source_url, metadata
FROM pgz_sport.savezi WHERE id=%s""", (eid,))
elif kind == 'sportas':
row = _fetch_one("""SELECT id, ime, prezime, sport, klub_id, profile_url, scrape_url,
slika_url, source_url, hns_igrac_id, biografija, metadata
row = _fetch_one("""SELECT id, ime, prezime, sport, klub_id, profile_url,
slika_url, source_url, source, source_id,
hns_igrac_id, biografija,
datum_rodenja, mjesto_rodenja, broj_dresa,
visina_cm, tezina_kg, dominantna_noga, oib,
vanjski_id, metadata
FROM pgz_sport.clanovi WHERE id=%s""", (eid,))
else:
raise HTTPException(400, "kind must be klub|savez|sportas")
@@ -401,16 +405,213 @@ def _propose_for_savez(row: dict) -> dict:
return {'proposed': proposed, 'sources': sources}
# ─── HNS Semafor parsing ────────────────────────────────────────────────
_HNS_BASE = 'https://semafor.hns.family'
def _slugify(name: str) -> str:
import unicodedata
s = unicodedata.normalize('NFKD', name or '').encode('ascii', 'ignore').decode('ascii').lower()
return re.sub(r'[^a-z0-9]+', '-', s).strip('-')
def _hns_url_from_row(row: dict) -> Optional[str]:
"""Try to build a semafor.hns.family /igraci/ URL for this row."""
# 1) Already-set columns
for k in ('profile_url', 'source_url'):
u = row.get(k)
if u and 'semafor.hns.family/igraci/' in (u or ''):
return u
# 2) hns_igrac_id column
pid = row.get('hns_igrac_id')
if pid:
slug = _slugify(((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip())
return f'{_HNS_BASE}/igraci/{int(pid)}/{slug}/'
# 3) vanjski_id JSONB → hns_comet
vid = row.get('vanjski_id') or {}
if isinstance(vid, dict):
comet = vid.get('hns_comet') or vid.get('hns_pid')
slug = vid.get('hns_slug') or _slugify(((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip())
if comet:
try:
return f'{_HNS_BASE}/igraci/{int(comet)}/{slug}/'
except Exception:
pass
# 4) source='hns_semafor' + source_id
if (row.get('source') or '').startswith('hns_') and row.get('source_id'):
try:
slug = _slugify(((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip())
return f'{_HNS_BASE}/igraci/{int(row["source_id"])}/{slug}/'
except Exception:
pass
return None
def _parse_hns_player(html_doc: str, url: str) -> Optional[dict]:
"""Extract structured fields from a semafor.hns.family player page."""
if not html_doc: return None
try:
from bs4 import BeautifulSoup
except Exception:
return _parse_hns_player_regex(html_doc, url)
soup = BeautifulSoup(html_doc, 'html.parser')
out: dict[str, Any] = {'source': 'semafor.hns.family', 'url': url}
# hns_igrac_id from URL
m = re.search(r'/igraci/(\d+)/', url)
if m: out['hns_igrac_id'] = int(m.group(1))
title = soup.find('title')
if title: out['title'] = title.get_text(strip=True)[:300]
# Photo
photo = soup.find('div', class_='photo')
if photo:
img = photo.find('img')
if img and img.get('src'):
src = img['src']
if not src.startswith('http'):
src = urllib.parse.urljoin(url, src)
out['slika_url'] = src
# Player number (jersey)
pn = soup.find('div', class_='playerName')
if pn:
h3 = pn.find('h3')
if h3:
t = h3.get_text(strip=True)
if t.isdigit():
out['broj_dresa'] = int(t)
# Datum rodjenja
li = soup.find('li', class_='dob')
if li:
h4 = li.find('h4')
if h4:
t = h4.get_text(' ', strip=True)
mm = re.match(r'(\d{1,2})\.(\d{1,2})\.(\d{4})', t)
if mm:
from datetime import date as _date
try:
out['datum_rodenja'] = _date(int(mm.group(3)), int(mm.group(2)), int(mm.group(1))).isoformat()
except Exception:
pass
# Mjesto rodjenja
li = soup.find('li', class_='pob')
if li:
h4 = li.find('h4')
if h4:
out['mjesto_rodenja'] = h4.get_text(strip=True)
# Trenutni klub (info only — we don't reassign klub_id from here)
klub_link = soup.find('a', href=re.compile(r'/klubovi/(\d+)/'))
if klub_link:
h4 = klub_link.find('h4')
if h4:
out['trenutni_klub'] = h4.get_text(strip=True)
m = re.search(r'/klubovi/(\d+)/', klub_link.get('href') or '')
if m: out['hns_klub_id'] = int(m.group(1))
# Description (meta)
meta_d = soup.find('meta', attrs={'name': 'description'})
if meta_d and meta_d.get('content'):
out['description'] = meta_d['content'][:600]
# Make a clean text blob for relevance / DeepSeek
text = soup.get_text(' ', strip=True)
out['raw_text'] = re.sub(r'\s+', ' ', text)[:4000]
out['extract'] = (out.get('description')
or (out['raw_text'][:500] if out.get('raw_text') else None))
return out
def _parse_hns_player_regex(html_doc: str, url: str) -> Optional[dict]:
"""BS4-free fallback parser."""
out: dict[str, Any] = {'source': 'semafor.hns.family', 'url': url}
m = re.search(r'/igraci/(\d+)/', url)
if m: out['hns_igrac_id'] = int(m.group(1))
m = re.search(r'<div class="photo"><img src="([^"]+)"', html_doc)
if m:
src = m.group(1)
if not src.startswith('http'): src = urllib.parse.urljoin(url, src)
out['slika_url'] = src
m = re.search(r'<li class="dob">.*?<h4>(\d{1,2})\.(\d{1,2})\.(\d{4})', html_doc, re.S)
if m:
from datetime import date as _date
try:
out['datum_rodenja'] = _date(int(m.group(3)), int(m.group(2)), int(m.group(1))).isoformat()
except Exception:
pass
m = re.search(r'<li class="pob"><i></i><h4>([^<]+)</h4>', html_doc)
if m: out['mjesto_rodenja'] = m.group(1).strip()
m = re.search(r'<div class="playerName"><h3>(\d+)</h3>', html_doc)
if m: out['broj_dresa'] = int(m.group(1))
m = re.search(r'<meta\s+name="description"\s+content="([^"]+)"', html_doc)
if m: out['description'] = m.group(1)[:600]
return out
def _hns_fetch_player(url: str) -> Optional[dict]:
body = _http_get(url, timeout=8)
if not body:
# Try Playwright fallback
if _HAS_PW and _pw_scraper is not None:
r = _pw_scraper.fetch_rendered(url, timeout_ms=15000)
if r and r.get('html_len', 0) > 2000:
# We didn't store html in fetch_rendered — re-fetch text only is enough
# but we need html for parse. Do a simple HTTP retry with longer timeout.
body = _http_get(url, timeout=15)
return _parse_hns_player(body, url) if body else None
def _propose_for_sportas(row: dict) -> dict:
naziv = ((row.get('ime') or '') + ' ' + (row.get('prezime') or '')).strip()
sources, evidence = [], []
wiki = _wiki_summary(naziv)
if wiki: sources.append(wiki); evidence.append(wiki.get('extract') or '')
proposed: dict[str, Any] = {}
if not row.get('biografija') and evidence:
descr = _deepseek_describe(naziv, 'sportaš', evidence)
if not descr and wiki: descr = wiki.get('extract')
if descr: proposed['biografija'] = descr.strip()[:2000]
# 1) Resolve a HNS Semafor URL for this athlete (column / vanjski_id / source_id)
hns_url = _hns_url_from_row(row)
hns_doc: Optional[dict] = None
if hns_url:
hns_doc = _hns_fetch_player(hns_url)
if hns_doc:
sources.append(hns_doc)
evidence.append(hns_doc.get('raw_text') or hns_doc.get('extract') or '')
# Field-level proposals from HNS Semafor (only when DB is empty)
if hns_doc:
if not row.get('profile_url') and hns_doc.get('url'):
proposed['profile_url'] = hns_doc['url']
if not row.get('source_url') and hns_doc.get('url'):
proposed['source_url'] = hns_doc['url']
if not row.get('slika_url') and hns_doc.get('slika_url'):
proposed['slika_url'] = hns_doc['slika_url']
if not row.get('hns_igrac_id') and hns_doc.get('hns_igrac_id'):
proposed['hns_igrac_id'] = hns_doc['hns_igrac_id']
if not row.get('datum_rodenja') and hns_doc.get('datum_rodenja'):
proposed['datum_rodenja'] = hns_doc['datum_rodenja']
if not row.get('mjesto_rodenja') and hns_doc.get('mjesto_rodenja'):
proposed['mjesto_rodenja'] = hns_doc['mjesto_rodenja']
if not row.get('broj_dresa') and hns_doc.get('broj_dresa'):
proposed['broj_dresa'] = hns_doc['broj_dresa']
# 2) Wikipedia HR for biografija
if not row.get('biografija'):
wiki = _wiki_summary(naziv)
if wiki:
sources.append(wiki)
evidence.append(wiki.get('extract') or '')
# Description: prefer DeepSeek synthesis from all evidence; fallback to first long snippet
if not row.get('biografija'):
descr = _deepseek_describe(naziv, 'sportaš', evidence) if evidence else None
if not descr:
for s in sources:
ext = s.get('extract')
if ext and len(ext) >= 80:
descr = ext; break
if descr:
proposed['biografija'] = descr.strip()[:2000]
return {'proposed': proposed, 'sources': sources}
@@ -428,7 +629,9 @@ def enrich_preview(kind: str, eid: int):
elif kind == 'savez':
keys = ['oib','sport','predsjednik','tajnik','email','telefon','web','adresa','godina_osnutka']
else:
keys = ['sport','profile_url','slika_url','hns_igrac_id','biografija']
keys = ['sport','profile_url','slika_url','hns_igrac_id','biografija',
'datum_rodenja','mjesto_rodenja','broj_dresa','visina_cm','tezina_kg',
'dominantna_noga','oib']
naziv = _display_name(kind, row)
grad = row.get('grad') if kind == 'klub' else None
@@ -462,11 +665,13 @@ def enrich_preview(kind: str, eid: int):
_TABLE_MAP = {
'klub': ('pgz_sport.klubovi',
{'web','email','telefon','predsjednik','tajnik',
'opis_djelatnosti','ciljevi','godina_osnutka','sjediste'}),
'opis_djelatnosti','ciljevi','godina_osnutka','sjediste','adresa'}),
'savez': ('pgz_sport.savezi',
{'web','email','telefon','predsjednik','tajnik','adresa','godina_osnutka'}),
'sportas': ('pgz_sport.clanovi',
{'biografija','profile_url','slika_url'}),
{'biografija','profile_url','source_url','slika_url','hns_igrac_id',
'datum_rodenja','mjesto_rodenja','broj_dresa','visina_cm',
'tezina_kg','dominantna_noga','oib'}),
}