M12.6 SF: sport-aware enrichment + federation map (HBS, HKS, HRS, HOS, HVS, HPS, HBS bocanje…)
- data/sport_federations.json: 24 Croatian sport federations + aliases +
PGŽ local media (Novi list, Glas Istre, Rijeka.danas).
- enrich_router._sport_fed/_normalize_sport/_load_sport_feds: cached
loader that picks up file changes via mtime.
- _research_links() now sport-aware: when row.sport maps to a known fed,
the dynamic links list shows that fed (national + PGŽ regional) plus the
three PGŽ local-media search URLs in place of the static HNS Semafor +
transfermarkt fallback.
- scrape_sport_federation(sport, ime, prezime): generic profile-page
scraper (slug pattern OR search-results crawl) → returns
{profile_url, slika_url, datum_rodenja, mjesto_rodenja, klub_naziv}.
- _propose_for_sportas() now routes through the federation scraper before
HNS Semafor; HNS path is gated to nogomet or rows already linked.
- _load_row(sportas) JOINs klubovi to fall back to klub.sport when
c.sport is empty.
- Tested on 1024 Marijan Alkić (boćanje): proposed profile_url +
datum_rodenja from hrvatski-bocarski-savez.hr; /apply persisted them.
- Tested on 3335 Toni Jelenković (košarka) and 3379 Niko Miknić
(plivanje): research_links surface HKS/KS PGŽ and HPS respectively.
Worker:
- _pick_sportas now selects on coverage<70 across ALL sports (sport
set OR known external linkage), not just hns_*.
- _SOURCE_WEIGHTS extended with 16 federation hosts at 0.88-0.92.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -95,30 +95,46 @@ _SPORTAS_KEYS = ('sport','profile_url','slika_url','hns_igrac_id','biografija',
|
||||
'datum_rodenja','mjesto_rodenja','broj_dresa')
|
||||
|
||||
|
||||
def _coverage_expr(table_keys: tuple[str, ...]) -> str:
|
||||
"""Postgres expression that returns 0..100 coverage % for the row."""
|
||||
def _coverage_expr(table_keys: tuple[str, ...], prefix: str = '') -> str:
|
||||
"""Postgres expression that returns 0..100 coverage % for the row.
|
||||
|
||||
`prefix` is e.g. 'c.' when the SQL uses a table alias.
|
||||
"""
|
||||
parts = []
|
||||
for k in table_keys:
|
||||
parts.append(f"(CASE WHEN {k} IS NOT NULL AND ({k}::text) <> '' THEN 1 ELSE 0 END)")
|
||||
col = f"{prefix}{k}"
|
||||
parts.append(f"(CASE WHEN {col} IS NOT NULL AND ({col}::text) <> '' THEN 1 ELSE 0 END)")
|
||||
total = len(table_keys)
|
||||
return f"((({' + '.join(parts)})::numeric * 100) / {total})"
|
||||
|
||||
|
||||
def _pick_sportas(limit: int = 50) -> list[int]:
|
||||
"""Athletes with coverage<COVERAGE_MAX, randomly ordered."""
|
||||
cov = _coverage_expr(_SPORTAS_KEYS)
|
||||
"""Athletes with coverage<COVERAGE_MAX, randomly ordered.
|
||||
|
||||
Selection is sport-agnostic now: the router decides which federation to
|
||||
query based on c.sport (or klubovi.sport via the JOIN). We require either
|
||||
sport to be set on the row OR a known external linkage so we don't burn
|
||||
cycles on rows the router can't enrich.
|
||||
"""
|
||||
cov = _coverage_expr(_SPORTAS_KEYS, prefix='c.')
|
||||
sql = f"""
|
||||
SELECT id FROM pgz_sport.clanovi
|
||||
WHERE aktivan = TRUE
|
||||
SELECT c.id
|
||||
FROM pgz_sport.clanovi c
|
||||
LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id
|
||||
WHERE c.aktivan = TRUE
|
||||
AND {cov} < %s
|
||||
AND (
|
||||
source IN ('hns_semafor','hns_family','manual','godisnjak')
|
||||
OR jsonb_exists(vanjski_id, 'hns_comet')
|
||||
OR (source_url ILIKE '%%semafor.hns.family%%')
|
||||
OR profile_url ILIKE '%%semafor.hns.family%%'
|
||||
c.sport IS NOT NULL
|
||||
OR k.sport IS NOT NULL
|
||||
OR c.source IN ('hns_semafor','hns_family','manual','godisnjak','hbs_savez','hks_savez')
|
||||
OR jsonb_exists(c.vanjski_id, 'hns_comet')
|
||||
OR (c.source_url ILIKE '%%semafor.hns.family%%')
|
||||
OR (c.profile_url ILIKE '%%semafor.hns.family%%')
|
||||
OR (c.source_url ILIKE '%%hrvatski-bocarski-savez.hr%%')
|
||||
OR (c.profile_url ILIKE '%%hrvatski-bocarski-savez.hr%%')
|
||||
)
|
||||
AND ((metadata->>'enriched_at') IS NULL
|
||||
OR (metadata->>'enriched_at')::timestamptz < now() - interval '7 days')
|
||||
AND ((c.metadata->>'enriched_at') IS NULL
|
||||
OR (c.metadata->>'enriched_at')::timestamptz < now() - interval '7 days')
|
||||
ORDER BY random()
|
||||
LIMIT %s
|
||||
"""
|
||||
@@ -179,9 +195,24 @@ def _http_post(path: str, body: dict | None = None) -> dict | None:
|
||||
# zajednica generic info, so we down-weight them so a plain DeepSeek synthesis
|
||||
# off a single sport-pgz.hr source falls below the gate.
|
||||
_SOURCE_WEIGHTS = {
|
||||
'semafor.hns.family': 0.95,
|
||||
'wikipedia.hr': 0.80,
|
||||
'sport-pgz.hr': 0.55,
|
||||
'semafor.hns.family': 0.95,
|
||||
'hrvatski-bocarski-savez.hr': 0.92,
|
||||
'hns-cff.hr': 0.90,
|
||||
'hks-cbf.hr': 0.90,
|
||||
'hrs.hr': 0.90,
|
||||
'hos-cvf.hr': 0.90,
|
||||
'hvs.hr': 0.90,
|
||||
'hps.hr': 0.90,
|
||||
'atletika.hr': 0.90,
|
||||
'htsavez.hr': 0.90,
|
||||
'judo-savez.hr': 0.88,
|
||||
'karate.hr': 0.88,
|
||||
'veslacki-savez.hr': 0.88,
|
||||
'gimnastika.hr': 0.88,
|
||||
'stolni-tenis.hr': 0.88,
|
||||
'kuglanje.hr': 0.88,
|
||||
'wikipedia.hr': 0.80,
|
||||
'sport-pgz.hr': 0.55,
|
||||
}
|
||||
# Fields that are safe to auto-write even from low-confidence sources because
|
||||
# they come from the entity's own structured page (URLs, IDs).
|
||||
|
||||
Reference in New Issue
Block a user