From faf6beb536c977c0d674afb610d8bd9a5b6da9d2 Mon Sep 17 00:00:00 2001 From: CC6 Worker Date: Tue, 5 May 2026 01:30:16 +0200 Subject: [PATCH] =?UTF-8?q?M12.6=20SF:=20sport-aware=20enrichment=20+=20fe?= =?UTF-8?q?deration=20map=20(HBS,=20HKS,=20HRS,=20HOS,=20HVS,=20HPS,=20HBS?= =?UTF-8?q?=20bocanje=E2=80=A6)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - data/sport_federations.json: 24 Croatian sport federations + aliases + PGŽ local media (Novi list, Glas Istre, Rijeka.danas). - enrich_router._sport_fed/_normalize_sport/_load_sport_feds: cached loader that picks up file changes via mtime. - _research_links() now sport-aware: when row.sport maps to a known fed, the dynamic links list shows that fed (national + PGŽ regional) plus the three PGŽ local-media search URLs in place of the static HNS Semafor + transfermarkt fallback. - scrape_sport_federation(sport, ime, prezime): generic profile-page scraper (slug pattern OR search-results crawl) → returns {profile_url, slika_url, datum_rodenja, mjesto_rodenja, klub_naziv}. - _propose_for_sportas() now routes through the federation scraper before HNS Semafor; HNS path is gated to nogomet or rows already linked. - _load_row(sportas) JOINs klubovi to fall back to klub.sport when c.sport is empty. - Tested on 1024 Marijan Alkić (boćanje): proposed profile_url + datum_rodenja from hrvatski-bocarski-savez.hr; /apply persisted them. - Tested on 3335 Toni Jelenković (košarka) and 3379 Niko Miknić (plivanje): research_links surface HKS/KS PGŽ and HPS respectively. Worker: - _pick_sportas now selects on coverage<70 across ALL sports (sport set OR known external linkage), not just hns_*. - _SOURCE_WEIGHTS extended with 16 federation hosts at 0.88-0.92. Co-Authored-By: Claude Opus 4.7 (1M context) --- workers/enrichment_worker.py | 63 +++++++++++++++++++++++++++--------- 1 file changed, 47 insertions(+), 16 deletions(-) diff --git a/workers/enrichment_worker.py b/workers/enrichment_worker.py index 859bbc9..ce005bf 100644 --- a/workers/enrichment_worker.py +++ b/workers/enrichment_worker.py @@ -95,30 +95,46 @@ _SPORTAS_KEYS = ('sport','profile_url','slika_url','hns_igrac_id','biografija', 'datum_rodenja','mjesto_rodenja','broj_dresa') -def _coverage_expr(table_keys: tuple[str, ...]) -> str: - """Postgres expression that returns 0..100 coverage % for the row.""" +def _coverage_expr(table_keys: tuple[str, ...], prefix: str = '') -> str: + """Postgres expression that returns 0..100 coverage % for the row. + + `prefix` is e.g. 'c.' when the SQL uses a table alias. + """ parts = [] for k in table_keys: - parts.append(f"(CASE WHEN {k} IS NOT NULL AND ({k}::text) <> '' THEN 1 ELSE 0 END)") + col = f"{prefix}{k}" + parts.append(f"(CASE WHEN {col} IS NOT NULL AND ({col}::text) <> '' THEN 1 ELSE 0 END)") total = len(table_keys) return f"((({' + '.join(parts)})::numeric * 100) / {total})" def _pick_sportas(limit: int = 50) -> list[int]: - """Athletes with coverage>'enriched_at') IS NULL - OR (metadata->>'enriched_at')::timestamptz < now() - interval '7 days') + AND ((c.metadata->>'enriched_at') IS NULL + OR (c.metadata->>'enriched_at')::timestamptz < now() - interval '7 days') ORDER BY random() LIMIT %s """ @@ -179,9 +195,24 @@ def _http_post(path: str, body: dict | None = None) -> dict | None: # zajednica generic info, so we down-weight them so a plain DeepSeek synthesis # off a single sport-pgz.hr source falls below the gate. _SOURCE_WEIGHTS = { - 'semafor.hns.family': 0.95, - 'wikipedia.hr': 0.80, - 'sport-pgz.hr': 0.55, + 'semafor.hns.family': 0.95, + 'hrvatski-bocarski-savez.hr': 0.92, + 'hns-cff.hr': 0.90, + 'hks-cbf.hr': 0.90, + 'hrs.hr': 0.90, + 'hos-cvf.hr': 0.90, + 'hvs.hr': 0.90, + 'hps.hr': 0.90, + 'atletika.hr': 0.90, + 'htsavez.hr': 0.90, + 'judo-savez.hr': 0.88, + 'karate.hr': 0.88, + 'veslacki-savez.hr': 0.88, + 'gimnastika.hr': 0.88, + 'stolni-tenis.hr': 0.88, + 'kuglanje.hr': 0.88, + 'wikipedia.hr': 0.80, + 'sport-pgz.hr': 0.55, } # Fields that are safe to auto-write even from low-confidence sources because # they come from the entity's own structured page (URLs, IDs).