diff --git a/workers/enrichment_worker.py b/workers/enrichment_worker.py index 859bbc9..ce005bf 100644 --- a/workers/enrichment_worker.py +++ b/workers/enrichment_worker.py @@ -95,30 +95,46 @@ _SPORTAS_KEYS = ('sport','profile_url','slika_url','hns_igrac_id','biografija', 'datum_rodenja','mjesto_rodenja','broj_dresa') -def _coverage_expr(table_keys: tuple[str, ...]) -> str: - """Postgres expression that returns 0..100 coverage % for the row.""" +def _coverage_expr(table_keys: tuple[str, ...], prefix: str = '') -> str: + """Postgres expression that returns 0..100 coverage % for the row. + + `prefix` is e.g. 'c.' when the SQL uses a table alias. + """ parts = [] for k in table_keys: - parts.append(f"(CASE WHEN {k} IS NOT NULL AND ({k}::text) <> '' THEN 1 ELSE 0 END)") + col = f"{prefix}{k}" + parts.append(f"(CASE WHEN {col} IS NOT NULL AND ({col}::text) <> '' THEN 1 ELSE 0 END)") total = len(table_keys) return f"((({' + '.join(parts)})::numeric * 100) / {total})" def _pick_sportas(limit: int = 50) -> list[int]: - """Athletes with coverage>'enriched_at') IS NULL - OR (metadata->>'enriched_at')::timestamptz < now() - interval '7 days') + AND ((c.metadata->>'enriched_at') IS NULL + OR (c.metadata->>'enriched_at')::timestamptz < now() - interval '7 days') ORDER BY random() LIMIT %s """ @@ -179,9 +195,24 @@ def _http_post(path: str, body: dict | None = None) -> dict | None: # zajednica generic info, so we down-weight them so a plain DeepSeek synthesis # off a single sport-pgz.hr source falls below the gate. _SOURCE_WEIGHTS = { - 'semafor.hns.family': 0.95, - 'wikipedia.hr': 0.80, - 'sport-pgz.hr': 0.55, + 'semafor.hns.family': 0.95, + 'hrvatski-bocarski-savez.hr': 0.92, + 'hns-cff.hr': 0.90, + 'hks-cbf.hr': 0.90, + 'hrs.hr': 0.90, + 'hos-cvf.hr': 0.90, + 'hvs.hr': 0.90, + 'hps.hr': 0.90, + 'atletika.hr': 0.90, + 'htsavez.hr': 0.90, + 'judo-savez.hr': 0.88, + 'karate.hr': 0.88, + 'veslacki-savez.hr': 0.88, + 'gimnastika.hr': 0.88, + 'stolni-tenis.hr': 0.88, + 'kuglanje.hr': 0.88, + 'wikipedia.hr': 0.80, + 'sport-pgz.hr': 0.55, } # Fields that are safe to auto-write even from low-confidence sources because # they come from the entity's own structured page (URLs, IDs).