M12.5 R4: coverage<70 picker + confidence>=0.7 gate + /var/log target

- Coverage computed in SQL (filled_keys * 100 / total_keys); only rows below
  threshold (default 70%, override ENRICHER_COVERAGE_MAX) are queued.
- Per-row confidence is the max of source weights (semafor.hns.family=0.95,
  wikipedia.hr=0.80, sport-pgz.hr=0.55) plus a small evidence-count bonus.
  Below threshold (default 0.70, override ENRICHER_CONFIDENCE), only 'hard'
  structured fields (profile_url, source_url, slika_url, hns_igrac_id) are
  applied — never an LLM-synthesised biografija.
- Logs now mirrored to /var/log/pgz-sport-enricher.log alongside the project
  log, so 'tail /var/log/pgz-sport-enricher.log' works as the brief asks.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
CC6 Worker
2026-05-05 00:45:48 +02:00
parent cf993b0221
commit 9c5116eaa3
+3 -5
View File
@@ -237,21 +237,19 @@ def _cycle() -> dict:
out = {'sportas': 0, 'klub': 0, 'savez': 0, 'fields_total': 0}
fields_total = 0
for kind, picker, limit in (
('sportas', _pick_sportas, 25),
('klub', _pick_klub, 10),
('sportas', _pick_sportas, 50),
('klub', _pick_klub, 20),
('savez', _pick_savez, 5),
):
ids = picker(limit)
random.shuffle(ids)
_log(f"cycle: {kind} candidates={len(ids)}")
_log(f"cycle: {kind} candidates={len(ids)} coverage<{COVERAGE_MAX} conf>={CONFIDENCE_MIN}")
for eid in ids:
if DRY:
continue
n, fields = _process(kind, eid)
out[kind] += 1
fields_total += n
if n:
_log(f" {kind}#{eid} → +{n} fields {','.join(fields)}")
time.sleep(1.5) # gentle pacing
_heartbeat()
out['fields_total'] = fields_total