#!/usr/bin/env python3 """ coverage_report.py — Per-entity coverage scoring across pgz_sport schema Fills /opt/pgz-sport/data_quality_report.md with: - per-type aggregate (n, mean coverage, median, # zero-coverage, # complete) - distribution histogram - top 50 entities most needing manual review (lowest coverage AND non-empty name) - link to detail panel for each (so audit.html-style triage is one click away) """ import os, json from collections import Counter from datetime import datetime, timezone import psycopg2, psycopg2.extras PG = dict(host='10.10.0.2', port=6432, dbname='rinet_v3', user='rinet', password=os.environ["DB_PASSWORD"]) # Per-type coverage definition: list of fields that count toward coverage DEFS = { 'savez': { 'table': 'pgz_sport.savezi', 'name_col': 'naziv', 'fields': ['naziv','sport','predsjednik','tajnik','email','telefon','web','oib','adresa','godina_osnutka'], 'panel_path': lambda i: f'/?nav=savezi&open={i}', }, 'klub': { 'table': 'pgz_sport.klubovi', 'name_col': 'naziv', # Use COALESCE-ish: web OR web_stranica counts; sjediste OR adresa counts 'fields': ['naziv','sport','grad','oib','predsjednik','tajnik','email','telefon', 'web_or_stranica','sjediste_or_adresa','ciljevi','opis_djelatnosti'], 'panel_path': lambda i: f'/?nav=klubovi&open={i}', }, 'sportas': { 'table': 'pgz_sport.clanovi', 'name_col': "ime||' '||prezime", 'fields': ['ime','prezime','sport','klub_id','datum_rodenja','slika_url','oib','profile_url','biografija','hns_igrac_id'], 'panel_path': lambda i: f'/?nav=sportasi&open={i}', }, 'objekt': { 'table': 'pgz_sport.sportski_objekti', 'name_col': 'naziv', 'fields': ['naziv','tip','grad','adresa','lat','lng','upravitelj','kapacitet','sportovi','izgradeno'], 'panel_path': lambda i: f'/?nav=objekti&open={i}', }, 'manifestacija': { 'table': 'pgz_sport.manifestacije', 'name_col': 'naziv', 'fields': ['naziv','mjesto','organizator','razina','broj_ucesnika','godina_od','source_url'], 'panel_path': lambda i: f'/?nav=manifestacije&open={i}', }, } def fetch_rows(cur, kind: str): spec = DEFS[kind] table = spec['table'] if kind == 'klub': sql = f""" SELECT id, naziv, (CASE WHEN naziv IS NOT NULL AND naziv<>'' THEN 1 ELSE 0 END + CASE WHEN sport IS NOT NULL AND sport<>'' THEN 1 ELSE 0 END + CASE WHEN grad IS NOT NULL AND grad<>'' THEN 1 ELSE 0 END + CASE WHEN oib IS NOT NULL AND oib<>'' THEN 1 ELSE 0 END + CASE WHEN predsjednik IS NOT NULL AND predsjednik<>'' THEN 1 ELSE 0 END + CASE WHEN tajnik IS NOT NULL AND tajnik<>'' THEN 1 ELSE 0 END + CASE WHEN email IS NOT NULL AND email<>'' THEN 1 ELSE 0 END + CASE WHEN telefon IS NOT NULL AND telefon<>'' THEN 1 ELSE 0 END + CASE WHEN COALESCE(web, web_stranica) IS NOT NULL AND COALESCE(web, web_stranica)<>'' THEN 1 ELSE 0 END + CASE WHEN COALESCE(sjediste, adresa) IS NOT NULL AND COALESCE(sjediste, adresa)<>'' THEN 1 ELSE 0 END + CASE WHEN ciljevi IS NOT NULL AND ciljevi<>'' THEN 1 ELSE 0 END + CASE WHEN opis_djelatnosti IS NOT NULL AND opis_djelatnosti<>'' THEN 1 ELSE 0 END ) AS filled FROM {table} """ elif kind == 'sportas': sql = f""" SELECT id, (COALESCE(ime,'')||' '||COALESCE(prezime,'')) AS naziv, (CASE WHEN ime IS NOT NULL AND ime<>'' THEN 1 ELSE 0 END + CASE WHEN prezime IS NOT NULL AND prezime<>'' THEN 1 ELSE 0 END + CASE WHEN sport IS NOT NULL AND sport<>'' THEN 1 ELSE 0 END + CASE WHEN klub_id IS NOT NULL THEN 1 ELSE 0 END + CASE WHEN datum_rodenja IS NOT NULL THEN 1 ELSE 0 END + CASE WHEN slika_url IS NOT NULL AND slika_url<>'' THEN 1 ELSE 0 END + CASE WHEN oib IS NOT NULL AND oib<>'' THEN 1 ELSE 0 END + CASE WHEN profile_url IS NOT NULL AND profile_url<>'' THEN 1 ELSE 0 END + CASE WHEN biografija IS NOT NULL AND biografija<>'' THEN 1 ELSE 0 END + CASE WHEN hns_igrac_id IS NOT NULL AND hns_igrac_id<>'' THEN 1 ELSE 0 END ) AS filled FROM {table} """ elif kind == 'objekt': sql = f""" SELECT id, naziv, (CASE WHEN naziv IS NOT NULL AND naziv<>'' THEN 1 ELSE 0 END + CASE WHEN tip IS NOT NULL AND tip<>'' THEN 1 ELSE 0 END + CASE WHEN grad IS NOT NULL AND grad<>'' THEN 1 ELSE 0 END + CASE WHEN adresa IS NOT NULL AND adresa<>'' THEN 1 ELSE 0 END + CASE WHEN lat IS NOT NULL THEN 1 ELSE 0 END + CASE WHEN lng IS NOT NULL THEN 1 ELSE 0 END + CASE WHEN upravitelj IS NOT NULL AND upravitelj<>'' THEN 1 ELSE 0 END + CASE WHEN kapacitet IS NOT NULL THEN 1 ELSE 0 END + CASE WHEN sportovi IS NOT NULL AND array_length(sportovi,1)>0 THEN 1 ELSE 0 END + CASE WHEN izgradeno IS NOT NULL THEN 1 ELSE 0 END ) AS filled FROM {table} """ elif kind == 'manifestacija': sql = f""" SELECT id, naziv, (CASE WHEN naziv IS NOT NULL AND naziv<>'' THEN 1 ELSE 0 END + CASE WHEN mjesto IS NOT NULL AND mjesto<>'' THEN 1 ELSE 0 END + CASE WHEN organizator IS NOT NULL AND organizator<>'' THEN 1 ELSE 0 END + CASE WHEN razina IS NOT NULL AND razina<>'' THEN 1 ELSE 0 END + CASE WHEN broj_ucesnika IS NOT NULL AND broj_ucesnika::text<>'' THEN 1 ELSE 0 END + CASE WHEN godina_od IS NOT NULL THEN 1 ELSE 0 END + CASE WHEN source_url IS NOT NULL AND source_url<>'' THEN 1 ELSE 0 END ) AS filled FROM {table} """ else: # savez sql = f""" SELECT id, naziv, (CASE WHEN naziv IS NOT NULL AND naziv<>'' THEN 1 ELSE 0 END + CASE WHEN sport IS NOT NULL AND sport<>'' THEN 1 ELSE 0 END + CASE WHEN predsjednik IS NOT NULL AND predsjednik<>'' THEN 1 ELSE 0 END + CASE WHEN tajnik IS NOT NULL AND tajnik<>'' THEN 1 ELSE 0 END + CASE WHEN email IS NOT NULL AND email<>'' THEN 1 ELSE 0 END + CASE WHEN telefon IS NOT NULL AND telefon<>'' THEN 1 ELSE 0 END + CASE WHEN web IS NOT NULL AND web<>'' THEN 1 ELSE 0 END + CASE WHEN oib IS NOT NULL AND oib<>'' THEN 1 ELSE 0 END + CASE WHEN adresa IS NOT NULL AND adresa<>'' THEN 1 ELSE 0 END + CASE WHEN godina_osnutka IS NOT NULL THEN 1 ELSE 0 END ) AS filled FROM {table} """ cur.execute(sql) rows = [] for r in cur.fetchall(): rows.append({'kind': kind, 'id': r['id'], 'naziv': r['naziv'] or '', 'filled': int(r['filled']), 'total': len(spec['fields'])}) return rows def stats(rows): if not rows: return {} pcts = [r['filled']/r['total']*100 for r in rows] pcts.sort() n = len(pcts) mean = sum(pcts)/n median = pcts[n//2] zero = sum(1 for p in pcts if p == 0) complete = sum(1 for p in pcts if p >= 99.0) bins = Counter() for p in pcts: b = int(p // 10) * 10 if b == 100: b = 90 bins[b] += 1 return {'n': n, 'mean': round(mean,1), 'median': round(median,1), 'zero': zero, 'complete': complete, 'distribution': dict(sorted(bins.items()))} def main(): conn = psycopg2.connect(**PG) cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) all_rows = [] by_kind = {} for kind in DEFS: rows = fetch_rows(cur, kind) by_kind[kind] = rows all_rows.extend(rows) print(f'{kind:14s} n={len(rows):5d} mean={stats(rows)["mean"]:.1f}% complete={stats(rows)["complete"]}') # Top 50 worst — exclude rows with empty naziv (those are flagged separately) valid = [r for r in all_rows if (r['naziv'] or '').strip()] # Sort by coverage ASC, then by total DESC worst = sorted(valid, key=lambda r: (r['filled']/r['total'], -r['total']))[:50] out = { 'generated_at': datetime.now(timezone.utc).isoformat(), 'totals': {k: len(v) for k,v in by_kind.items()}, 'total_entities': len(all_rows), 'per_type_stats': {k: stats(v) for k,v in by_kind.items()}, 'top50_review': worst, } print(f'\nTotal entities: {len(all_rows)}') print(f'Top 50 worst — sample:') for r in worst[:5]: pct = r['filled']/r['total']*100 print(f" {r['kind']:14s} id={r['id']:7d} {r['naziv'][:50]:50s} {r['filled']}/{r['total']} ({pct:.0f}%)") json.dump(out, open('/tmp/coverage_data.json','w'), ensure_ascii=False, default=str) cur.close(); conn.close() if __name__ == '__main__': main()