feat: /api/v2/analiza/* endpoints - sport analytics backend
This commit is contained in:
+197
@@ -0,0 +1,197 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
coverage_report.py — Per-entity coverage scoring across pgz_sport schema
|
||||
|
||||
Fills /opt/pgz-sport/data_quality_report.md with:
|
||||
- per-type aggregate (n, mean coverage, median, # zero-coverage, # complete)
|
||||
- distribution histogram
|
||||
- top 50 entities most needing manual review (lowest coverage AND non-empty name)
|
||||
- link to detail panel for each (so audit.html-style triage is one click away)
|
||||
"""
|
||||
import os, json
|
||||
from collections import Counter
|
||||
from datetime import datetime, timezone
|
||||
import psycopg2, psycopg2.extras
|
||||
|
||||
PG = dict(host='10.10.0.2', port=6432, dbname='rinet_v3',
|
||||
user='rinet', password=os.environ["DB_PASSWORD"])
|
||||
|
||||
# Per-type coverage definition: list of fields that count toward coverage
|
||||
DEFS = {
|
||||
'savez': {
|
||||
'table': 'pgz_sport.savezi',
|
||||
'name_col': 'naziv',
|
||||
'fields': ['naziv','sport','predsjednik','tajnik','email','telefon','web','oib','adresa','godina_osnutka'],
|
||||
'panel_path': lambda i: f'/?nav=savezi&open={i}',
|
||||
},
|
||||
'klub': {
|
||||
'table': 'pgz_sport.klubovi',
|
||||
'name_col': 'naziv',
|
||||
# Use COALESCE-ish: web OR web_stranica counts; sjediste OR adresa counts
|
||||
'fields': ['naziv','sport','grad','oib','predsjednik','tajnik','email','telefon',
|
||||
'web_or_stranica','sjediste_or_adresa','ciljevi','opis_djelatnosti'],
|
||||
'panel_path': lambda i: f'/?nav=klubovi&open={i}',
|
||||
},
|
||||
'sportas': {
|
||||
'table': 'pgz_sport.clanovi',
|
||||
'name_col': "ime||' '||prezime",
|
||||
'fields': ['ime','prezime','sport','klub_id','datum_rodenja','slika_url','oib','profile_url','biografija','hns_igrac_id'],
|
||||
'panel_path': lambda i: f'/?nav=sportasi&open={i}',
|
||||
},
|
||||
'objekt': {
|
||||
'table': 'pgz_sport.sportski_objekti',
|
||||
'name_col': 'naziv',
|
||||
'fields': ['naziv','tip','grad','adresa','lat','lng','upravitelj','kapacitet','sportovi','izgradeno'],
|
||||
'panel_path': lambda i: f'/?nav=objekti&open={i}',
|
||||
},
|
||||
'manifestacija': {
|
||||
'table': 'pgz_sport.manifestacije',
|
||||
'name_col': 'naziv',
|
||||
'fields': ['naziv','mjesto','organizator','razina','broj_ucesnika','godina_od','source_url'],
|
||||
'panel_path': lambda i: f'/?nav=manifestacije&open={i}',
|
||||
},
|
||||
}
|
||||
|
||||
def fetch_rows(cur, kind: str):
|
||||
spec = DEFS[kind]
|
||||
table = spec['table']
|
||||
if kind == 'klub':
|
||||
sql = f"""
|
||||
SELECT id, naziv,
|
||||
(CASE WHEN naziv IS NOT NULL AND naziv<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN sport IS NOT NULL AND sport<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN grad IS NOT NULL AND grad<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN oib IS NOT NULL AND oib<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN predsjednik IS NOT NULL AND predsjednik<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN tajnik IS NOT NULL AND tajnik<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN email IS NOT NULL AND email<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN telefon IS NOT NULL AND telefon<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN COALESCE(web, web_stranica) IS NOT NULL AND COALESCE(web, web_stranica)<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN COALESCE(sjediste, adresa) IS NOT NULL AND COALESCE(sjediste, adresa)<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN ciljevi IS NOT NULL AND ciljevi<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN opis_djelatnosti IS NOT NULL AND opis_djelatnosti<>'' THEN 1 ELSE 0 END
|
||||
) AS filled
|
||||
FROM {table}
|
||||
"""
|
||||
elif kind == 'sportas':
|
||||
sql = f"""
|
||||
SELECT id, (COALESCE(ime,'')||' '||COALESCE(prezime,'')) AS naziv,
|
||||
(CASE WHEN ime IS NOT NULL AND ime<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN prezime IS NOT NULL AND prezime<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN sport IS NOT NULL AND sport<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN klub_id IS NOT NULL THEN 1 ELSE 0 END +
|
||||
CASE WHEN datum_rodenja IS NOT NULL THEN 1 ELSE 0 END +
|
||||
CASE WHEN slika_url IS NOT NULL AND slika_url<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN oib IS NOT NULL AND oib<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN profile_url IS NOT NULL AND profile_url<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN biografija IS NOT NULL AND biografija<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN hns_igrac_id IS NOT NULL AND hns_igrac_id<>'' THEN 1 ELSE 0 END
|
||||
) AS filled
|
||||
FROM {table}
|
||||
"""
|
||||
elif kind == 'objekt':
|
||||
sql = f"""
|
||||
SELECT id, naziv,
|
||||
(CASE WHEN naziv IS NOT NULL AND naziv<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN tip IS NOT NULL AND tip<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN grad IS NOT NULL AND grad<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN adresa IS NOT NULL AND adresa<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN lat IS NOT NULL THEN 1 ELSE 0 END +
|
||||
CASE WHEN lng IS NOT NULL THEN 1 ELSE 0 END +
|
||||
CASE WHEN upravitelj IS NOT NULL AND upravitelj<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN kapacitet IS NOT NULL THEN 1 ELSE 0 END +
|
||||
CASE WHEN sportovi IS NOT NULL AND array_length(sportovi,1)>0 THEN 1 ELSE 0 END +
|
||||
CASE WHEN izgradeno IS NOT NULL THEN 1 ELSE 0 END
|
||||
) AS filled
|
||||
FROM {table}
|
||||
"""
|
||||
elif kind == 'manifestacija':
|
||||
sql = f"""
|
||||
SELECT id, naziv,
|
||||
(CASE WHEN naziv IS NOT NULL AND naziv<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN mjesto IS NOT NULL AND mjesto<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN organizator IS NOT NULL AND organizator<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN razina IS NOT NULL AND razina<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN broj_ucesnika IS NOT NULL AND broj_ucesnika::text<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN godina_od IS NOT NULL THEN 1 ELSE 0 END +
|
||||
CASE WHEN source_url IS NOT NULL AND source_url<>'' THEN 1 ELSE 0 END
|
||||
) AS filled
|
||||
FROM {table}
|
||||
"""
|
||||
else: # savez
|
||||
sql = f"""
|
||||
SELECT id, naziv,
|
||||
(CASE WHEN naziv IS NOT NULL AND naziv<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN sport IS NOT NULL AND sport<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN predsjednik IS NOT NULL AND predsjednik<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN tajnik IS NOT NULL AND tajnik<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN email IS NOT NULL AND email<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN telefon IS NOT NULL AND telefon<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN web IS NOT NULL AND web<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN oib IS NOT NULL AND oib<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN adresa IS NOT NULL AND adresa<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN godina_osnutka IS NOT NULL THEN 1 ELSE 0 END
|
||||
) AS filled
|
||||
FROM {table}
|
||||
"""
|
||||
cur.execute(sql)
|
||||
rows = []
|
||||
for r in cur.fetchall():
|
||||
rows.append({'kind': kind, 'id': r['id'], 'naziv': r['naziv'] or '',
|
||||
'filled': int(r['filled']),
|
||||
'total': len(spec['fields'])})
|
||||
return rows
|
||||
|
||||
|
||||
def stats(rows):
|
||||
if not rows: return {}
|
||||
pcts = [r['filled']/r['total']*100 for r in rows]
|
||||
pcts.sort()
|
||||
n = len(pcts)
|
||||
mean = sum(pcts)/n
|
||||
median = pcts[n//2]
|
||||
zero = sum(1 for p in pcts if p == 0)
|
||||
complete = sum(1 for p in pcts if p >= 99.0)
|
||||
bins = Counter()
|
||||
for p in pcts:
|
||||
b = int(p // 10) * 10
|
||||
if b == 100: b = 90
|
||||
bins[b] += 1
|
||||
return {'n': n, 'mean': round(mean,1), 'median': round(median,1),
|
||||
'zero': zero, 'complete': complete,
|
||||
'distribution': dict(sorted(bins.items()))}
|
||||
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(**PG)
|
||||
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
||||
all_rows = []
|
||||
by_kind = {}
|
||||
for kind in DEFS:
|
||||
rows = fetch_rows(cur, kind)
|
||||
by_kind[kind] = rows
|
||||
all_rows.extend(rows)
|
||||
print(f'{kind:14s} n={len(rows):5d} mean={stats(rows)["mean"]:.1f}% complete={stats(rows)["complete"]}')
|
||||
|
||||
# Top 50 worst — exclude rows with empty naziv (those are flagged separately)
|
||||
valid = [r for r in all_rows if (r['naziv'] or '').strip()]
|
||||
# Sort by coverage ASC, then by total DESC
|
||||
worst = sorted(valid, key=lambda r: (r['filled']/r['total'], -r['total']))[:50]
|
||||
out = {
|
||||
'generated_at': datetime.now(timezone.utc).isoformat(),
|
||||
'totals': {k: len(v) for k,v in by_kind.items()},
|
||||
'total_entities': len(all_rows),
|
||||
'per_type_stats': {k: stats(v) for k,v in by_kind.items()},
|
||||
'top50_review': worst,
|
||||
}
|
||||
print(f'\nTotal entities: {len(all_rows)}')
|
||||
print(f'Top 50 worst — sample:')
|
||||
for r in worst[:5]:
|
||||
pct = r['filled']/r['total']*100
|
||||
print(f" {r['kind']:14s} id={r['id']:7d} {r['naziv'][:50]:50s} {r['filled']}/{r['total']} ({pct:.0f}%)")
|
||||
json.dump(out, open('/tmp/coverage_data.json','w'), ensure_ascii=False, default=str)
|
||||
cur.close(); conn.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user