#!/usr/bin/env python3 # sub5_klubovi runner — W5 PGZ Sport data quality # author: dradulic@outlook.com / damir@rinet.one # date: 2026-05-05 # purpose: 5a adresa-as-naziv flagging, 5b lovacka drustva sport reclassification, # 5c RSS/ZSPGZ membership cross-check (best-effort) import os, json, re, datetime as dt, sys import psycopg2 import psycopg2.extras PG = dict(host='10.10.0.2', port=6432, dbname='rinet_v3', user='rinet', password='R1net2026!SecureDB#v7') OUT_DIR = '/opt/pgz-sport/_audit/sub5_klubovi' os.makedirs(OUT_DIR, exist_ok=True) NOW = dt.date.today().isoformat() # 2026-05-05 # Heuristics for inferring naziv from sport+sjediste SPORT_PREFIX = { 'odbojka': 'OK', 'nogomet': 'NK', 'rukomet': 'RK', 'košarka': 'KK', 'kosarka': 'KK', 'boćanje': 'BK', 'bocanje': 'BK', 'tenis': 'TK', 'plivanje': 'PK', 'atletika': 'AK', 'streljaštvo': 'SK', 'streljastvo': 'SK', 'jedrenje': 'JK', 'vaterpolo': 'VK', 'kuglanje': 'KGK', 'šah': 'ŠK', 'sah': 'ŠK', } def conn(): return psycopg2.connect(**PG) def task_5a(cur): """Identify clubs with bogus naziv (address/url/email/heading) and flag in napomena.""" cur.execute(""" SELECT id, naziv, sjediste, savez_id, sport, napomena, grad FROM pgz_sport.klubovi WHERE naziv ~* '\\d{5}' OR naziv ~* '^www\\.' OR naziv ~* '^https?://' OR naziv ~ '@.*\\.' OR naziv ~* '^(propozicije|ždrijeb|zdrijeb|satnica|video[ ]+seminar|raspored)' OR naziv ~ ',\\s*\\d{2}\\s*\\d{3}' ORDER BY id """) rows = cur.fetchall() actions = [] for r in rows: rid, naziv, sjediste, savez_id, sport, napomena, grad = r original = naziv kind = 'unknown' if re.match(r'^www\.', naziv, re.I) or re.match(r'^https?://', naziv, re.I): kind = 'url' elif re.search(r'@.*\.', naziv) and ' ' not in naziv.strip(): kind = 'email' elif re.search(r',\s*\d{2}\s*\d{3}', naziv) or re.search(r'\d{5}', naziv): kind = 'address' elif re.match(r'^(propozicije|ždrijeb|zdrijeb|satnica|video|raspored|seminar)', naziv, re.I): kind = 'heading/event' # Try to infer naziv only for address-kind with high confidence suggestion = None confidence = 0.0 sport_l = (sport or '').lower() prefix = SPORT_PREFIX.get(sport_l) # Try to extract grad from naziv if it's an address (e.g. "..., 51 000 Rijeka") m = re.search(r',\s*\d{2}\s*\d{3}\s*([\w\s\-šđč枊ĐČĆŽ]+?)\s*$', naziv) addr_grad = m.group(1).strip() if m else None if kind == 'address' and prefix and addr_grad: suggestion = f'{prefix} [VERIFY-{addr_grad.upper()}]' confidence = 0.5 # below threshold of 0.9 — DO NOT auto-rename elif kind == 'url' and prefix: # URL → maybe extract club name from domain dom_m = re.search(r'(?:www\.|//)([a-z0-9\-]+)', naziv, re.I) dom = dom_m.group(1) if dom_m else '' suggestion = f'{prefix} [VERIFY-from-URL-{dom}]' confidence = 0.4 # Build napomena prefix new_napomena_chunk = f'sub5a_{NOW}: TODO_FIX_NAME — naziv looks like {kind}; original="{original}"' if napomena: new_napomena = napomena.rstrip() + ' | ' + new_napomena_chunk else: new_napomena = new_napomena_chunk # Apply update — DO NOT change naziv (confidence < 0.9 always for these) cur.execute(""" UPDATE pgz_sport.klubovi SET napomena = %s, updated_at = now(), aktivan = false WHERE id = %s """, (new_napomena, rid)) actions.append(dict( id=rid, original_naziv=original, kind=kind, suggestion=suggestion, confidence=confidence, sport=sport, sjediste=sjediste, savez_id=savez_id, action='flagged_in_napomena+aktivan=false (no rename, conf<0.9)' )) return actions def task_5b(cur): """All 49 'kulturno-umjetnicko' rows are LOVAČKA DRUŠTVA — reclassify to sport='lovstvo'.""" cur.execute(""" SELECT id, naziv, sport, sjediste, savez_id, napomena FROM pgz_sport.klubovi WHERE sport = 'kulturno-umjetnicko' ORDER BY id """) rows = cur.fetchall() actions = [] sample_ids = [] for r in rows: rid, naziv, sport, sjediste, savez_id, napomena = r is_lovacko = bool(re.match(r'^\s*"?\s*(hrvatsko\s+)?lovačko\s+društvo', naziv, re.I)) or 'LOVAČKO' in naziv.upper() is_kud_marker = bool(re.search(r'\b(kud|kulturno-umjetn|folklor|tamburaš|tamburaski)', naziv, re.I)) if is_lovacko and not is_kud_marker: new_sport = 'lovstvo' reason = 'naziv počinje sa "Lovačko društvo" — nije KUD, kategorija lovstvo' chunk = f'sub5b_{NOW}: bio sport=kulturno-umjetnicko, vraćen na lovstvo (LD prefix detected)' new_napomena = (napomena.rstrip() + ' | ' + chunk) if napomena else chunk cur.execute(""" UPDATE pgz_sport.klubovi SET sport = %s, napomena = %s, updated_at = now() WHERE id = %s """, (new_sport, new_napomena, rid)) actions.append(dict( id=rid, naziv=naziv, sport_before='kulturno-umjetnicko', sport_after=new_sport, reason=reason )) else: # Genuinely a KUD actions.append(dict( id=rid, naziv=naziv, sport_before='kulturno-umjetnicko', sport_after='kulturno-umjetnicko', reason='ostavljen — naziv ne ukazuje na sportsku/lovačku klasifikaciju' )) sample_ids.append(rid) return actions def task_5c(cur): """Cross-check membership lists from sport-pgz.hr. Findings: sport-pgz.hr publishes only savezi membership of ZSPGZ, NOT individual clubs. Individual clubs only appear in NSPGZ glasnik (PDF) and per-savez websites (most non-existent or paywalled). 5c is therefore PARTIAL-BLOCKED. """ sources = [] # zspgz savez slugs we found zspgz_savez_slugs = [ 'atletski-savez-pgz', 'bocarski-savez-pgz', 'boksacki-savez-pgz', 'jedrilicarski-savez-pgz', 'judo-savez-pgz', 'karate-savez-pgz', 'kickboxing-savez-pgz', 'kosarkaski-savez-pgz', 'kuglacki-savez-pgz', 'nogometni-savez-pgz', 'odbojkaski-savez-pgz', 'pikado-savez-pgz', 'plivacki-savez-pgz', 'rukometni-savez-pgz', 'savez-za-sportski-ribolov-na-moru-pgz', 'sanjkaski-savez-pgz', 'skijaski-savez-pgz', 'stolnoteniski-savez-pgz', 'strelicarski-savez-pgz', 'udruga-streljackih-klubova-pgz', 'sahovski-savez-pgz', 'sportsko-ribolovni-savez-pgz', 'taekwondo-savez-pgz', 'teniski-savez-pgz', 'triatlon-savez-pgz', 'vaterpolo-savez-pgz', 'savez-skolskih-sportskih-drustava-pgz', 'savez-sportova-osoba-s-invaliditetom-pgz', 'savez-sportske-rekreacije-sport-za-sve-pgz', 'rijecki-sportski-savez', 'rijecki-sportski-sveucilisni-savez', ] sources.append(dict( url='https://sport-pgz.hr/clanice-zajednice', status='200 OK', type='ZSPGZ savezi members (NOT individual clubs)', n_found=len(zspgz_savez_slugs), n_flagged=0, note=('ZSPGZ portal lists only SAVEZE pages, not individual klubove. ' 'Individual clubs only available via NSPGZ glasnik PDFs / per-savez sites ' '(most non-existent or paywalled). Cross-check protiv klubova nije moguć ' 'autonomno bez parsiranja PDF-ova.'), )) sources.append(dict( url='https://rss-rijeka.hr/clanovi', status='no DNS / unreachable', type='RSS Rijeka member-clubs', n_found=0, n_flagged=0, note='Domain not resolvable. RSS Rijeka info-page exists on sport-pgz.hr/rijecki-sportski-savez but lists only PGZ-savezi (Atletski, Boćarski, ...), not individual clubs.', )) sources.append(dict( url='https://www.zssr-pgz.hr', status='no DNS / unreachable', type='ŽSSR PGŽ membership', n_found=0, n_flagged=0, note='Domain unreachable. Use info-page on sport-pgz.hr.', )) sources.append(dict( url='https://www.nspgz.hr', status='200 OK', type='Nogometni savez PGŽ', n_found=0, n_flagged=0, note='Has /komisija/registracije-klubovi-igraci, but no machine-readable list. Glasniks su PDF; potreban OCR + parsing.', )) # Identify klubovi that have empty savez_id and might need flagging — this # is structural evidence rather than membership-derived. cur.execute(""" SELECT COUNT(*) FROM pgz_sport.klubovi WHERE savez_id IS NULL AND aktivan = true AND naziv NOT ILIKE '%[VERIFY]%' AND naziv NOT ILIKE '%[MERGED%' AND naziv NOT ILIKE '%[UNRESOLVED]%' """) no_savez_count = cur.fetchone()[0] return dict(sources=sources, no_savez_active_klubovi=no_savez_count, flagged=[]) def main(): c = conn() c.autocommit = False cur = c.cursor() print('=== sub5a — adresa-as-naziv flagging ===') a5a = task_5a(cur) print(f'5a: {len(a5a)} klubova flagged') print('=== sub5b — KUD verify / lovačka reclassification ===') a5b = task_5b(cur) corrected = sum(1 for a in a5b if a['sport_after'] != a['sport_before']) print(f'5b: {len(a5b)} reviewed, {corrected} reclassified to lovstvo') print('=== sub5c — membership cross-check ===') a5c = task_5c(cur) print(f'5c: {len(a5c["sources"])} sources probed') c.commit() cur.close() c.close() out = dict( ts=dt.datetime.now().isoformat(), sub5a=a5a, sub5b=a5b, sub5c=a5c, summary=dict( sub5a_flagged=len(a5a), sub5b_reclassified=corrected, sub5b_total_reviewed=len(a5b), sub5c_blocked_sources=sum(1 for s in a5c['sources'] if s['n_found'] == 0), ), ) with open(os.path.join(OUT_DIR, 'sub5_run.json'), 'w') as f: json.dump(out, f, ensure_ascii=False, indent=2) print(f'Saved → {OUT_DIR}/sub5_run.json') return out if __name__ == '__main__': main()