#!/usr/bin/env python3 """ cleanup_garbage_clubs.py — fix klubovi where naziv is an address Author: Damir Radulić (dradulic@outlook.com / damir@rinet.one) Date: 2026-05-05 Symptoms (R3B/R4 cleanup pass): - 14 odbojkaški klubovi imaju adresu u polju `naziv` - others: null/empty naziv, naziv equal to grad, naziv only digits - sportaši with email/phone in ime/prezime Strategy: 1) For each problem klub, look up civic.entities by address fragment. 2) If exactly one candidate → swap (naziv ← candidate.name, adresa ← old naziv, oib ← candidate.oib if missing) with confidence 0.95. 3) If multiple candidates → mark metadata.manual_review=true with candidates list. 4) If zero candidates → broader fallback (city + sport=odbojka) and same logic. Backup: pgz_sport.klubovi_backup_20260505 must already exist (run from SQL). Reports written to /opt/pgz-sport/data_cleanup_report.md (separate driver). """ from __future__ import annotations import os, json, sys from datetime import datetime, timezone import psycopg2, psycopg2.extras PG = dict(host=os.environ.get('PG_HOST','10.10.0.2'), port=int(os.environ.get('PG_PORT','6432')), dbname=os.environ.get('PG_DB','rinet_v3'), user=os.environ.get('PG_USER','rinet'), password=os.environ.get('PG_PASS','')) PROBLEM_IDS = [2613, 2616, 2618, 2619, 2622, 2624, 2626, 2630, 2632, 2634, 2636, 2638, 2641, 2643] # Hand-curated picks where DB has multiple candidates at same address. # Source: cross-reference with HOS (Hrvatski odbojkaški savez) member roster. MANUAL_PICKS = { # 2613 = Trg Viktora Bubnja 1 — savez address; primary host is HAOK Rijeka. 2613: 100700, # Hrvatski Akademski Odbojkaški Klub "Rijeka" # 2618 = Zdravka Kučića 1 — both ŽOK and MOK Gornja Vežica share. The municipal # club entry is MOK Gornja Vežica which is the registered active senior team. 2618: 82677, # Muški Odbojkaški Klub "Gornja Vežica" } # Hand-curated for zero-match cases (verified via HOS public list) ZERO_MATCH_HINTS = { 2619: {'name': 'Odbojkaški Klub Čavle', 'note':'Vrh Čavje 31, Čavle'}, 2630: {'name': 'Odbojkaški Klub Opatija', 'note':'1. Istarske čete 3, Opatija'}, 2636: {'name': 'Odbojkaški Klub Rijeka', 'note':'Sv. Križ 24, Rijeka — possibly OK Rijeka senior'}, 2641: {'name': 'Odbojkaški Klub Crikvenica','note':'Kotorska 15a, Crikvenica'}, } def db(): c = psycopg2.connect(**PG); c.autocommit = False; return c def fetch_candidates(cur, addr_fragment, sport='odbojka'): cur.execute(""" SELECT id, name, oib, address, city, entity_type FROM civic.entities WHERE address ILIKE %s AND (name ILIKE '%%odbojk%%' OR name ILIKE 'OK %%' OR name ILIKE 'ŽOK%%' OR name ILIKE 'MOK %%' OR name ILIKE '%%volley%%') ORDER BY length(name) LIMIT 5 """, ('%'+addr_fragment+'%',)) return [dict(r) for r in cur.fetchall()] def update_klub(cur, kid, new_naziv, old_naziv_as_address, oib, manual_review=False, candidates=None, source=None): """Move old naziv → adresa, set new naziv, optionally set oib + metadata.""" md = { 'cleanup_at': datetime.now(timezone.utc).isoformat(), 'cleanup_reason': 'naziv_is_address', 'cleanup_source': source or 'civic.entities', } if manual_review: md['manual_review'] = True if candidates: md['candidates'] = candidates set_parts = ["naziv=%s", "adresa=%s", "metadata = COALESCE(metadata,'{}'::jsonb) || %s::jsonb"] params = [new_naziv, old_naziv_as_address, json.dumps(md, ensure_ascii=False)] if oib: set_parts.append("oib=COALESCE(NULLIF(oib,''), %s)") params.append(oib) params.append(kid) cur.execute(f"UPDATE pgz_sport.klubovi SET {', '.join(set_parts)} WHERE id=%s", params) def address_fragment(addr): """Extract the most distinctive piece of an address for ILIKE matching. e.g. 'Trg Viktora Bubnja 1, 51000 Rijeka' → 'Trg Viktora Bubnja 1' """ return (addr or '').split(',')[0].strip() def run(): conn = db() cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) report = { 'started_at': datetime.now(timezone.utc).isoformat(), 'problem_ids': PROBLEM_IDS, 'fixed': [], 'manual_review': [], 'failed': [], } cur.execute(""" SELECT id, naziv, adresa, grad, sport, oib FROM pgz_sport.klubovi WHERE id = ANY(%s) ORDER BY id """, (PROBLEM_IDS,)) problems = [dict(r) for r in cur.fetchall()] for p in problems: kid = p['id'] addr = p['naziv'] # the bad value frag = address_fragment(addr) cands = fetch_candidates(cur, frag) chosen = None confidence = 0.0 path = '' if len(cands) == 1: chosen = cands[0] confidence = 0.95 path = 'single_match' elif len(cands) > 1 and kid in MANUAL_PICKS: for c in cands: if c['id'] == MANUAL_PICKS[kid]: chosen = c confidence = 0.90 path = 'curated_pick' break elif len(cands) > 1: # Mark manual review with all candidates update_klub(cur, kid, new_naziv=f'[MANUAL REVIEW] {addr}', old_naziv_as_address=addr, oib=None, manual_review=True, candidates=[{'id':c['id'],'name':c['name'],'oib':c['oib']} for c in cands], source='multi_candidate') report['manual_review'].append({ 'klub_id': kid, 'address': addr, 'candidates': [{'id':c['id'],'name':c['name'],'oib':c['oib']} for c in cands], 'reason': f'{len(cands)} candidates at same address — operator must pick', }) continue elif kid in ZERO_MATCH_HINTS: # Use hint name and mark it for verification update_klub(cur, kid, new_naziv=f"[VERIFY] {ZERO_MATCH_HINTS[kid]['name']}", old_naziv_as_address=addr, oib=None, manual_review=True, candidates=None, source='heuristic_hint') report['manual_review'].append({ 'klub_id': kid, 'address': addr, 'suggested_name': ZERO_MATCH_HINTS[kid]['name'], 'note': ZERO_MATCH_HINTS[kid]['note'], 'reason': 'no civic.entities match — heuristic suggestion needs verification', }) continue else: update_klub(cur, kid, new_naziv=f'[UNRESOLVED] {addr}', old_naziv_as_address=addr, oib=None, manual_review=True, candidates=None, source='no_match') report['failed'].append({ 'klub_id': kid, 'address': addr, 'reason': 'no candidates found anywhere', }) continue if chosen: update_klub(cur, kid, new_naziv=chosen['name'], old_naziv_as_address=addr, oib=chosen.get('oib'), manual_review=False, source=f"civic.entities#{chosen['id']}") report['fixed'].append({ 'klub_id': kid, 'old_naziv': addr, 'new_naziv': chosen['name'], 'oib_set': chosen.get('oib'), 'civic_entity_id': chosen['id'], 'confidence': confidence, 'path': path, }) conn.commit() cur.close(); conn.close() report['completed_at'] = datetime.now(timezone.utc).isoformat() report['summary'] = { 'total': len(problems), 'fixed': len(report['fixed']), 'manual_review': len(report['manual_review']), 'failed': len(report['failed']), } print(json.dumps(report, indent=2, ensure_ascii=False)) if __name__ == '__main__': run()