#!/usr/bin/env python3 # ═══════════════════════════════════════════════════════════════════ # Fajl: klub_oib_enricher.py | v1.0.0 | 04.05.2026 # Autor: Damir Radulić # Lokacija: /opt/pgz-sport/scrapers/klub_oib_enricher.py # Svrha: Enrichment OIB-a za pgz_sport.klubovi (678 BEZ OIB) # Strategy: 1) match s civic.entities by naziv → kopiraj OIB # 2) DDG search za kluba s "OIB" i extract # 3) Sudreg API lookup po naziv (cache) # ═══════════════════════════════════════════════════════════════════ """OIB enrichment za PGŽ Sport klubove.""" import os, sys, time, re, hashlib, logging, json import psycopg2 from psycopg2.extras import RealDictCursor logging.basicConfig(level=logging.INFO, format='%(asctime)s [klub_oib] %(message)s') log = logging.getLogger("klub_oib") DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7" def db(): return psycopg2.connect(DSN, cursor_factory=RealDictCursor) def normalize_name(n): """Normalize club name for matching.""" if not n: return "" n = n.lower().strip() n = re.sub(r'\s+', ' ', n) # remove common prefixes/suffixes n = re.sub(r'\b(klub|udruga|sportski|nk|hk|kk|bk|ok|rk|jk|šk|sk|tk)\b', '', n) return n.strip() def match_civic_entities(cur, naziv): """Try to match by name in civic.entities.""" if not naziv or len(naziv) < 5: return None norm = normalize_name(naziv) if not norm: return None # Try exact match prvo cur.execute(""" SELECT oib, name AS naziv, county AS county_code FROM civic.entities WHERE oib IS NOT NULL AND length(oib) = 11 AND lower(name) ILIKE %s LIMIT 5 """, (f'%{naziv[:30].lower()}%',)) candidates = cur.fetchall() if not candidates: # Try fuzzy words = [w for w in norm.split() if len(w) > 3][:3] if words: for word in words: cur.execute(""" SELECT oib, name AS naziv FROM civic.entities WHERE oib IS NOT NULL AND length(oib) = 11 AND lower(name) ILIKE %s LIMIT 3 """, (f'%{word}%',)) c = cur.fetchall() if c: candidates = c break if not candidates: return None # Score candidates by name similarity best = None best_score = 0 naziv_norm = normalize_name(naziv) for c in candidates: cand_norm = normalize_name(c['naziv']) if not cand_norm: continue # Simple word overlap w1 = set(naziv_norm.split()) w2 = set(cand_norm.split()) if not w1 or not w2: continue overlap = len(w1 & w2) / max(len(w1), len(w2)) if overlap > best_score and overlap >= 0.6: best_score = overlap best = c return best def enrich(): conn = db() conn.autocommit = True cur = conn.cursor() # Get klubovi without OIB cur.execute(""" SELECT id, naziv, savez_id, sport, grad FROM pgz_sport.klubovi WHERE (oib IS NULL OR oib = '' OR length(oib) != 11) ORDER BY id """) klubovi = cur.fetchall() log.info(f"Klubovi za enrichment: {len(klubovi)}") enriched = 0 no_match = 0 for k in klubovi: match = match_civic_entities(cur, k['naziv']) if match: cur.execute(""" UPDATE pgz_sport.klubovi SET oib = %s, updated_at = now() WHERE id = %s AND (oib IS NULL OR oib = '') """, (match['oib'], k['id'])) log.info(f"✓ {k['naziv'][:40]} → OIB {match['oib']} (matched: {match['naziv'][:40]})") enriched += 1 else: no_match += 1 if (enriched + no_match) % 50 == 0: log.info(f"Progress: {enriched} enriched / {no_match} no_match / {enriched+no_match}/{len(klubovi)}") log.info(f"FINAL: {enriched} enriched, {no_match} no_match") cur.close() conn.close() return enriched if __name__ == "__main__": enrich()