126 lines
4.3 KiB
Python
126 lines
4.3 KiB
Python
#!/usr/bin/env python3
|
|
# ═══════════════════════════════════════════════════════════════════
|
|
# Fajl: klub_oib_enricher.py | v1.0.0 | 04.05.2026
|
|
# Autor: Damir Radulić <dradulic@outlook.com>
|
|
# Lokacija: /opt/pgz-sport/scrapers/klub_oib_enricher.py
|
|
# Svrha: Enrichment OIB-a za pgz_sport.klubovi (678 BEZ OIB)
|
|
# Strategy: 1) match s civic.entities by naziv → kopiraj OIB
|
|
# 2) DDG search za kluba s "OIB" i extract
|
|
# 3) Sudreg API lookup po naziv (cache)
|
|
# ═══════════════════════════════════════════════════════════════════
|
|
"""OIB enrichment za PGŽ Sport klubove."""
|
|
import os, sys, time, re, hashlib, logging, json
|
|
import psycopg2
|
|
from psycopg2.extras import RealDictCursor
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s [klub_oib] %(message)s')
|
|
log = logging.getLogger("klub_oib")
|
|
|
|
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
|
|
|
|
def db():
|
|
return psycopg2.connect(DSN, cursor_factory=RealDictCursor)
|
|
|
|
def normalize_name(n):
|
|
"""Normalize club name for matching."""
|
|
if not n: return ""
|
|
n = n.lower().strip()
|
|
n = re.sub(r'\s+', ' ', n)
|
|
# remove common prefixes/suffixes
|
|
n = re.sub(r'\b(klub|udruga|sportski|nk|hk|kk|bk|ok|rk|jk|šk|sk|tk)\b', '', n)
|
|
return n.strip()
|
|
|
|
def match_civic_entities(cur, naziv):
|
|
"""Try to match by name in civic.entities."""
|
|
if not naziv or len(naziv) < 5: return None
|
|
|
|
norm = normalize_name(naziv)
|
|
if not norm: return None
|
|
|
|
# Try exact match prvo
|
|
cur.execute("""
|
|
SELECT oib, name AS naziv, county AS county_code FROM civic.entities
|
|
WHERE oib IS NOT NULL AND length(oib) = 11
|
|
AND lower(name) ILIKE %s
|
|
LIMIT 5
|
|
""", (f'%{naziv[:30].lower()}%',))
|
|
candidates = cur.fetchall()
|
|
|
|
if not candidates:
|
|
# Try fuzzy
|
|
words = [w for w in norm.split() if len(w) > 3][:3]
|
|
if words:
|
|
for word in words:
|
|
cur.execute("""
|
|
SELECT oib, name AS naziv FROM civic.entities
|
|
WHERE oib IS NOT NULL AND length(oib) = 11
|
|
AND lower(name) ILIKE %s
|
|
LIMIT 3
|
|
""", (f'%{word}%',))
|
|
c = cur.fetchall()
|
|
if c:
|
|
candidates = c
|
|
break
|
|
|
|
if not candidates: return None
|
|
|
|
# Score candidates by name similarity
|
|
best = None
|
|
best_score = 0
|
|
naziv_norm = normalize_name(naziv)
|
|
for c in candidates:
|
|
cand_norm = normalize_name(c['naziv'])
|
|
if not cand_norm: continue
|
|
# Simple word overlap
|
|
w1 = set(naziv_norm.split())
|
|
w2 = set(cand_norm.split())
|
|
if not w1 or not w2: continue
|
|
overlap = len(w1 & w2) / max(len(w1), len(w2))
|
|
if overlap > best_score and overlap >= 0.6:
|
|
best_score = overlap
|
|
best = c
|
|
|
|
return best
|
|
|
|
def enrich():
|
|
conn = db()
|
|
conn.autocommit = True
|
|
cur = conn.cursor()
|
|
|
|
# Get klubovi without OIB
|
|
cur.execute("""
|
|
SELECT id, naziv, savez_id, sport, grad
|
|
FROM pgz_sport.klubovi
|
|
WHERE (oib IS NULL OR oib = '' OR length(oib) != 11)
|
|
ORDER BY id
|
|
""")
|
|
klubovi = cur.fetchall()
|
|
log.info(f"Klubovi za enrichment: {len(klubovi)}")
|
|
|
|
enriched = 0
|
|
no_match = 0
|
|
|
|
for k in klubovi:
|
|
match = match_civic_entities(cur, k['naziv'])
|
|
if match:
|
|
cur.execute("""
|
|
UPDATE pgz_sport.klubovi
|
|
SET oib = %s, updated_at = now()
|
|
WHERE id = %s AND (oib IS NULL OR oib = '')
|
|
""", (match['oib'], k['id']))
|
|
log.info(f"✓ {k['naziv'][:40]} → OIB {match['oib']} (matched: {match['naziv'][:40]})")
|
|
enriched += 1
|
|
else:
|
|
no_match += 1
|
|
|
|
if (enriched + no_match) % 50 == 0:
|
|
log.info(f"Progress: {enriched} enriched / {no_match} no_match / {enriched+no_match}/{len(klubovi)}")
|
|
|
|
log.info(f"FINAL: {enriched} enriched, {no_match} no_match")
|
|
cur.close()
|
|
conn.close()
|
|
return enriched
|
|
|
|
if __name__ == "__main__":
|
|
enrich()
|