Files
pgz-sport/scrapers/klub_oib_enricher.py
T

129 lines
4.5 KiB
Python

#!/usr/bin/env python3
from dotenv import load_dotenv
load_dotenv('/opt/rinet-gpu/.env.master')
# auto-added by patch_scrapers_with_dotenv.sh
# ═══════════════════════════════════════════════════════════════════
# Fajl: klub_oib_enricher.py | v1.0.0 | 04.05.2026
# Autor: Damir Radulić <dradulic@outlook.com>
# Lokacija: /opt/pgz-sport/scrapers/klub_oib_enricher.py
# Svrha: Enrichment OIB-a za pgz_sport.klubovi (678 BEZ OIB)
# Strategy: 1) match s civic.entities by naziv → kopiraj OIB
# 2) DDG search za kluba s "OIB" i extract
# 3) Sudreg API lookup po naziv (cache)
# ═══════════════════════════════════════════════════════════════════
"""OIB enrichment za PGŽ Sport klubove."""
import os, sys, time, re, hashlib, logging, json
import psycopg2
from psycopg2.extras import RealDictCursor
logging.basicConfig(level=logging.INFO, format='%(asctime)s [klub_oib] %(message)s')
log = logging.getLogger("klub_oib")
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
def db():
return psycopg2.connect(DSN, cursor_factory=RealDictCursor)
def normalize_name(n):
"""Normalize club name for matching."""
if not n: return ""
n = n.lower().strip()
n = re.sub(r'\s+', ' ', n)
# remove common prefixes/suffixes
n = re.sub(r'\b(klub|udruga|sportski|nk|hk|kk|bk|ok|rk|jk|šk|sk|tk)\b', '', n)
return n.strip()
def match_civic_entities(cur, naziv):
"""Try to match by name in civic.entities."""
if not naziv or len(naziv) < 5: return None
norm = normalize_name(naziv)
if not norm: return None
# Try exact match prvo
cur.execute("""
SELECT oib, name AS naziv, county AS county_code FROM civic.entities
WHERE oib IS NOT NULL AND length(oib) = 11
AND lower(name) ILIKE %s
LIMIT 5
""", (f'%{naziv[:30].lower()}%',))
candidates = cur.fetchall()
if not candidates:
# Try fuzzy
words = [w for w in norm.split() if len(w) > 3][:3]
if words:
for word in words:
cur.execute("""
SELECT oib, name AS naziv FROM civic.entities
WHERE oib IS NOT NULL AND length(oib) = 11
AND lower(name) ILIKE %s
LIMIT 3
""", (f'%{word}%',))
c = cur.fetchall()
if c:
candidates = c
break
if not candidates: return None
# Score candidates by name similarity
best = None
best_score = 0
naziv_norm = normalize_name(naziv)
for c in candidates:
cand_norm = normalize_name(c['naziv'])
if not cand_norm: continue
# Simple word overlap
w1 = set(naziv_norm.split())
w2 = set(cand_norm.split())
if not w1 or not w2: continue
overlap = len(w1 & w2) / max(len(w1), len(w2))
if overlap > best_score and overlap >= 0.6:
best_score = overlap
best = c
return best
def enrich():
conn = db()
conn.autocommit = True
cur = conn.cursor()
# Get klubovi without OIB
cur.execute("""
SELECT id, naziv, savez_id, sport, grad
FROM pgz_sport.klubovi
WHERE (oib IS NULL OR oib = '' OR length(oib) != 11)
ORDER BY id
""")
klubovi = cur.fetchall()
log.info(f"Klubovi za enrichment: {len(klubovi)}")
enriched = 0
no_match = 0
for k in klubovi:
match = match_civic_entities(cur, k['naziv'])
if match:
cur.execute("""
UPDATE pgz_sport.klubovi
SET oib = %s, updated_at = now()
WHERE id = %s AND (oib IS NULL OR oib = '')
""", (match['oib'], k['id']))
log.info(f"{k['naziv'][:40]} → OIB {match['oib']} (matched: {match['naziv'][:40]})")
enriched += 1
else:
no_match += 1
if (enriched + no_match) % 50 == 0:
log.info(f"Progress: {enriched} enriched / {no_match} no_match / {enriched+no_match}/{len(klubovi)}")
log.info(f"FINAL: {enriched} enriched, {no_match} no_match")
cur.close()
conn.close()
return enriched
if __name__ == "__main__":
enrich()