#!/usr/bin/env python3 """D.3 — Wiki + Wikidata logo enrichment for top klubovi.""" import psycopg2, requests, re, json, time, urllib.parse DB = dict(host="10.10.0.2", port=6432, dbname="rinet_v3", user="rinet", password="R1net2026!SecureDB#v7") UA = "RiNET-Civic/1.0 (https://rinet.one)" TIMEOUT = 20 DELAY = 0.5 s = requests.Session() s.headers.update({"User-Agent": UA, "Accept-Language": "hr,en"}) def query_wiki(name, lang="hr"): """Search + page details w/ pageimages.""" try: r = s.get(f"https://{lang}.wikipedia.org/w/api.php", params={"action":"query","format":"json","list":"search", "srsearch":name,"srlimit":3,"utf8":1}, timeout=TIMEOUT) sr = r.json().get("query",{}).get("search",[]) if not sr: return None candidates = [x["title"] for x in sr] except: return None # Pick first candidate that contains key word from name key = name.split()[-1].lower() for title in candidates: if key not in title.lower(): continue time.sleep(DELAY) try: r = s.get(f"https://{lang}.wikipedia.org/w/api.php", params={"action":"query","format":"json", "prop":"extracts|pageimages|info", "exintro":1,"explaintext":1, "piprop":"original|thumbnail","pithumbsize":500, "inprop":"url","titles":title,"utf8":1}, timeout=TIMEOUT) pages = r.json().get("query",{}).get("pages",{}) for pid, p in pages.items(): if pid == "-1": continue extract = p.get("extract","") if not extract: continue # Sport context check el = extract.lower() if not any(k in el for k in ["klub","sport","liga","prvenstv","football","basketball","handball","water polo","volleyball","cycling","sailing","klub","tim","club"]): continue logo = (p.get("thumbnail",{}).get("source") or p.get("original",{}).get("source")) page_url = p.get("fullurl") or f"https://{lang}.wikipedia.org/wiki/{urllib.parse.quote(title.replace(' ','_'))}" return {"title":title, "logo":logo, "biografija":extract[:1500], "source_url":page_url, "lang":lang} except: continue return None def enrich_klub(naziv): # Try variants variants = [naziv] # Strip common prefixes base = re.sub(r'^(HNK|NK|RK|KK|VK|HK|AK|TK|BK|PK|HAOK|HŠK)\s+', '', naziv).strip() if base != naziv: variants.append(base) if "Rijeka" not in naziv and base != naziv: variants.append(f"{base} Rijeka") for v in variants: for lang in ["hr","en"]: r = query_wiki(v, lang) if r: return r return None # === MAIN === conn = psycopg2.connect(**DB); conn.autocommit = True cr = conn.cursor() # Top klubovi: most trofeji + svjetski medalisti, missing logo cr.execute(""" WITH top_klubovi AS ( SELECT k.id, k.naziv, k.logo_url, k.napomena, k.web, (SELECT count(*) FROM pgz_sport.klub_sezona WHERE klub_id=k.id) AS trofeja, (SELECT count(*) FROM pgz_sport.clan_nagrada WHERE klub_id=k.id) AS nagrada FROM pgz_sport.klubovi k WHERE k.id != 4426 AND k.aktivan=true ) SELECT id, naziv FROM top_klubovi WHERE logo_url IS NULL AND (trofeja > 0 OR nagrada > 0) ORDER BY trofeja DESC, nagrada DESC LIMIT 50 """) todo = cr.fetchall() print(f"Klubovi to enrich (logo): {len(todo)}") success = 0 for kid, naziv in todo: print(f" → {naziv}", end="", flush=True) r = enrich_klub(naziv) if not r: print(" MISS"); continue sets, vals = [], [] if r.get("logo"): sets.append("logo_url = %s"); vals.append(r["logo"]) sets.append("napomena = COALESCE(napomena, %s)"); vals.append(r["biografija"][:1000]) sets.append("web = COALESCE(web, %s)"); vals.append(r["source_url"]) sets.append("source_synced_at = now()") vals.append(kid) try: cr.execute(f"UPDATE pgz_sport.klubovi SET {', '.join(sets)} WHERE id = %s", tuple(vals)) success += 1 flags = " +LOGO" if r.get("logo") else "" print(f" ✓ [{r['lang']}] {r['title']}{flags}") except Exception as e: print(f" DBerr: {e}") print(f"\n=== Klubovi enriched: {success}/{len(todo)} ===") conn.close()