111 lines
4.4 KiB
Python
Executable File
111 lines
4.4 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""D.3 — Wiki + Wikidata logo enrichment for top klubovi."""
|
|
import psycopg2, requests, re, json, time, urllib.parse
|
|
|
|
DB = dict(host="localhost", port=5432, dbname="rinet_v3",
|
|
user="rinet", password="R1net2026!SecureDB#v7")
|
|
UA = "RiNET-Civic/1.0 (https://rinet.one)"
|
|
TIMEOUT = 20
|
|
DELAY = 0.5
|
|
|
|
s = requests.Session()
|
|
s.headers.update({"User-Agent": UA, "Accept-Language": "hr,en"})
|
|
|
|
def query_wiki(name, lang="hr"):
|
|
"""Search + page details w/ pageimages."""
|
|
try:
|
|
r = s.get(f"https://{lang}.wikipedia.org/w/api.php",
|
|
params={"action":"query","format":"json","list":"search",
|
|
"srsearch":name,"srlimit":3,"utf8":1}, timeout=TIMEOUT)
|
|
sr = r.json().get("query",{}).get("search",[])
|
|
if not sr: return None
|
|
candidates = [x["title"] for x in sr]
|
|
except: return None
|
|
|
|
# Pick first candidate that contains key word from name
|
|
key = name.split()[-1].lower()
|
|
for title in candidates:
|
|
if key not in title.lower(): continue
|
|
time.sleep(DELAY)
|
|
try:
|
|
r = s.get(f"https://{lang}.wikipedia.org/w/api.php",
|
|
params={"action":"query","format":"json",
|
|
"prop":"extracts|pageimages|info",
|
|
"exintro":1,"explaintext":1,
|
|
"piprop":"original|thumbnail","pithumbsize":500,
|
|
"inprop":"url","titles":title,"utf8":1}, timeout=TIMEOUT)
|
|
pages = r.json().get("query",{}).get("pages",{})
|
|
for pid, p in pages.items():
|
|
if pid == "-1": continue
|
|
extract = p.get("extract","")
|
|
if not extract: continue
|
|
# Sport context check
|
|
el = extract.lower()
|
|
if not any(k in el for k in ["klub","sport","liga","prvenstv","football","basketball","handball","water polo","volleyball","cycling","sailing","klub","tim","club"]):
|
|
continue
|
|
logo = (p.get("thumbnail",{}).get("source") or
|
|
p.get("original",{}).get("source"))
|
|
page_url = p.get("fullurl") or f"https://{lang}.wikipedia.org/wiki/{urllib.parse.quote(title.replace(' ','_'))}"
|
|
return {"title":title, "logo":logo, "biografija":extract[:1500], "source_url":page_url, "lang":lang}
|
|
except: continue
|
|
return None
|
|
|
|
def enrich_klub(naziv):
|
|
# Try variants
|
|
variants = [naziv]
|
|
# Strip common prefixes
|
|
base = re.sub(r'^(HNK|NK|RK|KK|VK|HK|AK|TK|BK|PK|HAOK|HŠK)\s+', '', naziv).strip()
|
|
if base != naziv: variants.append(base)
|
|
if "Rijeka" not in naziv and base != naziv:
|
|
variants.append(f"{base} Rijeka")
|
|
|
|
for v in variants:
|
|
for lang in ["hr","en"]:
|
|
r = query_wiki(v, lang)
|
|
if r: return r
|
|
return None
|
|
|
|
# === MAIN ===
|
|
conn = psycopg2.connect(**DB); conn.autocommit = True
|
|
cr = conn.cursor()
|
|
|
|
# Top klubovi: most trofeji + svjetski medalisti, missing logo
|
|
cr.execute("""
|
|
WITH top_klubovi AS (
|
|
SELECT k.id, k.naziv, k.logo_url, k.napomena, k.web,
|
|
(SELECT count(*) FROM pgz_sport.klub_sezona WHERE klub_id=k.id) AS trofeja,
|
|
(SELECT count(*) FROM pgz_sport.clan_nagrada WHERE klub_id=k.id) AS nagrada
|
|
FROM pgz_sport.klubovi k
|
|
WHERE k.id != 4426 AND k.aktivan=true
|
|
)
|
|
SELECT id, naziv FROM top_klubovi
|
|
WHERE logo_url IS NULL AND (trofeja > 0 OR nagrada > 0)
|
|
ORDER BY trofeja DESC, nagrada DESC LIMIT 50
|
|
""")
|
|
todo = cr.fetchall()
|
|
print(f"Klubovi to enrich (logo): {len(todo)}")
|
|
|
|
success = 0
|
|
for kid, naziv in todo:
|
|
print(f" → {naziv}", end="", flush=True)
|
|
r = enrich_klub(naziv)
|
|
if not r:
|
|
print(" MISS"); continue
|
|
sets, vals = [], []
|
|
if r.get("logo"):
|
|
sets.append("logo_url = %s"); vals.append(r["logo"])
|
|
sets.append("napomena = COALESCE(napomena, %s)"); vals.append(r["biografija"][:1000])
|
|
sets.append("web = COALESCE(web, %s)"); vals.append(r["source_url"])
|
|
sets.append("source_synced_at = now()")
|
|
vals.append(kid)
|
|
try:
|
|
cr.execute(f"UPDATE pgz_sport.klubovi SET {', '.join(sets)} WHERE id = %s", tuple(vals))
|
|
success += 1
|
|
flags = " +LOGO" if r.get("logo") else ""
|
|
print(f" ✓ [{r['lang']}] {r['title']}{flags}")
|
|
except Exception as e:
|
|
print(f" DBerr: {e}")
|
|
|
|
print(f"\n=== Klubovi enriched: {success}/{len(todo)} ===")
|
|
conn.close()
|