Files
pgz-sport/scrapers/wiki_klub_logos.py

115 lines
4.5 KiB
Python
Executable File

#!/usr/bin/env python3
from dotenv import load_dotenv
load_dotenv('/opt/rinet-gpu/.env.master')
# auto-added by patch_scrapers_with_dotenv.sh
"""D.3 — Wiki + Wikidata logo enrichment for top klubovi."""
import os
import psycopg2, requests, re, json, time, urllib.parse
DB = dict(host="10.10.0.2", port=6432, dbname="rinet_v3",
user="rinet", password=os.environ["DB_PASSWORD"])
UA = "RiNET-Civic/1.0 (https://rinet.one)"
TIMEOUT = 20
DELAY = 0.5
s = requests.Session()
s.headers.update({"User-Agent": UA, "Accept-Language": "hr,en"})
def query_wiki(name, lang="hr"):
"""Search + page details w/ pageimages."""
try:
r = s.get(f"https://{lang}.wikipedia.org/w/api.php",
params={"action":"query","format":"json","list":"search",
"srsearch":name,"srlimit":3,"utf8":1}, timeout=TIMEOUT)
sr = r.json().get("query",{}).get("search",[])
if not sr: return None
candidates = [x["title"] for x in sr]
except: return None
# Pick first candidate that contains key word from name
key = name.split()[-1].lower()
for title in candidates:
if key not in title.lower(): continue
time.sleep(DELAY)
try:
r = s.get(f"https://{lang}.wikipedia.org/w/api.php",
params={"action":"query","format":"json",
"prop":"extracts|pageimages|info",
"exintro":1,"explaintext":1,
"piprop":"original|thumbnail","pithumbsize":500,
"inprop":"url","titles":title,"utf8":1}, timeout=TIMEOUT)
pages = r.json().get("query",{}).get("pages",{})
for pid, p in pages.items():
if pid == "-1": continue
extract = p.get("extract","")
if not extract: continue
# Sport context check
el = extract.lower()
if not any(k in el for k in ["klub","sport","liga","prvenstv","football","basketball","handball","water polo","volleyball","cycling","sailing","klub","tim","club"]):
continue
logo = (p.get("thumbnail",{}).get("source") or
p.get("original",{}).get("source"))
page_url = p.get("fullurl") or f"https://{lang}.wikipedia.org/wiki/{urllib.parse.quote(title.replace(' ','_'))}"
return {"title":title, "logo":logo, "biografija":extract[:1500], "source_url":page_url, "lang":lang}
except: continue
return None
def enrich_klub(naziv):
# Try variants
variants = [naziv]
# Strip common prefixes
base = re.sub(r'^(HNK|NK|RK|KK|VK|HK|AK|TK|BK|PK|HAOK|HŠK)\s+', '', naziv).strip()
if base != naziv: variants.append(base)
if "Rijeka" not in naziv and base != naziv:
variants.append(f"{base} Rijeka")
for v in variants:
for lang in ["hr","en"]:
r = query_wiki(v, lang)
if r: return r
return None
# === MAIN ===
conn = psycopg2.connect(**DB); conn.autocommit = True
cr = conn.cursor()
# Top klubovi: most trofeji + svjetski medalisti, missing logo
cr.execute("""
WITH top_klubovi AS (
SELECT k.id, k.naziv, k.logo_url, k.napomena, k.web,
(SELECT count(*) FROM pgz_sport.klub_sezona WHERE klub_id=k.id) AS trofeja,
(SELECT count(*) FROM pgz_sport.clan_nagrada WHERE klub_id=k.id) AS nagrada
FROM pgz_sport.klubovi k
WHERE k.id != 4426 AND k.aktivan=true
)
SELECT id, naziv FROM top_klubovi
WHERE logo_url IS NULL AND (trofeja > 0 OR nagrada > 0)
ORDER BY trofeja DESC, nagrada DESC LIMIT 50
""")
todo = cr.fetchall()
print(f"Klubovi to enrich (logo): {len(todo)}")
success = 0
for kid, naziv in todo:
print(f"{naziv}", end="", flush=True)
r = enrich_klub(naziv)
if not r:
print(" MISS"); continue
sets, vals = [], []
if r.get("logo"):
sets.append("logo_url = %s"); vals.append(r["logo"])
sets.append("napomena = COALESCE(napomena, %s)"); vals.append(r["biografija"][:1000])
sets.append("web = COALESCE(web, %s)"); vals.append(r["source_url"])
sets.append("source_synced_at = now()")
vals.append(kid)
try:
cr.execute(f"UPDATE pgz_sport.klubovi SET {', '.join(sets)} WHERE id = %s", tuple(vals))
success += 1
flags = " +LOGO" if r.get("logo") else ""
print(f" ✓ [{r['lang']}] {r['title']}{flags}")
except Exception as e:
print(f" DBerr: {e}")
print(f"\n=== Klubovi enriched: {success}/{len(todo)} ===")
conn.close()