#!/usr/bin/env python3 """ D curated: hand-curated wiki titles + extracts for top PGŽ athletes. Faster + more reliable than search-based approaches. For each known athlete, hardcode wiki title (HR/EN) and pull summary directly. """ import os import re, json, time, sys import urllib.request, urllib.parse import psycopg2 DB = dict(host="10.10.0.2", port=6432, dbname="rinet_v3", user="rinet", password=os.environ["DB_PASSWORD"]) UA = "Mozilla/5.0 (compatible; PGZBot/1.0)" def out(msg): print(msg, flush=True) def http_get(url, timeout=10): try: req = urllib.request.Request(url, headers={"User-Agent": UA}) with urllib.request.urlopen(req, timeout=timeout) as r: return r.read().decode("utf-8", errors="replace") except Exception: return None def wiki_summary(title, lang="hr"): enc = urllib.parse.quote(title.replace(" ", "_")) url = f"https://{lang}.wikipedia.org/api/rest_v1/page/summary/{enc}" raw = http_get(url) if not raw: return None try: return json.loads(raw) except Exception: return None # Curated list: (full_name, hr_wiki_title, en_wiki_title) - athletes with known wiki entries CURATED = [ # Olympic medalists (PGŽ historical heroes) ("Sara Kolak", "Sara Kolak", "Sara Kolak"), ("Duje Draganja", "Duje Draganja", "Duje Draganja"), ("Mirza Džomba", "Mirza Džomba", "Mirza Džomba"), ("Luciano Sušanj", "Luciano Sušanj", "Luciano Sušanj"), ("Damir Skomina", "Damir Skomina", "Damir Skomina"), # 2025 stars ("Petar Klovar", "Petar Klovar", "Petar Klovar"), ("Vitomir Maričić", "Vitomir Maričić", "Vitomir Maričić"), ("Sandra Delija", None, "Sandra Delija"), ("Laura Štefanac", "Laura Štefanac", "Laura Štefanac"), ("Ivan Šarić", "Ivan Šarić (šahist)", "Ivan Šarić (chess player)"), ("Damir Kreilach", "Damir Kreilach", "Damir Kreilach"), # Football (HNK Rijeka stars) ("Niko Janković", None, None), ("Ante Majstorović", "Ante Majstorović", "Ante Majstorović"), ("Toni Fruk", "Toni Fruk", "Toni Fruk"), ("Stjepan Radeljić", "Stjepan Radeljić", "Stjepan Radeljić"), ("Niko Galešić", "Niko Galešić", "Niko Galešić"), ("Bruno Bogojević", "Bruno Bogojević", "Bruno Bogojević"), ("Duje Čop", "Duje Čop", "Duje Čop"), ("Luka Menalo", "Luka Menalo", "Luka Menalo"), ("Mile Škorić", "Mile Škorić", "Mile Škorić"), ("Stipe Perica", "Stipe Perica", "Stipe Perica"), ("Marijan Čabraja", "Marijan Čabraja", "Marijan Čabraja"), ("Cherno Saho", None, "Cherno Saho"), ("Bruno Goda", None, None), ("Marco Pašalić", None, "Marco Pašalić"), ("Amer Gojak", "Amer Gojak", "Amer Gojak"), # Coaches ("Radomir Đalović", "Radomir Đalović", "Radomir Đalović"), # Vaterpolo PGŽ ("Tin Brubnjak", "Tin Brubnjak", "Tin Brubnjak"), # Boćanje legends ("Karlo Šaban", None, None), ("Carrolina Ban", None, None), # Karate ("Ema Sgardelli", None, "Ema Sgardelli"), # Atletika ("Sara Kolak", "Sara Kolak", "Sara Kolak"), ] def main(): conn = psycopg2.connect(**DB); conn.autocommit = True cr = conn.cursor() enriched = 0; tried = 0 for full, hr_title, en_title in CURATED: tried += 1 ime, prez = full.split(" ", 1) if " " in full else (full, "") # Find clan record cr.execute("""SELECT id, sport, klub_id FROM pgz_sport.clanovi WHERE LOWER(ime) = LOWER(%s) AND LOWER(prezime) = LOWER(%s) LIMIT 1""", (ime, prez)) row = cr.fetchone() if not row: out(f" - {full} not in clanovi"); continue cid, sport, klub_id = row # Fetch wiki - try hr first then en s = None; wlang = None for lang, title in [("hr", hr_title), ("en", en_title)]: if not title: continue s = wiki_summary(title, lang) if s and s.get("type") in ("standard", None): wlang = lang; break time.sleep(0.2) if not s or not s.get("extract"): out(f" ✗ {full} - no wiki page") continue extract = s["extract"].strip()[:1500] wurl = (s.get("content_urls") or {}).get("desktop", {}).get("page", "") try: cr.execute("""UPDATE pgz_sport.clanovi SET biografija = %s, source_url = COALESCE(source_url, %s), source_synced_at = now() WHERE id = %s""", (extract, wurl, cid)) cr.execute("""INSERT INTO dabi.knowledge (fact, source, confidence, category, created_at) VALUES (%s, %s, %s, %s, now()) ON CONFLICT DO NOTHING""", (f"{full} ({sport or '?'}) — {extract[:600]} (Wikipedia {wlang.upper()})"[:2000], f"wikipedia_{wlang}", 0.95, "biografija_sportasa")) enriched += 1 out(f" ✓ [{wlang}] {full} - {len(extract)} chars") except Exception as e: out(f" ERR {full}: {e}") time.sleep(0.3) out(f"\n=== DONE: tried={tried} enriched={enriched} ===") # Summary cr.execute("""SELECT count(*) FROM pgz_sport.clanovi WHERE LENGTH(biografija) > 200""") total = cr.fetchone()[0] out(f"\nTotal sportaša s bio > 200 chars: {total}") cr.execute("""SELECT ime, prezime, sport, LENGTH(biografija) AS bio FROM pgz_sport.clanovi WHERE LENGTH(biografija) > 200 ORDER BY bio DESC LIMIT 15""") out("\nTop bios:") for r in cr.fetchall(): out(f" {r[0]:18} {r[1]:18} {r[2]:15} {r[3]} chars") conn.close() if __name__ == "__main__": main()