#!/usr/bin/env python3 """ D: Wikipedia/online enrichment for top sportašs. For each athlete: fetch hr.wikipedia + en.wikipedia summary, extract bio + medalje + datum/mjesto rođenja, populate clanovi.biografija + dabi.knowledge facts. """ import os import re, json, time import urllib.request, urllib.parse import psycopg2 DB = dict(host="10.10.0.2", port=6432, dbname="rinet_v3", user="rinet", password=os.environ["DB_PASSWORD"]) UA = "Mozilla/5.0 (PGZSportBot/1.0)" DELAY = 0.5 def http_get(url, timeout=15): try: req = urllib.request.Request(url, headers={"User-Agent": UA, "Accept-Language": "hr,en"}) with urllib.request.urlopen(req, timeout=timeout) as r: return r.read().decode("utf-8", errors="replace") except urllib.error.HTTPError as e: if e.code == 404: return None return None except Exception: return None def wiki_summary(title, lang="hr"): """Use Wikipedia REST API for clean summary.""" enc = urllib.parse.quote(title.replace(" ", "_")) url = f"https://{lang}.wikipedia.org/api/rest_v1/page/summary/{enc}" raw = http_get(url) if not raw: return None try: return json.loads(raw) except Exception: return None def wiki_search(query, lang="hr"): """Find best Wikipedia title for a person.""" enc = urllib.parse.quote(query) url = f"https://{lang}.wikipedia.org/w/api.php?action=opensearch&search={enc}&limit=3&format=json" raw = http_get(url) if not raw: return [] try: d = json.loads(raw) return list(zip(d[1], d[3])) # (title, url) except Exception: return [] def main(): conn = psycopg2.connect(**DB); conn.autocommit = True cr = conn.cursor() # Get target list: HOO kat 1-3 + SP/EP/OI medalisti cr.execute(""" SELECT DISTINCT c.id, c.ime, c.prezime, c.sport, c.kategorija_hoo, c.biografija, k.naziv FROM pgz_sport.clanovi c LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id WHERE c.kategorija_hoo IN (1, 2, 3) OR c.id IN (SELECT DISTINCT clan_id FROM pgz_sport.clan_nagrada WHERE clan_id IS NOT NULL AND razina_natjecanja IN ('SP','EP','OI','SK')) ORDER BY c.kategorija_hoo NULLS LAST, c.prezime, c.ime """) targets = cr.fetchall() print(f"Targets: {len(targets)}") enriched = 0 fact_count = 0 for tid, ime, prez, sport, kat, bio, klub in targets: if bio and len(bio) > 200: continue # already enriched full = f"{ime} {prez}" # Try HR wiki first summary = None wiki_title = None wiki_lang = None wiki_url = None for lang in ["hr", "en"]: # Direct title try s = wiki_summary(full, lang) if s and s.get("type") == "standard" and not s.get("disambiguation"): # Sanity check: must mention sport in description or extract desc = (s.get("description","") + " " + s.get("extract",""))[:2000].lower() if any(kw in desc for kw in ("sport", "igra", "klub", "natjecat", "atlet", "vater", "ronil", "kuglač", "boćar", "skij", "karat", "kickbox", "wushu", "ju-jitsu", "sahist", "šahis", "atlet", "biciklist", "plivač", "plivat", "boxer", "olimpij", "bonifac", "athlete", "compete", "sportaš", "swimmer", "diver", "boxer", "sailor", "wrestler")): summary = s wiki_title = s.get("title") wiki_lang = lang wiki_url = s.get("content_urls", {}).get("desktop", {}).get("page") break time.sleep(DELAY) # Search fallback results = wiki_search(full, lang) for title, url in results: # Skip disambiguations if "razdvojba" in title.lower() or "disambiguation" in title.lower(): continue s2 = wiki_summary(title, lang) if not s2: continue desc = (s2.get("description","") + " " + s2.get("extract",""))[:2000].lower() if any(kw in desc for kw in ("sport", "igrač", "klub", "natjecat", "atlet", "vater", "ronil", "kuglač", "boćar", "skij", "karat", "kickbox", "wushu", "atletič", "ju-jitsu", "šahist", "biciklist", "plivač", "olimpij", "athlete", "compete", "sportaš", "swimmer", "diver", "sailor", "wrestler")): # Must mention sport-relevant keyword OR our sport if sport and sport.lower()[:5] in desc: summary = s2; wiki_title = title; wiki_lang = lang wiki_url = s2.get("content_urls", {}).get("desktop", {}).get("page") break time.sleep(DELAY) if summary: break if not summary or not summary.get("extract"): continue extract = summary.get("extract", "").strip()[:1500] if len(extract) < 80: continue # Try to extract date of birth from extract (pattern: rođen* DD.MM.YYYY or DD month YYYY) dob = None m = re.search(r"rođen[ai]?\s+(\d{1,2}\.\s*\w+\s+\d{4}|\d{1,2}\.\d{1,2}\.\d{4})", extract.lower()) if m: dob = m.group(1) # Update clanovi try: cr.execute("""UPDATE pgz_sport.clanovi SET biografija = %s, source = COALESCE(source, %s), source_url = COALESCE(source_url, %s), source_synced_at = now() WHERE id = %s""", (extract, f"wikipedia_{wiki_lang}", wiki_url, tid)) enriched += 1 except Exception as e: print(f" ERR update {full}: {e}") continue # Insert as fact in dabi.knowledge fact = f"{full} ({sport}, {klub or 'PGŽ'}) — {extract[:600]} (Wikipedia {wiki_lang.upper()})" try: cr.execute("""INSERT INTO dabi.knowledge (fact, source, confidence, category, created_at) VALUES (%s, %s, %s, %s, now()) ON CONFLICT DO NOTHING""", (fact[:2000], f"wikipedia_{wiki_lang}", 0.9, "biografija_sportasa")) if cr.rowcount: fact_count += 1 except Exception: pass print(f" ✓ {full} ({wiki_lang}) {len(extract)} chars") time.sleep(DELAY) print(f"\n=== DONE: {enriched} enriched, {fact_count} new facts ===") # Sample bios cr.execute("""SELECT ime, prezime, sport, LENGTH(biografija) AS bio_len FROM pgz_sport.clanovi WHERE biografija IS NOT NULL AND LENGTH(biografija) > 100 AND source LIKE 'wikipedia%' ORDER BY bio_len DESC LIMIT 15""") print("\nTop bios:") for r in cr.fetchall(): print(f" {r[0]} {r[1]} ({r[2]}) - {r[3]} chars") conn.close() if __name__ == "__main__": main()