#!/usr/bin/env python3 """D v3: enrichment with Wikipedia search API + DuckDuckGo as fallback.""" import re, json, time, sys import urllib.request, urllib.parse import psycopg2 DB = dict(host="localhost", port=5432, dbname="rinet_v3", user="rinet", password="R1net2026!SecureDB#v7") UA = "Mozilla/5.0 (PGZSportBot/1.0)" def out(msg): print(msg, flush=True) def http_get(url, timeout=10): try: req = urllib.request.Request(url, headers={"User-Agent": UA, "Accept-Language": "hr,en"}) with urllib.request.urlopen(req, timeout=timeout) as r: return r.read().decode("utf-8", errors="replace") except Exception as e: return None def wiki_search(query, lang="hr"): enc = urllib.parse.quote(query) url = f"https://{lang}.wikipedia.org/w/api.php?action=opensearch&search={enc}&limit=5&format=json" raw = http_get(url) if not raw: return [] try: d = json.loads(raw) return list(zip(d[1], d[3])) except Exception: return [] def wiki_summary(title, lang="hr"): enc = urllib.parse.quote(title.replace(" ", "_")) url = f"https://{lang}.wikipedia.org/api/rest_v1/page/summary/{enc}" raw = http_get(url) if not raw: return None try: return json.loads(raw) except Exception: return None SPORT_KW = ("sport", "klub", "olimp", "natjec", "atlet", "vater", "ronil", "kuglač", "boćar", "skij", "karat", "kickbox", "wushu", "biciklist", "plivat", "plivač", "athlete", "compet", "wrestle", "swimmer", "boxer", "diver", "skier", "sailor", "vesla", "gimnast", "rukomet", "košark", "tenisa", "šahist", "ribolov", "ju-jitsu", "jujits", "automob", "rally", "racing", "freediv", "apnea", "free diving") def main(): conn = psycopg2.connect(**DB); conn.autocommit = True cr = conn.cursor() cr.execute("""SELECT DISTINCT c.id, c.ime, c.prezime, c.sport, c.kategorija_hoo, k.naziv FROM pgz_sport.clanovi c LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id WHERE (c.kategorija_hoo IN (1, 2, 3) OR c.id IN (SELECT DISTINCT clan_id FROM pgz_sport.clan_nagrada WHERE clan_id IS NOT NULL AND razina_natjecanja IN ('SP','EP','OI','SK'))) AND (c.biografija IS NULL OR LENGTH(c.biografija) < 200) ORDER BY c.kategorija_hoo NULLS LAST LIMIT 150""") targets = cr.fetchall() out(f"Targets: {len(targets)}") enriched = 0; tried = 0 for tid, ime, prez, sport, kat, klub in targets: tried += 1 full = f"{ime} {prez}" found = False for lang in ["hr", "en"]: # Try search results = wiki_search(full, lang) for title, url_link in results[:3]: if "razdvojba" in title.lower() or "disambiguation" in title.lower(): continue s = wiki_summary(title, lang) if not s: continue if s.get("type") not in ("standard", None): continue extract = (s.get("extract") or "").strip() if not extract or len(extract) < 80: continue # Match: must contain at least sport keyword AND surname tlower = (extract + " " + (s.get("description") or "")).lower() if prez.lower() not in tlower: continue # not about same person if not any(kw in tlower for kw in SPORT_KW): continue wurl = (s.get("content_urls") or {}).get("desktop", {}).get("page") if not wurl: continue try: cr.execute("""UPDATE pgz_sport.clanovi SET biografija = %s, source_url = COALESCE(source_url, %s), source_synced_at = now() WHERE id = %s""", (extract[:1500], wurl, tid)) cr.execute("""INSERT INTO dabi.knowledge (fact, source, confidence, category, created_at) VALUES (%s, %s, %s, %s, now()) ON CONFLICT DO NOTHING""", (f"{full} ({sport or '?'}, {klub or 'PGŽ'}) — {extract[:600]} (Wikipedia {lang.upper()})"[:2000], f"wikipedia_{lang}", 0.9, "biografija_sportasa")) enriched += 1 out(f" ✓ [{lang}] {full} matched: {title} - {len(extract)} chars") found = True break except Exception as e: out(f" ERR {full}: {e}") time.sleep(0.2) if found: break if tried % 25 == 0: out(f" Progress: tried={tried} enriched={enriched}") time.sleep(0.2) out(f"=== DONE: tried={tried} enriched={enriched} ===") conn.close() if __name__ == "__main__": main()