143 lines
5.7 KiB
Python
Executable File
143 lines
5.7 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
D curated: hand-curated wiki titles + extracts for top PGŽ athletes.
|
|
Faster + more reliable than search-based approaches.
|
|
For each known athlete, hardcode wiki title (HR/EN) and pull summary directly.
|
|
"""
|
|
import re, json, time, sys
|
|
import urllib.request, urllib.parse
|
|
import psycopg2
|
|
|
|
DB = dict(host="localhost", port=5432, dbname="rinet_v3", user="rinet", password="R1net2026!SecureDB#v7")
|
|
UA = "Mozilla/5.0 (compatible; PGZBot/1.0)"
|
|
|
|
def out(msg): print(msg, flush=True)
|
|
|
|
def http_get(url, timeout=10):
|
|
try:
|
|
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
|
with urllib.request.urlopen(req, timeout=timeout) as r:
|
|
return r.read().decode("utf-8", errors="replace")
|
|
except Exception:
|
|
return None
|
|
|
|
def wiki_summary(title, lang="hr"):
|
|
enc = urllib.parse.quote(title.replace(" ", "_"))
|
|
url = f"https://{lang}.wikipedia.org/api/rest_v1/page/summary/{enc}"
|
|
raw = http_get(url)
|
|
if not raw: return None
|
|
try: return json.loads(raw)
|
|
except Exception: return None
|
|
|
|
# Curated list: (full_name, hr_wiki_title, en_wiki_title) - athletes with known wiki entries
|
|
CURATED = [
|
|
# Olympic medalists (PGŽ historical heroes)
|
|
("Sara Kolak", "Sara Kolak", "Sara Kolak"),
|
|
("Duje Draganja", "Duje Draganja", "Duje Draganja"),
|
|
("Mirza Džomba", "Mirza Džomba", "Mirza Džomba"),
|
|
("Luciano Sušanj", "Luciano Sušanj", "Luciano Sušanj"),
|
|
("Damir Skomina", "Damir Skomina", "Damir Skomina"),
|
|
# 2025 stars
|
|
("Petar Klovar", "Petar Klovar", "Petar Klovar"),
|
|
("Vitomir Maričić", "Vitomir Maričić", "Vitomir Maričić"),
|
|
("Sandra Delija", None, "Sandra Delija"),
|
|
("Laura Štefanac", "Laura Štefanac", "Laura Štefanac"),
|
|
("Ivan Šarić", "Ivan Šarić (šahist)", "Ivan Šarić (chess player)"),
|
|
("Damir Kreilach", "Damir Kreilach", "Damir Kreilach"),
|
|
# Football (HNK Rijeka stars)
|
|
("Niko Janković", None, None),
|
|
("Ante Majstorović", "Ante Majstorović", "Ante Majstorović"),
|
|
("Toni Fruk", "Toni Fruk", "Toni Fruk"),
|
|
("Stjepan Radeljić", "Stjepan Radeljić", "Stjepan Radeljić"),
|
|
("Niko Galešić", "Niko Galešić", "Niko Galešić"),
|
|
("Bruno Bogojević", "Bruno Bogojević", "Bruno Bogojević"),
|
|
("Duje Čop", "Duje Čop", "Duje Čop"),
|
|
("Luka Menalo", "Luka Menalo", "Luka Menalo"),
|
|
("Mile Škorić", "Mile Škorić", "Mile Škorić"),
|
|
("Stipe Perica", "Stipe Perica", "Stipe Perica"),
|
|
("Marijan Čabraja", "Marijan Čabraja", "Marijan Čabraja"),
|
|
("Cherno Saho", None, "Cherno Saho"),
|
|
("Bruno Goda", None, None),
|
|
("Marco Pašalić", None, "Marco Pašalić"),
|
|
("Amer Gojak", "Amer Gojak", "Amer Gojak"),
|
|
# Coaches
|
|
("Radomir Đalović", "Radomir Đalović", "Radomir Đalović"),
|
|
# Vaterpolo PGŽ
|
|
("Tin Brubnjak", "Tin Brubnjak", "Tin Brubnjak"),
|
|
# Boćanje legends
|
|
("Karlo Šaban", None, None),
|
|
("Carrolina Ban", None, None),
|
|
# Karate
|
|
("Ema Sgardelli", None, "Ema Sgardelli"),
|
|
# Atletika
|
|
("Sara Kolak", "Sara Kolak", "Sara Kolak"),
|
|
]
|
|
|
|
def main():
|
|
conn = psycopg2.connect(**DB); conn.autocommit = True
|
|
cr = conn.cursor()
|
|
|
|
enriched = 0; tried = 0
|
|
for full, hr_title, en_title in CURATED:
|
|
tried += 1
|
|
ime, prez = full.split(" ", 1) if " " in full else (full, "")
|
|
# Find clan record
|
|
cr.execute("""SELECT id, sport, klub_id FROM pgz_sport.clanovi
|
|
WHERE LOWER(ime) = LOWER(%s) AND LOWER(prezime) = LOWER(%s)
|
|
LIMIT 1""", (ime, prez))
|
|
row = cr.fetchone()
|
|
if not row:
|
|
out(f" - {full} not in clanovi"); continue
|
|
cid, sport, klub_id = row
|
|
|
|
# Fetch wiki - try hr first then en
|
|
s = None; wlang = None
|
|
for lang, title in [("hr", hr_title), ("en", en_title)]:
|
|
if not title: continue
|
|
s = wiki_summary(title, lang)
|
|
if s and s.get("type") in ("standard", None):
|
|
wlang = lang; break
|
|
time.sleep(0.2)
|
|
|
|
if not s or not s.get("extract"):
|
|
out(f" ✗ {full} - no wiki page")
|
|
continue
|
|
|
|
extract = s["extract"].strip()[:1500]
|
|
wurl = (s.get("content_urls") or {}).get("desktop", {}).get("page", "")
|
|
|
|
try:
|
|
cr.execute("""UPDATE pgz_sport.clanovi
|
|
SET biografija = %s, source_url = COALESCE(source_url, %s),
|
|
source_synced_at = now()
|
|
WHERE id = %s""",
|
|
(extract, wurl, cid))
|
|
cr.execute("""INSERT INTO dabi.knowledge (fact, source, confidence, category, created_at)
|
|
VALUES (%s, %s, %s, %s, now()) ON CONFLICT DO NOTHING""",
|
|
(f"{full} ({sport or '?'}) — {extract[:600]} (Wikipedia {wlang.upper()})"[:2000],
|
|
f"wikipedia_{wlang}", 0.95, "biografija_sportasa"))
|
|
enriched += 1
|
|
out(f" ✓ [{wlang}] {full} - {len(extract)} chars")
|
|
except Exception as e:
|
|
out(f" ERR {full}: {e}")
|
|
|
|
time.sleep(0.3)
|
|
|
|
out(f"\n=== DONE: tried={tried} enriched={enriched} ===")
|
|
|
|
# Summary
|
|
cr.execute("""SELECT count(*) FROM pgz_sport.clanovi WHERE LENGTH(biografija) > 200""")
|
|
total = cr.fetchone()[0]
|
|
out(f"\nTotal sportaša s bio > 200 chars: {total}")
|
|
|
|
cr.execute("""SELECT ime, prezime, sport, LENGTH(biografija) AS bio FROM pgz_sport.clanovi
|
|
WHERE LENGTH(biografija) > 200 ORDER BY bio DESC LIMIT 15""")
|
|
out("\nTop bios:")
|
|
for r in cr.fetchall():
|
|
out(f" {r[0]:18} {r[1]:18} {r[2]:15} {r[3]} chars")
|
|
|
|
conn.close()
|
|
|
|
if __name__ == "__main__":
|
|
main()
|