PGŽ Sport Platform — Round 1+2 baseline (sport2.html + API)
This commit is contained in:
Executable
+142
@@ -0,0 +1,142 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
D curated: hand-curated wiki titles + extracts for top PGŽ athletes.
|
||||
Faster + more reliable than search-based approaches.
|
||||
For each known athlete, hardcode wiki title (HR/EN) and pull summary directly.
|
||||
"""
|
||||
import re, json, time, sys
|
||||
import urllib.request, urllib.parse
|
||||
import psycopg2
|
||||
|
||||
DB = dict(host="10.10.0.2", port=6432, dbname="rinet_v3", user="rinet", password="R1net2026!SecureDB#v7")
|
||||
UA = "Mozilla/5.0 (compatible; PGZBot/1.0)"
|
||||
|
||||
def out(msg): print(msg, flush=True)
|
||||
|
||||
def http_get(url, timeout=10):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||||
return r.read().decode("utf-8", errors="replace")
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def wiki_summary(title, lang="hr"):
|
||||
enc = urllib.parse.quote(title.replace(" ", "_"))
|
||||
url = f"https://{lang}.wikipedia.org/api/rest_v1/page/summary/{enc}"
|
||||
raw = http_get(url)
|
||||
if not raw: return None
|
||||
try: return json.loads(raw)
|
||||
except Exception: return None
|
||||
|
||||
# Curated list: (full_name, hr_wiki_title, en_wiki_title) - athletes with known wiki entries
|
||||
CURATED = [
|
||||
# Olympic medalists (PGŽ historical heroes)
|
||||
("Sara Kolak", "Sara Kolak", "Sara Kolak"),
|
||||
("Duje Draganja", "Duje Draganja", "Duje Draganja"),
|
||||
("Mirza Džomba", "Mirza Džomba", "Mirza Džomba"),
|
||||
("Luciano Sušanj", "Luciano Sušanj", "Luciano Sušanj"),
|
||||
("Damir Skomina", "Damir Skomina", "Damir Skomina"),
|
||||
# 2025 stars
|
||||
("Petar Klovar", "Petar Klovar", "Petar Klovar"),
|
||||
("Vitomir Maričić", "Vitomir Maričić", "Vitomir Maričić"),
|
||||
("Sandra Delija", None, "Sandra Delija"),
|
||||
("Laura Štefanac", "Laura Štefanac", "Laura Štefanac"),
|
||||
("Ivan Šarić", "Ivan Šarić (šahist)", "Ivan Šarić (chess player)"),
|
||||
("Damir Kreilach", "Damir Kreilach", "Damir Kreilach"),
|
||||
# Football (HNK Rijeka stars)
|
||||
("Niko Janković", None, None),
|
||||
("Ante Majstorović", "Ante Majstorović", "Ante Majstorović"),
|
||||
("Toni Fruk", "Toni Fruk", "Toni Fruk"),
|
||||
("Stjepan Radeljić", "Stjepan Radeljić", "Stjepan Radeljić"),
|
||||
("Niko Galešić", "Niko Galešić", "Niko Galešić"),
|
||||
("Bruno Bogojević", "Bruno Bogojević", "Bruno Bogojević"),
|
||||
("Duje Čop", "Duje Čop", "Duje Čop"),
|
||||
("Luka Menalo", "Luka Menalo", "Luka Menalo"),
|
||||
("Mile Škorić", "Mile Škorić", "Mile Škorić"),
|
||||
("Stipe Perica", "Stipe Perica", "Stipe Perica"),
|
||||
("Marijan Čabraja", "Marijan Čabraja", "Marijan Čabraja"),
|
||||
("Cherno Saho", None, "Cherno Saho"),
|
||||
("Bruno Goda", None, None),
|
||||
("Marco Pašalić", None, "Marco Pašalić"),
|
||||
("Amer Gojak", "Amer Gojak", "Amer Gojak"),
|
||||
# Coaches
|
||||
("Radomir Đalović", "Radomir Đalović", "Radomir Đalović"),
|
||||
# Vaterpolo PGŽ
|
||||
("Tin Brubnjak", "Tin Brubnjak", "Tin Brubnjak"),
|
||||
# Boćanje legends
|
||||
("Karlo Šaban", None, None),
|
||||
("Carrolina Ban", None, None),
|
||||
# Karate
|
||||
("Ema Sgardelli", None, "Ema Sgardelli"),
|
||||
# Atletika
|
||||
("Sara Kolak", "Sara Kolak", "Sara Kolak"),
|
||||
]
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cr = conn.cursor()
|
||||
|
||||
enriched = 0; tried = 0
|
||||
for full, hr_title, en_title in CURATED:
|
||||
tried += 1
|
||||
ime, prez = full.split(" ", 1) if " " in full else (full, "")
|
||||
# Find clan record
|
||||
cr.execute("""SELECT id, sport, klub_id FROM pgz_sport.clanovi
|
||||
WHERE LOWER(ime) = LOWER(%s) AND LOWER(prezime) = LOWER(%s)
|
||||
LIMIT 1""", (ime, prez))
|
||||
row = cr.fetchone()
|
||||
if not row:
|
||||
out(f" - {full} not in clanovi"); continue
|
||||
cid, sport, klub_id = row
|
||||
|
||||
# Fetch wiki - try hr first then en
|
||||
s = None; wlang = None
|
||||
for lang, title in [("hr", hr_title), ("en", en_title)]:
|
||||
if not title: continue
|
||||
s = wiki_summary(title, lang)
|
||||
if s and s.get("type") in ("standard", None):
|
||||
wlang = lang; break
|
||||
time.sleep(0.2)
|
||||
|
||||
if not s or not s.get("extract"):
|
||||
out(f" ✗ {full} - no wiki page")
|
||||
continue
|
||||
|
||||
extract = s["extract"].strip()[:1500]
|
||||
wurl = (s.get("content_urls") or {}).get("desktop", {}).get("page", "")
|
||||
|
||||
try:
|
||||
cr.execute("""UPDATE pgz_sport.clanovi
|
||||
SET biografija = %s, source_url = COALESCE(source_url, %s),
|
||||
source_synced_at = now()
|
||||
WHERE id = %s""",
|
||||
(extract, wurl, cid))
|
||||
cr.execute("""INSERT INTO dabi.knowledge (fact, source, confidence, category, created_at)
|
||||
VALUES (%s, %s, %s, %s, now()) ON CONFLICT DO NOTHING""",
|
||||
(f"{full} ({sport or '?'}) — {extract[:600]} (Wikipedia {wlang.upper()})"[:2000],
|
||||
f"wikipedia_{wlang}", 0.95, "biografija_sportasa"))
|
||||
enriched += 1
|
||||
out(f" ✓ [{wlang}] {full} - {len(extract)} chars")
|
||||
except Exception as e:
|
||||
out(f" ERR {full}: {e}")
|
||||
|
||||
time.sleep(0.3)
|
||||
|
||||
out(f"\n=== DONE: tried={tried} enriched={enriched} ===")
|
||||
|
||||
# Summary
|
||||
cr.execute("""SELECT count(*) FROM pgz_sport.clanovi WHERE LENGTH(biografija) > 200""")
|
||||
total = cr.fetchone()[0]
|
||||
out(f"\nTotal sportaša s bio > 200 chars: {total}")
|
||||
|
||||
cr.execute("""SELECT ime, prezime, sport, LENGTH(biografija) AS bio FROM pgz_sport.clanovi
|
||||
WHERE LENGTH(biografija) > 200 ORDER BY bio DESC LIMIT 15""")
|
||||
out("\nTop bios:")
|
||||
for r in cr.fetchall():
|
||||
out(f" {r[0]:18} {r[1]:18} {r[2]:15} {r[3]} chars")
|
||||
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user