Files
pgz-sport/scrapers/D_curated.py_prije_env_deepseek

144 lines
5.7 KiB
Python
Executable File

#!/usr/bin/env python3
"""
D curated: hand-curated wiki titles + extracts for top PGŽ athletes.
Faster + more reliable than search-based approaches.
For each known athlete, hardcode wiki title (HR/EN) and pull summary directly.
"""
import os
import re, json, time, sys
import urllib.request, urllib.parse
import psycopg2
DB = dict(host="10.10.0.2", port=6432, dbname="rinet_v3", user="rinet", password=os.environ["DB_PASSWORD"])
UA = "Mozilla/5.0 (compatible; PGZBot/1.0)"
def out(msg): print(msg, flush=True)
def http_get(url, timeout=10):
try:
req = urllib.request.Request(url, headers={"User-Agent": UA})
with urllib.request.urlopen(req, timeout=timeout) as r:
return r.read().decode("utf-8", errors="replace")
except Exception:
return None
def wiki_summary(title, lang="hr"):
enc = urllib.parse.quote(title.replace(" ", "_"))
url = f"https://{lang}.wikipedia.org/api/rest_v1/page/summary/{enc}"
raw = http_get(url)
if not raw: return None
try: return json.loads(raw)
except Exception: return None
# Curated list: (full_name, hr_wiki_title, en_wiki_title) - athletes with known wiki entries
CURATED = [
# Olympic medalists (PGŽ historical heroes)
("Sara Kolak", "Sara Kolak", "Sara Kolak"),
("Duje Draganja", "Duje Draganja", "Duje Draganja"),
("Mirza Džomba", "Mirza Džomba", "Mirza Džomba"),
("Luciano Sušanj", "Luciano Sušanj", "Luciano Sušanj"),
("Damir Skomina", "Damir Skomina", "Damir Skomina"),
# 2025 stars
("Petar Klovar", "Petar Klovar", "Petar Klovar"),
("Vitomir Maričić", "Vitomir Maričić", "Vitomir Maričić"),
("Sandra Delija", None, "Sandra Delija"),
("Laura Štefanac", "Laura Štefanac", "Laura Štefanac"),
("Ivan Šarić", "Ivan Šarić (šahist)", "Ivan Šarić (chess player)"),
("Damir Kreilach", "Damir Kreilach", "Damir Kreilach"),
# Football (HNK Rijeka stars)
("Niko Janković", None, None),
("Ante Majstorović", "Ante Majstorović", "Ante Majstorović"),
("Toni Fruk", "Toni Fruk", "Toni Fruk"),
("Stjepan Radeljić", "Stjepan Radeljić", "Stjepan Radeljić"),
("Niko Galešić", "Niko Galešić", "Niko Galešić"),
("Bruno Bogojević", "Bruno Bogojević", "Bruno Bogojević"),
("Duje Čop", "Duje Čop", "Duje Čop"),
("Luka Menalo", "Luka Menalo", "Luka Menalo"),
("Mile Škorić", "Mile Škorić", "Mile Škorić"),
("Stipe Perica", "Stipe Perica", "Stipe Perica"),
("Marijan Čabraja", "Marijan Čabraja", "Marijan Čabraja"),
("Cherno Saho", None, "Cherno Saho"),
("Bruno Goda", None, None),
("Marco Pašalić", None, "Marco Pašalić"),
("Amer Gojak", "Amer Gojak", "Amer Gojak"),
# Coaches
("Radomir Đalović", "Radomir Đalović", "Radomir Đalović"),
# Vaterpolo PGŽ
("Tin Brubnjak", "Tin Brubnjak", "Tin Brubnjak"),
# Boćanje legends
("Karlo Šaban", None, None),
("Carrolina Ban", None, None),
# Karate
("Ema Sgardelli", None, "Ema Sgardelli"),
# Atletika
("Sara Kolak", "Sara Kolak", "Sara Kolak"),
]
def main():
conn = psycopg2.connect(**DB); conn.autocommit = True
cr = conn.cursor()
enriched = 0; tried = 0
for full, hr_title, en_title in CURATED:
tried += 1
ime, prez = full.split(" ", 1) if " " in full else (full, "")
# Find clan record
cr.execute("""SELECT id, sport, klub_id FROM pgz_sport.clanovi
WHERE LOWER(ime) = LOWER(%s) AND LOWER(prezime) = LOWER(%s)
LIMIT 1""", (ime, prez))
row = cr.fetchone()
if not row:
out(f" - {full} not in clanovi"); continue
cid, sport, klub_id = row
# Fetch wiki - try hr first then en
s = None; wlang = None
for lang, title in [("hr", hr_title), ("en", en_title)]:
if not title: continue
s = wiki_summary(title, lang)
if s and s.get("type") in ("standard", None):
wlang = lang; break
time.sleep(0.2)
if not s or not s.get("extract"):
out(f"{full} - no wiki page")
continue
extract = s["extract"].strip()[:1500]
wurl = (s.get("content_urls") or {}).get("desktop", {}).get("page", "")
try:
cr.execute("""UPDATE pgz_sport.clanovi
SET biografija = %s, source_url = COALESCE(source_url, %s),
source_synced_at = now()
WHERE id = %s""",
(extract, wurl, cid))
cr.execute("""INSERT INTO dabi.knowledge (fact, source, confidence, category, created_at)
VALUES (%s, %s, %s, %s, now()) ON CONFLICT DO NOTHING""",
(f"{full} ({sport or '?'}) — {extract[:600]} (Wikipedia {wlang.upper()})"[:2000],
f"wikipedia_{wlang}", 0.95, "biografija_sportasa"))
enriched += 1
out(f" ✓ [{wlang}] {full} - {len(extract)} chars")
except Exception as e:
out(f" ERR {full}: {e}")
time.sleep(0.3)
out(f"\n=== DONE: tried={tried} enriched={enriched} ===")
# Summary
cr.execute("""SELECT count(*) FROM pgz_sport.clanovi WHERE LENGTH(biografija) > 200""")
total = cr.fetchone()[0]
out(f"\nTotal sportaša s bio > 200 chars: {total}")
cr.execute("""SELECT ime, prezime, sport, LENGTH(biografija) AS bio FROM pgz_sport.clanovi
WHERE LENGTH(biografija) > 200 ORDER BY bio DESC LIMIT 15""")
out("\nTop bios:")
for r in cr.fetchall():
out(f" {r[0]:18} {r[1]:18} {r[2]:15} {r[3]} chars")
conn.close()
if __name__ == "__main__":
main()