PGŽ Sport Platform — Round 1+2 baseline (sport2.html + API)
This commit is contained in:
+112
@@ -0,0 +1,112 @@
|
||||
#!/usr/bin/env python3
|
||||
"""D v3: enrichment with Wikipedia search API + DuckDuckGo as fallback."""
|
||||
import re, json, time, sys
|
||||
import urllib.request, urllib.parse
|
||||
import psycopg2
|
||||
|
||||
DB = dict(host="localhost", port=5432, dbname="rinet_v3", user="rinet", password="R1net2026!SecureDB#v7")
|
||||
UA = "Mozilla/5.0 (PGZSportBot/1.0)"
|
||||
|
||||
def out(msg):
|
||||
print(msg, flush=True)
|
||||
|
||||
def http_get(url, timeout=10):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA, "Accept-Language": "hr,en"})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||||
return r.read().decode("utf-8", errors="replace")
|
||||
except Exception as e:
|
||||
return None
|
||||
|
||||
def wiki_search(query, lang="hr"):
|
||||
enc = urllib.parse.quote(query)
|
||||
url = f"https://{lang}.wikipedia.org/w/api.php?action=opensearch&search={enc}&limit=5&format=json"
|
||||
raw = http_get(url)
|
||||
if not raw: return []
|
||||
try:
|
||||
d = json.loads(raw)
|
||||
return list(zip(d[1], d[3]))
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
def wiki_summary(title, lang="hr"):
|
||||
enc = urllib.parse.quote(title.replace(" ", "_"))
|
||||
url = f"https://{lang}.wikipedia.org/api/rest_v1/page/summary/{enc}"
|
||||
raw = http_get(url)
|
||||
if not raw: return None
|
||||
try: return json.loads(raw)
|
||||
except Exception: return None
|
||||
|
||||
SPORT_KW = ("sport", "klub", "olimp", "natjec", "atlet", "vater", "ronil", "kuglač",
|
||||
"boćar", "skij", "karat", "kickbox", "wushu", "biciklist", "plivat", "plivač",
|
||||
"athlete", "compet", "wrestle", "swimmer", "boxer", "diver", "skier", "sailor",
|
||||
"vesla", "gimnast", "rukomet", "košark", "tenisa", "šahist", "ribolov",
|
||||
"ju-jitsu", "jujits", "automob", "rally", "racing", "freediv", "apnea", "free diving")
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cr = conn.cursor()
|
||||
cr.execute("""SELECT DISTINCT c.id, c.ime, c.prezime, c.sport, c.kategorija_hoo, k.naziv
|
||||
FROM pgz_sport.clanovi c LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id
|
||||
WHERE (c.kategorija_hoo IN (1, 2, 3)
|
||||
OR c.id IN (SELECT DISTINCT clan_id FROM pgz_sport.clan_nagrada
|
||||
WHERE clan_id IS NOT NULL AND razina_natjecanja IN ('SP','EP','OI','SK')))
|
||||
AND (c.biografija IS NULL OR LENGTH(c.biografija) < 200)
|
||||
ORDER BY c.kategorija_hoo NULLS LAST LIMIT 150""")
|
||||
targets = cr.fetchall()
|
||||
out(f"Targets: {len(targets)}")
|
||||
|
||||
enriched = 0; tried = 0
|
||||
|
||||
for tid, ime, prez, sport, kat, klub in targets:
|
||||
tried += 1
|
||||
full = f"{ime} {prez}"
|
||||
|
||||
found = False
|
||||
for lang in ["hr", "en"]:
|
||||
# Try search
|
||||
results = wiki_search(full, lang)
|
||||
for title, url_link in results[:3]:
|
||||
if "razdvojba" in title.lower() or "disambiguation" in title.lower(): continue
|
||||
s = wiki_summary(title, lang)
|
||||
if not s: continue
|
||||
if s.get("type") not in ("standard", None): continue
|
||||
extract = (s.get("extract") or "").strip()
|
||||
if not extract or len(extract) < 80: continue
|
||||
# Match: must contain at least sport keyword AND surname
|
||||
tlower = (extract + " " + (s.get("description") or "")).lower()
|
||||
if prez.lower() not in tlower: continue # not about same person
|
||||
if not any(kw in tlower for kw in SPORT_KW): continue
|
||||
|
||||
wurl = (s.get("content_urls") or {}).get("desktop", {}).get("page")
|
||||
if not wurl: continue
|
||||
|
||||
try:
|
||||
cr.execute("""UPDATE pgz_sport.clanovi
|
||||
SET biografija = %s,
|
||||
source_url = COALESCE(source_url, %s),
|
||||
source_synced_at = now()
|
||||
WHERE id = %s""",
|
||||
(extract[:1500], wurl, tid))
|
||||
cr.execute("""INSERT INTO dabi.knowledge (fact, source, confidence, category, created_at)
|
||||
VALUES (%s, %s, %s, %s, now()) ON CONFLICT DO NOTHING""",
|
||||
(f"{full} ({sport or '?'}, {klub or 'PGŽ'}) — {extract[:600]} (Wikipedia {lang.upper()})"[:2000],
|
||||
f"wikipedia_{lang}", 0.9, "biografija_sportasa"))
|
||||
enriched += 1
|
||||
out(f" ✓ [{lang}] {full} matched: {title} - {len(extract)} chars")
|
||||
found = True
|
||||
break
|
||||
except Exception as e:
|
||||
out(f" ERR {full}: {e}")
|
||||
time.sleep(0.2)
|
||||
if found: break
|
||||
|
||||
if tried % 25 == 0:
|
||||
out(f" Progress: tried={tried} enriched={enriched}")
|
||||
time.sleep(0.2)
|
||||
|
||||
out(f"=== DONE: tried={tried} enriched={enriched} ===")
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user