Files
pgz-sport/scrapers/D_wiki_v2.py
T

99 lines
4.3 KiB
Python
Executable File

#!/usr/bin/env python3
from dotenv import load_dotenv
load_dotenv('/opt/rinet-gpu/.env.master')
# auto-added by patch_scrapers_with_dotenv.sh
"""D v2: simpler & faster wiki enrichment with stdout flush."""
import os
import re, json, time, sys
import urllib.request, urllib.parse, urllib.error
import psycopg2
DB = dict(host="10.10.0.2", port=6432, dbname="rinet_v3", user="rinet", password=os.environ["DB_PASSWORD"])
UA = "Mozilla/5.0 (PGZSportBot/1.0)"
def out(msg):
print(msg, flush=True)
sys.stdout.flush()
def http_get(url, timeout=10):
try:
req = urllib.request.Request(url, headers={"User-Agent": UA, "Accept-Language": "hr,en"})
with urllib.request.urlopen(req, timeout=timeout) as r:
return r.read().decode("utf-8", errors="replace")
except Exception as e:
return None
def wiki_summary(title, lang="hr"):
enc = urllib.parse.quote(title.replace(" ", "_"))
url = f"https://{lang}.wikipedia.org/api/rest_v1/page/summary/{enc}"
raw = http_get(url)
if not raw: return None
try: return json.loads(raw)
except Exception: return None
def main():
conn = psycopg2.connect(**DB); conn.autocommit = True
cr = conn.cursor()
cr.execute("""SELECT DISTINCT c.id, c.ime, c.prezime, c.sport, c.kategorija_hoo, k.naziv
FROM pgz_sport.clanovi c LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id
WHERE (c.kategorija_hoo IN (1, 2, 3)
OR c.id IN (SELECT DISTINCT clan_id FROM pgz_sport.clan_nagrada
WHERE clan_id IS NOT NULL AND razina_natjecanja IN ('SP','EP','OI','SK')))
AND (c.biografija IS NULL OR LENGTH(c.biografija) < 200)
ORDER BY c.kategorija_hoo NULLS LAST LIMIT 100""")
targets = cr.fetchall()
out(f"Targets: {len(targets)}")
enriched = 0; tried = 0
for tid, ime, prez, sport, kat, klub in targets:
tried += 1
full = f"{ime} {prez}"
# Try direct title match HR + EN
for lang in ["hr", "en"]:
s = wiki_summary(full, lang)
if not s: continue
if s.get("type") not in ("standard", None): continue
extract = (s.get("extract") or "").strip()
if not extract or len(extract) < 80: continue
# Quality check: must mention sport keyword
t = (extract + " " + (s.get("description") or "")).lower()
sport_kws = ("sport", "klub", "olimp", "natjec", "atlet", "vater", "ronil",
"kuglač", "boćar", "skij", "karat", "kickbox", "wushu", "biciklist",
"plivat", "plivač", "athlete", "compet", "wrestle", "swimmer", "boxer",
"diver", "skier", "sailor", "vesla", "ringa", "gimnast")
if not any(kw in t for kw in sport_kws):
continue
wurl = (s.get("content_urls") or {}).get("desktop", {}).get("page")
if not wurl: continue
try:
cr.execute("""UPDATE pgz_sport.clanovi
SET biografija = %s,
source = COALESCE(NULLIF(source, ''), %s),
source_url = COALESCE(source_url, %s),
source_synced_at = now()
WHERE id = %s""",
(extract[:1500], f"wikipedia_{lang}", wurl, tid))
cr.execute("""INSERT INTO dabi.knowledge (fact, source, confidence, category, created_at)
VALUES (%s, %s, %s, %s, now()) ON CONFLICT DO NOTHING""",
(f"{full} ({sport}, {klub or 'PGŽ'}) — {extract[:600]} (Wikipedia {lang.upper()})"[:2000],
f"wikipedia_{lang}", 0.9, "biografija_sportasa"))
enriched += 1
out(f" ✓ [{lang}] {full} - {len(extract)} chars")
except Exception as e:
out(f" ERR {full}: {e}")
break # found, no need to try other lang
if tried % 20 == 0:
out(f" Progress: tried={tried} enriched={enriched}")
time.sleep(0.3)
out(f"=== DONE: tried={tried} enriched={enriched} ===")
conn.close()
if __name__ == "__main__":
main()