#!/usr/bin/env python3 from dotenv import load_dotenv load_dotenv('/opt/rinet-gpu/.env.master') # auto-added by patch_scrapers_with_dotenv.sh # ═══════════════════════════════════════════════════════════════════ # Fajl: wiki_pgz_sport.py | v1.0.0 | 05.05.2026 # Lokacija: /opt/pgz-sport/scrapers/wiki_pgz_sport.py # Svrha: Wikipedia HR/EN scrape — PGŽ sport klubovi + sportaši # - Iterate kroz sve known PGŽ klubove # - Wiki API → page extract # - Plus historical match results od Wikipedia season tables # ═══════════════════════════════════════════════════════════════════ """Wikipedia PGŽ sport corpus.""" import os, sys, re, time, hashlib, logging, json import urllib.request, urllib.parse import psycopg2 from psycopg2.extras import execute_batch logging.basicConfig(level=logging.INFO, format="%(asctime)s [wiki_sport] %(message)s") log = logging.getLogger("wiki_sport") DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}" UA = "Ri.NET Civic Bot 1.0 (contact: dradulic@outlook.com)" API_HR = "https://hr.wikipedia.org/w/api.php" API_EN = "https://en.wikipedia.org/w/api.php" def wiki_extract(api, title, sentences=None): """Get plain text extract for a Wikipedia page.""" params = { "action": "query", "prop": "extracts", "explaintext": "1", "redirects": "1", "format": "json", "titles": title, } if sentences: params["exsentences"] = str(sentences) url = api + "?" + urllib.parse.urlencode(params) req = urllib.request.Request(url, headers={"User-Agent": UA}) try: with urllib.request.urlopen(req, timeout=15) as r: d = json.loads(r.read()) pages = d.get("query", {}).get("pages", {}) for pid, p in pages.items(): if pid == "-1": return None # not found return p.get("extract", "") except Exception as e: log.warning(f"wiki err {title}: {e}") return None def wiki_search(api, query, limit=5): """Search Wikipedia for related pages.""" params = {"action": "query", "list": "search", "srsearch": query, "format": "json", "srlimit": str(limit)} url = api + "?" + urllib.parse.urlencode(params) req = urllib.request.Request(url, headers={"User-Agent": UA}) try: with urllib.request.urlopen(req, timeout=10) as r: d = json.loads(r.read()) return [p["title"] for p in d.get("query", {}).get("search", [])] except Exception: return [] def get_pgz_clubs(conn): """Fetch active PGŽ clubs from DB.""" cur = conn.cursor() cur.execute(""" SELECT naziv, COALESCE(skraceni_naziv, '') FROM pgz_sport.klubovi WHERE COALESCE(aktivan, true) = true ORDER BY naziv """) out = [(r[0], r[1]) for r in cur.fetchall()] cur.close() return out def chunk(text, max_len=700): if len(text) <= max_len: return [text] if text else [] out = []; start = 0 while start < len(text): end = min(start + max_len, len(text)) if end < len(text): for sep in [". ", "! ", "? ", "\n"]: p = text.rfind(sep, start, end) if p > start + max_len // 2: end = p + len(sep); break out.append(text[start:end].strip()) start = end return [c for c in out if len(c) > 80] def upsert(conn, facts): if not facts: return 0 cur = conn.cursor() rows = [] for f in facts: h = hashlib.md5(f["fact"].encode()).hexdigest() rows.append((f["fact"], "wikipedia_pgz_sport", "pgz_sport_wiki", f.get("confidence", 0.84), h, json.dumps({"page": f.get("page", ""), "lang": f.get("lang", "hr")}))) sql = """INSERT INTO dabi.knowledge (fact, source, category, confidence, data_hash, source_refs) VALUES (%s, %s, %s, %s, %s, %s::jsonb) ON CONFLICT (data_hash) DO NOTHING""" try: execute_batch(cur, sql, rows, page_size=50) n = cur.rowcount; cur.close() return n except Exception as e: log.error(f"upsert: {e}") return 0 def main(): conn = psycopg2.connect(DSN); conn.autocommit = True clubs = get_pgz_clubs(conn) log.info(f"PGŽ active clubs: {len(clubs)}") total_facts = 0 found_pages = 0 for naziv, kraci in clubs[:200]: # limit first run # Try direct page first text = wiki_extract(API_HR, naziv) if not text: # Try search candidates = wiki_search(API_HR, naziv, limit=3) for c in candidates: if any(t.lower() in c.lower() for t in [naziv.split()[-1], "Rijeka", "Opatija", "Krk"]): text = wiki_extract(API_HR, c) if text: break if text and len(text) > 200: found_pages += 1 facts = [{"fact": c, "page": naziv, "lang": "hr", "confidence": 0.85} for c in chunk(text, 700)] total_facts += upsert(conn, facts) time.sleep(0.5) if found_pages % 20 == 0 and found_pages > 0: log.info(f" progress: pages {found_pages}, facts {total_facts}") log.info(f"=== DONE: pages={found_pages} facts={total_facts} ===") print(json.dumps({"pages": found_pages, "facts": total_facts})) conn.close() if __name__ == "__main__": main()