#!/usr/bin/env python3 from dotenv import load_dotenv load_dotenv('/opt/rinet-gpu/.env.master') # auto-added by patch_scrapers_with_dotenv.sh # ═══════════════════════════════════════════════════════════════════ # Fajl: sport_rezultati_arhivar.py | v1.0.0 | 05.05.2026 # Lokacija: /opt/pgz-sport/scrapers/sport_rezultati_arhivar.py # Svrha: Wikipedia HR sezone HNL + Kup HR po godinama # - Iterate kroz sve sezone HNL od 1992 # - Wikipedia API pages: "1._HNL_2023/24", "Kup_Hrvatske_u_nogometu_2024/25" # - Extract konacne tablice + finalne utakmice # - Plus PGŽ klubovi: HNK Rijeka, Orijent, Crikvenica, Opatija # ═══════════════════════════════════════════════════════════════════ """Sport rezultati historical arhivar.""" import os, re, json, time, hashlib import urllib.request, urllib.parse import psycopg2 from psycopg2.extras import execute_batch DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}" UA = "Ri.NET Civic Bot 1.0 (contact: dradulic@outlook.com)" API = "https://hr.wikipedia.org/w/api.php" def wiki_extract(title, sentences=None): params = {"action": "query", "prop": "extracts", "explaintext": "1", "redirects": "1", "format": "json", "titles": title} if sentences: params["exsentences"] = str(sentences) url = API + "?" + urllib.parse.urlencode(params) req = urllib.request.Request(url, headers={"User-Agent": UA}) try: with urllib.request.urlopen(req, timeout=15) as r: d = json.loads(r.read()) for pid, p in d.get("query", {}).get("pages", {}).items(): if pid == "-1": return None return p.get("extract", "") except Exception as e: return None def chunk(text, max_len=700): if len(text) <= max_len: return [text] if text else [] out = []; start = 0 while start < len(text): end = min(start + max_len, len(text)) if end < len(text): for sep in [". ", "! ", "? ", "\n"]: p = text.rfind(sep, start, end) if p > start + max_len // 2: end = p + len(sep); break out.append(text[start:end].strip()) start = end return [c for c in out if len(c) > 80] def insert_facts(conn, page, text, category, confidence=0.88): if not text or len(text) < 200: return 0 cur = conn.cursor() rows = [] for c in chunk(text, 700): h = hashlib.md5(c.encode()).hexdigest() rows.append((c, "wikipedia_sport_arhiv", category, confidence, h, json.dumps({"page": page}))) sql = """INSERT INTO dabi.knowledge (fact, source, category, confidence, data_hash, source_refs) VALUES (%s, %s, %s, %s, %s, %s::jsonb) ON CONFLICT (data_hash) DO NOTHING""" try: execute_batch(cur, sql, rows, page_size=50) n = cur.rowcount; cur.close() return n except Exception as e: return 0 def main(): conn = psycopg2.connect(DSN); conn.autocommit = True pages = [] # 1. HNL sezone 1992-2024 for year in range(1992, 2026): for fmt in [f"1._HNL_{year}.", f"1._HNL_{year}./{(year+1)%100:02d}.", f"HNL_{year}/{(year+1)%100:02d}", f"HNL_{year}-{year+1}", f"SuperSport_HNL_{year}./{(year+1)%100:02d}.", f"HT_Prva_HNL_{year}./{(year+1)%100:02d}."]: pages.append(("hnl_sezona", fmt)) # 2. Kup Hrvatske u nogometu (po godinama) for year in range(1992, 2026): for fmt in [f"Kup_Hrvatske_u_nogometu_{year}.", f"Kup_Hrvatske_u_nogometu_{year}./{(year+1)%100:02d}.", f"Hrvatski_nogometni_kup_{year}-{year+1}"]: pages.append(("hr_nogometni_kup", fmt)) # 3. Glavni klubovi PGŽ + povijest for klub in ["HNK_Rijeka", "NK_Orijent", "NK_Krk", "NK_Crikvenica", "NK_Opatija", "NK_Mat-Promet", "NK_Pomorac", "NK_Naša_Slatina", "HNK_Rijeka_(boys)", "ŽNK_Rijeka", "HKK_Kvarner", "KK_Kvarner_2010", "KK_Lovran", "HMRK_Zamet", "MRK_Pomorac", "RK_Trsat", "RK_Crikvenica", "VK_Primorje", "VK_Rijeka", "HRK_Rijeka", "HOK_Rijeka", "OK_Rijeka", "HAOK_Mladost", "HAOK_Rijeka"]: pages.append(("pgz_klub_povijest", klub)) # 4. Sezone HNK Rijeka po godinama for year in range(1990, 2026): for fmt in [f"Sezona_HNK_Rijeka_{year}./{(year+1)%100:02d}.", f"HNK_Rijeka_u_sezoni_{year}-{year+1}", f"HNK_Rijeka_{year}-{year+1}_sezona"]: pages.append(("hnk_rijeka_sezona", fmt)) # Crawl successful = 0 total_facts = 0 found_pages = [] for category, page in pages: text = wiki_extract(page) if text and len(text) > 300: successful += 1 facts_inserted = insert_facts(conn, page, text, category, confidence=0.88) total_facts += facts_inserted found_pages.append(page) if successful % 10 == 0: print(f" progress: {successful} pages found, {total_facts} facts") time.sleep(0.4) # rate limit print(f"\n=== DONE: {successful}/{len(pages)} pages found, {total_facts} facts ===") print(f"Sample found pages: {found_pages[:15]}") conn.close() return {"pages_found": successful, "pages_tried": len(pages), "facts": total_facts} if __name__ == "__main__": print(json.dumps(main()))