#!/usr/bin/env python3 """Wikipedia deep PGZ encyclopedia.""" import sys, json, time sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters") from _common import chunk_text, upsert_facts, DSN, UA from urllib.parse import urlencode, quote import urllib.request import psycopg2 API_HR = "https://hr.wikipedia.org/w/api.php" API_EN = "https://en.wikipedia.org/w/api.php" def wiki_extract(api, title, timeout=15): params = {"action":"query","prop":"extracts","explaintext":"1", "redirects":"1","format":"json","titles":title} url = api + "?" + urlencode(params) try: req = urllib.request.Request(url, headers={"User-Agent": UA}) with urllib.request.urlopen(req, timeout=timeout) as r: d = json.loads(r.read()) for pid, p in d.get("query", {}).get("pages", {}).items(): if pid == "-1": return None return p.get("extract", "") except Exception: return None PAGES = { "wiki_pgz_grad": ["Rijeka","Opatija","Crikvenica","Krk_(grad)","Kraljevica", "Rab_(grad)","Cres_(grad)","Mali_Losinj","Delnice","Vrbovsko", "Cabar","Bakar","Kastav","Novi_Vinodolski","Susak","Unije"], "wiki_pgz_opcina": ["Opcina_Viskovo","Opcina_Klana","Opcina_Lovran","Opcina_Matulji", "Opcina_Omisalj","Opcina_Punat","Opcina_Vrbnik","Opcina_Baska", "Opcina_Dobrinj","Opcina_Jelenje","Opcina_Kostrena","Opcina_Cavle", "Opcina_Lopar","Opcina_Brod_Moravice","Opcina_Mrkopalj", "Opcina_Ravna_Gora","Opcina_Lokve","Opcina_Skrad","Opcina_Fuzine"], "wiki_pgz_otok": ["Krk","Cres","Losinj","Rab","Susak","Unije","Ilovik","Ist", "Goli_otok","Sveti_Grgur"], "wiki_pgz_povijest": ["Vinodolski_zakonik","Frankopani","Krcki_knezovi", "Liburnija","Liburni","Trsat","Tvrdjava_Trsat", "Slobodna_Drzava_Rijeka","Rijecka_rezolucija"], "wiki_pgz_kultura": ["Glagoljica","Bascanska_ploca","Rijecki_karneval", "Halubajski_zvoncari","Hrvatsko_narodno_kazaliste_Ivana_pl._Zajca"], "wiki_pgz_priroda": ["Ucka","Risnjak","Park_prirode_Ucka", "Nacionalni_park_Risnjak","Velebit","Kvarnerski_zaljev"], "wiki_pgz_gospodarstvo": ["Luka_Rijeka","Brodogradiliste_3._maj", "Brodogradiliste_Viktor_Lenac","Rafinerija_nafte_Rijeka"], "wiki_pgz_obrazovanje": ["Sveuciliste_u_Rijeci","Tehnicki_fakultet_u_Rijeci", "Pomorski_fakultet_u_Rijeci","Filozofski_fakultet_u_Rijeci", "Medicinski_fakultet_u_Rijeci"], "wiki_pgz_osobe": ["Janica_Kostelic","Ivica_Kostelic","Janko_Polic_Kamov"], } def main(): conn = psycopg2.connect(DSN); conn.autocommit = True total = 0; found = 0 for category, titles in PAGES.items(): cnt = 0 for title in titles: for api, lang in [(API_HR, "hr"), (API_EN, "en")]: text = wiki_extract(api, title) if not text or len(text) < 250: continue found += 1 facts = [{"fact": c, "url": f"https://{lang}.wikipedia.org/wiki/{quote(title)}", "title": title} for c in chunk_text(text, 700) if len(c) > 100] n = upsert_facts(conn, facts, source_name=f"wikipedia_pgz_{lang}", category=category, confidence=0.88) total += n; cnt += n time.sleep(0.4) print(f" {category:25} +{cnt:>5}f") conn.close() print(f"=== TOTAL pages={found} facts={total} ===") print(json.dumps({"pages_found": found, "total_facts": total})) if __name__ == "__main__": main()