#!/usr/bin/env python3 """Naselja PGZ — sela, zaseoci, otocna mjesta.""" import sys, json, time sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters") from _common import chunk_text, upsert_facts, DSN, UA from urllib.parse import urlencode, quote import urllib.request import psycopg2 API_HR = "https://hr.wikipedia.org/w/api.php" def wiki_cat_members(cat, limit=200): """Get pages in a Wikipedia category.""" params = {"action":"query","list":"categorymembers","cmtitle":cat, "cmlimit":str(limit),"format":"json"} url = API_HR + "?" + urlencode(params) try: req = urllib.request.Request(url, headers={"User-Agent": UA}) with urllib.request.urlopen(req, timeout=20) as r: d = json.loads(r.read()) return [m["title"] for m in d.get("query", {}).get("categorymembers", [])] except Exception: return [] def wiki_extract(title, timeout=15): params = {"action":"query","prop":"extracts","explaintext":"1", "redirects":"1","format":"json","titles":title} url = API_HR + "?" + urlencode(params) try: req = urllib.request.Request(url, headers={"User-Agent": UA}) with urllib.request.urlopen(req, timeout=timeout) as r: d = json.loads(r.read()) for pid, p in d.get("query", {}).get("pages", {}).items(): if pid == "-1": return None return p.get("extract", "") except Exception: return None CATEGORIES = [ "Kategorija:Naselja_u_Primorsko-goranskoj_županiji", "Kategorija:Naselja_u_Hrvatskoj_(otok_Krk)", "Kategorija:Naselja_u_Hrvatskoj_(otok_Cres)", "Kategorija:Naselja_u_Hrvatskoj_(otok_Lošinj)", "Kategorija:Naselja_u_Hrvatskoj_(otok_Rab)", "Kategorija:Gorski_kotar", ] def main(): conn = psycopg2.connect(DSN); conn.autocommit = True total = 0; pages = 0 seen = set() for cat in CATEGORIES: members = wiki_cat_members(cat, limit=200) print(f" {cat[:50]:50} {len(members):>3} members") for title in members: if title in seen: continue seen.add(title) text = wiki_extract(title) if not text or len(text) < 200: continue pages += 1 facts = [{"fact": c, "url": f"https://hr.wikipedia.org/wiki/{quote(title)}", "title": title} for c in chunk_text(text, 600) if len(c) > 100] n = upsert_facts(conn, facts, source_name="wikipedia_pgz_naselja", category="naselja_pgz", confidence=0.86) total += n time.sleep(0.3) conn.close() print(f"=== TOTAL pages={pages} facts={total} ===") print(json.dumps({"pages": pages, "total_facts": total})) if __name__ == "__main__": main()