f07fdad919
DB: - pgz_sport.sufinanciranje_sport.je_klub flag (RSS programi/totals false) - pgz_sport.sufinanciranje_sport.klub_id matched Endpoints: - /v2/potpore/by-year: samo_klubovi=True default + davatelj filter Frontend: - sport2.html PANEL FORCE HIDE CSS (right:-100vw default) - crm_v2.html: redirect to /login only on actual 401, not on page load
84 lines
2.8 KiB
Python
84 lines
2.8 KiB
Python
#!/usr/bin/env python3
|
|
"""Naselja PGZ — sela, zaseoci, otocna mjesta."""
|
|
import sys, json, time
|
|
sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
|
|
from _common import chunk_text, upsert_facts, DSN, UA
|
|
from urllib.parse import urlencode, quote
|
|
import urllib.request
|
|
import psycopg2
|
|
|
|
API_HR = "https://hr.wikipedia.org/w/api.php"
|
|
|
|
|
|
def wiki_cat_members(cat, limit=200):
|
|
"""Get pages in a Wikipedia category."""
|
|
params = {"action":"query","list":"categorymembers","cmtitle":cat,
|
|
"cmlimit":str(limit),"format":"json"}
|
|
url = API_HR + "?" + urlencode(params)
|
|
try:
|
|
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
|
with urllib.request.urlopen(req, timeout=20) as r:
|
|
d = json.loads(r.read())
|
|
return [m["title"] for m in d.get("query", {}).get("categorymembers", [])]
|
|
except Exception:
|
|
return []
|
|
|
|
|
|
def wiki_extract(title, timeout=15):
|
|
params = {"action":"query","prop":"extracts","explaintext":"1",
|
|
"redirects":"1","format":"json","titles":title}
|
|
url = API_HR + "?" + urlencode(params)
|
|
try:
|
|
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
|
with urllib.request.urlopen(req, timeout=timeout) as r:
|
|
d = json.loads(r.read())
|
|
for pid, p in d.get("query", {}).get("pages", {}).items():
|
|
if pid == "-1": return None
|
|
return p.get("extract", "")
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
CATEGORIES = [
|
|
"Kategorija:Naselja_u_Primorsko-goranskoj_županiji",
|
|
"Kategorija:Naselja_u_Hrvatskoj_(otok_Krk)",
|
|
"Kategorija:Naselja_u_Hrvatskoj_(otok_Cres)",
|
|
"Kategorija:Naselja_u_Hrvatskoj_(otok_Lošinj)",
|
|
"Kategorija:Naselja_u_Hrvatskoj_(otok_Rab)",
|
|
"Kategorija:Gorski_kotar",
|
|
]
|
|
|
|
|
|
def main():
|
|
conn = psycopg2.connect(DSN); conn.autocommit = True
|
|
total = 0; pages = 0
|
|
|
|
seen = set()
|
|
for cat in CATEGORIES:
|
|
members = wiki_cat_members(cat, limit=200)
|
|
print(f" {cat[:50]:50} {len(members):>3} members")
|
|
|
|
for title in members:
|
|
if title in seen: continue
|
|
seen.add(title)
|
|
|
|
text = wiki_extract(title)
|
|
if not text or len(text) < 200: continue
|
|
pages += 1
|
|
|
|
facts = [{"fact": c, "url": f"https://hr.wikipedia.org/wiki/{quote(title)}",
|
|
"title": title}
|
|
for c in chunk_text(text, 600) if len(c) > 100]
|
|
n = upsert_facts(conn, facts, source_name="wikipedia_pgz_naselja",
|
|
category="naselja_pgz", confidence=0.86)
|
|
total += n
|
|
time.sleep(0.3)
|
|
|
|
conn.close()
|
|
print(f"=== TOTAL pages={pages} facts={total} ===")
|
|
print(json.dumps({"pages": pages, "total_facts": total}))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|