Crisis V7 MEGA: sufinanciranje_sport + panel + CRM auth
DB: - pgz_sport.sufinanciranje_sport.je_klub flag (RSS programi/totals false) - pgz_sport.sufinanciranje_sport.klub_id matched Endpoints: - /v2/potpore/by-year: samo_klubovi=True default + davatelj filter Frontend: - sport2.html PANEL FORCE HIDE CSS (right:-100vw default) - crm_v2.html: redirect to /login only on actual 401, not on page load
This commit is contained in:
@@ -0,0 +1,83 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Naselja PGZ — sela, zaseoci, otocna mjesta."""
|
||||
import sys, json, time
|
||||
sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
|
||||
from _common import chunk_text, upsert_facts, DSN, UA
|
||||
from urllib.parse import urlencode, quote
|
||||
import urllib.request
|
||||
import psycopg2
|
||||
|
||||
API_HR = "https://hr.wikipedia.org/w/api.php"
|
||||
|
||||
|
||||
def wiki_cat_members(cat, limit=200):
|
||||
"""Get pages in a Wikipedia category."""
|
||||
params = {"action":"query","list":"categorymembers","cmtitle":cat,
|
||||
"cmlimit":str(limit),"format":"json"}
|
||||
url = API_HR + "?" + urlencode(params)
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
||||
with urllib.request.urlopen(req, timeout=20) as r:
|
||||
d = json.loads(r.read())
|
||||
return [m["title"] for m in d.get("query", {}).get("categorymembers", [])]
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
|
||||
def wiki_extract(title, timeout=15):
|
||||
params = {"action":"query","prop":"extracts","explaintext":"1",
|
||||
"redirects":"1","format":"json","titles":title}
|
||||
url = API_HR + "?" + urlencode(params)
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||||
d = json.loads(r.read())
|
||||
for pid, p in d.get("query", {}).get("pages", {}).items():
|
||||
if pid == "-1": return None
|
||||
return p.get("extract", "")
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
CATEGORIES = [
|
||||
"Kategorija:Naselja_u_Primorsko-goranskoj_županiji",
|
||||
"Kategorija:Naselja_u_Hrvatskoj_(otok_Krk)",
|
||||
"Kategorija:Naselja_u_Hrvatskoj_(otok_Cres)",
|
||||
"Kategorija:Naselja_u_Hrvatskoj_(otok_Lošinj)",
|
||||
"Kategorija:Naselja_u_Hrvatskoj_(otok_Rab)",
|
||||
"Kategorija:Gorski_kotar",
|
||||
]
|
||||
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
total = 0; pages = 0
|
||||
|
||||
seen = set()
|
||||
for cat in CATEGORIES:
|
||||
members = wiki_cat_members(cat, limit=200)
|
||||
print(f" {cat[:50]:50} {len(members):>3} members")
|
||||
|
||||
for title in members:
|
||||
if title in seen: continue
|
||||
seen.add(title)
|
||||
|
||||
text = wiki_extract(title)
|
||||
if not text or len(text) < 200: continue
|
||||
pages += 1
|
||||
|
||||
facts = [{"fact": c, "url": f"https://hr.wikipedia.org/wiki/{quote(title)}",
|
||||
"title": title}
|
||||
for c in chunk_text(text, 600) if len(c) > 100]
|
||||
n = upsert_facts(conn, facts, source_name="wikipedia_pgz_naselja",
|
||||
category="naselja_pgz", confidence=0.86)
|
||||
total += n
|
||||
time.sleep(0.3)
|
||||
|
||||
conn.close()
|
||||
print(f"=== TOTAL pages={pages} facts={total} ===")
|
||||
print(json.dumps({"pages": pages, "total_facts": total}))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user