007825acee
DB: - Aggressive je_klub=false flag for programs/treninzi/totals (>100K€ no klub_id) - 53 ne-klubovi flagged false (RSS Rijeka ukupni, Stručni rad, Potpora loptačkim, etc) Frontend (sport2.html): - Panel back button (← Natrag) + history stack - window._panelHistory + pushPanelState + panelBack functions - closePanel resets history
81 lines
3.8 KiB
Python
81 lines
3.8 KiB
Python
#!/usr/bin/env python3
|
|
"""Wikipedia deep PGZ encyclopedia."""
|
|
import sys, json, time
|
|
sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
|
|
from _common import chunk_text, upsert_facts, DSN, UA
|
|
from urllib.parse import urlencode, quote
|
|
import urllib.request
|
|
import psycopg2
|
|
|
|
API_HR = "https://hr.wikipedia.org/w/api.php"
|
|
API_EN = "https://en.wikipedia.org/w/api.php"
|
|
|
|
|
|
def wiki_extract(api, title, timeout=15):
|
|
params = {"action":"query","prop":"extracts","explaintext":"1",
|
|
"redirects":"1","format":"json","titles":title}
|
|
url = api + "?" + urlencode(params)
|
|
try:
|
|
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
|
with urllib.request.urlopen(req, timeout=timeout) as r:
|
|
d = json.loads(r.read())
|
|
for pid, p in d.get("query", {}).get("pages", {}).items():
|
|
if pid == "-1":
|
|
return None
|
|
return p.get("extract", "")
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
PAGES = {
|
|
"wiki_pgz_grad": ["Rijeka","Opatija","Crikvenica","Krk_(grad)","Kraljevica",
|
|
"Rab_(grad)","Cres_(grad)","Mali_Losinj","Delnice","Vrbovsko",
|
|
"Cabar","Bakar","Kastav","Novi_Vinodolski","Susak","Unije"],
|
|
"wiki_pgz_opcina": ["Opcina_Viskovo","Opcina_Klana","Opcina_Lovran","Opcina_Matulji",
|
|
"Opcina_Omisalj","Opcina_Punat","Opcina_Vrbnik","Opcina_Baska",
|
|
"Opcina_Dobrinj","Opcina_Jelenje","Opcina_Kostrena","Opcina_Cavle",
|
|
"Opcina_Lopar","Opcina_Brod_Moravice","Opcina_Mrkopalj",
|
|
"Opcina_Ravna_Gora","Opcina_Lokve","Opcina_Skrad","Opcina_Fuzine"],
|
|
"wiki_pgz_otok": ["Krk","Cres","Losinj","Rab","Susak","Unije","Ilovik","Ist",
|
|
"Goli_otok","Sveti_Grgur"],
|
|
"wiki_pgz_povijest": ["Vinodolski_zakonik","Frankopani","Krcki_knezovi",
|
|
"Liburnija","Liburni","Trsat","Tvrdjava_Trsat",
|
|
"Slobodna_Drzava_Rijeka","Rijecka_rezolucija"],
|
|
"wiki_pgz_kultura": ["Glagoljica","Bascanska_ploca","Rijecki_karneval",
|
|
"Halubajski_zvoncari","Hrvatsko_narodno_kazaliste_Ivana_pl._Zajca"],
|
|
"wiki_pgz_priroda": ["Ucka","Risnjak","Park_prirode_Ucka",
|
|
"Nacionalni_park_Risnjak","Velebit","Kvarnerski_zaljev"],
|
|
"wiki_pgz_gospodarstvo": ["Luka_Rijeka","Brodogradiliste_3._maj",
|
|
"Brodogradiliste_Viktor_Lenac","Rafinerija_nafte_Rijeka"],
|
|
"wiki_pgz_obrazovanje": ["Sveuciliste_u_Rijeci","Tehnicki_fakultet_u_Rijeci",
|
|
"Pomorski_fakultet_u_Rijeci","Filozofski_fakultet_u_Rijeci",
|
|
"Medicinski_fakultet_u_Rijeci"],
|
|
"wiki_pgz_osobe": ["Janica_Kostelic","Ivica_Kostelic","Janko_Polic_Kamov"],
|
|
}
|
|
|
|
|
|
def main():
|
|
conn = psycopg2.connect(DSN); conn.autocommit = True
|
|
total = 0; found = 0
|
|
for category, titles in PAGES.items():
|
|
cnt = 0
|
|
for title in titles:
|
|
for api, lang in [(API_HR, "hr"), (API_EN, "en")]:
|
|
text = wiki_extract(api, title)
|
|
if not text or len(text) < 250: continue
|
|
found += 1
|
|
facts = [{"fact": c, "url": f"https://{lang}.wikipedia.org/wiki/{quote(title)}", "title": title}
|
|
for c in chunk_text(text, 700) if len(c) > 100]
|
|
n = upsert_facts(conn, facts, source_name=f"wikipedia_pgz_{lang}",
|
|
category=category, confidence=0.88)
|
|
total += n; cnt += n
|
|
time.sleep(0.4)
|
|
print(f" {category:25} +{cnt:>5}f")
|
|
conn.close()
|
|
print(f"=== TOTAL pages={found} facts={total} ===")
|
|
print(json.dumps({"pages_found": found, "total_facts": total}))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|