Bug hunt V7:

DB:
- Aggressive je_klub=false flag for programs/treninzi/totals (>100K€ no klub_id)
- 53 ne-klubovi flagged false (RSS Rijeka ukupni, Stručni rad, Potpora loptačkim, etc)

Frontend (sport2.html):
- Panel back button (← Natrag) + history stack
- window._panelHistory + pushPanelState + panelBack functions
- closePanel resets history
This commit is contained in:
2026-05-05 14:56:53 +02:00
parent 1e611d59f1
commit 007825acee
214 changed files with 15117 additions and 565 deletions
+93
View File
@@ -0,0 +1,93 @@
#!/usr/bin/env python3
"""Common scraper helpers."""
import os, re, time, json, hashlib
from urllib.parse import urljoin, urlparse, urlencode, quote
import urllib.request
from html import unescape
import psycopg2
from psycopg2.extras import execute_batch
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
UA = "Mozilla/5.0 (Ri.NET Civic Bot 1.0; contact: dradulic@outlook.com)"
def fetch(url, timeout=20, retries=3, binary=False):
for i in range(retries):
try:
req = urllib.request.Request(url, headers={"User-Agent": UA})
with urllib.request.urlopen(req, timeout=timeout) as r:
d = r.read()
return (d if binary else d.decode("utf-8", errors="replace")), r.status
except Exception:
time.sleep(2 * (i + 1))
return None, 0
def extract_text(html):
if not html:
return ""
h = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.S | re.I)
h = re.sub(r"<style[^>]*>.*?</style>", "", h, flags=re.S | re.I)
h = re.sub(r"<nav[^>]*>.*?</nav>", "", h, flags=re.S | re.I)
h = re.sub(r"<footer[^>]*>.*?</footer>", "", h, flags=re.S | re.I)
t = re.sub(r"<[^>]+>", " ", h)
return re.sub(r"\s+", " ", unescape(t)).strip()
def extract_title(html):
m = re.search(r"<title>([^<]+)</title>", html or "", re.I)
return re.sub(r"\s+", " ", unescape(m.group(1))).strip() if m else ""
def chunk_text(text, max_len=800):
if len(text) <= max_len:
return [text] if text else []
out = []
start = 0
while start < len(text):
end = min(start + max_len, len(text))
if end < len(text):
for sep in [". ", "! ", "? ", "\n"]:
p = text.rfind(sep, start, end)
if p > start + max_len // 2:
end = p + len(sep)
break
out.append(text[start:end].strip())
start = end
return [c for c in out if len(c) > 80]
def upsert_facts(conn, facts, source_name, category, confidence=0.85):
if not facts:
return 0
cur = conn.cursor()
rows = []
for f in facts:
h = hashlib.md5(f["fact"].encode()).hexdigest()
rows.append((f["fact"], source_name, category, confidence, h,
json.dumps({"url": f.get("url", ""), "title": f.get("title", "")})))
sql = ("INSERT INTO dabi.knowledge (fact, source, category, confidence, data_hash, source_refs) "
"VALUES (%s, %s, %s, %s, %s, %s::jsonb) ON CONFLICT (data_hash) DO NOTHING")
try:
execute_batch(cur, sql, rows, page_size=50)
n = cur.rowcount
cur.close()
return n
except Exception:
return 0
HREF_RE = re.compile("href=[\"']([^\"']+)[\"']")
def find_internal_links(html, base_url):
if not html:
return []
base_host = urlparse(base_url).hostname or ""
out = set()
for m in HREF_RE.finditer(html):
u = urljoin(base_url, m.group(1))
host = urlparse(u).hostname or ""
if host == base_host:
out.add(u.split("#")[0])
return list(out)
+68
View File
@@ -0,0 +1,68 @@
#!/usr/bin/env python3
"""Gospodarstvo PGZ — luke, brodogradilista, komora."""
import sys, json, time
sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
from _common import (fetch, extract_text, extract_title, chunk_text,
upsert_facts, find_internal_links, DSN)
from urllib.parse import urlparse
import psycopg2
GOSPOD = {
"luka_rijeka": ["https://www.lukarijeka.hr/"],
"brodogradiliste_3maj":["https://www.3maj.hr/"],
"viktor_lenac": ["https://www.lenac.hr/"],
"ina_rafinerija": ["https://www.ina.hr/"],
"rrif": ["https://www.rrif.hr/"],
"hgk_rijeka": ["https://www.hgk.hr/zk-rijeka"],
"porin": ["https://www.porin.hr/"],
"tehnopolis": ["https://www.tehnopolis.hr/"],
"step_ri": ["https://step-ri.hr/"],
"luka_ploce": ["https://www.luka-ploce.hr/"],
"rijeka_gateway": ["https://www.rijeka-gateway.com/"],
}
def crawl(name, urls, max_pages=12):
conn = psycopg2.connect(DSN); conn.autocommit = True
visited = set(); queue = list(urls); facts = 0
while queue and len(visited) < max_pages:
url = queue.pop(0)
if url in visited: continue
visited.add(url)
html, status = fetch(url, timeout=15)
if not html or status != 200: continue
title = extract_title(html); text = extract_text(html)
if not text or len(text) < 200: continue
ff = []
if title and len(title) > 8:
ff.append({"fact": f"{name} - {title}", "url": url, "title": title})
for c in chunk_text(text, 800):
if len(c) > 100:
ff.append({"fact": c, "url": url, "title": title})
facts += upsert_facts(conn, ff, source_name=name,
category="gospodarstvo_pgz", confidence=0.86)
base = urlparse(url).hostname
for link in find_internal_links(html, url):
if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 40:
queue.append(link)
time.sleep(0.5)
conn.close()
return {"name": name, "visited": len(visited), "facts": facts}
def main():
results = []
for name, urls in GOSPOD.items():
try:
r = crawl(name, urls, max_pages=12)
print(f" {name:25} {r['visited']:>3}p {r['facts']:>5}f")
results.append(r)
except Exception as e:
print(f" {name:25} FAIL: {str(e)[:60]}")
total = sum(r.get("facts", 0) for r in results)
print(f"=== TOTAL: {total} ===")
print(json.dumps({"gospod_count": len(results), "total_facts": total}))
if __name__ == "__main__":
main()
+94
View File
@@ -0,0 +1,94 @@
#!/usr/bin/env python3
"""JLS PGZ — 36 jedinica."""
import sys, json, time
sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
from _common import (fetch, extract_text, extract_title, chunk_text,
upsert_facts, find_internal_links, DSN)
from urllib.parse import urlparse
import psycopg2
JLS_PGZ = {
"Rijeka": "https://www.rijeka.hr/",
"Opatija": "https://www.opatija.hr/",
"Crikvenica": "https://www.crikvenica.hr/",
"Krk": "https://www.grad-krk.hr/",
"Kraljevica": "https://www.kraljevica.hr/",
"Rab": "https://www.rab.hr/",
"Cres": "https://www.cres.hr/",
"Mali_Losinj": "https://www.mali-losinj.hr/",
"Delnice": "https://www.delnice.hr/",
"Vrbovsko": "https://www.vrbovsko.hr/",
"Cabar": "https://www.cabar.hr/",
"Bakar": "https://www.bakar.hr/",
"Kastav": "https://www.kastav.hr/",
"Novi_Vinodolski": "https://www.novi-vinodolski.hr/",
"Viskovo": "https://www.opcina-viskovo.hr/",
"Klana": "https://www.klana.hr/",
"Moscenicka_Draga":"https://www.moscenicka-draga.hr/",
"Lovran": "https://www.opcinalovran.hr/",
"Matulji": "https://www.matulji.hr/",
"Omisalj": "https://www.omisalj.hr/",
"Punat": "https://www.punat.hr/",
"Vrbnik": "https://www.vrbnik.hr/",
"Baska": "https://www.baska.hr/",
"Dobrinj": "https://www.opcina-dobrinj.hr/",
"Malinska": "https://www.malinska.hr/",
"Jelenje": "https://www.jelenje.hr/",
"Kostrena": "https://www.kostrena.hr/",
"Cavle": "https://www.cavle.hr/",
"Lopar": "https://www.opcina-lopar.hr/",
"Brod_Moravice": "https://www.brod-moravice.hr/",
"Mrkopalj": "https://www.mrkopalj.hr/",
"Ravna_Gora": "https://www.ravnagora.hr/",
"Lokve": "https://www.lokve.hr/",
"Skrad": "https://www.skrad.hr/",
"Fuzine": "https://www.fuzine.hr/",
"Vinodolska": "https://www.vinodolska-opcina.hr/",
"PGZ_zupanija": "https://www.pgz.hr/",
}
def crawl(name, root, max_pages=25):
conn = psycopg2.connect(DSN); conn.autocommit = True
visited = set(); queue = [root]; facts = 0
base_host = urlparse(root).hostname or ""
while queue and len(visited) < max_pages:
url = queue.pop(0)
if url in visited: continue
visited.add(url)
html, status = fetch(url, timeout=15)
if not html or status != 200: continue
title = extract_title(html); text = extract_text(html)
if not text or len(text) < 200: continue
ff = []
if title and len(title) > 8:
ff.append({"fact": f"{name} - {title}", "url": url, "title": title})
for c in chunk_text(text, 800):
if len(c) > 100:
ff.append({"fact": c, "url": url, "title": title})
facts += upsert_facts(conn, ff, source_name=f"jls_pgz_{name.lower()}",
category="jls_pgz_official", confidence=0.90)
for link in find_internal_links(html, url):
if link not in visited and (urlparse(link).hostname or "") == base_host and len(queue) < 200:
queue.append(link)
time.sleep(0.4)
conn.close()
return {"name": name, "visited": len(visited), "facts": facts}
def main():
results = []
for name, url in JLS_PGZ.items():
try:
r = crawl(name, url, max_pages=25)
results.append(r)
print(f" {name:25} {r['visited']:>3}p {r['facts']:>5}f")
except Exception as e:
print(f" {name:25} FAIL: {str(e)[:60]}")
total = sum(r.get("facts", 0) for r in results)
print(f"=== TOTAL: {total} ===")
print(json.dumps({"jls_count": len(results), "total_facts": total}))
if __name__ == "__main__":
main()
+69
View File
@@ -0,0 +1,69 @@
#!/usr/bin/env python3
"""Kultura PGZ — muzeji, kazalista, knjiznice, festival."""
import sys, json, time
sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
from _common import (fetch, extract_text, extract_title, chunk_text,
upsert_facts, find_internal_links, DSN)
from urllib.parse import urlparse
import psycopg2
KULTURA = {
"muzej_pomorski": ["https://ppmhp.hr/"],
"muzej_grada_rijeke": ["https://www.muzej-rijeka.hr/"],
"muzej_marine": ["https://www.maritime-museum-rijeka.com/"],
"muzej_grada_krka": ["https://www.gradkrk.hr/"],
"kazalist_zajca": ["https://www.hnk-zajc.hr/"],
"knjiznica_rijeka": ["https://gkri.hr/"],
"knjiznica_opatija": ["https://www.gradskaknjiznica-opatija.hr/"],
"rijecki_karneval": ["https://www.rijecki-karneval.hr/"],
"rijeka_ekc2020": ["https://rijeka2020.eu/"],
"art_kino_rijeka": ["https://art-kino.hr/"],
"filodrammatica": ["https://www.filodrammatica.eu/"],
"drustvo_pisaca": ["https://drustvohrvatskihknjizevnika.hr/"],
}
def crawl(name, urls, max_pages=15):
conn = psycopg2.connect(DSN); conn.autocommit = True
visited = set(); queue = list(urls); facts = 0
while queue and len(visited) < max_pages:
url = queue.pop(0)
if url in visited: continue
visited.add(url)
html, status = fetch(url, timeout=15)
if not html or status != 200: continue
title = extract_title(html); text = extract_text(html)
if not text or len(text) < 200: continue
ff = []
if title and len(title) > 8:
ff.append({"fact": f"{name} - {title}", "url": url, "title": title})
for c in chunk_text(text, 800):
if len(c) > 100:
ff.append({"fact": c, "url": url, "title": title})
facts += upsert_facts(conn, ff, source_name=name,
category="kultura_pgz", confidence=0.86)
base = urlparse(url).hostname
for link in find_internal_links(html, url):
if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 50:
queue.append(link)
time.sleep(0.5)
conn.close()
return {"name": name, "visited": len(visited), "facts": facts}
def main():
results = []
for name, urls in KULTURA.items():
try:
r = crawl(name, urls, max_pages=15)
print(f" {name:25} {r['visited']:>3}p {r['facts']:>5}f")
results.append(r)
except Exception as e:
print(f" {name:25} FAIL: {str(e)[:60]}")
total = sum(r.get("facts", 0) for r in results)
print(f"=== TOTAL: {total} ===")
print(json.dumps({"kultura_count": len(results), "total_facts": total}))
if __name__ == "__main__":
main()
+73
View File
@@ -0,0 +1,73 @@
#!/usr/bin/env python3
"""Lokalni news RSS PGZ."""
import sys, json, time, re
sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
from _common import (fetch, extract_text, chunk_text, upsert_facts, DSN)
from html import unescape
import psycopg2
FEEDS = [
("novi_list", "https://www.novilist.hr/rss/rijeka.xml"),
("novi_list_pgz", "https://www.novilist.hr/rss/regija.xml"),
("rijeka_danas", "https://rijekadanas.com/feed/"),
("rijeka_in", "https://rijekain.hr/feed/"),
("primorske_novice","https://primorskenovice.hr/feed/"),
("kvarner_news", "https://www.kvarner.news/feed/"),
("oradio", "https://otvoreniradio.hr/rss/sve.xml"),
("rijeka_today", "https://www.rijekatoday.com/feed/"),
]
def parse_rss(xml):
items = []
for m in re.finditer(r"<item>(.*?)</item>", xml, re.S | re.I):
item = m.group(1)
def grab(tag):
mt = re.search(f"<{tag}[^>]*>(.*?)</{tag}>", item, re.S | re.I)
if mt:
t = mt.group(1)
t = re.sub(r"<!\[CDATA\[(.*?)\]\]>", r"\1", t, flags=re.S)
t = re.sub(r"<[^>]+>", " ", t)
return unescape(re.sub(r"\s+", " ", t).strip())
return ""
items.append({"title": grab("title"), "link": grab("link"),
"description": grab("description"), "pubDate": grab("pubDate")})
return items
def main():
conn = psycopg2.connect(DSN); conn.autocommit = True
total_inserted = 0
for portal, url in FEEDS:
xml, status = fetch(url, timeout=15)
if not xml:
print(f" {portal:20} fetch FAIL")
continue
items = parse_rss(xml)
if not items:
print(f" {portal:20} parse 0 items")
continue
ff = []
for it in items:
title = it.get("title", "")
desc = it.get("description", "")
if not title and not desc: continue
fact = f"{title} - {desc[:400]}".strip()
if len(fact) < 30: continue
ff.append({"fact": fact, "url": it.get("link", ""), "title": title})
n = upsert_facts(conn, ff, source_name=f"news_{portal}",
category="news_pgz_rss", confidence=0.84)
total_inserted += n
print(f" {portal:20} items={len(items):>3} inserted={n:>3}")
time.sleep(1)
conn.close()
print(f"=== TOTAL inserted: {total_inserted} ===")
print(json.dumps({"feeds": len(FEEDS), "inserted": total_inserted}))
if __name__ == "__main__":
main()
+69
View File
@@ -0,0 +1,69 @@
#!/usr/bin/env python3
"""Obrazovanje PGZ — Sveuciliste + fakulteti + skole."""
import sys, json, time
sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
from _common import (fetch, extract_text, extract_title, chunk_text,
upsert_facts, find_internal_links, DSN)
from urllib.parse import urlparse
import psycopg2
EDU = {
"uniri": ["https://www.uniri.hr/"],
"ffri": ["https://www.ffri.uniri.hr/"],
"tfr": ["https://www.tehnickifakultet.uniri.hr/"],
"pfri": ["https://www.pfri.uniri.hr/"],
"med_fri": ["https://medri.uniri.hr/"],
"efri": ["https://www.efri.uniri.hr/"],
"pravniri": ["https://www.pravri.uniri.hr/"],
"ufri": ["https://www.ufri.uniri.hr/"],
"akademija_pri": ["https://www.apuri.uniri.hr/"],
"ucitelji_ri": ["https://www.ufri.uniri.hr/"],
"vss_ri": ["https://www.veleri.hr/"],
"rkc_pgz": ["https://www.rkcrijeka.hr/"],
}
def crawl(name, urls, max_pages=15):
conn = psycopg2.connect(DSN); conn.autocommit = True
visited = set(); queue = list(urls); facts = 0
while queue and len(visited) < max_pages:
url = queue.pop(0)
if url in visited: continue
visited.add(url)
html, status = fetch(url, timeout=15)
if not html or status != 200: continue
title = extract_title(html); text = extract_text(html)
if not text or len(text) < 200: continue
ff = []
if title and len(title) > 8:
ff.append({"fact": f"{name} - {title}", "url": url, "title": title})
for c in chunk_text(text, 800):
if len(c) > 100:
ff.append({"fact": c, "url": url, "title": title})
facts += upsert_facts(conn, ff, source_name=name,
category="obrazovanje_pgz", confidence=0.88)
base = urlparse(url).hostname
for link in find_internal_links(html, url):
if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 40:
queue.append(link)
time.sleep(0.5)
conn.close()
return {"name": name, "visited": len(visited), "facts": facts}
def main():
results = []
for name, urls in EDU.items():
try:
r = crawl(name, urls, max_pages=15)
print(f" {name:25} {r['visited']:>3}p {r['facts']:>5}f")
results.append(r)
except Exception as e:
print(f" {name:25} FAIL: {str(e)[:60]}")
total = sum(r.get("facts", 0) for r in results)
print(f"=== TOTAL: {total} ===")
print(json.dumps({"edu_count": len(results), "total_facts": total}))
if __name__ == "__main__":
main()
+70
View File
@@ -0,0 +1,70 @@
#!/usr/bin/env python3
"""Servisne usluge PGZ — komunalije, transport."""
import sys, json, time
sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
from _common import (fetch, extract_text, extract_title, chunk_text,
upsert_facts, find_internal_links, DSN)
from urllib.parse import urlparse
import psycopg2
SERVIS = {
"kd_komunalc": ["https://www.kd-komunalac.hr/"],
"kd_kozala": ["https://www.kd-kozala.hr/"],
"rijekapromet": ["https://www.rijekapromet.hr/"],
"vodovod_pgz": ["https://www.kdvik-rijeka.hr/"],
"rgcc_plin": ["https://www.energo.hr/"],
"hep_rijeka": ["https://www.hep.hr/elektrodalmacija/"],
"rijeka_parking": ["https://www.rijekaplus.hr/"],
"ana_aerodrom": ["https://rijeka-airport.hr/"],
"rijeka_busplus": ["https://www.autotrans.hr/"],
"jadrolinija": ["https://www.jadrolinija.hr/"],
"kbc_rijeka": ["https://www.kbc-rijeka.hr/"],
"thalassotherapia": ["https://thalassotherapia-opatija.hr/"],
"klinika_opatija": ["https://www.opatija.medicus.hr/"],
}
def crawl(name, urls, max_pages=15):
conn = psycopg2.connect(DSN); conn.autocommit = True
visited = set(); queue = list(urls); facts = 0
while queue and len(visited) < max_pages:
url = queue.pop(0)
if url in visited: continue
visited.add(url)
html, status = fetch(url, timeout=15)
if not html or status != 200: continue
title = extract_title(html); text = extract_text(html)
if not text or len(text) < 200: continue
ff = []
if title and len(title) > 8:
ff.append({"fact": f"{name} - {title}", "url": url, "title": title})
for c in chunk_text(text, 800):
if len(c) > 100:
ff.append({"fact": c, "url": url, "title": title})
facts += upsert_facts(conn, ff, source_name=name,
category="servisne_pgz", confidence=0.86)
base = urlparse(url).hostname
for link in find_internal_links(html, url):
if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 40:
queue.append(link)
time.sleep(0.5)
conn.close()
return {"name": name, "visited": len(visited), "facts": facts}
def main():
results = []
for name, urls in SERVIS.items():
try:
r = crawl(name, urls, max_pages=12)
print(f" {name:25} {r['visited']:>3}p {r['facts']:>5}f")
results.append(r)
except Exception as e:
print(f" {name:25} FAIL: {str(e)[:60]}")
total = sum(r.get("facts", 0) for r in results)
print(f"=== TOTAL: {total} ===")
print(json.dumps({"servis_count": len(results), "total_facts": total}))
if __name__ == "__main__":
main()
+69
View File
@@ -0,0 +1,69 @@
#!/usr/bin/env python3
"""TZ Kvarner + sve TZ PGZ."""
import sys, json, time
sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
from _common import (fetch, extract_text, extract_title, chunk_text,
upsert_facts, find_internal_links, DSN)
from urllib.parse import urlparse
import psycopg2
TZ_SITES = {
"tz_kvarner": ["https://www.kvarner.hr/"],
"tz_rijeka": ["https://www.visitrijeka.hr/"],
"tz_opatija": ["https://www.visitopatija.com/"],
"tz_crikvenica": ["https://www.tz-crikvenica.hr/"],
"tz_krk": ["https://www.krk.hr/"],
"tz_rab": ["https://www.rab-visit.com/"],
"tz_cres": ["https://www.tzg-cres.hr/"],
"tz_losinj": ["https://www.visitlosinj.hr/"],
"tz_gorski_kotar": ["https://www.gorskikotar.hr/"],
"tz_baska": ["https://www.tz-baska.hr/"],
"tz_lovran": ["https://www.tz-lovran.hr/"],
"tz_kastav": ["https://www.tz-kastav.hr/"],
}
def crawl(name, urls, max_pages=20):
conn = psycopg2.connect(DSN); conn.autocommit = True
visited = set(); queue = list(urls); facts = 0
while queue and len(visited) < max_pages:
url = queue.pop(0)
if url in visited: continue
visited.add(url)
html, status = fetch(url, timeout=15)
if not html or status != 200: continue
title = extract_title(html); text = extract_text(html)
if not text or len(text) < 200: continue
ff = []
if title and len(title) > 8:
ff.append({"fact": f"{name} - {title}", "url": url, "title": title})
for c in chunk_text(text, 800):
if len(c) > 100:
ff.append({"fact": c, "url": url, "title": title})
facts += upsert_facts(conn, ff, source_name=name,
category="turizam_pgz", confidence=0.85)
base = urlparse(url).hostname
for link in find_internal_links(html, url):
if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 80:
queue.append(link)
time.sleep(0.4)
conn.close()
return {"name": name, "visited": len(visited), "facts": facts}
def main():
results = []
for name, urls in TZ_SITES.items():
try:
r = crawl(name, urls, max_pages=20)
print(f" {name:25} {r['visited']:>3}p {r['facts']:>5}f")
results.append(r)
except Exception as e:
print(f" {name:25} FAIL: {str(e)[:60]}")
total = sum(r.get("facts", 0) for r in results)
print(f"=== TOTAL: {total} ===")
print(json.dumps({"tz_count": len(results), "total_facts": total}))
if __name__ == "__main__":
main()
+80
View File
@@ -0,0 +1,80 @@
#!/usr/bin/env python3
"""Wikipedia deep PGZ encyclopedia."""
import sys, json, time
sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
from _common import chunk_text, upsert_facts, DSN, UA
from urllib.parse import urlencode, quote
import urllib.request
import psycopg2
API_HR = "https://hr.wikipedia.org/w/api.php"
API_EN = "https://en.wikipedia.org/w/api.php"
def wiki_extract(api, title, timeout=15):
params = {"action":"query","prop":"extracts","explaintext":"1",
"redirects":"1","format":"json","titles":title}
url = api + "?" + urlencode(params)
try:
req = urllib.request.Request(url, headers={"User-Agent": UA})
with urllib.request.urlopen(req, timeout=timeout) as r:
d = json.loads(r.read())
for pid, p in d.get("query", {}).get("pages", {}).items():
if pid == "-1":
return None
return p.get("extract", "")
except Exception:
return None
PAGES = {
"wiki_pgz_grad": ["Rijeka","Opatija","Crikvenica","Krk_(grad)","Kraljevica",
"Rab_(grad)","Cres_(grad)","Mali_Losinj","Delnice","Vrbovsko",
"Cabar","Bakar","Kastav","Novi_Vinodolski","Susak","Unije"],
"wiki_pgz_opcina": ["Opcina_Viskovo","Opcina_Klana","Opcina_Lovran","Opcina_Matulji",
"Opcina_Omisalj","Opcina_Punat","Opcina_Vrbnik","Opcina_Baska",
"Opcina_Dobrinj","Opcina_Jelenje","Opcina_Kostrena","Opcina_Cavle",
"Opcina_Lopar","Opcina_Brod_Moravice","Opcina_Mrkopalj",
"Opcina_Ravna_Gora","Opcina_Lokve","Opcina_Skrad","Opcina_Fuzine"],
"wiki_pgz_otok": ["Krk","Cres","Losinj","Rab","Susak","Unije","Ilovik","Ist",
"Goli_otok","Sveti_Grgur"],
"wiki_pgz_povijest": ["Vinodolski_zakonik","Frankopani","Krcki_knezovi",
"Liburnija","Liburni","Trsat","Tvrdjava_Trsat",
"Slobodna_Drzava_Rijeka","Rijecka_rezolucija"],
"wiki_pgz_kultura": ["Glagoljica","Bascanska_ploca","Rijecki_karneval",
"Halubajski_zvoncari","Hrvatsko_narodno_kazaliste_Ivana_pl._Zajca"],
"wiki_pgz_priroda": ["Ucka","Risnjak","Park_prirode_Ucka",
"Nacionalni_park_Risnjak","Velebit","Kvarnerski_zaljev"],
"wiki_pgz_gospodarstvo": ["Luka_Rijeka","Brodogradiliste_3._maj",
"Brodogradiliste_Viktor_Lenac","Rafinerija_nafte_Rijeka"],
"wiki_pgz_obrazovanje": ["Sveuciliste_u_Rijeci","Tehnicki_fakultet_u_Rijeci",
"Pomorski_fakultet_u_Rijeci","Filozofski_fakultet_u_Rijeci",
"Medicinski_fakultet_u_Rijeci"],
"wiki_pgz_osobe": ["Janica_Kostelic","Ivica_Kostelic","Janko_Polic_Kamov"],
}
def main():
conn = psycopg2.connect(DSN); conn.autocommit = True
total = 0; found = 0
for category, titles in PAGES.items():
cnt = 0
for title in titles:
for api, lang in [(API_HR, "hr"), (API_EN, "en")]:
text = wiki_extract(api, title)
if not text or len(text) < 250: continue
found += 1
facts = [{"fact": c, "url": f"https://{lang}.wikipedia.org/wiki/{quote(title)}", "title": title}
for c in chunk_text(text, 700) if len(c) > 100]
n = upsert_facts(conn, facts, source_name=f"wikipedia_pgz_{lang}",
category=category, confidence=0.88)
total += n; cnt += n
time.sleep(0.4)
print(f" {category:25} +{cnt:>5}f")
conn.close()
print(f"=== TOTAL pages={found} facts={total} ===")
print(json.dumps({"pages_found": found, "total_facts": total}))
if __name__ == "__main__":
main()