Bug hunt V7:

DB: - Aggressive je_klub=false flag for programs/treninzi/totals (>100K€ no klub_id) - 53 ne-klubovi flagged false (RSS Rijeka ukupni, Stručni rad, Potpora loptačkim, etc) Frontend (sport2.html): - Panel back button (← Natrag) + history stack - window._panelHistory + pushPanelState + panelBack functions - closePanel resets history
2026-05-05 14:56:53 +02:00
parent 1e611d59f1
commit 007825acee
214 changed files with 15117 additions and 565 deletions
@@ -0,0 +1,93 @@
+#!/usr/bin/env python3
+"""Common scraper helpers."""
+import os, re, time, json, hashlib
+from urllib.parse import urljoin, urlparse, urlencode, quote
+import urllib.request
+from html import unescape
+import psycopg2
+from psycopg2.extras import execute_batch
+
+DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
+UA = "Mozilla/5.0 (Ri.NET Civic Bot 1.0; contact: dradulic@outlook.com)"
+
+
+def fetch(url, timeout=20, retries=3, binary=False):
+    for i in range(retries):
+        try:
+            req = urllib.request.Request(url, headers={"User-Agent": UA})
+            with urllib.request.urlopen(req, timeout=timeout) as r:
+                d = r.read()
+                return (d if binary else d.decode("utf-8", errors="replace")), r.status
+        except Exception:
+            time.sleep(2 * (i + 1))
+    return None, 0
+
+
+def extract_text(html):
+    if not html:
+        return ""
+    h = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.S | re.I)
+    h = re.sub(r"<style[^>]*>.*?</style>", "", h, flags=re.S | re.I)
+    h = re.sub(r"<nav[^>]*>.*?</nav>", "", h, flags=re.S | re.I)
+    h = re.sub(r"<footer[^>]*>.*?</footer>", "", h, flags=re.S | re.I)
+    t = re.sub(r"<[^>]+>", " ", h)
+    return re.sub(r"\s+", " ", unescape(t)).strip()
+
+
+def extract_title(html):
+    m = re.search(r"<title>([^<]+)</title>", html or "", re.I)
+    return re.sub(r"\s+", " ", unescape(m.group(1))).strip() if m else ""
+
+
+def chunk_text(text, max_len=800):
+    if len(text) <= max_len:
+        return [text] if text else []
+    out = []
+    start = 0
+    while start < len(text):
+        end = min(start + max_len, len(text))
+        if end < len(text):
+            for sep in [". ", "! ", "? ", "\n"]:
+                p = text.rfind(sep, start, end)
+                if p > start + max_len // 2:
+                    end = p + len(sep)
+                    break
+        out.append(text[start:end].strip())
+        start = end
+    return [c for c in out if len(c) > 80]
+
+
+def upsert_facts(conn, facts, source_name, category, confidence=0.85):
+    if not facts:
+        return 0
+    cur = conn.cursor()
+    rows = []
+    for f in facts:
+        h = hashlib.md5(f["fact"].encode()).hexdigest()
+        rows.append((f["fact"], source_name, category, confidence, h,
+                     json.dumps({"url": f.get("url", ""), "title": f.get("title", "")})))
+    sql = ("INSERT INTO dabi.knowledge (fact, source, category, confidence, data_hash, source_refs) "
+           "VALUES (%s, %s, %s, %s, %s, %s::jsonb) ON CONFLICT (data_hash) DO NOTHING")
+    try:
+        execute_batch(cur, sql, rows, page_size=50)
+        n = cur.rowcount
+        cur.close()
+        return n
+    except Exception:
+        return 0
+
+
+HREF_RE = re.compile("href=[\"']([^\"']+)[\"']")
+
+
+def find_internal_links(html, base_url):
+    if not html:
+        return []
+    base_host = urlparse(base_url).hostname or ""
+    out = set()
+    for m in HREF_RE.finditer(html):
+        u = urljoin(base_url, m.group(1))
+        host = urlparse(u).hostname or ""
+        if host == base_host:
+            out.add(u.split("#")[0])
+    return list(out)
@@ -0,0 +1,68 @@
+#!/usr/bin/env python3
+"""Gospodarstvo PGZ — luke, brodogradilista, komora."""
+import sys, json, time
+sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
+from _common import (fetch, extract_text, extract_title, chunk_text,
+                      upsert_facts, find_internal_links, DSN)
+from urllib.parse import urlparse
+import psycopg2
+
+GOSPOD = {
+    "luka_rijeka":        ["https://www.lukarijeka.hr/"],
+    "brodogradiliste_3maj":["https://www.3maj.hr/"],
+    "viktor_lenac":       ["https://www.lenac.hr/"],
+    "ina_rafinerija":     ["https://www.ina.hr/"],
+    "rrif":               ["https://www.rrif.hr/"],
+    "hgk_rijeka":         ["https://www.hgk.hr/zk-rijeka"],
+    "porin":              ["https://www.porin.hr/"],
+    "tehnopolis":         ["https://www.tehnopolis.hr/"],
+    "step_ri":            ["https://step-ri.hr/"],
+    "luka_ploce":         ["https://www.luka-ploce.hr/"],
+    "rijeka_gateway":     ["https://www.rijeka-gateway.com/"],
+}
+
+
+def crawl(name, urls, max_pages=12):
+    conn = psycopg2.connect(DSN); conn.autocommit = True
+    visited = set(); queue = list(urls); facts = 0
+    while queue and len(visited) < max_pages:
+        url = queue.pop(0)
+        if url in visited: continue
+        visited.add(url)
+        html, status = fetch(url, timeout=15)
+        if not html or status != 200: continue
+        title = extract_title(html); text = extract_text(html)
+        if not text or len(text) < 200: continue
+        ff = []
+        if title and len(title) > 8:
+            ff.append({"fact": f"{name} - {title}", "url": url, "title": title})
+        for c in chunk_text(text, 800):
+            if len(c) > 100:
+                ff.append({"fact": c, "url": url, "title": title})
+        facts += upsert_facts(conn, ff, source_name=name,
+                               category="gospodarstvo_pgz", confidence=0.86)
+        base = urlparse(url).hostname
+        for link in find_internal_links(html, url):
+            if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 40:
+                queue.append(link)
+        time.sleep(0.5)
+    conn.close()
+    return {"name": name, "visited": len(visited), "facts": facts}
+
+
+def main():
+    results = []
+    for name, urls in GOSPOD.items():
+        try:
+            r = crawl(name, urls, max_pages=12)
+            print(f"  {name:25} {r['visited']:>3}p  {r['facts']:>5}f")
+            results.append(r)
+        except Exception as e:
+            print(f"  {name:25} FAIL: {str(e)[:60]}")
+    total = sum(r.get("facts", 0) for r in results)
+    print(f"=== TOTAL: {total} ===")
+    print(json.dumps({"gospod_count": len(results), "total_facts": total}))
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,94 @@
+#!/usr/bin/env python3
+"""JLS PGZ — 36 jedinica."""
+import sys, json, time
+sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
+from _common import (fetch, extract_text, extract_title, chunk_text,
+                      upsert_facts, find_internal_links, DSN)
+from urllib.parse import urlparse
+import psycopg2
+
+JLS_PGZ = {
+    "Rijeka":          "https://www.rijeka.hr/",
+    "Opatija":         "https://www.opatija.hr/",
+    "Crikvenica":      "https://www.crikvenica.hr/",
+    "Krk":             "https://www.grad-krk.hr/",
+    "Kraljevica":      "https://www.kraljevica.hr/",
+    "Rab":             "https://www.rab.hr/",
+    "Cres":            "https://www.cres.hr/",
+    "Mali_Losinj":     "https://www.mali-losinj.hr/",
+    "Delnice":         "https://www.delnice.hr/",
+    "Vrbovsko":        "https://www.vrbovsko.hr/",
+    "Cabar":           "https://www.cabar.hr/",
+    "Bakar":           "https://www.bakar.hr/",
+    "Kastav":          "https://www.kastav.hr/",
+    "Novi_Vinodolski": "https://www.novi-vinodolski.hr/",
+    "Viskovo":         "https://www.opcina-viskovo.hr/",
+    "Klana":           "https://www.klana.hr/",
+    "Moscenicka_Draga":"https://www.moscenicka-draga.hr/",
+    "Lovran":          "https://www.opcinalovran.hr/",
+    "Matulji":         "https://www.matulji.hr/",
+    "Omisalj":         "https://www.omisalj.hr/",
+    "Punat":           "https://www.punat.hr/",
+    "Vrbnik":          "https://www.vrbnik.hr/",
+    "Baska":           "https://www.baska.hr/",
+    "Dobrinj":         "https://www.opcina-dobrinj.hr/",
+    "Malinska":        "https://www.malinska.hr/",
+    "Jelenje":         "https://www.jelenje.hr/",
+    "Kostrena":        "https://www.kostrena.hr/",
+    "Cavle":           "https://www.cavle.hr/",
+    "Lopar":           "https://www.opcina-lopar.hr/",
+    "Brod_Moravice":   "https://www.brod-moravice.hr/",
+    "Mrkopalj":        "https://www.mrkopalj.hr/",
+    "Ravna_Gora":      "https://www.ravnagora.hr/",
+    "Lokve":           "https://www.lokve.hr/",
+    "Skrad":           "https://www.skrad.hr/",
+    "Fuzine":          "https://www.fuzine.hr/",
+    "Vinodolska":      "https://www.vinodolska-opcina.hr/",
+    "PGZ_zupanija":    "https://www.pgz.hr/",
+}
+
+
+def crawl(name, root, max_pages=25):
+    conn = psycopg2.connect(DSN); conn.autocommit = True
+    visited = set(); queue = [root]; facts = 0
+    base_host = urlparse(root).hostname or ""
+    while queue and len(visited) < max_pages:
+        url = queue.pop(0)
+        if url in visited: continue
+        visited.add(url)
+        html, status = fetch(url, timeout=15)
+        if not html or status != 200: continue
+        title = extract_title(html); text = extract_text(html)
+        if not text or len(text) < 200: continue
+        ff = []
+        if title and len(title) > 8:
+            ff.append({"fact": f"{name} - {title}", "url": url, "title": title})
+        for c in chunk_text(text, 800):
+            if len(c) > 100:
+                ff.append({"fact": c, "url": url, "title": title})
+        facts += upsert_facts(conn, ff, source_name=f"jls_pgz_{name.lower()}",
+                               category="jls_pgz_official", confidence=0.90)
+        for link in find_internal_links(html, url):
+            if link not in visited and (urlparse(link).hostname or "") == base_host and len(queue) < 200:
+                queue.append(link)
+        time.sleep(0.4)
+    conn.close()
+    return {"name": name, "visited": len(visited), "facts": facts}
+
+
+def main():
+    results = []
+    for name, url in JLS_PGZ.items():
+        try:
+            r = crawl(name, url, max_pages=25)
+            results.append(r)
+            print(f"  {name:25} {r['visited']:>3}p  {r['facts']:>5}f")
+        except Exception as e:
+            print(f"  {name:25} FAIL: {str(e)[:60]}")
+    total = sum(r.get("facts", 0) for r in results)
+    print(f"=== TOTAL: {total} ===")
+    print(json.dumps({"jls_count": len(results), "total_facts": total}))
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+"""Kultura PGZ — muzeji, kazalista, knjiznice, festival."""
+import sys, json, time
+sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
+from _common import (fetch, extract_text, extract_title, chunk_text,
+                      upsert_facts, find_internal_links, DSN)
+from urllib.parse import urlparse
+import psycopg2
+
+KULTURA = {
+    "muzej_pomorski":     ["https://ppmhp.hr/"],
+    "muzej_grada_rijeke": ["https://www.muzej-rijeka.hr/"],
+    "muzej_marine":       ["https://www.maritime-museum-rijeka.com/"],
+    "muzej_grada_krka":   ["https://www.gradkrk.hr/"],
+    "kazalist_zajca":     ["https://www.hnk-zajc.hr/"],
+    "knjiznica_rijeka":   ["https://gkri.hr/"],
+    "knjiznica_opatija":  ["https://www.gradskaknjiznica-opatija.hr/"],
+    "rijecki_karneval":   ["https://www.rijecki-karneval.hr/"],
+    "rijeka_ekc2020":     ["https://rijeka2020.eu/"],
+    "art_kino_rijeka":    ["https://art-kino.hr/"],
+    "filodrammatica":     ["https://www.filodrammatica.eu/"],
+    "drustvo_pisaca":     ["https://drustvohrvatskihknjizevnika.hr/"],
+}
+
+
+def crawl(name, urls, max_pages=15):
+    conn = psycopg2.connect(DSN); conn.autocommit = True
+    visited = set(); queue = list(urls); facts = 0
+    while queue and len(visited) < max_pages:
+        url = queue.pop(0)
+        if url in visited: continue
+        visited.add(url)
+        html, status = fetch(url, timeout=15)
+        if not html or status != 200: continue
+        title = extract_title(html); text = extract_text(html)
+        if not text or len(text) < 200: continue
+        ff = []
+        if title and len(title) > 8:
+            ff.append({"fact": f"{name} - {title}", "url": url, "title": title})
+        for c in chunk_text(text, 800):
+            if len(c) > 100:
+                ff.append({"fact": c, "url": url, "title": title})
+        facts += upsert_facts(conn, ff, source_name=name,
+                               category="kultura_pgz", confidence=0.86)
+        base = urlparse(url).hostname
+        for link in find_internal_links(html, url):
+            if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 50:
+                queue.append(link)
+        time.sleep(0.5)
+    conn.close()
+    return {"name": name, "visited": len(visited), "facts": facts}
+
+
+def main():
+    results = []
+    for name, urls in KULTURA.items():
+        try:
+            r = crawl(name, urls, max_pages=15)
+            print(f"  {name:25} {r['visited']:>3}p  {r['facts']:>5}f")
+            results.append(r)
+        except Exception as e:
+            print(f"  {name:25} FAIL: {str(e)[:60]}")
+    total = sum(r.get("facts", 0) for r in results)
+    print(f"=== TOTAL: {total} ===")
+    print(json.dumps({"kultura_count": len(results), "total_facts": total}))
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,73 @@
+#!/usr/bin/env python3
+"""Lokalni news RSS PGZ."""
+import sys, json, time, re
+sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
+from _common import (fetch, extract_text, chunk_text, upsert_facts, DSN)
+from html import unescape
+import psycopg2
+
+FEEDS = [
+    ("novi_list",     "https://www.novilist.hr/rss/rijeka.xml"),
+    ("novi_list_pgz", "https://www.novilist.hr/rss/regija.xml"),
+    ("rijeka_danas",  "https://rijekadanas.com/feed/"),
+    ("rijeka_in",     "https://rijekain.hr/feed/"),
+    ("primorske_novice","https://primorskenovice.hr/feed/"),
+    ("kvarner_news",  "https://www.kvarner.news/feed/"),
+    ("oradio",        "https://otvoreniradio.hr/rss/sve.xml"),
+    ("rijeka_today",  "https://www.rijekatoday.com/feed/"),
+]
+
+
+def parse_rss(xml):
+    items = []
+    for m in re.finditer(r"<item>(.*?)</item>", xml, re.S | re.I):
+        item = m.group(1)
+        def grab(tag):
+            mt = re.search(f"<{tag}[^>]*>(.*?)</{tag}>", item, re.S | re.I)
+            if mt:
+                t = mt.group(1)
+                t = re.sub(r"<!\[CDATA\[(.*?)\]\]>", r"\1", t, flags=re.S)
+                t = re.sub(r"<[^>]+>", " ", t)
+                return unescape(re.sub(r"\s+", " ", t).strip())
+            return ""
+        items.append({"title": grab("title"), "link": grab("link"),
+                      "description": grab("description"), "pubDate": grab("pubDate")})
+    return items
+
+
+def main():
+    conn = psycopg2.connect(DSN); conn.autocommit = True
+    total_inserted = 0
+    
+    for portal, url in FEEDS:
+        xml, status = fetch(url, timeout=15)
+        if not xml:
+            print(f"  {portal:20} fetch FAIL")
+            continue
+        items = parse_rss(xml)
+        if not items:
+            print(f"  {portal:20} parse 0 items")
+            continue
+        
+        ff = []
+        for it in items:
+            title = it.get("title", "")
+            desc = it.get("description", "")
+            if not title and not desc: continue
+            fact = f"{title} - {desc[:400]}".strip()
+            if len(fact) < 30: continue
+            ff.append({"fact": fact, "url": it.get("link", ""), "title": title})
+        
+        n = upsert_facts(conn, ff, source_name=f"news_{portal}",
+                          category="news_pgz_rss", confidence=0.84)
+        total_inserted += n
+        print(f"  {portal:20} items={len(items):>3}  inserted={n:>3}")
+        time.sleep(1)
+    
+    conn.close()
+    print(f"=== TOTAL inserted: {total_inserted} ===")
+    print(json.dumps({"feeds": len(FEEDS), "inserted": total_inserted}))
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+"""Obrazovanje PGZ — Sveuciliste + fakulteti + skole."""
+import sys, json, time
+sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
+from _common import (fetch, extract_text, extract_title, chunk_text,
+                      upsert_facts, find_internal_links, DSN)
+from urllib.parse import urlparse
+import psycopg2
+
+EDU = {
+    "uniri":          ["https://www.uniri.hr/"],
+    "ffri":           ["https://www.ffri.uniri.hr/"],
+    "tfr":            ["https://www.tehnickifakultet.uniri.hr/"],
+    "pfri":           ["https://www.pfri.uniri.hr/"],
+    "med_fri":        ["https://medri.uniri.hr/"],
+    "efri":           ["https://www.efri.uniri.hr/"],
+    "pravniri":       ["https://www.pravri.uniri.hr/"],
+    "ufri":           ["https://www.ufri.uniri.hr/"],
+    "akademija_pri":  ["https://www.apuri.uniri.hr/"],
+    "ucitelji_ri":    ["https://www.ufri.uniri.hr/"],
+    "vss_ri":         ["https://www.veleri.hr/"],
+    "rkc_pgz":        ["https://www.rkcrijeka.hr/"],
+}
+
+
+def crawl(name, urls, max_pages=15):
+    conn = psycopg2.connect(DSN); conn.autocommit = True
+    visited = set(); queue = list(urls); facts = 0
+    while queue and len(visited) < max_pages:
+        url = queue.pop(0)
+        if url in visited: continue
+        visited.add(url)
+        html, status = fetch(url, timeout=15)
+        if not html or status != 200: continue
+        title = extract_title(html); text = extract_text(html)
+        if not text or len(text) < 200: continue
+        ff = []
+        if title and len(title) > 8:
+            ff.append({"fact": f"{name} - {title}", "url": url, "title": title})
+        for c in chunk_text(text, 800):
+            if len(c) > 100:
+                ff.append({"fact": c, "url": url, "title": title})
+        facts += upsert_facts(conn, ff, source_name=name,
+                               category="obrazovanje_pgz", confidence=0.88)
+        base = urlparse(url).hostname
+        for link in find_internal_links(html, url):
+            if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 40:
+                queue.append(link)
+        time.sleep(0.5)
+    conn.close()
+    return {"name": name, "visited": len(visited), "facts": facts}
+
+
+def main():
+    results = []
+    for name, urls in EDU.items():
+        try:
+            r = crawl(name, urls, max_pages=15)
+            print(f"  {name:25} {r['visited']:>3}p  {r['facts']:>5}f")
+            results.append(r)
+        except Exception as e:
+            print(f"  {name:25} FAIL: {str(e)[:60]}")
+    total = sum(r.get("facts", 0) for r in results)
+    print(f"=== TOTAL: {total} ===")
+    print(json.dumps({"edu_count": len(results), "total_facts": total}))
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,70 @@
+#!/usr/bin/env python3
+"""Servisne usluge PGZ — komunalije, transport."""
+import sys, json, time
+sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
+from _common import (fetch, extract_text, extract_title, chunk_text,
+                      upsert_facts, find_internal_links, DSN)
+from urllib.parse import urlparse
+import psycopg2
+
+SERVIS = {
+    "kd_komunalc":        ["https://www.kd-komunalac.hr/"],
+    "kd_kozala":          ["https://www.kd-kozala.hr/"],
+    "rijekapromet":       ["https://www.rijekapromet.hr/"],
+    "vodovod_pgz":        ["https://www.kdvik-rijeka.hr/"],
+    "rgcc_plin":          ["https://www.energo.hr/"],
+    "hep_rijeka":         ["https://www.hep.hr/elektrodalmacija/"],
+    "rijeka_parking":     ["https://www.rijekaplus.hr/"],
+    "ana_aerodrom":       ["https://rijeka-airport.hr/"],
+    "rijeka_busplus":     ["https://www.autotrans.hr/"],
+    "jadrolinija":        ["https://www.jadrolinija.hr/"],
+    "kbc_rijeka":         ["https://www.kbc-rijeka.hr/"],
+    "thalassotherapia":   ["https://thalassotherapia-opatija.hr/"],
+    "klinika_opatija":    ["https://www.opatija.medicus.hr/"],
+}
+
+
+def crawl(name, urls, max_pages=15):
+    conn = psycopg2.connect(DSN); conn.autocommit = True
+    visited = set(); queue = list(urls); facts = 0
+    while queue and len(visited) < max_pages:
+        url = queue.pop(0)
+        if url in visited: continue
+        visited.add(url)
+        html, status = fetch(url, timeout=15)
+        if not html or status != 200: continue
+        title = extract_title(html); text = extract_text(html)
+        if not text or len(text) < 200: continue
+        ff = []
+        if title and len(title) > 8:
+            ff.append({"fact": f"{name} - {title}", "url": url, "title": title})
+        for c in chunk_text(text, 800):
+            if len(c) > 100:
+                ff.append({"fact": c, "url": url, "title": title})
+        facts += upsert_facts(conn, ff, source_name=name,
+                               category="servisne_pgz", confidence=0.86)
+        base = urlparse(url).hostname
+        for link in find_internal_links(html, url):
+            if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 40:
+                queue.append(link)
+        time.sleep(0.5)
+    conn.close()
+    return {"name": name, "visited": len(visited), "facts": facts}
+
+
+def main():
+    results = []
+    for name, urls in SERVIS.items():
+        try:
+            r = crawl(name, urls, max_pages=12)
+            print(f"  {name:25} {r['visited']:>3}p  {r['facts']:>5}f")
+            results.append(r)
+        except Exception as e:
+            print(f"  {name:25} FAIL: {str(e)[:60]}")
+    total = sum(r.get("facts", 0) for r in results)
+    print(f"=== TOTAL: {total} ===")
+    print(json.dumps({"servis_count": len(results), "total_facts": total}))
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+"""TZ Kvarner + sve TZ PGZ."""
+import sys, json, time
+sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
+from _common import (fetch, extract_text, extract_title, chunk_text,
+                      upsert_facts, find_internal_links, DSN)
+from urllib.parse import urlparse
+import psycopg2
+
+TZ_SITES = {
+    "tz_kvarner":       ["https://www.kvarner.hr/"],
+    "tz_rijeka":        ["https://www.visitrijeka.hr/"],
+    "tz_opatija":       ["https://www.visitopatija.com/"],
+    "tz_crikvenica":    ["https://www.tz-crikvenica.hr/"],
+    "tz_krk":           ["https://www.krk.hr/"],
+    "tz_rab":           ["https://www.rab-visit.com/"],
+    "tz_cres":          ["https://www.tzg-cres.hr/"],
+    "tz_losinj":        ["https://www.visitlosinj.hr/"],
+    "tz_gorski_kotar":  ["https://www.gorskikotar.hr/"],
+    "tz_baska":         ["https://www.tz-baska.hr/"],
+    "tz_lovran":        ["https://www.tz-lovran.hr/"],
+    "tz_kastav":        ["https://www.tz-kastav.hr/"],
+}
+
+
+def crawl(name, urls, max_pages=20):
+    conn = psycopg2.connect(DSN); conn.autocommit = True
+    visited = set(); queue = list(urls); facts = 0
+    while queue and len(visited) < max_pages:
+        url = queue.pop(0)
+        if url in visited: continue
+        visited.add(url)
+        html, status = fetch(url, timeout=15)
+        if not html or status != 200: continue
+        title = extract_title(html); text = extract_text(html)
+        if not text or len(text) < 200: continue
+        ff = []
+        if title and len(title) > 8:
+            ff.append({"fact": f"{name} - {title}", "url": url, "title": title})
+        for c in chunk_text(text, 800):
+            if len(c) > 100:
+                ff.append({"fact": c, "url": url, "title": title})
+        facts += upsert_facts(conn, ff, source_name=name,
+                               category="turizam_pgz", confidence=0.85)
+        base = urlparse(url).hostname
+        for link in find_internal_links(html, url):
+            if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 80:
+                queue.append(link)
+        time.sleep(0.4)
+    conn.close()
+    return {"name": name, "visited": len(visited), "facts": facts}
+
+
+def main():
+    results = []
+    for name, urls in TZ_SITES.items():
+        try:
+            r = crawl(name, urls, max_pages=20)
+            print(f"  {name:25} {r['visited']:>3}p  {r['facts']:>5}f")
+            results.append(r)
+        except Exception as e:
+            print(f"  {name:25} FAIL: {str(e)[:60]}")
+    total = sum(r.get("facts", 0) for r in results)
+    print(f"=== TOTAL: {total} ===")
+    print(json.dumps({"tz_count": len(results), "total_facts": total}))
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+"""Wikipedia deep PGZ encyclopedia."""
+import sys, json, time
+sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
+from _common import chunk_text, upsert_facts, DSN, UA
+from urllib.parse import urlencode, quote
+import urllib.request
+import psycopg2
+
+API_HR = "https://hr.wikipedia.org/w/api.php"
+API_EN = "https://en.wikipedia.org/w/api.php"
+
+
+def wiki_extract(api, title, timeout=15):
+    params = {"action":"query","prop":"extracts","explaintext":"1",
+              "redirects":"1","format":"json","titles":title}
+    url = api + "?" + urlencode(params)
+    try:
+        req = urllib.request.Request(url, headers={"User-Agent": UA})
+        with urllib.request.urlopen(req, timeout=timeout) as r:
+            d = json.loads(r.read())
+        for pid, p in d.get("query", {}).get("pages", {}).items():
+            if pid == "-1":
+                return None
+            return p.get("extract", "")
+    except Exception:
+        return None
+
+
+PAGES = {
+    "wiki_pgz_grad": ["Rijeka","Opatija","Crikvenica","Krk_(grad)","Kraljevica",
+                       "Rab_(grad)","Cres_(grad)","Mali_Losinj","Delnice","Vrbovsko",
+                       "Cabar","Bakar","Kastav","Novi_Vinodolski","Susak","Unije"],
+    "wiki_pgz_opcina": ["Opcina_Viskovo","Opcina_Klana","Opcina_Lovran","Opcina_Matulji",
+                         "Opcina_Omisalj","Opcina_Punat","Opcina_Vrbnik","Opcina_Baska",
+                         "Opcina_Dobrinj","Opcina_Jelenje","Opcina_Kostrena","Opcina_Cavle",
+                         "Opcina_Lopar","Opcina_Brod_Moravice","Opcina_Mrkopalj",
+                         "Opcina_Ravna_Gora","Opcina_Lokve","Opcina_Skrad","Opcina_Fuzine"],
+    "wiki_pgz_otok": ["Krk","Cres","Losinj","Rab","Susak","Unije","Ilovik","Ist",
+                       "Goli_otok","Sveti_Grgur"],
+    "wiki_pgz_povijest": ["Vinodolski_zakonik","Frankopani","Krcki_knezovi",
+                           "Liburnija","Liburni","Trsat","Tvrdjava_Trsat",
+                           "Slobodna_Drzava_Rijeka","Rijecka_rezolucija"],
+    "wiki_pgz_kultura": ["Glagoljica","Bascanska_ploca","Rijecki_karneval",
+                          "Halubajski_zvoncari","Hrvatsko_narodno_kazaliste_Ivana_pl._Zajca"],
+    "wiki_pgz_priroda": ["Ucka","Risnjak","Park_prirode_Ucka",
+                          "Nacionalni_park_Risnjak","Velebit","Kvarnerski_zaljev"],
+    "wiki_pgz_gospodarstvo": ["Luka_Rijeka","Brodogradiliste_3._maj",
+                                "Brodogradiliste_Viktor_Lenac","Rafinerija_nafte_Rijeka"],
+    "wiki_pgz_obrazovanje": ["Sveuciliste_u_Rijeci","Tehnicki_fakultet_u_Rijeci",
+                              "Pomorski_fakultet_u_Rijeci","Filozofski_fakultet_u_Rijeci",
+                              "Medicinski_fakultet_u_Rijeci"],
+    "wiki_pgz_osobe": ["Janica_Kostelic","Ivica_Kostelic","Janko_Polic_Kamov"],
+}
+
+
+def main():
+    conn = psycopg2.connect(DSN); conn.autocommit = True
+    total = 0; found = 0
+    for category, titles in PAGES.items():
+        cnt = 0
+        for title in titles:
+            for api, lang in [(API_HR, "hr"), (API_EN, "en")]:
+                text = wiki_extract(api, title)
+                if not text or len(text) < 250: continue
+                found += 1
+                facts = [{"fact": c, "url": f"https://{lang}.wikipedia.org/wiki/{quote(title)}", "title": title}
+                         for c in chunk_text(text, 700) if len(c) > 100]
+                n = upsert_facts(conn, facts, source_name=f"wikipedia_pgz_{lang}",
+                                  category=category, confidence=0.88)
+                total += n; cnt += n
+                time.sleep(0.4)
+        print(f"  {category:25} +{cnt:>5}f")
+    conn.close()
+    print(f"=== TOTAL pages={found} facts={total} ===")
+    print(json.dumps({"pages_found": found, "total_facts": total}))
+
+
+if __name__ == "__main__":
+    main()