Crisis V7 MEGA: sufinanciranje_sport + panel + CRM auth

DB: - pgz_sport.sufinanciranje_sport.je_klub flag (RSS programi/totals false) - pgz_sport.sufinanciranje_sport.klub_id matched Endpoints: - /v2/potpore/by-year: samo_klubovi=True default + davatelj filter Frontend: - sport2.html PANEL FORCE HIDE CSS (right:-100vw default) - crm_v2.html: redirect to /login only on actual 401, not on page load
2026-05-05 15:02:47 +02:00
parent 007825acee
commit f07fdad919
18 changed files with 1235 additions and 65 deletions
@@ -0,0 +1,70 @@
+#!/usr/bin/env python3
+"""data.gov.hr — Open Data PGZ."""
+import sys, json, time
+sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
+from _common import upsert_facts, DSN, UA
+import urllib.request
+import psycopg2
+
+API = "https://data.gov.hr/api/3/action"
+
+
+def search(query, rows=50):
+    url = f"{API}/package_search?q={urllib.parse.quote(query)}&rows={rows}"
+    try:
+        req = urllib.request.Request(url, headers={"User-Agent": UA})
+        with urllib.request.urlopen(req, timeout=30) as r:
+            d = json.loads(r.read())
+        return d.get("result", {}).get("results", [])
+    except Exception as e:
+        print(f"search err: {e}")
+        return []
+
+import urllib.parse
+
+
+def main():
+    conn = psycopg2.connect(DSN); conn.autocommit = True
+    
+    queries = [
+        "Primorsko-goranska", "Rijeka", "Opatija", "Crikvenica", "Krk",
+        "Cres", "Lošinj", "Rab", "Delnice", "Bakar", "Kvarner",
+    ]
+    
+    total_inserted = 0
+    seen = set()
+    
+    for q in queries:
+        results = search(q, rows=50)
+        ff = []
+        for pkg in results:
+            pkg_id = pkg.get("id", "")
+            if pkg_id in seen: continue
+            seen.add(pkg_id)
+            
+            title = pkg.get("title", "")
+            notes = pkg.get("notes", "")[:600]
+            org = pkg.get("organization", {}).get("title", "")
+            tags = ", ".join([t.get("name", "") for t in pkg.get("tags", [])])
+            
+            fact = f"[OpenData] {title} | Org: {org} | {notes} | Tags: {tags}"[:1200]
+            if len(fact) > 50:
+                ff.append({
+                    "fact": fact,
+                    "url": f"https://data.gov.hr/dataset/{pkg.get('name', '')}",
+                    "title": title,
+                })
+        
+        n = upsert_facts(conn, ff, source_name="data_gov_hr_pgz",
+                          category="opendata_pgz", confidence=0.85)
+        total_inserted += n
+        print(f"  query='{q}' -> {len(results)} results, {n} new facts")
+        time.sleep(1)
+    
+    conn.close()
+    print(f"=== TOTAL: {total_inserted} ===")
+    print(json.dumps({"queries": len(queries), "total_facts": total_inserted}))
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,70 @@
+#!/usr/bin/env python3
+"""Public events i festivali PGZ."""
+import sys, json, time
+sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
+from _common import (fetch, extract_text, extract_title, chunk_text,
+                      upsert_facts, find_internal_links, DSN)
+from urllib.parse import urlparse
+import psycopg2
+
+EVENTS = {
+    "rijeka_karneval":    ["https://www.rijecki-karneval.hr/", "https://rijekakarneval.hr/"],
+    "ljeto_kvarner":      ["https://www.kvarner-ljeto.hr/"],
+    "rijeka_film_fest":   ["https://www.riff.hr/"],
+    "vinski_festival_op": ["https://opatijawine.hr/"],
+    "festival_culture":   ["https://www.festival-of-cultures.hr/"],
+    "muzicki_kvarner":    ["https://www.kvarnermusic.hr/"],
+    "porto_etno":         ["https://www.portoetno.eu/"],
+    "ri_rock":            ["https://rockkonferencija.com/"],
+    "fjeshta_lovran":     ["https://www.fjeshta.hr/"],
+    "njanje_zvoncari":    ["https://halubajskizvoncari.hr/"],
+    "skoljka_festival":   ["https://www.kostrena.hr/"],
+    "sumski_film":        ["https://www.lifftrijeka.hr/"],
+    "zicfest":            ["https://www.zicfest.hr/"],
+}
+
+
+def crawl(name, urls, max_pages=12):
+    conn = psycopg2.connect(DSN); conn.autocommit = True
+    visited = set(); queue = list(urls); facts = 0
+    while queue and len(visited) < max_pages:
+        url = queue.pop(0)
+        if url in visited: continue
+        visited.add(url)
+        html, status = fetch(url, timeout=15)
+        if not html or status != 200: continue
+        title = extract_title(html); text = extract_text(html)
+        if not text or len(text) < 200: continue
+        ff = []
+        if title and len(title) > 8:
+            ff.append({"fact": f"{name} - {title}", "url": url, "title": title})
+        for c in chunk_text(text, 800):
+            if len(c) > 100:
+                ff.append({"fact": c, "url": url, "title": title})
+        facts += upsert_facts(conn, ff, source_name=name,
+                               category="events_pgz", confidence=0.85)
+        base = urlparse(url).hostname
+        for link in find_internal_links(html, url):
+            if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 30:
+                queue.append(link)
+        time.sleep(0.5)
+    conn.close()
+    return {"name": name, "visited": len(visited), "facts": facts}
+
+
+def main():
+    results = []
+    for name, urls in EVENTS.items():
+        try:
+            r = crawl(name, urls, max_pages=10)
+            print(f"  {name:25} {r['visited']:>3}p  {r['facts']:>5}f")
+            results.append(r)
+        except Exception as e:
+            print(f"  {name:25} FAIL: {str(e)[:60]}")
+    total = sum(r.get("facts", 0) for r in results)
+    print(f"=== TOTAL: {total} ===")
+    print(json.dumps({"events_count": len(results), "total_facts": total}))
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+"""Gorski kotar deep — Risnjak, Delnice, Cabar, Lokve."""
+import sys, json, time
+sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
+from _common import (fetch, extract_text, extract_title, chunk_text,
+                      upsert_facts, find_internal_links, DSN)
+from urllib.parse import urlparse
+import psycopg2
+
+GORSKI = {
+    "tz_gorski_kotar":    ["https://www.gorskikotar.hr/", "https://www.tz-gorskikotar.hr/"],
+    "np_risnjak":         ["https://www.np-risnjak.hr/"],
+    "delnice_info":       ["https://delnice.com/"],
+    "cabar_info":         ["https://www.cabar.hr/"],
+    "lokvarsko_jezero":   ["https://www.lokve.hr/"],
+    "fuzine_info":        ["https://www.fuzine.hr/"],
+    "ravnagora_blog":     ["https://www.ravnagora.hr/"],
+    "skrad_info":         ["https://www.skrad.hr/"],
+    "vrbovsko_info":      ["https://www.vrbovsko.hr/"],
+    "spilja_lokvarka":    ["https://www.spilja-lokvarka.com/"],
+    "skijanje_platak":    ["https://www.platak.hr/"],
+    "snowboard_klub":     ["https://www.snowboardklub-rijeka.hr/"],
+}
+
+
+def crawl(name, urls, max_pages=15):
+    conn = psycopg2.connect(DSN); conn.autocommit = True
+    visited = set(); queue = list(urls); facts = 0
+    while queue and len(visited) < max_pages:
+        url = queue.pop(0)
+        if url in visited: continue
+        visited.add(url)
+        html, status = fetch(url, timeout=15)
+        if not html or status != 200: continue
+        title = extract_title(html); text = extract_text(html)
+        if not text or len(text) < 200: continue
+        ff = []
+        if title and len(title) > 8:
+            ff.append({"fact": f"{name} - {title}", "url": url, "title": title})
+        for c in chunk_text(text, 800):
+            if len(c) > 100:
+                ff.append({"fact": c, "url": url, "title": title})
+        facts += upsert_facts(conn, ff, source_name=name,
+                               category="gorski_kotar_pgz", confidence=0.85)
+        base = urlparse(url).hostname
+        for link in find_internal_links(html, url):
+            if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 40:
+                queue.append(link)
+        time.sleep(0.5)
+    conn.close()
+    return {"name": name, "visited": len(visited), "facts": facts}
+
+
+def main():
+    results = []
+    for name, urls in GORSKI.items():
+        try:
+            r = crawl(name, urls, max_pages=12)
+            print(f"  {name:25} {r['visited']:>3}p  {r['facts']:>5}f")
+            results.append(r)
+        except Exception as e:
+            print(f"  {name:25} FAIL: {str(e)[:60]}")
+    total = sum(r.get("facts", 0) for r in results)
+    print(f"=== TOTAL: {total} ===")
+    print(json.dumps({"gorski_count": len(results), "total_facts": total}))
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+"""Media deep crawl — full pages of local portals."""
+import sys, json, time
+sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
+from _common import (fetch, extract_text, extract_title, chunk_text,
+                      upsert_facts, find_internal_links, DSN)
+from urllib.parse import urlparse
+import psycopg2
+
+MEDIA = {
+    "novilist_rijeka":    ["https://www.novilist.hr/rijeka/", "https://www.novilist.hr/regija/"],
+    "rijekadanas_full":   ["https://rijekadanas.com/category/rijeka/", "https://rijekadanas.com/category/pgz/"],
+    "rijekain_full":      ["https://rijekain.hr/category/rijeka/"],
+    "primorske_full":     ["https://primorskenovice.hr/"],
+    "rkc_blog":           ["https://www.rkcrijeka.hr/blog/"],
+    "rijeka2020_arhiva":  ["https://rijeka2020.eu/category/news/"],
+    "kulturpunkt_ri":     ["https://www.kulturpunkt.hr/tag/rijeka"],
+    "5portala_hr_pgz":    ["https://www.5portala.hr/regije/primorsko-goranska/"],
+}
+
+
+def crawl(name, urls, max_pages=20):
+    conn = psycopg2.connect(DSN); conn.autocommit = True
+    visited = set(); queue = list(urls); facts = 0
+    while queue and len(visited) < max_pages:
+        url = queue.pop(0)
+        if url in visited: continue
+        visited.add(url)
+        html, status = fetch(url, timeout=15)
+        if not html or status != 200: continue
+        title = extract_title(html); text = extract_text(html)
+        if not text or len(text) < 300: continue
+        ff = []
+        if title and len(title) > 12:
+            ff.append({"fact": f"[{name}] {title}", "url": url, "title": title})
+        for c in chunk_text(text, 800):
+            if len(c) > 120:
+                ff.append({"fact": c, "url": url, "title": title})
+        facts += upsert_facts(conn, ff, source_name=f"media_{name}",
+                               category="media_pgz_deep", confidence=0.82)
+        base = urlparse(url).hostname
+        for link in find_internal_links(html, url):
+            if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 60:
+                queue.append(link)
+        time.sleep(0.6)
+    conn.close()
+    return {"name": name, "visited": len(visited), "facts": facts}
+
+
+def main():
+    results = []
+    for name, urls in MEDIA.items():
+        try:
+            r = crawl(name, urls, max_pages=18)
+            print(f"  {name:25} {r['visited']:>3}p  {r['facts']:>5}f")
+            results.append(r)
+        except Exception as e:
+            print(f"  {name:25} FAIL: {str(e)[:60]}")
+    total = sum(r.get("facts", 0) for r in results)
+    print(f"=== TOTAL: {total} ===")
+    print(json.dumps({"media_count": len(results), "total_facts": total}))
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+"""Naselja PGZ — sela, zaseoci, otocna mjesta."""
+import sys, json, time
+sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
+from _common import chunk_text, upsert_facts, DSN, UA
+from urllib.parse import urlencode, quote
+import urllib.request
+import psycopg2
+
+API_HR = "https://hr.wikipedia.org/w/api.php"
+
+
+def wiki_cat_members(cat, limit=200):
+    """Get pages in a Wikipedia category."""
+    params = {"action":"query","list":"categorymembers","cmtitle":cat,
+              "cmlimit":str(limit),"format":"json"}
+    url = API_HR + "?" + urlencode(params)
+    try:
+        req = urllib.request.Request(url, headers={"User-Agent": UA})
+        with urllib.request.urlopen(req, timeout=20) as r:
+            d = json.loads(r.read())
+        return [m["title"] for m in d.get("query", {}).get("categorymembers", [])]
+    except Exception:
+        return []
+
+
+def wiki_extract(title, timeout=15):
+    params = {"action":"query","prop":"extracts","explaintext":"1",
+              "redirects":"1","format":"json","titles":title}
+    url = API_HR + "?" + urlencode(params)
+    try:
+        req = urllib.request.Request(url, headers={"User-Agent": UA})
+        with urllib.request.urlopen(req, timeout=timeout) as r:
+            d = json.loads(r.read())
+        for pid, p in d.get("query", {}).get("pages", {}).items():
+            if pid == "-1": return None
+            return p.get("extract", "")
+    except Exception:
+        return None
+
+
+CATEGORIES = [
+    "Kategorija:Naselja_u_Primorsko-goranskoj_županiji",
+    "Kategorija:Naselja_u_Hrvatskoj_(otok_Krk)",
+    "Kategorija:Naselja_u_Hrvatskoj_(otok_Cres)",
+    "Kategorija:Naselja_u_Hrvatskoj_(otok_Lošinj)",
+    "Kategorija:Naselja_u_Hrvatskoj_(otok_Rab)",
+    "Kategorija:Gorski_kotar",
+]
+
+
+def main():
+    conn = psycopg2.connect(DSN); conn.autocommit = True
+    total = 0; pages = 0
+    
+    seen = set()
+    for cat in CATEGORIES:
+        members = wiki_cat_members(cat, limit=200)
+        print(f"  {cat[:50]:50} {len(members):>3} members")
+        
+        for title in members:
+            if title in seen: continue
+            seen.add(title)
+            
+            text = wiki_extract(title)
+            if not text or len(text) < 200: continue
+            pages += 1
+            
+            facts = [{"fact": c, "url": f"https://hr.wikipedia.org/wiki/{quote(title)}",
+                       "title": title}
+                     for c in chunk_text(text, 600) if len(c) > 100]
+            n = upsert_facts(conn, facts, source_name="wikipedia_pgz_naselja",
+                              category="naselja_pgz", confidence=0.86)
+            total += n
+            time.sleep(0.3)
+    
+    conn.close()
+    print(f"=== TOTAL pages={pages} facts={total} ===")
+    print(json.dumps({"pages": pages, "total_facts": total}))
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,63 @@
+#!/usr/bin/env python3
+"""Real estate + housing PGZ."""
+import sys, json, time
+sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
+from _common import (fetch, extract_text, extract_title, chunk_text,
+                      upsert_facts, find_internal_links, DSN)
+from urllib.parse import urlparse
+import psycopg2
+
+REAL = {
+    "rijeka_najam":       ["https://www.rijeka.hr/javnu-najam/"],
+    "stanovanje_pgz":     ["https://www.dom-rijeka.hr/"],
+    "katastar_pgz":       ["https://geoportal.dgu.hr/"],
+    "uprava_imovine":     ["https://www.rijeka.hr/imovinsko-pravna/"],
+    "rijeka_arhitekt":    ["https://www.rijeka.hr/arhitektonska/"],
+    "drzavna_imovina":    ["https://www.drzavnaimovina.hr/"],
+}
+
+
+def crawl(name, urls, max_pages=8):
+    conn = psycopg2.connect(DSN); conn.autocommit = True
+    visited = set(); queue = list(urls); facts = 0
+    while queue and len(visited) < max_pages:
+        url = queue.pop(0)
+        if url in visited: continue
+        visited.add(url)
+        html, status = fetch(url, timeout=15)
+        if not html or status != 200: continue
+        title = extract_title(html); text = extract_text(html)
+        if not text or len(text) < 200: continue
+        ff = []
+        if title and len(title) > 8:
+            ff.append({"fact": f"{name} - {title}", "url": url, "title": title})
+        for c in chunk_text(text, 800):
+            if len(c) > 100:
+                ff.append({"fact": c, "url": url, "title": title})
+        facts += upsert_facts(conn, ff, source_name=name,
+                               category="nekretnine_pgz", confidence=0.83)
+        base = urlparse(url).hostname
+        for link in find_internal_links(html, url):
+            if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 25:
+                queue.append(link)
+        time.sleep(0.5)
+    conn.close()
+    return {"name": name, "visited": len(visited), "facts": facts}
+
+
+def main():
+    results = []
+    for name, urls in REAL.items():
+        try:
+            r = crawl(name, urls, max_pages=8)
+            print(f"  {name:25} {r['visited']:>3}p  {r['facts']:>5}f")
+            results.append(r)
+        except Exception as e:
+            print(f"  {name:25} FAIL: {str(e)[:60]}")
+    total = sum(r.get("facts", 0) for r in results)
+    print(f"=== TOTAL: {total} ===")
+    print(json.dumps({"real_count": len(results), "total_facts": total}))
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,68 @@
+#!/usr/bin/env python3
+"""Otoci PGZ deep — Krk, Cres, Losinj, Rab portali."""
+import sys, json, time
+sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
+from _common import (fetch, extract_text, extract_title, chunk_text,
+                      upsert_facts, find_internal_links, DSN)
+from urllib.parse import urlparse
+import psycopg2
+
+OTOCI = {
+    "krk_info":           ["https://www.krk.com/"],
+    "krkonline":          ["https://www.krkonline.com/"],
+    "krkinfo_news":       ["https://www.krk-info.com/"],
+    "cres_info":          ["https://www.cres.info/"],
+    "cres_lapis":         ["https://lapis.cres.hr/"],
+    "losinj_info":        ["https://www.losinj.info/"],
+    "losinj_centar":      ["https://www.muzejmalilosinj.hr/"],
+    "rab_info":           ["https://www.rab.com/"],
+    "rab_news":           ["https://rabnews.com/"],
+    "susak_info":         ["https://www.susakisland.com/"],
+    "ilovik_info":        ["https://www.ilovik.eu/"],
+}
+
+
+def crawl(name, urls, max_pages=12):
+    conn = psycopg2.connect(DSN); conn.autocommit = True
+    visited = set(); queue = list(urls); facts = 0
+    while queue and len(visited) < max_pages:
+        url = queue.pop(0)
+        if url in visited: continue
+        visited.add(url)
+        html, status = fetch(url, timeout=15)
+        if not html or status != 200: continue
+        title = extract_title(html); text = extract_text(html)
+        if not text or len(text) < 200: continue
+        ff = []
+        if title and len(title) > 8:
+            ff.append({"fact": f"{name} - {title}", "url": url, "title": title})
+        for c in chunk_text(text, 800):
+            if len(c) > 100:
+                ff.append({"fact": c, "url": url, "title": title})
+        facts += upsert_facts(conn, ff, source_name=name,
+                               category="otoci_pgz", confidence=0.85)
+        base = urlparse(url).hostname
+        for link in find_internal_links(html, url):
+            if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 30:
+                queue.append(link)
+        time.sleep(0.5)
+    conn.close()
+    return {"name": name, "visited": len(visited), "facts": facts}
+
+
+def main():
+    results = []
+    for name, urls in OTOCI.items():
+        try:
+            r = crawl(name, urls, max_pages=12)
+            print(f"  {name:25} {r['visited']:>3}p  {r['facts']:>5}f")
+            results.append(r)
+        except Exception as e:
+            print(f"  {name:25} FAIL: {str(e)[:60]}")
+    total = sum(r.get("facts", 0) for r in results)
+    print(f"=== TOTAL: {total} ===")
+    print(json.dumps({"otoci_count": len(results), "total_facts": total}))
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,68 @@
+#!/usr/bin/env python3
+"""Povijesni izvori PGZ — Liburnija, arhivi."""
+import sys, json, time
+sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
+from _common import (fetch, extract_text, extract_title, chunk_text,
+                      upsert_facts, find_internal_links, DSN)
+from urllib.parse import urlparse
+import psycopg2
+
+HISTORY = {
+    "drzavni_arhiv_ri":   ["https://www.dari.hr/"],
+    "arhiv_pazin":        ["https://www.dapa.hr/"],
+    "muzej_glagoljice":   ["https://glagoljica.hr/"],
+    "glagoljaska_alea":   ["https://www.aleja-glagoljasa.hr/"],
+    "frankopani":         ["https://www.frankopani.eu/"],
+    "trsatske_legende":   ["https://www.trsat-svetiste.com/povijest/"],
+    "rijeka_povijest":    ["https://rijeka-history.eu/"],
+    "stare_rijeke":       ["https://www.stararijeka.com/"],
+    "kvarner_arhiv":      ["https://www.kvarnerheritage.eu/"],
+    "muzeji_pgz_arhiv":   ["https://www.muzeji-pgz.hr/"],
+    "razno_pomorski":     ["https://www.kpu.hr/"],
+}
+
+
+def crawl(name, urls, max_pages=12):
+    conn = psycopg2.connect(DSN); conn.autocommit = True
+    visited = set(); queue = list(urls); facts = 0
+    while queue and len(visited) < max_pages:
+        url = queue.pop(0)
+        if url in visited: continue
+        visited.add(url)
+        html, status = fetch(url, timeout=15)
+        if not html or status != 200: continue
+        title = extract_title(html); text = extract_text(html)
+        if not text or len(text) < 200: continue
+        ff = []
+        if title and len(title) > 8:
+            ff.append({"fact": f"{name} - {title}", "url": url, "title": title})
+        for c in chunk_text(text, 800):
+            if len(c) > 100:
+                ff.append({"fact": c, "url": url, "title": title})
+        facts += upsert_facts(conn, ff, source_name=name,
+                               category="povijest_pgz", confidence=0.86)
+        base = urlparse(url).hostname
+        for link in find_internal_links(html, url):
+            if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 30:
+                queue.append(link)
+        time.sleep(0.5)
+    conn.close()
+    return {"name": name, "visited": len(visited), "facts": facts}
+
+
+def main():
+    results = []
+    for name, urls in HISTORY.items():
+        try:
+            r = crawl(name, urls, max_pages=10)
+            print(f"  {name:25} {r['visited']:>3}p  {r['facts']:>5}f")
+            results.append(r)
+        except Exception as e:
+            print(f"  {name:25} FAIL: {str(e)[:60]}")
+    total = sum(r.get("facts", 0) for r in results)
+    print(f"=== TOTAL: {total} ===")
+    print(json.dumps({"hist_count": len(results), "total_facts": total}))
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+"""Lokalni propisi PGZ — sluzbene novine, statuti."""
+import sys, json, time
+sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
+from _common import (fetch, extract_text, extract_title, chunk_text,
+                      upsert_facts, find_internal_links, DSN)
+from urllib.parse import urlparse
+import psycopg2
+
+PROPISI = {
+    "sluzbene_novine_pgz": ["https://www.sn.pgz.hr/"],
+    "sluzbene_glasnik_rijeka": ["https://www.sluzbene-novine.com/"],
+    "sn_opatija":          ["https://www.opatija.hr/sluzbene-novine"],
+    "pgz_dokumenti":       ["https://www.pgz.hr/dokumenti"],
+    "pgz_skupstina":       ["https://www.pgz.hr/skupstina"],
+    "rijeka_grad_v_savjet":["https://www.rijeka.hr/gradska-uprava/"],
+    "pgz_javnatime":       ["https://www.pgz.hr/javna-nabava/"],
+    "rijeka_javna_nabava": ["https://www.rijeka.hr/javna-nabava/"],
+    "narodne_novine_pgz":  ["https://narodne-novine.nn.hr/"],
+}
+
+
+def crawl(name, urls, max_pages=15):
+    conn = psycopg2.connect(DSN); conn.autocommit = True
+    visited = set(); queue = list(urls); facts = 0
+    while queue and len(visited) < max_pages:
+        url = queue.pop(0)
+        if url in visited: continue
+        visited.add(url)
+        html, status = fetch(url, timeout=15)
+        if not html or status != 200: continue
+        title = extract_title(html); text = extract_text(html)
+        if not text or len(text) < 200: continue
+        ff = []
+        if title and len(title) > 8:
+            ff.append({"fact": f"{name} - {title}", "url": url, "title": title})
+        for c in chunk_text(text, 800):
+            if len(c) > 100:
+                ff.append({"fact": c, "url": url, "title": title})
+        facts += upsert_facts(conn, ff, source_name=name,
+                               category="propisi_pgz", confidence=0.88)
+        base = urlparse(url).hostname
+        for link in find_internal_links(html, url):
+            if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 60:
+                queue.append(link)
+        time.sleep(0.5)
+    conn.close()
+    return {"name": name, "visited": len(visited), "facts": facts}
+
+
+def main():
+    results = []
+    for name, urls in PROPISI.items():
+        try:
+            r = crawl(name, urls, max_pages=15)
+            print(f"  {name:25} {r['visited']:>3}p  {r['facts']:>5}f")
+            results.append(r)
+        except Exception as e:
+            print(f"  {name:25} FAIL: {str(e)[:60]}")
+    total = sum(r.get("facts", 0) for r in results)
+    print(f"=== TOTAL: {total} ===")
+    print(json.dumps({"propisi_count": len(results), "total_facts": total}))
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,68 @@
+#!/usr/bin/env python3
+"""Sport infrastruktura PGZ — dvorane, baze, skole."""
+import sys, json, time
+sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
+from _common import (fetch, extract_text, extract_title, chunk_text,
+                      upsert_facts, find_internal_links, DSN)
+from urllib.parse import urlparse
+import psycopg2
+
+INFRA = {
+    "dvorana_zamet":      ["https://www.dvoranazamet.hr/"],
+    "stadion_kantrida":   ["https://www.kantrida.hr/"],
+    "stadion_rujevica":   ["https://www.nk-rijeka.hr/stadion-rujevica/"],
+    "ck_kantrida_pliv":   ["https://kantridapool.hr/"],
+    "ri_sport_centar":    ["https://www.ri-sport.hr/"],
+    "delta_jumbo":        ["https://www.deltajumbo.hr/"],
+    "skolski_sport":      ["https://www.hsss.hr/"],
+    "platak_skijanje":    ["https://www.platak.hr/"],
+    "rec_velebit":        ["https://www.velebit.hr/"],
+    "platak_ski_klub":    ["https://www.skiclub-platak.hr/"],
+    "rijeka_marina":      ["https://www.aci-marinas.com/"],
+}
+
+
+def crawl(name, urls, max_pages=15):
+    conn = psycopg2.connect(DSN); conn.autocommit = True
+    visited = set(); queue = list(urls); facts = 0
+    while queue and len(visited) < max_pages:
+        url = queue.pop(0)
+        if url in visited: continue
+        visited.add(url)
+        html, status = fetch(url, timeout=15)
+        if not html or status != 200: continue
+        title = extract_title(html); text = extract_text(html)
+        if not text or len(text) < 200: continue
+        ff = []
+        if title and len(title) > 8:
+            ff.append({"fact": f"{name} - {title}", "url": url, "title": title})
+        for c in chunk_text(text, 800):
+            if len(c) > 100:
+                ff.append({"fact": c, "url": url, "title": title})
+        facts += upsert_facts(conn, ff, source_name=name,
+                               category="sport_infra_pgz", confidence=0.85)
+        base = urlparse(url).hostname
+        for link in find_internal_links(html, url):
+            if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 40:
+                queue.append(link)
+        time.sleep(0.5)
+    conn.close()
+    return {"name": name, "visited": len(visited), "facts": facts}
+
+
+def main():
+    results = []
+    for name, urls in INFRA.items():
+        try:
+            r = crawl(name, urls, max_pages=12)
+            print(f"  {name:25} {r['visited']:>3}p  {r['facts']:>5}f")
+            results.append(r)
+        except Exception as e:
+            print(f"  {name:25} FAIL: {str(e)[:60]}")
+    total = sum(r.get("facts", 0) for r in results)
+    print(f"=== TOTAL: {total} ===")
+    print(json.dumps({"infra_count": len(results), "total_facts": total}))
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,72 @@
+#!/usr/bin/env python3
+"""Sport klubovi PGZ — direktno s web stranica."""
+import sys, json, time
+sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
+from _common import (fetch, extract_text, extract_title, chunk_text,
+                      upsert_facts, find_internal_links, DSN)
+from urllib.parse import urlparse
+import psycopg2
+
+KLUBOVI = {
+    "hnk_rijeka":         ["https://www.nk-rijeka.hr/"],
+    "kk_kvarner":         ["https://www.kk-kvarner.hr/"],
+    "rk_zamet":           ["https://www.rk-zamet.hr/"],
+    "vk_primorje":        ["https://www.vkprimorje.hr/"],
+    "ok_rijeka":          ["https://www.ok-rijeka.hr/"],
+    "haok_mladost":       ["https://www.haok-mladost.hr/"],
+    "abc_rijeka":         ["https://www.abc-rijeka.hr/"],
+    "rugby_rijeka":       ["https://www.rugbyrijeka.hr/"],
+    "pliva_klub_primorje":["https://www.primorje-aquarius.hr/"],
+    "judo_kvarner":       ["https://www.judokvarner.hr/"],
+    "kuglacki_savez_pgz": ["https://www.kuglacki-savez-pgz.hr/"],
+    "tenis_kvarner":      ["https://www.tk-kvarner.hr/"],
+    "atletika_rijeka":    ["https://www.akrijeka.hr/"],
+    "biciklisticki":      ["https://www.bk-rijeka.hr/"],
+    "stoljecesporta":     ["https://stoljecesporta.com/"],
+}
+
+
+def crawl(name, urls, max_pages=12):
+    conn = psycopg2.connect(DSN); conn.autocommit = True
+    visited = set(); queue = list(urls); facts = 0
+    while queue and len(visited) < max_pages:
+        url = queue.pop(0)
+        if url in visited: continue
+        visited.add(url)
+        html, status = fetch(url, timeout=15)
+        if not html or status != 200: continue
+        title = extract_title(html); text = extract_text(html)
+        if not text or len(text) < 200: continue
+        ff = []
+        if title and len(title) > 8:
+            ff.append({"fact": f"{name} - {title}", "url": url, "title": title})
+        for c in chunk_text(text, 800):
+            if len(c) > 100:
+                ff.append({"fact": c, "url": url, "title": title})
+        facts += upsert_facts(conn, ff, source_name=name,
+                               category="sport_klub_pgz", confidence=0.88)
+        base = urlparse(url).hostname
+        for link in find_internal_links(html, url):
+            if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 30:
+                queue.append(link)
+        time.sleep(0.5)
+    conn.close()
+    return {"name": name, "visited": len(visited), "facts": facts}
+
+
+def main():
+    results = []
+    for name, urls in KLUBOVI.items():
+        try:
+            r = crawl(name, urls, max_pages=10)
+            print(f"  {name:25} {r['visited']:>3}p  {r['facts']:>5}f")
+            results.append(r)
+        except Exception as e:
+            print(f"  {name:25} FAIL: {str(e)[:60]}")
+    total = sum(r.get("facts", 0) for r in results)
+    print(f"=== TOTAL: {total} ===")
+    print(json.dumps({"klub_count": len(results), "total_facts": total}))
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+"""Crkve i vjerske institucije PGZ."""
+import sys, json, time
+sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
+from _common import (fetch, extract_text, extract_title, chunk_text,
+                      upsert_facts, find_internal_links, DSN)
+from urllib.parse import urlparse
+import psycopg2
+
+CRKVA = {
+    "rijecka_nadbiskupija":["https://www.ri-nadbiskupija.hr/"],
+    "krcka_biskupija":    ["https://www.biskupija-krk.hr/"],
+    "isusovci_rijeka":    ["https://isusovci.hr/"],
+    "trsat_svetiste":     ["https://trsat-svetiste.com/"],
+    "katedrala_rijeka":   ["https://katedrala-rijeka.hr/"],
+    "samostan_kosljun":   ["https://www.kosljun.hr/"],
+    "samostan_glavotok":  ["https://www.glavotok.hr/"],
+    "katedrala_krk":      ["https://www.biskupija-krk.hr/katedrala/"],
+    "crkva_opatija":      ["https://www.zupa-opatija.hr/"],
+    "rijecka_eparhija":   ["https://www.eparhija-zagrebackoljubljanska.com/"],
+}
+
+
+def crawl(name, urls, max_pages=10):
+    conn = psycopg2.connect(DSN); conn.autocommit = True
+    visited = set(); queue = list(urls); facts = 0
+    while queue and len(visited) < max_pages:
+        url = queue.pop(0)
+        if url in visited: continue
+        visited.add(url)
+        html, status = fetch(url, timeout=15)
+        if not html or status != 200: continue
+        title = extract_title(html); text = extract_text(html)
+        if not text or len(text) < 200: continue
+        ff = []
+        if title and len(title) > 8:
+            ff.append({"fact": f"{name} - {title}", "url": url, "title": title})
+        for c in chunk_text(text, 800):
+            if len(c) > 100:
+                ff.append({"fact": c, "url": url, "title": title})
+        facts += upsert_facts(conn, ff, source_name=name,
+                               category="vjera_pgz", confidence=0.84)
+        base = urlparse(url).hostname
+        for link in find_internal_links(html, url):
+            if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 25:
+                queue.append(link)
+        time.sleep(0.5)
+    conn.close()
+    return {"name": name, "visited": len(visited), "facts": facts}
+
+
+def main():
+    results = []
+    for name, urls in CRKVA.items():
+        try:
+            r = crawl(name, urls, max_pages=10)
+            print(f"  {name:25} {r['visited']:>3}p  {r['facts']:>5}f")
+            results.append(r)
+        except Exception as e:
+            print(f"  {name:25} FAIL: {str(e)[:60]}")
+    total = sum(r.get("facts", 0) for r in results)
+    print(f"=== TOTAL: {total} ===")
+    print(json.dumps({"crkva_count": len(results), "total_facts": total}))
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,68 @@
+#!/usr/bin/env python3
+"""Zdravstvo + udruge PGZ."""
+import sys, json, time
+sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
+from _common import (fetch, extract_text, extract_title, chunk_text,
+                      upsert_facts, find_internal_links, DSN)
+from urllib.parse import urlparse
+import psycopg2
+
+ZDRAVSTVO = {
+    "kbc_rijeka":         ["https://www.kbc-rijeka.hr/"],
+    "thalassotherapia":   ["https://thalassotherapia-opatija.hr/"],
+    "klinika_lovran":     ["https://www.tnz-lovran.hr/"],
+    "klinika_crikvenica": ["https://www.thalassotherapia-crikvenica.hr/"],
+    "dom_zdravlja_pgz":   ["https://www.dom-zdravlja-pgz.hr/"],
+    "zavod_javno_zdravlje":["https://www.zzjzpgz.hr/"],
+    "crveni_kriz_pgz":    ["https://www.crveni-kriz-rijeka.hr/"],
+    "hzzo_rijeka":        ["https://hzzo.hr/"],
+    "savjetovaliste_ri":  ["https://www.zzjzpgz.hr/savjetovaliste/"],
+    "deinstitucionalizacija":["https://www.cczg-rijeka.hr/"],
+    "centar_socijalne":   ["https://www.czss.rijeka.hr/"],
+}
+
+
+def crawl(name, urls, max_pages=12):
+    conn = psycopg2.connect(DSN); conn.autocommit = True
+    visited = set(); queue = list(urls); facts = 0
+    while queue and len(visited) < max_pages:
+        url = queue.pop(0)
+        if url in visited: continue
+        visited.add(url)
+        html, status = fetch(url, timeout=15)
+        if not html or status != 200: continue
+        title = extract_title(html); text = extract_text(html)
+        if not text or len(text) < 200: continue
+        ff = []
+        if title and len(title) > 8:
+            ff.append({"fact": f"{name} - {title}", "url": url, "title": title})
+        for c in chunk_text(text, 800):
+            if len(c) > 100:
+                ff.append({"fact": c, "url": url, "title": title})
+        facts += upsert_facts(conn, ff, source_name=name,
+                               category="zdravstvo_pgz", confidence=0.86)
+        base = urlparse(url).hostname
+        for link in find_internal_links(html, url):
+            if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 40:
+                queue.append(link)
+        time.sleep(0.5)
+    conn.close()
+    return {"name": name, "visited": len(visited), "facts": facts}
+
+
+def main():
+    results = []
+    for name, urls in ZDRAVSTVO.items():
+        try:
+            r = crawl(name, urls, max_pages=10)
+            print(f"  {name:25} {r['visited']:>3}p  {r['facts']:>5}f")
+            results.append(r)
+        except Exception as e:
+            print(f"  {name:25} FAIL: {str(e)[:60]}")
+    total = sum(r.get("facts", 0) for r in results)
+    print(f"=== TOTAL: {total} ===")
+    print(json.dumps({"zdr_count": len(results), "total_facts": total}))
+
+
+if __name__ == "__main__":
+    main()