diff --git a/pgz_sport_api.py b/pgz_sport_api.py index 39c765b..b4f7b7f 100644 --- a/pgz_sport_api.py +++ b/pgz_sport_api.py @@ -674,10 +674,18 @@ def get_savez(savez_id: int, authorization: Optional[str] = Header(None)): return {**savez, "klubovi": klubovi, "statistika": statistika, "manifestacije": manifestacije} # ==================== KLUBOVI ==================== +# ───────────────────────────────────────────────────────────────────── +# Endpoint: GET /api/klubovi +# Author: Damir Radulić (dradulic@outlook.com / damir@rinet.one) +# Date: 2026-05-05 (BUG-E filter sprint) +# Note: `samo_hns_roster` added — keeps priority-sort behaviour but +# lets UI filter to klubs that have at least 1 HNS roster row. +# ───────────────────────────────────────────────────────────────────── @app.get("/api/klubovi") def list_klubovi(authorization: Optional[str] = Header(None), q: Optional[str] = None, savez_id: Optional[int] = None, nositelj: Optional[bool] = None, region: Optional[str] = None, sport: Optional[str] = None, grad: Optional[str] = None, kategorija: Optional[str] = None, godisnjak: Optional[bool] = None, financiran: Optional[bool] = None, + samo_hns_roster: Optional[bool] = None, sort: str = "naziv", order: str = "asc"): where = ["v.aktivan"] params = [] @@ -703,6 +711,8 @@ def list_klubovi(authorization: Optional[str] = Header(None), q: Optional[str] = where.append("(k.godisnjak_godine IS NULL OR array_length(k.godisnjak_godine,1) IS NULL)") if kategorija and kategorija.strip().lower() == "priority": where.append("(COALESCE(k.pgz_sufinanciran,false) OR (k.godisnjak_godine IS NOT NULL AND array_length(k.godisnjak_godine,1) > 0))") + if samo_hns_roster: + where.append("EXISTS (SELECT 1 FROM pgz_sport.hns_klub_roster r WHERE r.klub_id = k.id)") sort_col = {"naziv": "v.klub", "savez": "v.savez", "broj_clanova": "v.broj_clanova", "razina": "v.razina", "region": "v.region", "grad": "v.grad", "sport": "v.sport"}.get(sort, "v.klub") order_sql = "DESC" if order.lower() == "desc" else "ASC" diff --git a/pgz_sport_v2_router.py b/pgz_sport_v2_router.py index d63089f..2768be4 100644 --- a/pgz_sport_v2_router.py +++ b/pgz_sport_v2_router.py @@ -4960,19 +4960,31 @@ def proracun_sport(godina: int = None): # POTPORE — by year filter # ═══════════════════════════════════════════════════════ @router.get("/potpore/by-year") -def potpore_by_year(godina: int = None, q: str = ""): - """Sufinanciranje za specifičnu godinu.""" +def potpore_by_year(godina: int = None, q: str = "", samo_klubovi: bool = True, davatelj: str = None): + """Sufinanciranje za specifičnu godinu — samo_klubovi=True izbacuje programe/totals/services.""" import datetime yr = godina or datetime.date.today().year like = f"%{q}%" if q else "%" - rows = db_query(""" - SELECT korisnik, sport, iznos_eur, vrsta, napomena, izvor, source_url, godina, - (SELECT k.id FROM pgz_sport.klubovi k WHERE LOWER(k.naziv) LIKE LOWER('%%'||LEFT(korisnik,20)||'%%') AND k.aktivan=true LIMIT 1) as klub_id + + where = ["godina = %s", "LOWER(COALESCE(korisnik,'')) LIKE LOWER(%s)"] + params = [yr, like] + + if samo_klubovi: + where.append("(je_klub IS NULL OR je_klub = true)") + + if davatelj == 'rijeka': + where.append("izvor ILIKE '%%rijeka.hr%%'") + elif davatelj == 'pgz': + where.append("izvor ILIKE '%%sport-pgz%%'") + + sql = f""" + SELECT id, korisnik, sport, iznos_eur, vrsta, napomena, izvor, source_url, godina, klub_id, je_klub FROM pgz_sport.sufinanciranje_sport - WHERE godina = %s AND LOWER(COALESCE(korisnik,'')) LIKE LOWER(%s) + WHERE {' AND '.join(where)} ORDER BY iznos_eur DESC NULLS LAST LIMIT 500 - """, (yr, like)) + """ + rows = db_query(sql, params) total = sum(float(r.get('iznos_eur') or 0) for r in rows) return {"godina": yr, "count": len(rows), "total": total, "results": rows} diff --git a/scrapers/harvesters/civic_data_pgz.py b/scrapers/harvesters/civic_data_pgz.py new file mode 100644 index 0000000..bedaec2 --- /dev/null +++ b/scrapers/harvesters/civic_data_pgz.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 +"""data.gov.hr — Open Data PGZ.""" +import sys, json, time +sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters") +from _common import upsert_facts, DSN, UA +import urllib.request +import psycopg2 + +API = "https://data.gov.hr/api/3/action" + + +def search(query, rows=50): + url = f"{API}/package_search?q={urllib.parse.quote(query)}&rows={rows}" + try: + req = urllib.request.Request(url, headers={"User-Agent": UA}) + with urllib.request.urlopen(req, timeout=30) as r: + d = json.loads(r.read()) + return d.get("result", {}).get("results", []) + except Exception as e: + print(f"search err: {e}") + return [] + +import urllib.parse + + +def main(): + conn = psycopg2.connect(DSN); conn.autocommit = True + + queries = [ + "Primorsko-goranska", "Rijeka", "Opatija", "Crikvenica", "Krk", + "Cres", "Lošinj", "Rab", "Delnice", "Bakar", "Kvarner", + ] + + total_inserted = 0 + seen = set() + + for q in queries: + results = search(q, rows=50) + ff = [] + for pkg in results: + pkg_id = pkg.get("id", "") + if pkg_id in seen: continue + seen.add(pkg_id) + + title = pkg.get("title", "") + notes = pkg.get("notes", "")[:600] + org = pkg.get("organization", {}).get("title", "") + tags = ", ".join([t.get("name", "") for t in pkg.get("tags", [])]) + + fact = f"[OpenData] {title} | Org: {org} | {notes} | Tags: {tags}"[:1200] + if len(fact) > 50: + ff.append({ + "fact": fact, + "url": f"https://data.gov.hr/dataset/{pkg.get('name', '')}", + "title": title, + }) + + n = upsert_facts(conn, ff, source_name="data_gov_hr_pgz", + category="opendata_pgz", confidence=0.85) + total_inserted += n + print(f" query='{q}' -> {len(results)} results, {n} new facts") + time.sleep(1) + + conn.close() + print(f"=== TOTAL: {total_inserted} ===") + print(json.dumps({"queries": len(queries), "total_facts": total_inserted})) + + +if __name__ == "__main__": + main() diff --git a/scrapers/harvesters/events_pgz.py b/scrapers/harvesters/events_pgz.py new file mode 100644 index 0000000..d98298b --- /dev/null +++ b/scrapers/harvesters/events_pgz.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 +"""Public events i festivali PGZ.""" +import sys, json, time +sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters") +from _common import (fetch, extract_text, extract_title, chunk_text, + upsert_facts, find_internal_links, DSN) +from urllib.parse import urlparse +import psycopg2 + +EVENTS = { + "rijeka_karneval": ["https://www.rijecki-karneval.hr/", "https://rijekakarneval.hr/"], + "ljeto_kvarner": ["https://www.kvarner-ljeto.hr/"], + "rijeka_film_fest": ["https://www.riff.hr/"], + "vinski_festival_op": ["https://opatijawine.hr/"], + "festival_culture": ["https://www.festival-of-cultures.hr/"], + "muzicki_kvarner": ["https://www.kvarnermusic.hr/"], + "porto_etno": ["https://www.portoetno.eu/"], + "ri_rock": ["https://rockkonferencija.com/"], + "fjeshta_lovran": ["https://www.fjeshta.hr/"], + "njanje_zvoncari": ["https://halubajskizvoncari.hr/"], + "skoljka_festival": ["https://www.kostrena.hr/"], + "sumski_film": ["https://www.lifftrijeka.hr/"], + "zicfest": ["https://www.zicfest.hr/"], +} + + +def crawl(name, urls, max_pages=12): + conn = psycopg2.connect(DSN); conn.autocommit = True + visited = set(); queue = list(urls); facts = 0 + while queue and len(visited) < max_pages: + url = queue.pop(0) + if url in visited: continue + visited.add(url) + html, status = fetch(url, timeout=15) + if not html or status != 200: continue + title = extract_title(html); text = extract_text(html) + if not text or len(text) < 200: continue + ff = [] + if title and len(title) > 8: + ff.append({"fact": f"{name} - {title}", "url": url, "title": title}) + for c in chunk_text(text, 800): + if len(c) > 100: + ff.append({"fact": c, "url": url, "title": title}) + facts += upsert_facts(conn, ff, source_name=name, + category="events_pgz", confidence=0.85) + base = urlparse(url).hostname + for link in find_internal_links(html, url): + if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 30: + queue.append(link) + time.sleep(0.5) + conn.close() + return {"name": name, "visited": len(visited), "facts": facts} + + +def main(): + results = [] + for name, urls in EVENTS.items(): + try: + r = crawl(name, urls, max_pages=10) + print(f" {name:25} {r['visited']:>3}p {r['facts']:>5}f") + results.append(r) + except Exception as e: + print(f" {name:25} FAIL: {str(e)[:60]}") + total = sum(r.get("facts", 0) for r in results) + print(f"=== TOTAL: {total} ===") + print(json.dumps({"events_count": len(results), "total_facts": total})) + + +if __name__ == "__main__": + main() diff --git a/scrapers/harvesters/gorski_kotar_pgz.py b/scrapers/harvesters/gorski_kotar_pgz.py new file mode 100644 index 0000000..9a63e76 --- /dev/null +++ b/scrapers/harvesters/gorski_kotar_pgz.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 +"""Gorski kotar deep — Risnjak, Delnice, Cabar, Lokve.""" +import sys, json, time +sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters") +from _common import (fetch, extract_text, extract_title, chunk_text, + upsert_facts, find_internal_links, DSN) +from urllib.parse import urlparse +import psycopg2 + +GORSKI = { + "tz_gorski_kotar": ["https://www.gorskikotar.hr/", "https://www.tz-gorskikotar.hr/"], + "np_risnjak": ["https://www.np-risnjak.hr/"], + "delnice_info": ["https://delnice.com/"], + "cabar_info": ["https://www.cabar.hr/"], + "lokvarsko_jezero": ["https://www.lokve.hr/"], + "fuzine_info": ["https://www.fuzine.hr/"], + "ravnagora_blog": ["https://www.ravnagora.hr/"], + "skrad_info": ["https://www.skrad.hr/"], + "vrbovsko_info": ["https://www.vrbovsko.hr/"], + "spilja_lokvarka": ["https://www.spilja-lokvarka.com/"], + "skijanje_platak": ["https://www.platak.hr/"], + "snowboard_klub": ["https://www.snowboardklub-rijeka.hr/"], +} + + +def crawl(name, urls, max_pages=15): + conn = psycopg2.connect(DSN); conn.autocommit = True + visited = set(); queue = list(urls); facts = 0 + while queue and len(visited) < max_pages: + url = queue.pop(0) + if url in visited: continue + visited.add(url) + html, status = fetch(url, timeout=15) + if not html or status != 200: continue + title = extract_title(html); text = extract_text(html) + if not text or len(text) < 200: continue + ff = [] + if title and len(title) > 8: + ff.append({"fact": f"{name} - {title}", "url": url, "title": title}) + for c in chunk_text(text, 800): + if len(c) > 100: + ff.append({"fact": c, "url": url, "title": title}) + facts += upsert_facts(conn, ff, source_name=name, + category="gorski_kotar_pgz", confidence=0.85) + base = urlparse(url).hostname + for link in find_internal_links(html, url): + if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 40: + queue.append(link) + time.sleep(0.5) + conn.close() + return {"name": name, "visited": len(visited), "facts": facts} + + +def main(): + results = [] + for name, urls in GORSKI.items(): + try: + r = crawl(name, urls, max_pages=12) + print(f" {name:25} {r['visited']:>3}p {r['facts']:>5}f") + results.append(r) + except Exception as e: + print(f" {name:25} FAIL: {str(e)[:60]}") + total = sum(r.get("facts", 0) for r in results) + print(f"=== TOTAL: {total} ===") + print(json.dumps({"gorski_count": len(results), "total_facts": total})) + + +if __name__ == "__main__": + main() diff --git a/scrapers/harvesters/media_deep_pgz.py b/scrapers/harvesters/media_deep_pgz.py new file mode 100644 index 0000000..301dfae --- /dev/null +++ b/scrapers/harvesters/media_deep_pgz.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 +"""Media deep crawl — full pages of local portals.""" +import sys, json, time +sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters") +from _common import (fetch, extract_text, extract_title, chunk_text, + upsert_facts, find_internal_links, DSN) +from urllib.parse import urlparse +import psycopg2 + +MEDIA = { + "novilist_rijeka": ["https://www.novilist.hr/rijeka/", "https://www.novilist.hr/regija/"], + "rijekadanas_full": ["https://rijekadanas.com/category/rijeka/", "https://rijekadanas.com/category/pgz/"], + "rijekain_full": ["https://rijekain.hr/category/rijeka/"], + "primorske_full": ["https://primorskenovice.hr/"], + "rkc_blog": ["https://www.rkcrijeka.hr/blog/"], + "rijeka2020_arhiva": ["https://rijeka2020.eu/category/news/"], + "kulturpunkt_ri": ["https://www.kulturpunkt.hr/tag/rijeka"], + "5portala_hr_pgz": ["https://www.5portala.hr/regije/primorsko-goranska/"], +} + + +def crawl(name, urls, max_pages=20): + conn = psycopg2.connect(DSN); conn.autocommit = True + visited = set(); queue = list(urls); facts = 0 + while queue and len(visited) < max_pages: + url = queue.pop(0) + if url in visited: continue + visited.add(url) + html, status = fetch(url, timeout=15) + if not html or status != 200: continue + title = extract_title(html); text = extract_text(html) + if not text or len(text) < 300: continue + ff = [] + if title and len(title) > 12: + ff.append({"fact": f"[{name}] {title}", "url": url, "title": title}) + for c in chunk_text(text, 800): + if len(c) > 120: + ff.append({"fact": c, "url": url, "title": title}) + facts += upsert_facts(conn, ff, source_name=f"media_{name}", + category="media_pgz_deep", confidence=0.82) + base = urlparse(url).hostname + for link in find_internal_links(html, url): + if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 60: + queue.append(link) + time.sleep(0.6) + conn.close() + return {"name": name, "visited": len(visited), "facts": facts} + + +def main(): + results = [] + for name, urls in MEDIA.items(): + try: + r = crawl(name, urls, max_pages=18) + print(f" {name:25} {r['visited']:>3}p {r['facts']:>5}f") + results.append(r) + except Exception as e: + print(f" {name:25} FAIL: {str(e)[:60]}") + total = sum(r.get("facts", 0) for r in results) + print(f"=== TOTAL: {total} ===") + print(json.dumps({"media_count": len(results), "total_facts": total})) + + +if __name__ == "__main__": + main() diff --git a/scrapers/harvesters/naselja_pgz.py b/scrapers/harvesters/naselja_pgz.py new file mode 100644 index 0000000..9784ec2 --- /dev/null +++ b/scrapers/harvesters/naselja_pgz.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +"""Naselja PGZ — sela, zaseoci, otocna mjesta.""" +import sys, json, time +sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters") +from _common import chunk_text, upsert_facts, DSN, UA +from urllib.parse import urlencode, quote +import urllib.request +import psycopg2 + +API_HR = "https://hr.wikipedia.org/w/api.php" + + +def wiki_cat_members(cat, limit=200): + """Get pages in a Wikipedia category.""" + params = {"action":"query","list":"categorymembers","cmtitle":cat, + "cmlimit":str(limit),"format":"json"} + url = API_HR + "?" + urlencode(params) + try: + req = urllib.request.Request(url, headers={"User-Agent": UA}) + with urllib.request.urlopen(req, timeout=20) as r: + d = json.loads(r.read()) + return [m["title"] for m in d.get("query", {}).get("categorymembers", [])] + except Exception: + return [] + + +def wiki_extract(title, timeout=15): + params = {"action":"query","prop":"extracts","explaintext":"1", + "redirects":"1","format":"json","titles":title} + url = API_HR + "?" + urlencode(params) + try: + req = urllib.request.Request(url, headers={"User-Agent": UA}) + with urllib.request.urlopen(req, timeout=timeout) as r: + d = json.loads(r.read()) + for pid, p in d.get("query", {}).get("pages", {}).items(): + if pid == "-1": return None + return p.get("extract", "") + except Exception: + return None + + +CATEGORIES = [ + "Kategorija:Naselja_u_Primorsko-goranskoj_županiji", + "Kategorija:Naselja_u_Hrvatskoj_(otok_Krk)", + "Kategorija:Naselja_u_Hrvatskoj_(otok_Cres)", + "Kategorija:Naselja_u_Hrvatskoj_(otok_Lošinj)", + "Kategorija:Naselja_u_Hrvatskoj_(otok_Rab)", + "Kategorija:Gorski_kotar", +] + + +def main(): + conn = psycopg2.connect(DSN); conn.autocommit = True + total = 0; pages = 0 + + seen = set() + for cat in CATEGORIES: + members = wiki_cat_members(cat, limit=200) + print(f" {cat[:50]:50} {len(members):>3} members") + + for title in members: + if title in seen: continue + seen.add(title) + + text = wiki_extract(title) + if not text or len(text) < 200: continue + pages += 1 + + facts = [{"fact": c, "url": f"https://hr.wikipedia.org/wiki/{quote(title)}", + "title": title} + for c in chunk_text(text, 600) if len(c) > 100] + n = upsert_facts(conn, facts, source_name="wikipedia_pgz_naselja", + category="naselja_pgz", confidence=0.86) + total += n + time.sleep(0.3) + + conn.close() + print(f"=== TOTAL pages={pages} facts={total} ===") + print(json.dumps({"pages": pages, "total_facts": total})) + + +if __name__ == "__main__": + main() diff --git a/scrapers/harvesters/nekretnine_pgz.py b/scrapers/harvesters/nekretnine_pgz.py new file mode 100644 index 0000000..b0dd18d --- /dev/null +++ b/scrapers/harvesters/nekretnine_pgz.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 +"""Real estate + housing PGZ.""" +import sys, json, time +sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters") +from _common import (fetch, extract_text, extract_title, chunk_text, + upsert_facts, find_internal_links, DSN) +from urllib.parse import urlparse +import psycopg2 + +REAL = { + "rijeka_najam": ["https://www.rijeka.hr/javnu-najam/"], + "stanovanje_pgz": ["https://www.dom-rijeka.hr/"], + "katastar_pgz": ["https://geoportal.dgu.hr/"], + "uprava_imovine": ["https://www.rijeka.hr/imovinsko-pravna/"], + "rijeka_arhitekt": ["https://www.rijeka.hr/arhitektonska/"], + "drzavna_imovina": ["https://www.drzavnaimovina.hr/"], +} + + +def crawl(name, urls, max_pages=8): + conn = psycopg2.connect(DSN); conn.autocommit = True + visited = set(); queue = list(urls); facts = 0 + while queue and len(visited) < max_pages: + url = queue.pop(0) + if url in visited: continue + visited.add(url) + html, status = fetch(url, timeout=15) + if not html or status != 200: continue + title = extract_title(html); text = extract_text(html) + if not text or len(text) < 200: continue + ff = [] + if title and len(title) > 8: + ff.append({"fact": f"{name} - {title}", "url": url, "title": title}) + for c in chunk_text(text, 800): + if len(c) > 100: + ff.append({"fact": c, "url": url, "title": title}) + facts += upsert_facts(conn, ff, source_name=name, + category="nekretnine_pgz", confidence=0.83) + base = urlparse(url).hostname + for link in find_internal_links(html, url): + if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 25: + queue.append(link) + time.sleep(0.5) + conn.close() + return {"name": name, "visited": len(visited), "facts": facts} + + +def main(): + results = [] + for name, urls in REAL.items(): + try: + r = crawl(name, urls, max_pages=8) + print(f" {name:25} {r['visited']:>3}p {r['facts']:>5}f") + results.append(r) + except Exception as e: + print(f" {name:25} FAIL: {str(e)[:60]}") + total = sum(r.get("facts", 0) for r in results) + print(f"=== TOTAL: {total} ===") + print(json.dumps({"real_count": len(results), "total_facts": total})) + + +if __name__ == "__main__": + main() diff --git a/scrapers/harvesters/otoci_deep_pgz.py b/scrapers/harvesters/otoci_deep_pgz.py new file mode 100644 index 0000000..1524a28 --- /dev/null +++ b/scrapers/harvesters/otoci_deep_pgz.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 +"""Otoci PGZ deep — Krk, Cres, Losinj, Rab portali.""" +import sys, json, time +sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters") +from _common import (fetch, extract_text, extract_title, chunk_text, + upsert_facts, find_internal_links, DSN) +from urllib.parse import urlparse +import psycopg2 + +OTOCI = { + "krk_info": ["https://www.krk.com/"], + "krkonline": ["https://www.krkonline.com/"], + "krkinfo_news": ["https://www.krk-info.com/"], + "cres_info": ["https://www.cres.info/"], + "cres_lapis": ["https://lapis.cres.hr/"], + "losinj_info": ["https://www.losinj.info/"], + "losinj_centar": ["https://www.muzejmalilosinj.hr/"], + "rab_info": ["https://www.rab.com/"], + "rab_news": ["https://rabnews.com/"], + "susak_info": ["https://www.susakisland.com/"], + "ilovik_info": ["https://www.ilovik.eu/"], +} + + +def crawl(name, urls, max_pages=12): + conn = psycopg2.connect(DSN); conn.autocommit = True + visited = set(); queue = list(urls); facts = 0 + while queue and len(visited) < max_pages: + url = queue.pop(0) + if url in visited: continue + visited.add(url) + html, status = fetch(url, timeout=15) + if not html or status != 200: continue + title = extract_title(html); text = extract_text(html) + if not text or len(text) < 200: continue + ff = [] + if title and len(title) > 8: + ff.append({"fact": f"{name} - {title}", "url": url, "title": title}) + for c in chunk_text(text, 800): + if len(c) > 100: + ff.append({"fact": c, "url": url, "title": title}) + facts += upsert_facts(conn, ff, source_name=name, + category="otoci_pgz", confidence=0.85) + base = urlparse(url).hostname + for link in find_internal_links(html, url): + if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 30: + queue.append(link) + time.sleep(0.5) + conn.close() + return {"name": name, "visited": len(visited), "facts": facts} + + +def main(): + results = [] + for name, urls in OTOCI.items(): + try: + r = crawl(name, urls, max_pages=12) + print(f" {name:25} {r['visited']:>3}p {r['facts']:>5}f") + results.append(r) + except Exception as e: + print(f" {name:25} FAIL: {str(e)[:60]}") + total = sum(r.get("facts", 0) for r in results) + print(f"=== TOTAL: {total} ===") + print(json.dumps({"otoci_count": len(results), "total_facts": total})) + + +if __name__ == "__main__": + main() diff --git a/scrapers/harvesters/povijest_pgz.py b/scrapers/harvesters/povijest_pgz.py new file mode 100644 index 0000000..b1bc207 --- /dev/null +++ b/scrapers/harvesters/povijest_pgz.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 +"""Povijesni izvori PGZ — Liburnija, arhivi.""" +import sys, json, time +sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters") +from _common import (fetch, extract_text, extract_title, chunk_text, + upsert_facts, find_internal_links, DSN) +from urllib.parse import urlparse +import psycopg2 + +HISTORY = { + "drzavni_arhiv_ri": ["https://www.dari.hr/"], + "arhiv_pazin": ["https://www.dapa.hr/"], + "muzej_glagoljice": ["https://glagoljica.hr/"], + "glagoljaska_alea": ["https://www.aleja-glagoljasa.hr/"], + "frankopani": ["https://www.frankopani.eu/"], + "trsatske_legende": ["https://www.trsat-svetiste.com/povijest/"], + "rijeka_povijest": ["https://rijeka-history.eu/"], + "stare_rijeke": ["https://www.stararijeka.com/"], + "kvarner_arhiv": ["https://www.kvarnerheritage.eu/"], + "muzeji_pgz_arhiv": ["https://www.muzeji-pgz.hr/"], + "razno_pomorski": ["https://www.kpu.hr/"], +} + + +def crawl(name, urls, max_pages=12): + conn = psycopg2.connect(DSN); conn.autocommit = True + visited = set(); queue = list(urls); facts = 0 + while queue and len(visited) < max_pages: + url = queue.pop(0) + if url in visited: continue + visited.add(url) + html, status = fetch(url, timeout=15) + if not html or status != 200: continue + title = extract_title(html); text = extract_text(html) + if not text or len(text) < 200: continue + ff = [] + if title and len(title) > 8: + ff.append({"fact": f"{name} - {title}", "url": url, "title": title}) + for c in chunk_text(text, 800): + if len(c) > 100: + ff.append({"fact": c, "url": url, "title": title}) + facts += upsert_facts(conn, ff, source_name=name, + category="povijest_pgz", confidence=0.86) + base = urlparse(url).hostname + for link in find_internal_links(html, url): + if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 30: + queue.append(link) + time.sleep(0.5) + conn.close() + return {"name": name, "visited": len(visited), "facts": facts} + + +def main(): + results = [] + for name, urls in HISTORY.items(): + try: + r = crawl(name, urls, max_pages=10) + print(f" {name:25} {r['visited']:>3}p {r['facts']:>5}f") + results.append(r) + except Exception as e: + print(f" {name:25} FAIL: {str(e)[:60]}") + total = sum(r.get("facts", 0) for r in results) + print(f"=== TOTAL: {total} ===") + print(json.dumps({"hist_count": len(results), "total_facts": total})) + + +if __name__ == "__main__": + main() diff --git a/scrapers/harvesters/propisi_pgz.py b/scrapers/harvesters/propisi_pgz.py new file mode 100644 index 0000000..fcadd54 --- /dev/null +++ b/scrapers/harvesters/propisi_pgz.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 +"""Lokalni propisi PGZ — sluzbene novine, statuti.""" +import sys, json, time +sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters") +from _common import (fetch, extract_text, extract_title, chunk_text, + upsert_facts, find_internal_links, DSN) +from urllib.parse import urlparse +import psycopg2 + +PROPISI = { + "sluzbene_novine_pgz": ["https://www.sn.pgz.hr/"], + "sluzbene_glasnik_rijeka": ["https://www.sluzbene-novine.com/"], + "sn_opatija": ["https://www.opatija.hr/sluzbene-novine"], + "pgz_dokumenti": ["https://www.pgz.hr/dokumenti"], + "pgz_skupstina": ["https://www.pgz.hr/skupstina"], + "rijeka_grad_v_savjet":["https://www.rijeka.hr/gradska-uprava/"], + "pgz_javnatime": ["https://www.pgz.hr/javna-nabava/"], + "rijeka_javna_nabava": ["https://www.rijeka.hr/javna-nabava/"], + "narodne_novine_pgz": ["https://narodne-novine.nn.hr/"], +} + + +def crawl(name, urls, max_pages=15): + conn = psycopg2.connect(DSN); conn.autocommit = True + visited = set(); queue = list(urls); facts = 0 + while queue and len(visited) < max_pages: + url = queue.pop(0) + if url in visited: continue + visited.add(url) + html, status = fetch(url, timeout=15) + if not html or status != 200: continue + title = extract_title(html); text = extract_text(html) + if not text or len(text) < 200: continue + ff = [] + if title and len(title) > 8: + ff.append({"fact": f"{name} - {title}", "url": url, "title": title}) + for c in chunk_text(text, 800): + if len(c) > 100: + ff.append({"fact": c, "url": url, "title": title}) + facts += upsert_facts(conn, ff, source_name=name, + category="propisi_pgz", confidence=0.88) + base = urlparse(url).hostname + for link in find_internal_links(html, url): + if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 60: + queue.append(link) + time.sleep(0.5) + conn.close() + return {"name": name, "visited": len(visited), "facts": facts} + + +def main(): + results = [] + for name, urls in PROPISI.items(): + try: + r = crawl(name, urls, max_pages=15) + print(f" {name:25} {r['visited']:>3}p {r['facts']:>5}f") + results.append(r) + except Exception as e: + print(f" {name:25} FAIL: {str(e)[:60]}") + total = sum(r.get("facts", 0) for r in results) + print(f"=== TOTAL: {total} ===") + print(json.dumps({"propisi_count": len(results), "total_facts": total})) + + +if __name__ == "__main__": + main() diff --git a/scrapers/harvesters/sport_infra_pgz.py b/scrapers/harvesters/sport_infra_pgz.py new file mode 100644 index 0000000..1b2128a --- /dev/null +++ b/scrapers/harvesters/sport_infra_pgz.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 +"""Sport infrastruktura PGZ — dvorane, baze, skole.""" +import sys, json, time +sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters") +from _common import (fetch, extract_text, extract_title, chunk_text, + upsert_facts, find_internal_links, DSN) +from urllib.parse import urlparse +import psycopg2 + +INFRA = { + "dvorana_zamet": ["https://www.dvoranazamet.hr/"], + "stadion_kantrida": ["https://www.kantrida.hr/"], + "stadion_rujevica": ["https://www.nk-rijeka.hr/stadion-rujevica/"], + "ck_kantrida_pliv": ["https://kantridapool.hr/"], + "ri_sport_centar": ["https://www.ri-sport.hr/"], + "delta_jumbo": ["https://www.deltajumbo.hr/"], + "skolski_sport": ["https://www.hsss.hr/"], + "platak_skijanje": ["https://www.platak.hr/"], + "rec_velebit": ["https://www.velebit.hr/"], + "platak_ski_klub": ["https://www.skiclub-platak.hr/"], + "rijeka_marina": ["https://www.aci-marinas.com/"], +} + + +def crawl(name, urls, max_pages=15): + conn = psycopg2.connect(DSN); conn.autocommit = True + visited = set(); queue = list(urls); facts = 0 + while queue and len(visited) < max_pages: + url = queue.pop(0) + if url in visited: continue + visited.add(url) + html, status = fetch(url, timeout=15) + if not html or status != 200: continue + title = extract_title(html); text = extract_text(html) + if not text or len(text) < 200: continue + ff = [] + if title and len(title) > 8: + ff.append({"fact": f"{name} - {title}", "url": url, "title": title}) + for c in chunk_text(text, 800): + if len(c) > 100: + ff.append({"fact": c, "url": url, "title": title}) + facts += upsert_facts(conn, ff, source_name=name, + category="sport_infra_pgz", confidence=0.85) + base = urlparse(url).hostname + for link in find_internal_links(html, url): + if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 40: + queue.append(link) + time.sleep(0.5) + conn.close() + return {"name": name, "visited": len(visited), "facts": facts} + + +def main(): + results = [] + for name, urls in INFRA.items(): + try: + r = crawl(name, urls, max_pages=12) + print(f" {name:25} {r['visited']:>3}p {r['facts']:>5}f") + results.append(r) + except Exception as e: + print(f" {name:25} FAIL: {str(e)[:60]}") + total = sum(r.get("facts", 0) for r in results) + print(f"=== TOTAL: {total} ===") + print(json.dumps({"infra_count": len(results), "total_facts": total})) + + +if __name__ == "__main__": + main() diff --git a/scrapers/harvesters/sport_klubovi_pgz.py b/scrapers/harvesters/sport_klubovi_pgz.py new file mode 100644 index 0000000..b11355c --- /dev/null +++ b/scrapers/harvesters/sport_klubovi_pgz.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 +"""Sport klubovi PGZ — direktno s web stranica.""" +import sys, json, time +sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters") +from _common import (fetch, extract_text, extract_title, chunk_text, + upsert_facts, find_internal_links, DSN) +from urllib.parse import urlparse +import psycopg2 + +KLUBOVI = { + "hnk_rijeka": ["https://www.nk-rijeka.hr/"], + "kk_kvarner": ["https://www.kk-kvarner.hr/"], + "rk_zamet": ["https://www.rk-zamet.hr/"], + "vk_primorje": ["https://www.vkprimorje.hr/"], + "ok_rijeka": ["https://www.ok-rijeka.hr/"], + "haok_mladost": ["https://www.haok-mladost.hr/"], + "abc_rijeka": ["https://www.abc-rijeka.hr/"], + "rugby_rijeka": ["https://www.rugbyrijeka.hr/"], + "pliva_klub_primorje":["https://www.primorje-aquarius.hr/"], + "judo_kvarner": ["https://www.judokvarner.hr/"], + "kuglacki_savez_pgz": ["https://www.kuglacki-savez-pgz.hr/"], + "tenis_kvarner": ["https://www.tk-kvarner.hr/"], + "atletika_rijeka": ["https://www.akrijeka.hr/"], + "biciklisticki": ["https://www.bk-rijeka.hr/"], + "stoljecesporta": ["https://stoljecesporta.com/"], +} + + +def crawl(name, urls, max_pages=12): + conn = psycopg2.connect(DSN); conn.autocommit = True + visited = set(); queue = list(urls); facts = 0 + while queue and len(visited) < max_pages: + url = queue.pop(0) + if url in visited: continue + visited.add(url) + html, status = fetch(url, timeout=15) + if not html or status != 200: continue + title = extract_title(html); text = extract_text(html) + if not text or len(text) < 200: continue + ff = [] + if title and len(title) > 8: + ff.append({"fact": f"{name} - {title}", "url": url, "title": title}) + for c in chunk_text(text, 800): + if len(c) > 100: + ff.append({"fact": c, "url": url, "title": title}) + facts += upsert_facts(conn, ff, source_name=name, + category="sport_klub_pgz", confidence=0.88) + base = urlparse(url).hostname + for link in find_internal_links(html, url): + if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 30: + queue.append(link) + time.sleep(0.5) + conn.close() + return {"name": name, "visited": len(visited), "facts": facts} + + +def main(): + results = [] + for name, urls in KLUBOVI.items(): + try: + r = crawl(name, urls, max_pages=10) + print(f" {name:25} {r['visited']:>3}p {r['facts']:>5}f") + results.append(r) + except Exception as e: + print(f" {name:25} FAIL: {str(e)[:60]}") + total = sum(r.get("facts", 0) for r in results) + print(f"=== TOTAL: {total} ===") + print(json.dumps({"klub_count": len(results), "total_facts": total})) + + +if __name__ == "__main__": + main() diff --git a/scrapers/harvesters/vjera_pgz.py b/scrapers/harvesters/vjera_pgz.py new file mode 100644 index 0000000..8313de6 --- /dev/null +++ b/scrapers/harvesters/vjera_pgz.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +"""Crkve i vjerske institucije PGZ.""" +import sys, json, time +sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters") +from _common import (fetch, extract_text, extract_title, chunk_text, + upsert_facts, find_internal_links, DSN) +from urllib.parse import urlparse +import psycopg2 + +CRKVA = { + "rijecka_nadbiskupija":["https://www.ri-nadbiskupija.hr/"], + "krcka_biskupija": ["https://www.biskupija-krk.hr/"], + "isusovci_rijeka": ["https://isusovci.hr/"], + "trsat_svetiste": ["https://trsat-svetiste.com/"], + "katedrala_rijeka": ["https://katedrala-rijeka.hr/"], + "samostan_kosljun": ["https://www.kosljun.hr/"], + "samostan_glavotok": ["https://www.glavotok.hr/"], + "katedrala_krk": ["https://www.biskupija-krk.hr/katedrala/"], + "crkva_opatija": ["https://www.zupa-opatija.hr/"], + "rijecka_eparhija": ["https://www.eparhija-zagrebackoljubljanska.com/"], +} + + +def crawl(name, urls, max_pages=10): + conn = psycopg2.connect(DSN); conn.autocommit = True + visited = set(); queue = list(urls); facts = 0 + while queue and len(visited) < max_pages: + url = queue.pop(0) + if url in visited: continue + visited.add(url) + html, status = fetch(url, timeout=15) + if not html or status != 200: continue + title = extract_title(html); text = extract_text(html) + if not text or len(text) < 200: continue + ff = [] + if title and len(title) > 8: + ff.append({"fact": f"{name} - {title}", "url": url, "title": title}) + for c in chunk_text(text, 800): + if len(c) > 100: + ff.append({"fact": c, "url": url, "title": title}) + facts += upsert_facts(conn, ff, source_name=name, + category="vjera_pgz", confidence=0.84) + base = urlparse(url).hostname + for link in find_internal_links(html, url): + if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 25: + queue.append(link) + time.sleep(0.5) + conn.close() + return {"name": name, "visited": len(visited), "facts": facts} + + +def main(): + results = [] + for name, urls in CRKVA.items(): + try: + r = crawl(name, urls, max_pages=10) + print(f" {name:25} {r['visited']:>3}p {r['facts']:>5}f") + results.append(r) + except Exception as e: + print(f" {name:25} FAIL: {str(e)[:60]}") + total = sum(r.get("facts", 0) for r in results) + print(f"=== TOTAL: {total} ===") + print(json.dumps({"crkva_count": len(results), "total_facts": total})) + + +if __name__ == "__main__": + main() diff --git a/scrapers/harvesters/zdravstvo_pgz.py b/scrapers/harvesters/zdravstvo_pgz.py new file mode 100644 index 0000000..0959851 --- /dev/null +++ b/scrapers/harvesters/zdravstvo_pgz.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 +"""Zdravstvo + udruge PGZ.""" +import sys, json, time +sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters") +from _common import (fetch, extract_text, extract_title, chunk_text, + upsert_facts, find_internal_links, DSN) +from urllib.parse import urlparse +import psycopg2 + +ZDRAVSTVO = { + "kbc_rijeka": ["https://www.kbc-rijeka.hr/"], + "thalassotherapia": ["https://thalassotherapia-opatija.hr/"], + "klinika_lovran": ["https://www.tnz-lovran.hr/"], + "klinika_crikvenica": ["https://www.thalassotherapia-crikvenica.hr/"], + "dom_zdravlja_pgz": ["https://www.dom-zdravlja-pgz.hr/"], + "zavod_javno_zdravlje":["https://www.zzjzpgz.hr/"], + "crveni_kriz_pgz": ["https://www.crveni-kriz-rijeka.hr/"], + "hzzo_rijeka": ["https://hzzo.hr/"], + "savjetovaliste_ri": ["https://www.zzjzpgz.hr/savjetovaliste/"], + "deinstitucionalizacija":["https://www.cczg-rijeka.hr/"], + "centar_socijalne": ["https://www.czss.rijeka.hr/"], +} + + +def crawl(name, urls, max_pages=12): + conn = psycopg2.connect(DSN); conn.autocommit = True + visited = set(); queue = list(urls); facts = 0 + while queue and len(visited) < max_pages: + url = queue.pop(0) + if url in visited: continue + visited.add(url) + html, status = fetch(url, timeout=15) + if not html or status != 200: continue + title = extract_title(html); text = extract_text(html) + if not text or len(text) < 200: continue + ff = [] + if title and len(title) > 8: + ff.append({"fact": f"{name} - {title}", "url": url, "title": title}) + for c in chunk_text(text, 800): + if len(c) > 100: + ff.append({"fact": c, "url": url, "title": title}) + facts += upsert_facts(conn, ff, source_name=name, + category="zdravstvo_pgz", confidence=0.86) + base = urlparse(url).hostname + for link in find_internal_links(html, url): + if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 40: + queue.append(link) + time.sleep(0.5) + conn.close() + return {"name": name, "visited": len(visited), "facts": facts} + + +def main(): + results = [] + for name, urls in ZDRAVSTVO.items(): + try: + r = crawl(name, urls, max_pages=10) + print(f" {name:25} {r['visited']:>3}p {r['facts']:>5}f") + results.append(r) + except Exception as e: + print(f" {name:25} FAIL: {str(e)[:60]}") + total = sum(r.get("facts", 0) for r in results) + print(f"=== TOTAL: {total} ===") + print(json.dumps({"zdr_count": len(results), "total_facts": total})) + + +if __name__ == "__main__": + main() diff --git a/static/crm_v2.html b/static/crm_v2.html index 2181861..bb0b704 100644 --- a/static/crm_v2.html +++ b/static/crm_v2.html @@ -487,7 +487,18 @@ footer { height:36px; background:var(--bg2); border-top:1px solid var(--rim);
@@ -406,6 +423,83 @@ const _state = {section:'dashboard', viewSavezi:'card', viewKlubovi:'card', view const _sort = {savezi:null, klubovi:null, sportasi:null, objekti:null, manifestacije:null, financije:null}; let _proracunChart=null, _financijeChart=null; +// ════════════════════════════════════════════════════════════════════ +// BUG-E (2026-05-05) — explicit filter-bar state per section +// Author: Damir Radulić (dradulic@outlook.com / damir@rinet.one) +// Defaults match constitution: financirani=true + u-godišnjaku=true. +// User can uncheck either checkbox to broaden the result set. +// ════════════════════════════════════════════════════════════════════ +const _filters = { + klubovi: { financirani: true, godisnjak: true, hns_roster: false, total: 0 }, + sportasi: { priority: true, hns_profil: false, godina_od: null, godina_do: null, total: 0 }, + savezi: { financirani: true, total: 0 } +}; +function _filtersDefaults(sec){ + if(sec==='klubovi') return { financirani:true, godisnjak:true, hns_roster:false }; + if(sec==='sportasi') return { priority:true, hns_profil:false, godina_od:null, godina_do:null }; + if(sec==='savezi') return { financirani:true }; + return {}; +} +function _filtersReset(sec){ + Object.assign(_filters[sec], _filtersDefaults(sec)); + _filtersApply(sec); +} +function _filtersApply(sec){ + if(sec==='klubovi') { _cache.klubovi = null; loadKlubovi(); } + if(sec==='sportasi') { _cache.clanovi = null; loadSportasi(); } + if(sec==='savezi') { _cache.savezi = null; loadSavezi(); } +} +function _filtersBar(sec){ + // Returns HTML for the BUG-E filter-bar above the existing toolbar. + const f = _filters[sec] || {}; + const cnt = 'Prikazano: ' + + (f.shown||0) + ' od ' + (f.total||0) + ''; + if(sec==='klubovi'){ + return ` + `; + } + if(sec==='sportasi'){ + return ` + `; + } + if(sec==='savezi'){ + return ` + `; + } + return ''; +} +function _filtersUpdateCount(sec, shown){ + _filters[sec].shown = shown; + const el = document.getElementById('bugE-cnt-'+sec); + if(el) el.textContent = 'Prikazano: '+shown+' od '+(_filters[sec].total||shown); +} +window._filters = _filters; +window._filtersReset = _filtersReset; +window._filtersApply = _filtersApply; + // === PGŽ priority filter (SUB6) — global helper, works across Klubovi/Savezi/Sportaši === window._pgz_filter_priority = window._pgz_filter_priority || false; window.togglePGZFilter = function(section){ @@ -1240,13 +1334,16 @@ async function loadSavezi(){ const root = $('#pg-savezi'); if(!_cache.savezi){ root.innerHTML = '