#!/usr/bin/env python3 """JLS PGZ — 36 jedinica.""" import sys, json, time sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters") from _common import (fetch, extract_text, extract_title, chunk_text, upsert_facts, find_internal_links, DSN) from urllib.parse import urlparse import psycopg2 JLS_PGZ = { "Rijeka": "https://www.rijeka.hr/", "Opatija": "https://www.opatija.hr/", "Crikvenica": "https://www.crikvenica.hr/", "Krk": "https://www.grad-krk.hr/", "Kraljevica": "https://www.kraljevica.hr/", "Rab": "https://www.rab.hr/", "Cres": "https://www.cres.hr/", "Mali_Losinj": "https://www.mali-losinj.hr/", "Delnice": "https://www.delnice.hr/", "Vrbovsko": "https://www.vrbovsko.hr/", "Cabar": "https://www.cabar.hr/", "Bakar": "https://www.bakar.hr/", "Kastav": "https://www.kastav.hr/", "Novi_Vinodolski": "https://www.novi-vinodolski.hr/", "Viskovo": "https://www.opcina-viskovo.hr/", "Klana": "https://www.klana.hr/", "Moscenicka_Draga":"https://www.moscenicka-draga.hr/", "Lovran": "https://www.opcinalovran.hr/", "Matulji": "https://www.matulji.hr/", "Omisalj": "https://www.omisalj.hr/", "Punat": "https://www.punat.hr/", "Vrbnik": "https://www.vrbnik.hr/", "Baska": "https://www.baska.hr/", "Dobrinj": "https://www.opcina-dobrinj.hr/", "Malinska": "https://www.malinska.hr/", "Jelenje": "https://www.jelenje.hr/", "Kostrena": "https://www.kostrena.hr/", "Cavle": "https://www.cavle.hr/", "Lopar": "https://www.opcina-lopar.hr/", "Brod_Moravice": "https://www.brod-moravice.hr/", "Mrkopalj": "https://www.mrkopalj.hr/", "Ravna_Gora": "https://www.ravnagora.hr/", "Lokve": "https://www.lokve.hr/", "Skrad": "https://www.skrad.hr/", "Fuzine": "https://www.fuzine.hr/", "Vinodolska": "https://www.vinodolska-opcina.hr/", "PGZ_zupanija": "https://www.pgz.hr/", } def crawl(name, root, max_pages=25): conn = psycopg2.connect(DSN); conn.autocommit = True visited = set(); queue = [root]; facts = 0 base_host = urlparse(root).hostname or "" while queue and len(visited) < max_pages: url = queue.pop(0) if url in visited: continue visited.add(url) html, status = fetch(url, timeout=15) if not html or status != 200: continue title = extract_title(html); text = extract_text(html) if not text or len(text) < 200: continue ff = [] if title and len(title) > 8: ff.append({"fact": f"{name} - {title}", "url": url, "title": title}) for c in chunk_text(text, 800): if len(c) > 100: ff.append({"fact": c, "url": url, "title": title}) facts += upsert_facts(conn, ff, source_name=f"jls_pgz_{name.lower()}", category="jls_pgz_official", confidence=0.90) for link in find_internal_links(html, url): if link not in visited and (urlparse(link).hostname or "") == base_host and len(queue) < 200: queue.append(link) time.sleep(0.4) conn.close() return {"name": name, "visited": len(visited), "facts": facts} def main(): results = [] for name, url in JLS_PGZ.items(): try: r = crawl(name, url, max_pages=25) results.append(r) print(f" {name:25} {r['visited']:>3}p {r['facts']:>5}f") except Exception as e: print(f" {name:25} FAIL: {str(e)[:60]}") total = sum(r.get("facts", 0) for r in results) print(f"=== TOTAL: {total} ===") print(json.dumps({"jls_count": len(results), "total_facts": total})) if __name__ == "__main__": main()