#!/usr/bin/env python3
# ═══════════════════════════════════════════════════════════════════
# Fajl: pgz_savezi_deep.py | v1.0.0 | 05.05.2026
# Lokacija: /opt/pgz-sport/scrapers/pgz_savezi_deep.py
# Svrha: Deep crawl glavnih sportskih saveza za PGŽ klubove
# - HNS (nogomet) — hns-cff.hr, prvahnl.hr
# - HKS (košarka) — hks.hr, abaliga.com
# - HRS (rukomet) — hrs.hr
# - HOS (odbojka) — hos.hr
# - HBS (boćanje) — hbs.hr
# - HVS (vaterpolo) — hvs.hr
# Sve klube + utakmice + rezultate koji su u PGŽ
# ═══════════════════════════════════════════════════════════════════
"""Multi-savez deep scrape for PGŽ clubs."""
import os, sys, re, time, hashlib, logging, json
from urllib.parse import urljoin, urlparse
import urllib.request
from html import unescape
import psycopg2
from psycopg2.extras import execute_batch
logging.basicConfig(level=logging.INFO, format="%(asctime)s [savezi] %(message)s")
log = logging.getLogger("savezi")
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
UA = "Mozilla/5.0 (Ri.NET Civic Bot 1.0; contact: dradulic@outlook.com)"
# PGŽ municipalities — for filtering relevant clubs
PGZ_TOWNS = ["Rijeka", "Opatija", "Crikvenica", "Krk", "Cres", "Mali Lošinj",
"Rab", "Delnice", "Vrbovsko", "Čabar", "Bakar", "Kraljevica",
"Kastav", "Viškovo", "Klana", "Mošćenička Draga", "Lovran",
"Matulji", "Omišalj", "Punat", "Vrbnik", "Baška", "Dobrinj",
"Malinska", "Jelenje", "Costrena", "Kostrena", "Čavle", "Lopar",
"Brod Moravice", "Mrkopalj", "Ravna Gora", "Lokve", "Skrad",
"Fužine", "Novi Vinodolski", "Vinodol"]
ROOTS = {
"hns_nogomet": ["https://hns-cff.hr/", "https://prvahnl.hr/", "https://hns-cff.hr/klubovi/"],
"hks_kosarka": ["https://hks.hr/", "https://hks.hr/klubovi/"],
"hrs_rukomet": ["https://hrs.hr/", "https://hrs.hr/klubovi/"],
"hos_odbojka": ["https://hos.hr/", "https://hos.hr/klubovi/"],
"hbs_bocanje": ["https://hbs.hr/", "https://hbs.hr/klubovi/"],
"hvs_vaterpolo": ["https://hvs.hr/", "https://hvs.hr/klubovi/"],
"hps_plivanje": ["https://hps.hr/"],
"haof_atletika": ["https://haaf.hr/"],
"hgsf_gimnastika":["https://hgsf.hr/"],
}
def fetch(url, timeout=20, retries=2):
for i in range(retries):
try:
req = urllib.request.Request(url, headers={"User-Agent": UA})
with urllib.request.urlopen(req, timeout=timeout) as r:
return r.read().decode("utf-8", errors="replace"), r.status
except Exception:
time.sleep(2*(i+1))
return None, 0
def extract_text(html):
h = re.sub(r"", "", html or "", flags=re.S|re.I)
h = re.sub(r"", "", h, flags=re.S|re.I)
t = re.sub(r"<[^>]+>", " ", h)
return re.sub(r"\s+", " ", unescape(t)).strip()
def is_pgz_relevant(text):
"""Check if text mentions PGŽ towns/clubs."""
return any(t in text for t in PGZ_TOWNS) or "Primorsko-goranska" in text or "PGŽ" in text
def chunk(text, max_len=800):
if len(text) <= max_len: return [text] if text else []
out = []; start = 0
while start < len(text):
end = min(start + max_len, len(text))
if end < len(text):
for sep in [". ", "! ", "? ", "\n"]:
p = text.rfind(sep, start, end)
if p > start + max_len // 2:
end = p + len(sep); break
out.append(text[start:end].strip())
start = end
return [c for c in out if len(c) > 80]
def upsert(conn, facts, savez_key):
if not facts: return 0
cur = conn.cursor()
rows = []
for f in facts:
h = hashlib.md5(f["fact"].encode()).hexdigest()
rows.append((f["fact"], f"savezi_{savez_key}", "pgz_sport_savezi",
f.get("confidence", 0.82), h,
json.dumps({"url": f.get("url", "")})))
sql = """INSERT INTO dabi.knowledge (fact, source, category, confidence, data_hash, source_refs)
VALUES (%s, %s, %s, %s, %s, %s::jsonb) ON CONFLICT (data_hash) DO NOTHING"""
try:
execute_batch(cur, sql, rows, page_size=50)
n = cur.rowcount
cur.close()
return n
except Exception as e:
log.error(f"upsert: {e}")
return 0
def crawl_savez(savez_key, urls, max_per=80):
log.info(f"=== {savez_key} ===")
conn = psycopg2.connect(DSN); conn.autocommit = True
visited = set()
queue = list(urls)
total_facts = 0
pgz_relevant = 0
while queue and len(visited) < max_per:
url = queue.pop(0)
if url in visited: continue
visited.add(url)
html, _ = fetch(url, timeout=15)
if not html: continue
text = extract_text(html)
if not text or len(text) < 100: continue
# Add subpages
for m in re.finditer(r'href=["\']([^"\']+)["\']', html):
u = urljoin(url, m.group(1))
host = urlparse(u).hostname or ""
base_host = urlparse(url).hostname or ""
if host == base_host and u not in visited and len(queue) < 200:
queue.append(u.split("#")[0])
# Only ingest PGŽ-relevant content
if not is_pgz_relevant(text):
continue
pgz_relevant += 1
facts = [{"fact": c, "url": url, "confidence": 0.82}
for c in chunk(text, 800) if len(c) > 100]
total_facts += upsert(conn, facts, savez_key)
time.sleep(0.4)
log.info(f" {savez_key}: visited={len(visited)} pgz_relevant={pgz_relevant} facts={total_facts}")
conn.close()
return {"savez": savez_key, "visited": len(visited),
"pgz_relevant": pgz_relevant, "facts": total_facts}
def main():
results = []
for savez, urls in ROOTS.items():
try:
r = crawl_savez(savez, urls, max_per=60)
results.append(r)
except Exception as e:
log.error(f"{savez} fail: {e}")
results.append({"savez": savez, "error": str(e)})
print(json.dumps({"summary": results,
"total_facts": sum(r.get("facts", 0) for r in results)}))
if __name__ == "__main__":
main()