1d02c0897d
- pgz nav now includes /erp/full, /crm/v2, /admin/users, /dokumenti
- 4 dokumenti endpoints: list, godišnjaci/list, godišnjak/{godina} PDF, detail
- 18 godišnjaka u pgz_sport.dokumenti (2006-2024) with savez_id=333
- PGŽ filter helpers (window._pgz_filter_priority, togglePGZFilter)
- navItemClick handler for nav items with href
170 lines
6.5 KiB
Python
170 lines
6.5 KiB
Python
#!/usr/bin/env python3
|
|
# ═══════════════════════════════════════════════════════════════════
|
|
# Fajl: pgz_savezi_deep.py | v1.0.0 | 05.05.2026
|
|
# Lokacija: /opt/pgz-sport/scrapers/pgz_savezi_deep.py
|
|
# Svrha: Deep crawl glavnih sportskih saveza za PGŽ klubove
|
|
# - HNS (nogomet) — hns-cff.hr, prvahnl.hr
|
|
# - HKS (košarka) — hks.hr, abaliga.com
|
|
# - HRS (rukomet) — hrs.hr
|
|
# - HOS (odbojka) — hos.hr
|
|
# - HBS (boćanje) — hbs.hr
|
|
# - HVS (vaterpolo) — hvs.hr
|
|
# Sve klube + utakmice + rezultate koji su u PGŽ
|
|
# ═══════════════════════════════════════════════════════════════════
|
|
"""Multi-savez deep scrape for PGŽ clubs."""
|
|
import os, sys, re, time, hashlib, logging, json
|
|
from urllib.parse import urljoin, urlparse
|
|
import urllib.request
|
|
from html import unescape
|
|
import psycopg2
|
|
from psycopg2.extras import execute_batch
|
|
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s [savezi] %(message)s")
|
|
log = logging.getLogger("savezi")
|
|
|
|
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
|
UA = "Mozilla/5.0 (Ri.NET Civic Bot 1.0; contact: dradulic@outlook.com)"
|
|
|
|
# PGŽ municipalities — for filtering relevant clubs
|
|
PGZ_TOWNS = ["Rijeka", "Opatija", "Crikvenica", "Krk", "Cres", "Mali Lošinj",
|
|
"Rab", "Delnice", "Vrbovsko", "Čabar", "Bakar", "Kraljevica",
|
|
"Kastav", "Viškovo", "Klana", "Mošćenička Draga", "Lovran",
|
|
"Matulji", "Omišalj", "Punat", "Vrbnik", "Baška", "Dobrinj",
|
|
"Malinska", "Jelenje", "Costrena", "Kostrena", "Čavle", "Lopar",
|
|
"Brod Moravice", "Mrkopalj", "Ravna Gora", "Lokve", "Skrad",
|
|
"Fužine", "Novi Vinodolski", "Vinodol"]
|
|
|
|
ROOTS = {
|
|
"hns_nogomet": ["https://hns-cff.hr/", "https://prvahnl.hr/", "https://hns-cff.hr/klubovi/"],
|
|
"hks_kosarka": ["https://hks.hr/", "https://hks.hr/klubovi/"],
|
|
"hrs_rukomet": ["https://hrs.hr/", "https://hrs.hr/klubovi/"],
|
|
"hos_odbojka": ["https://hos.hr/", "https://hos.hr/klubovi/"],
|
|
"hbs_bocanje": ["https://hbs.hr/", "https://hbs.hr/klubovi/"],
|
|
"hvs_vaterpolo": ["https://hvs.hr/", "https://hvs.hr/klubovi/"],
|
|
"hps_plivanje": ["https://hps.hr/"],
|
|
"haof_atletika": ["https://haaf.hr/"],
|
|
"hgsf_gimnastika":["https://hgsf.hr/"],
|
|
}
|
|
|
|
|
|
def fetch(url, timeout=20, retries=2):
|
|
for i in range(retries):
|
|
try:
|
|
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
|
with urllib.request.urlopen(req, timeout=timeout) as r:
|
|
return r.read().decode("utf-8", errors="replace"), r.status
|
|
except Exception:
|
|
time.sleep(2*(i+1))
|
|
return None, 0
|
|
|
|
|
|
def extract_text(html):
|
|
h = re.sub(r"<script[^>]*>.*?</script>", "", html or "", flags=re.S|re.I)
|
|
h = re.sub(r"<style[^>]*>.*?</style>", "", h, flags=re.S|re.I)
|
|
t = re.sub(r"<[^>]+>", " ", h)
|
|
return re.sub(r"\s+", " ", unescape(t)).strip()
|
|
|
|
|
|
def is_pgz_relevant(text):
|
|
"""Check if text mentions PGŽ towns/clubs."""
|
|
return any(t in text for t in PGZ_TOWNS) or "Primorsko-goranska" in text or "PGŽ" in text
|
|
|
|
|
|
def chunk(text, max_len=800):
|
|
if len(text) <= max_len: return [text] if text else []
|
|
out = []; start = 0
|
|
while start < len(text):
|
|
end = min(start + max_len, len(text))
|
|
if end < len(text):
|
|
for sep in [". ", "! ", "? ", "\n"]:
|
|
p = text.rfind(sep, start, end)
|
|
if p > start + max_len // 2:
|
|
end = p + len(sep); break
|
|
out.append(text[start:end].strip())
|
|
start = end
|
|
return [c for c in out if len(c) > 80]
|
|
|
|
|
|
def upsert(conn, facts, savez_key):
|
|
if not facts: return 0
|
|
cur = conn.cursor()
|
|
rows = []
|
|
for f in facts:
|
|
h = hashlib.md5(f["fact"].encode()).hexdigest()
|
|
rows.append((f["fact"], f"savezi_{savez_key}", "pgz_sport_savezi",
|
|
f.get("confidence", 0.82), h,
|
|
json.dumps({"url": f.get("url", "")})))
|
|
sql = """INSERT INTO dabi.knowledge (fact, source, category, confidence, data_hash, source_refs)
|
|
VALUES (%s, %s, %s, %s, %s, %s::jsonb) ON CONFLICT (data_hash) DO NOTHING"""
|
|
try:
|
|
execute_batch(cur, sql, rows, page_size=50)
|
|
n = cur.rowcount
|
|
cur.close()
|
|
return n
|
|
except Exception as e:
|
|
log.error(f"upsert: {e}")
|
|
return 0
|
|
|
|
|
|
def crawl_savez(savez_key, urls, max_per=80):
|
|
log.info(f"=== {savez_key} ===")
|
|
conn = psycopg2.connect(DSN); conn.autocommit = True
|
|
|
|
visited = set()
|
|
queue = list(urls)
|
|
total_facts = 0
|
|
pgz_relevant = 0
|
|
|
|
while queue and len(visited) < max_per:
|
|
url = queue.pop(0)
|
|
if url in visited: continue
|
|
visited.add(url)
|
|
|
|
html, _ = fetch(url, timeout=15)
|
|
if not html: continue
|
|
|
|
text = extract_text(html)
|
|
if not text or len(text) < 100: continue
|
|
|
|
# Add subpages
|
|
for m in re.finditer(r'href=["\']([^"\']+)["\']', html):
|
|
u = urljoin(url, m.group(1))
|
|
host = urlparse(u).hostname or ""
|
|
base_host = urlparse(url).hostname or ""
|
|
if host == base_host and u not in visited and len(queue) < 200:
|
|
queue.append(u.split("#")[0])
|
|
|
|
# Only ingest PGŽ-relevant content
|
|
if not is_pgz_relevant(text):
|
|
continue
|
|
pgz_relevant += 1
|
|
|
|
facts = [{"fact": c, "url": url, "confidence": 0.82}
|
|
for c in chunk(text, 800) if len(c) > 100]
|
|
total_facts += upsert(conn, facts, savez_key)
|
|
|
|
time.sleep(0.4)
|
|
|
|
log.info(f" {savez_key}: visited={len(visited)} pgz_relevant={pgz_relevant} facts={total_facts}")
|
|
conn.close()
|
|
return {"savez": savez_key, "visited": len(visited),
|
|
"pgz_relevant": pgz_relevant, "facts": total_facts}
|
|
|
|
|
|
def main():
|
|
results = []
|
|
for savez, urls in ROOTS.items():
|
|
try:
|
|
r = crawl_savez(savez, urls, max_per=60)
|
|
results.append(r)
|
|
except Exception as e:
|
|
log.error(f"{savez} fail: {e}")
|
|
results.append({"savez": savez, "error": str(e)})
|
|
|
|
print(json.dumps({"summary": results,
|
|
"total_facts": sum(r.get("facts", 0) for r in results)}))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|