#!/usr/bin/env python3
"""HBS Hrvatski boćarski savez scraper - prava bota za PGŽ klubove."""
import os, re, sys, time, json, html, traceback, datetime as dt
import urllib.request, urllib.error
from urllib.parse import urljoin
import psycopg2
DB = dict(host="localhost", port=5432, dbname="rinet_v3", user="rinet", password="R1net2026!SecureDB#v7")
BASE = "https://hrvatski-bocarski-savez.hr"
UA = "Mozilla/5.0 (compatible; PGZSportBot/1.0; +https://api.rinet.one/sport)"
DELAY = 1.2
LOG_FP = "/opt/pgz-sport/_logs/hbs_scraper.log"
def log(msg):
line = f"[{dt.datetime.now().isoformat()}] {msg}"
print(line, flush=True)
try:
with open(LOG_FP, "a") as f: f.write(line+"\n")
except: pass
def db():
c = psycopg2.connect(**DB); c.autocommit = True; return c
def fetch(url, retries=2):
for i in range(retries+1):
try:
req = urllib.request.Request(url, headers={"User-Agent": UA, "Accept-Language": "hr,en"})
with urllib.request.urlopen(req, timeout=20) as r:
return r.read().decode("utf-8", errors="replace")
except urllib.error.HTTPError as e:
if e.code in (404, 410): return None
if i == retries: log(f"HTTP {e.code} {url}"); return None
except Exception as e:
if i == retries: log(f"FETCH err {e} {url}"); return None
time.sleep(DELAY * 2)
# === KLUB PARSER ===
def parse_klub(h, slug):
if not h: return None
# Naziv - drugi h3 (prvi je uvijek "Fédération Croate de Boules")
h3s = re.findall(r'
]*>\s*([^<]+?)\s*
', h)
naziv = None
for cand in h3s:
cand = html.unescape(cand.strip())
if cand and 'Fédération' not in cand and 'sponzor' not in cand.lower() and len(cand) < 80:
naziv = cand; break
if not naziv: return None
# Logo
m = re.search(r'
]+src="(/cdn/content/[^"]+\.(jpg|png|jpeg))"[^>]+alt="Klub"', h)
logo = urljoin(BASE, m.group(1)) if m else None
info = {}
# Bullets: Županija, Liga, Adresa, Sportske grane, Osoba za kontakt, E-mail, Tel/mob, OIB
for m in re.finditer(r'([^<]+?):\s*([^<\n]+?)(?:<|\n)', h):
key = m.group(1).strip().lower()
val = re.sub(r'<[^>]+>', '', m.group(2).strip())
val = html.unescape(val)
if 'županija' in key: info['zupanija'] = val
elif 'liga' in key: info['liga'] = val
elif 'adresa' in key: info['adresa'] = val
elif 'sportske grane' in key or 'sportska grana' in key: info['sportske_grane'] = val
elif 'osoba za kontakt' in key: info['kontakt_osoba'] = val
elif 'tel' in key: info['telefon'] = val
elif 'oib' in key: info['oib'] = val
# Igrači - pattern: N. E-XX-YY, Ime, YYYY.
igraci = []
for m in re.finditer(
r']*>\s*\d+\.\s*([A-Z][\d\-]+),\s*([^<]+),\s*(\d{4})\.?',
h
):
igraci.append({
"slug": m.group(1),
"iskaznica": m.group(2).strip(),
"ime_prezime": html.unescape(m.group(3).strip()),
"godina_rodenja": int(m.group(4))
})
# Voditelji ekipe (treneri) - tab #popis_voditelja_ekipe
voditelji = []
vsec = re.search(r'id="popis_voditelja_ekipe"[^>]*>(.*?)(?:|