#!/usr/bin/env python3 """HBS Hrvatski boćarski savez scraper - prava bota za PGŽ klubove.""" import os, re, sys, time, json, html, traceback, datetime as dt import urllib.request, urllib.error from urllib.parse import urljoin import psycopg2 DB = dict(host="10.10.0.2", port=6432, port=5432, dbname="rinet_v3", user="rinet", password=os.environ["DB_PASSWORD"]) BASE = "https://hrvatski-bocarski-savez.hr" UA = "Mozilla/5.0 (compatible; PGZSportBot/1.0; +https://api.rinet.one/sport)" DELAY = 1.2 LOG_FP = "/opt/pgz-sport/_logs/hbs_scraper.log" def log(msg): line = f"[{dt.datetime.now().isoformat()}] {msg}" print(line, flush=True) try: with open(LOG_FP, "a") as f: f.write(line+"\n") except: pass def db(): c = psycopg2.connect(**DB); c.autocommit = True; return c def fetch(url, retries=2): for i in range(retries+1): try: req = urllib.request.Request(url, headers={"User-Agent": UA, "Accept-Language": "hr,en"}) with urllib.request.urlopen(req, timeout=20) as r: return r.read().decode("utf-8", errors="replace") except urllib.error.HTTPError as e: if e.code in (404, 410): return None if i == retries: log(f"HTTP {e.code} {url}"); return None except Exception as e: if i == retries: log(f"FETCH err {e} {url}"); return None time.sleep(DELAY * 2) # === KLUB PARSER === def parse_klub(h, slug): if not h: return None # Naziv - drugi h3 (prvi je uvijek "Fédération Croate de Boules") h3s = re.findall(r']*>\s*([^<]+?)\s*', h) naziv = None for cand in h3s: cand = html.unescape(cand.strip()) if cand and 'Fédération' not in cand and 'sponzor' not in cand.lower() and len(cand) < 80: naziv = cand; break if not naziv: return None # Logo m = re.search(r']+src="(/cdn/content/[^"]+\.(jpg|png|jpeg))"[^>]+alt="Klub"', h) logo = urljoin(BASE, m.group(1)) if m else None info = {} # Bullets: Županija, Liga, Adresa, Sportske grane, Osoba za kontakt, E-mail, Tel/mob, OIB for m in re.finditer(r'([^<]+?):\s*([^<\n]+?)(?:<|\n)', h): key = m.group(1).strip().lower() val = re.sub(r'<[^>]+>', '', m.group(2).strip()) val = html.unescape(val) if 'županija' in key: info['zupanija'] = val elif 'liga' in key: info['liga'] = val elif 'adresa' in key: info['adresa'] = val elif 'sportske grane' in key or 'sportska grana' in key: info['sportske_grane'] = val elif 'osoba za kontakt' in key: info['kontakt_osoba'] = val elif 'tel' in key: info['telefon'] = val elif 'oib' in key: info['oib'] = val # Igrači - pattern:
  • N. E-XX-YY, Ime, YYYY.
  • igraci = [] for m in re.finditer( r'
  • ]*>\s*\d+\.\s*([A-Z][\d\-]+),\s*([^<]+),\s*(\d{4})\.?', h ): igraci.append({ "slug": m.group(1), "iskaznica": m.group(2).strip(), "ime_prezime": html.unescape(m.group(3).strip()), "godina_rodenja": int(m.group(4)) }) # Voditelji ekipe (treneri) - tab #popis_voditelja_ekipe voditelji = [] vsec = re.search(r'id="popis_voditelja_ekipe"[^>]*>(.*?)(?:|