#!/usr/bin/env python3 from dotenv import load_dotenv load_dotenv('/opt/rinet-gpu/.env.master') # auto-added by patch_scrapers_with_dotenv.sh """HBS Hrvatski boćarski savez scraper - prava bota za PGŽ klubove.""" import os, re, sys, time, json, html, traceback, datetime as dt import urllib.request, urllib.error from urllib.parse import urljoin import psycopg2 DB = dict(host="10.10.0.2", port=6432, port=5432, dbname="rinet_v3", user="rinet", password=os.environ["DB_PASSWORD"]) BASE = "https://hrvatski-bocarski-savez.hr" UA = "Mozilla/5.0 (compatible; PGZSportBot/1.0; +https://api.rinet.one/sport)" DELAY = 1.2 LOG_FP = "/opt/pgz-sport/_logs/hbs_scraper.log" def log(msg): line = f"[{dt.datetime.now().isoformat()}] {msg}" print(line, flush=True) try: with open(LOG_FP, "a") as f: f.write(line+"\n") except: pass def db(): c = psycopg2.connect(**DB); c.autocommit = True; return c def fetch(url, retries=2): for i in range(retries+1): try: req = urllib.request.Request(url, headers={"User-Agent": UA, "Accept-Language": "hr,en"}) with urllib.request.urlopen(req, timeout=20) as r: return r.read().decode("utf-8", errors="replace") except urllib.error.HTTPError as e: if e.code in (404, 410): return None if i == retries: log(f"HTTP {e.code} {url}"); return None except Exception as e: if i == retries: log(f"FETCH err {e} {url}"); return None time.sleep(DELAY * 2) # === KLUB PARSER === def parse_klub(h, slug): if not h: return None # Naziv - drugi h3 (prvi je uvijek "Fédération Croate de Boules") h3s = re.findall(r'