#!/usr/bin/env python3 from dotenv import load_dotenv load_dotenv('/opt/rinet-gpu/.env.master') # auto-added by patch_scrapers_with_dotenv.sh """HVS Riznica - parse direct HTML, no Playwright. Extract champions per season.""" import os import re, requests, psycopg2 DB = dict(host='localhost', port=5432, dbname='rinet_v3', user='rinet', password=os.environ["DB_PASSWORD"]) conn = psycopg2.connect(**DB); conn.autocommit = True cu = conn.cursor() CATEGORIES = [ ("Prvenstvo Hrvatske - muškarci", "prvenstvo-hrvatske-muskarci"), ("Prvenstvo Hrvatske - žene", "prvenstvo-hrvatske-zene"), ("Kup Hrvatske - muškarci", "kup-hrvatske-muskarci"), ("Kup Hrvatske - žene", "kup-hrvatske-zene"), ("Trofej Toni Nardelli", "trofej-toni-nardelli"), ("Vaterpolist godine", "vaterpolist"), ("Vaterpolistica godine", "vaterpolistica"), ] UA = "RiNET-Civic/1.0 (https://rinet.one)" HDR = {"User-Agent": UA} inserted_total = 0 all_winners = [] for label, slug in CATEGORIES: url = f"https://hvs.hr/riznica/{slug}/" print(f"\n=== {label} ({slug}) ===", flush=True) try: r = requests.get(url, headers=HDR, timeout=15) if r.status_code != 200: print(f" HTTP {r.status_code}"); continue html = r.text # Extract slides - each riznica__slide block # Pattern: extract slide blocks with championship name + year # Each slide has competition name OR medal name (for Vaterpolist categories) slides_re = re.compile( r'