#!/usr/bin/env python3 """HVS Riznica - parse direct HTML, no Playwright. Extract champions per season.""" import os import re, requests, psycopg2 DB = dict(host='localhost', port=5432, dbname='rinet_v3', user='rinet', password=os.environ["DB_PASSWORD"]) conn = psycopg2.connect(**DB); conn.autocommit = True cu = conn.cursor() CATEGORIES = [ ("Prvenstvo Hrvatske - muškarci", "prvenstvo-hrvatske-muskarci"), ("Prvenstvo Hrvatske - žene", "prvenstvo-hrvatske-zene"), ("Kup Hrvatske - muškarci", "kup-hrvatske-muskarci"), ("Kup Hrvatske - žene", "kup-hrvatske-zene"), ("Trofej Toni Nardelli", "trofej-toni-nardelli"), ("Vaterpolist godine", "vaterpolist"), ("Vaterpolistica godine", "vaterpolistica"), ] UA = "RiNET-Civic/1.0 (https://rinet.one)" HDR = {"User-Agent": UA} inserted_total = 0 all_winners = [] for label, slug in CATEGORIES: url = f"https://hvs.hr/riznica/{slug}/" print(f"\n=== {label} ({slug}) ===", flush=True) try: r = requests.get(url, headers=HDR, timeout=15) if r.status_code != 200: print(f" HTTP {r.status_code}"); continue html = r.text # Extract slides - each riznica__slide block # Pattern: extract slide blocks with championship name + year # Each slide has competition name OR medal name (for Vaterpolist categories) slides_re = re.compile( r'
]*>(.+?)(?=
for current visible # Actually each slide contains its own competition details # Better approach: extract all competition name spans in order comp_names = re.findall(r'

]*>\s*([^<]+?)\s*<', html) # And champion images for Vaterpolist (medal__name) medal_names = re.findall(r'

]*>\s*([^<]+?)\s*<', html) # And years years = re.findall(r'

]*>\s*([^<]+?)\s*<', html) # Or in name spans years_alt = re.findall(r'

]*>\s*([0-9]{4}\./[0-9]{4}\.)\s*<', html) # Also single-year format (Trofej, Vaterpolist often have just single year) years_single = re.findall(r'

]*>\s*([0-9]{4}\.)\s*<', html) all_years = list(set(years + years_alt + years_single)) all_years_sorted = sorted([y.strip() for y in all_years if y.strip()]) print(f" Champions: {len(comp_names)}, Medal names: {len(medal_names)}, Years: {len(all_years_sorted)}", flush=True) if comp_names: print(f" Sample champ: {comp_names[:3]}") if medal_names: print(f" Sample medal: {medal_names[:3]}") if all_years_sorted: print(f" Years range: {all_years_sorted[0]} → {all_years_sorted[-1]}") # The names may be aligned with years sequentially # Use whichever list has names - prefer comp_names (championships) else medal_names (individual awards) names = comp_names if comp_names else medal_names names = [n.strip() for n in names if n.strip()] # The champions in HTML order represent the seasons in display order # Map them to years - assume index alignment with sorted years if same length # Otherwise, the page shows multiple slides — same name may repeat # For each non-empty name, create entry # Best guess: names list and years list are PARALLEL (same length, in order on page) # Pages show all-time history, so years_alt (with format) is most reliable # Smart: if len(names) matches len(all_years_sorted), pair them # Else create entries with name+year separately and link by index # Even better: each "slide" block contains 1 name + 1 year contextually nearby # Find pairs by extracting full slides and matching internal patterns slide_pat = re.compile( r'(?:

]*>\s*([^<]+?)\s*<|

]*>\s*([^<]+?)\s*<)' r'.*?(?:

]*>\s*([0-9./]+)|

]*>\s*([0-9./]+))', re.DOTALL ) # That regex too complex - use simpler split approach # Split by slide divs all_records = [] slide_blocks = re.split(r'
', html) for blk in slide_blocks[1:]: # Find name (champion or medal_name) name_m = re.search(r'class="riznica__competition__name"[^>]*>\s*([^<]+?)\s*<', blk) if not name_m: name_m = re.search(r'class="riznica__medal__name"[^>]*>\s*([^<]+?)\s*<', blk) year_m = re.search(r'class="riznica__competitions__current"[^>]*>\s*([0-9./]+)', blk) if not year_m: year_m = re.search(r'class="riznica__competitions__name(?:[^"]*)"[^>]*>\s*([0-9./]+?)\s*<', blk) # Image img_m = re.search(r' 0: inserted_total += 1 except Exception as e: pass all_winners.append({"category": label, "count": len(all_records), "records": all_records}) except Exception as e: print(f" EXC: {e}") print(f"\n\n=== TOTAL INSERTED: {inserted_total} ===") # Audit log cu.execute("""INSERT INTO pgz_sport.audit_feed (table_name, action, source, source_url, details) VALUES ('klub_sezona', 'hvs_riznica_scrape', 'hvs_riznica', 'https://hvs.hr/riznica/', %s::jsonb)""", (f'{{"inserted":{inserted_total},"categories":{len(CATEGORIES)}}}',)) # Top winners cu.execute("""SELECT klub_naziv, count(*) AS naslova FROM pgz_sport.klub_sezona WHERE source='hvs_riznica' GROUP BY klub_naziv ORDER BY count(*) DESC LIMIT 15""") print("\n=== TOP HVS prvaci/medalisti ===") for r in cu.fetchall(): print(f" {r[1]:>3}× {r[0]}") # PGŽ-relevant cu.execute("""SELECT k.naziv, count(ks.*) AS naslova FROM pgz_sport.klub_sezona ks JOIN pgz_sport.klubovi k ON k.id = ks.klub_id WHERE ks.source='hvs_riznica' AND ks.klub_id IS NOT NULL GROUP BY k.naziv ORDER BY count(*) DESC""") print("\n=== PGŽ klubovi sa HVS naslovima ===") for r in cu.fetchall(): print(f" {r[1]:>3}× {r[0]}") conn.close()