pgz-sport/scrapers/hvs_riznica.py

#!/usr/bin/env python3
from dotenv import load_dotenv
load_dotenv('/opt/rinet-gpu/.env.master')
# auto-added by patch_scrapers_with_dotenv.sh
"""HVS Riznica - parse direct HTML, no Playwright. Extract champions per season."""
import os
import re, requests, psycopg2

DB = dict(host='localhost', port=5432, dbname='rinet_v3',
          user='rinet', password=os.environ["DB_PASSWORD"])
conn = psycopg2.connect(**DB); conn.autocommit = True
cu = conn.cursor()

CATEGORIES = [
    ("Prvenstvo Hrvatske - muškarci", "prvenstvo-hrvatske-muskarci"),
    ("Prvenstvo Hrvatske - žene", "prvenstvo-hrvatske-zene"),
    ("Kup Hrvatske - muškarci", "kup-hrvatske-muskarci"),
    ("Kup Hrvatske - žene", "kup-hrvatske-zene"),
    ("Trofej Toni Nardelli", "trofej-toni-nardelli"),
    ("Vaterpolist godine", "vaterpolist"),
    ("Vaterpolistica godine", "vaterpolistica"),
]

UA = "RiNET-Civic/1.0 (https://rinet.one)"
HDR = {"User-Agent": UA}

inserted_total = 0
all_winners = []
for label, slug in CATEGORIES:
    url = f"https://hvs.hr/riznica/{slug}/"
    print(f"\n=== {label} ({slug}) ===", flush=True)
    try:
        r = requests.get(url, headers=HDR, timeout=15)
        if r.status_code != 200:
            print(f"  HTTP {r.status_code}"); continue
        html = r.text

        # Extract slides - each riznica__slide block
        # Pattern: extract slide blocks with championship name + year
        # Each slide has competition name OR medal name (for Vaterpolist categories)
        slides_re = re.compile(
            r'<div class="riznica__slide[^"]*"[^>]*>(.+?)(?=<div class="riznica__slide|<section|<footer)',
            re.DOTALL
        )
        slides = slides_re.findall(html)
        print(f"  Slides: {len(slides)}", flush=True)

        # Each slide we look for: champion name + year
        # But year shows separately in <h2 class="riznica__competitions__current"> for current visible
        # Actually each slide contains its own competition details

        # Better approach: extract all competition name spans in order
        comp_names = re.findall(r'<h2 class="riznica__competition__name"[^>]*>\s*([^<]+?)\s*<', html)
        # And champion images for Vaterpolist (medal__name)
        medal_names = re.findall(r'<h2 class="riznica__medal__name"[^>]*>\s*([^<]+?)\s*<', html)
        # And years
        years = re.findall(r'<h2 class="riznica__competitions__current"[^>]*>\s*([^<]+?)\s*<', html)
        # Or in name spans
        years_alt = re.findall(r'<h2 class="riznica__competitions__name"[^>]*>\s*([0-9]{4}\./[0-9]{4}\.)\s*<', html)

        # Also single-year format (Trofej, Vaterpolist often have just single year)
        years_single = re.findall(r'<h2 class="riznica__competitions__name"[^>]*>\s*([0-9]{4}\.)\s*<', html)

        all_years = list(set(years + years_alt + years_single))
        all_years_sorted = sorted([y.strip() for y in all_years if y.strip()])
        print(f"  Champions: {len(comp_names)}, Medal names: {len(medal_names)}, Years: {len(all_years_sorted)}", flush=True)
        if comp_names: print(f"  Sample champ: {comp_names[:3]}")
        if medal_names: print(f"  Sample medal: {medal_names[:3]}")
        if all_years_sorted: print(f"  Years range: {all_years_sorted[0]} → {all_years_sorted[-1]}")

        # The names may be aligned with years sequentially
        # Use whichever list has names - prefer comp_names (championships) else medal_names (individual awards)
        names = comp_names if comp_names else medal_names
        names = [n.strip() for n in names if n.strip()]

        # The champions in HTML order represent the seasons in display order
        # Map them to years - assume index alignment with sorted years if same length
        # Otherwise, the page shows multiple slides — same name may repeat

        # For each non-empty name, create entry
        # Best guess: names list and years list are PARALLEL (same length, in order on page)
        # Pages show all-time history, so years_alt (with format) is most reliable

        # Smart: if len(names) matches len(all_years_sorted), pair them
        # Else create entries with name+year separately and link by index

        # Even better: each "slide" block contains 1 name + 1 year contextually nearby
        # Find pairs by extracting full slides and matching internal patterns
        slide_pat = re.compile(
            r'(?:<h2 class="riznica__competition__name"[^>]*>\s*([^<]+?)\s*<|<h2 class="riznica__medal__name"[^>]*>\s*([^<]+?)\s*<)'
            r'.*?(?:<h2 class="riznica__competitions__current"[^>]*>\s*([0-9./]+)|<h2 class="riznica__competitions__name(?:[^"]*)"[^>]*>\s*([0-9./]+))',
            re.DOTALL
        )
        # That regex too complex - use simpler split approach
        # Split by slide divs
        all_records = []
        slide_blocks = re.split(r'<div class="riznica__slide(?:\s+is-visible)?">', html)
        for blk in slide_blocks[1:]:
            # Find name (champion or medal_name)
            name_m = re.search(r'class="riznica__competition__name"[^>]*>\s*([^<]+?)\s*<', blk)
            if not name_m:
                name_m = re.search(r'class="riznica__medal__name"[^>]*>\s*([^<]+?)\s*<', blk)
            year_m = re.search(r'class="riznica__competitions__current"[^>]*>\s*([0-9./]+)', blk)
            if not year_m:
                year_m = re.search(r'class="riznica__competitions__name(?:[^"]*)"[^>]*>\s*([0-9./]+?)\s*<', blk)
            # Image
            img_m = re.search(r'<img src="([^"]+)"', blk)

            if name_m and year_m:
                all_records.append({
                    "name": name_m.group(1).strip(),
                    "year": year_m.group(1).strip(),
                    "img": img_m.group(1) if img_m else None
                })

        print(f"  Parsed records: {len(all_records)}", flush=True)
        if all_records:
            for rec in all_records[:3]: print(f"    {rec['year']}: {rec['name']}")

        # Insert into DB
        for rec in all_records:
            year = rec['year']
            champ = rec['name']

            # Find klub_id
            cu.execute("""SELECT id FROM pgz_sport.klubovi
                          WHERE LOWER(naziv) LIKE LOWER(%s) OR LOWER(naziv) = LOWER(%s)
                          ORDER BY
                            CASE WHEN aktivan THEN 0 ELSE 1 END,
                            CASE WHEN sport='vaterpolo' THEN 0 ELSE 1 END,
                            id ASC LIMIT 1""",
                       (f'%{champ}%', champ))
            kid_row = cu.fetchone()
            klub_id = kid_row[0] if kid_row else None

            try:
                cu.execute("""INSERT INTO pgz_sport.klub_sezona
                              (klub_id, klub_naziv, sezona, natjecanje, plasiranje, trofej, source, source_url)
                              VALUES (%s, %s, %s, %s, 1, %s, 'hvs_riznica', %s)
                              ON CONFLICT DO NOTHING""",
                           (klub_id, champ, year, label,
                            f'1. mjesto - {label} {year}', url))
                if cu.rowcount > 0:
                    inserted_total += 1
            except Exception as e:
                pass

        all_winners.append({"category": label, "count": len(all_records), "records": all_records})

    except Exception as e:
        print(f"  EXC: {e}")

print(f"\n\n=== TOTAL INSERTED: {inserted_total} ===")

# Audit log
cu.execute("""INSERT INTO pgz_sport.audit_feed
              (table_name, action, source, source_url, details)
              VALUES ('klub_sezona', 'hvs_riznica_scrape', 'hvs_riznica', 'https://hvs.hr/riznica/', %s::jsonb)""",
           (f'{{"inserted":{inserted_total},"categories":{len(CATEGORIES)}}}',))

# Top winners
cu.execute("""SELECT klub_naziv, count(*) AS naslova
              FROM pgz_sport.klub_sezona
              WHERE source='hvs_riznica'
              GROUP BY klub_naziv
              ORDER BY count(*) DESC LIMIT 15""")
print("\n=== TOP HVS prvaci/medalisti ===")
for r in cu.fetchall():
    print(f"  {r[1]:>3}× {r[0]}")

# PGŽ-relevant
cu.execute("""SELECT k.naziv, count(ks.*) AS naslova
              FROM pgz_sport.klub_sezona ks
              JOIN pgz_sport.klubovi k ON k.id = ks.klub_id
              WHERE ks.source='hvs_riznica' AND ks.klub_id IS NOT NULL
              GROUP BY k.naziv
              ORDER BY count(*) DESC""")
print("\n=== PGŽ klubovi sa HVS naslovima ===")
for r in cu.fetchall():
    print(f"  {r[1]:>3}× {r[0]}")

conn.close()