Files
pgz-sport/scrapers/hvs_riznica.py
T

183 lines
8.5 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
from dotenv import load_dotenv
load_dotenv('/opt/rinet-gpu/.env.master')
# auto-added by patch_scrapers_with_dotenv.sh
"""HVS Riznica - parse direct HTML, no Playwright. Extract champions per season."""
import os
import re, requests, psycopg2
DB = dict(host='localhost', port=5432, dbname='rinet_v3',
user='rinet', password=os.environ["DB_PASSWORD"])
conn = psycopg2.connect(**DB); conn.autocommit = True
cu = conn.cursor()
CATEGORIES = [
("Prvenstvo Hrvatske - muškarci", "prvenstvo-hrvatske-muskarci"),
("Prvenstvo Hrvatske - žene", "prvenstvo-hrvatske-zene"),
("Kup Hrvatske - muškarci", "kup-hrvatske-muskarci"),
("Kup Hrvatske - žene", "kup-hrvatske-zene"),
("Trofej Toni Nardelli", "trofej-toni-nardelli"),
("Vaterpolist godine", "vaterpolist"),
("Vaterpolistica godine", "vaterpolistica"),
]
UA = "RiNET-Civic/1.0 (https://rinet.one)"
HDR = {"User-Agent": UA}
inserted_total = 0
all_winners = []
for label, slug in CATEGORIES:
url = f"https://hvs.hr/riznica/{slug}/"
print(f"\n=== {label} ({slug}) ===", flush=True)
try:
r = requests.get(url, headers=HDR, timeout=15)
if r.status_code != 200:
print(f" HTTP {r.status_code}"); continue
html = r.text
# Extract slides - each riznica__slide block
# Pattern: extract slide blocks with championship name + year
# Each slide has competition name OR medal name (for Vaterpolist categories)
slides_re = re.compile(
r'<div class="riznica__slide[^"]*"[^>]*>(.+?)(?=<div class="riznica__slide|<section|<footer)',
re.DOTALL
)
slides = slides_re.findall(html)
print(f" Slides: {len(slides)}", flush=True)
# Each slide we look for: champion name + year
# But year shows separately in <h2 class="riznica__competitions__current"> for current visible
# Actually each slide contains its own competition details
# Better approach: extract all competition name spans in order
comp_names = re.findall(r'<h2 class="riznica__competition__name"[^>]*>\s*([^<]+?)\s*<', html)
# And champion images for Vaterpolist (medal__name)
medal_names = re.findall(r'<h2 class="riznica__medal__name"[^>]*>\s*([^<]+?)\s*<', html)
# And years
years = re.findall(r'<h2 class="riznica__competitions__current"[^>]*>\s*([^<]+?)\s*<', html)
# Or in name spans
years_alt = re.findall(r'<h2 class="riznica__competitions__name"[^>]*>\s*([0-9]{4}\./[0-9]{4}\.)\s*<', html)
# Also single-year format (Trofej, Vaterpolist often have just single year)
years_single = re.findall(r'<h2 class="riznica__competitions__name"[^>]*>\s*([0-9]{4}\.)\s*<', html)
all_years = list(set(years + years_alt + years_single))
all_years_sorted = sorted([y.strip() for y in all_years if y.strip()])
print(f" Champions: {len(comp_names)}, Medal names: {len(medal_names)}, Years: {len(all_years_sorted)}", flush=True)
if comp_names: print(f" Sample champ: {comp_names[:3]}")
if medal_names: print(f" Sample medal: {medal_names[:3]}")
if all_years_sorted: print(f" Years range: {all_years_sorted[0]}{all_years_sorted[-1]}")
# The names may be aligned with years sequentially
# Use whichever list has names - prefer comp_names (championships) else medal_names (individual awards)
names = comp_names if comp_names else medal_names
names = [n.strip() for n in names if n.strip()]
# The champions in HTML order represent the seasons in display order
# Map them to years - assume index alignment with sorted years if same length
# Otherwise, the page shows multiple slides — same name may repeat
# For each non-empty name, create entry
# Best guess: names list and years list are PARALLEL (same length, in order on page)
# Pages show all-time history, so years_alt (with format) is most reliable
# Smart: if len(names) matches len(all_years_sorted), pair them
# Else create entries with name+year separately and link by index
# Even better: each "slide" block contains 1 name + 1 year contextually nearby
# Find pairs by extracting full slides and matching internal patterns
slide_pat = re.compile(
r'(?:<h2 class="riznica__competition__name"[^>]*>\s*([^<]+?)\s*<|<h2 class="riznica__medal__name"[^>]*>\s*([^<]+?)\s*<)'
r'.*?(?:<h2 class="riznica__competitions__current"[^>]*>\s*([0-9./]+)|<h2 class="riznica__competitions__name(?:[^"]*)"[^>]*>\s*([0-9./]+))',
re.DOTALL
)
# That regex too complex - use simpler split approach
# Split by slide divs
all_records = []
slide_blocks = re.split(r'<div class="riznica__slide(?:\s+is-visible)?">', html)
for blk in slide_blocks[1:]:
# Find name (champion or medal_name)
name_m = re.search(r'class="riznica__competition__name"[^>]*>\s*([^<]+?)\s*<', blk)
if not name_m:
name_m = re.search(r'class="riznica__medal__name"[^>]*>\s*([^<]+?)\s*<', blk)
year_m = re.search(r'class="riznica__competitions__current"[^>]*>\s*([0-9./]+)', blk)
if not year_m:
year_m = re.search(r'class="riznica__competitions__name(?:[^"]*)"[^>]*>\s*([0-9./]+?)\s*<', blk)
# Image
img_m = re.search(r'<img src="([^"]+)"', blk)
if name_m and year_m:
all_records.append({
"name": name_m.group(1).strip(),
"year": year_m.group(1).strip(),
"img": img_m.group(1) if img_m else None
})
print(f" Parsed records: {len(all_records)}", flush=True)
if all_records:
for rec in all_records[:3]: print(f" {rec['year']}: {rec['name']}")
# Insert into DB
for rec in all_records:
year = rec['year']
champ = rec['name']
# Find klub_id
cu.execute("""SELECT id FROM pgz_sport.klubovi
WHERE LOWER(naziv) LIKE LOWER(%s) OR LOWER(naziv) = LOWER(%s)
ORDER BY
CASE WHEN aktivan THEN 0 ELSE 1 END,
CASE WHEN sport='vaterpolo' THEN 0 ELSE 1 END,
id ASC LIMIT 1""",
(f'%{champ}%', champ))
kid_row = cu.fetchone()
klub_id = kid_row[0] if kid_row else None
try:
cu.execute("""INSERT INTO pgz_sport.klub_sezona
(klub_id, klub_naziv, sezona, natjecanje, plasiranje, trofej, source, source_url)
VALUES (%s, %s, %s, %s, 1, %s, 'hvs_riznica', %s)
ON CONFLICT DO NOTHING""",
(klub_id, champ, year, label,
f'1. mjesto - {label} {year}', url))
if cu.rowcount > 0:
inserted_total += 1
except Exception as e:
pass
all_winners.append({"category": label, "count": len(all_records), "records": all_records})
except Exception as e:
print(f" EXC: {e}")
print(f"\n\n=== TOTAL INSERTED: {inserted_total} ===")
# Audit log
cu.execute("""INSERT INTO pgz_sport.audit_feed
(table_name, action, source, source_url, details)
VALUES ('klub_sezona', 'hvs_riznica_scrape', 'hvs_riznica', 'https://hvs.hr/riznica/', %s::jsonb)""",
(f'{{"inserted":{inserted_total},"categories":{len(CATEGORIES)}}}',))
# Top winners
cu.execute("""SELECT klub_naziv, count(*) AS naslova
FROM pgz_sport.klub_sezona
WHERE source='hvs_riznica'
GROUP BY klub_naziv
ORDER BY count(*) DESC LIMIT 15""")
print("\n=== TOP HVS prvaci/medalisti ===")
for r in cu.fetchall():
print(f" {r[1]:>3}× {r[0]}")
# PGŽ-relevant
cu.execute("""SELECT k.naziv, count(ks.*) AS naslova
FROM pgz_sport.klub_sezona ks
JOIN pgz_sport.klubovi k ON k.id = ks.klub_id
WHERE ks.source='hvs_riznica' AND ks.klub_id IS NOT NULL
GROUP BY k.naziv
ORDER BY count(*) DESC""")
print("\n=== PGŽ klubovi sa HVS naslovima ===")
for r in cu.fetchall():
print(f" {r[1]:>3}× {r[0]}")
conn.close()