feat: /api/v2/analiza/* endpoints - sport analytics backend
This commit is contained in:
Executable
+179
@@ -0,0 +1,179 @@
|
||||
#!/usr/bin/env python3
|
||||
"""HVS Riznica - parse direct HTML, no Playwright. Extract champions per season."""
|
||||
import os
|
||||
import re, requests, psycopg2
|
||||
|
||||
DB = dict(host='localhost', port=5432, dbname='rinet_v3',
|
||||
user='rinet', password=os.environ["DB_PASSWORD"])
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cu = conn.cursor()
|
||||
|
||||
CATEGORIES = [
|
||||
("Prvenstvo Hrvatske - muškarci", "prvenstvo-hrvatske-muskarci"),
|
||||
("Prvenstvo Hrvatske - žene", "prvenstvo-hrvatske-zene"),
|
||||
("Kup Hrvatske - muškarci", "kup-hrvatske-muskarci"),
|
||||
("Kup Hrvatske - žene", "kup-hrvatske-zene"),
|
||||
("Trofej Toni Nardelli", "trofej-toni-nardelli"),
|
||||
("Vaterpolist godine", "vaterpolist"),
|
||||
("Vaterpolistica godine", "vaterpolistica"),
|
||||
]
|
||||
|
||||
UA = "RiNET-Civic/1.0 (https://rinet.one)"
|
||||
HDR = {"User-Agent": UA}
|
||||
|
||||
inserted_total = 0
|
||||
all_winners = []
|
||||
for label, slug in CATEGORIES:
|
||||
url = f"https://hvs.hr/riznica/{slug}/"
|
||||
print(f"\n=== {label} ({slug}) ===", flush=True)
|
||||
try:
|
||||
r = requests.get(url, headers=HDR, timeout=15)
|
||||
if r.status_code != 200:
|
||||
print(f" HTTP {r.status_code}"); continue
|
||||
html = r.text
|
||||
|
||||
# Extract slides - each riznica__slide block
|
||||
# Pattern: extract slide blocks with championship name + year
|
||||
# Each slide has competition name OR medal name (for Vaterpolist categories)
|
||||
slides_re = re.compile(
|
||||
r'<div class="riznica__slide[^"]*"[^>]*>(.+?)(?=<div class="riznica__slide|<section|<footer)',
|
||||
re.DOTALL
|
||||
)
|
||||
slides = slides_re.findall(html)
|
||||
print(f" Slides: {len(slides)}", flush=True)
|
||||
|
||||
# Each slide we look for: champion name + year
|
||||
# But year shows separately in <h2 class="riznica__competitions__current"> for current visible
|
||||
# Actually each slide contains its own competition details
|
||||
|
||||
# Better approach: extract all competition name spans in order
|
||||
comp_names = re.findall(r'<h2 class="riznica__competition__name"[^>]*>\s*([^<]+?)\s*<', html)
|
||||
# And champion images for Vaterpolist (medal__name)
|
||||
medal_names = re.findall(r'<h2 class="riznica__medal__name"[^>]*>\s*([^<]+?)\s*<', html)
|
||||
# And years
|
||||
years = re.findall(r'<h2 class="riznica__competitions__current"[^>]*>\s*([^<]+?)\s*<', html)
|
||||
# Or in name spans
|
||||
years_alt = re.findall(r'<h2 class="riznica__competitions__name"[^>]*>\s*([0-9]{4}\./[0-9]{4}\.)\s*<', html)
|
||||
|
||||
# Also single-year format (Trofej, Vaterpolist often have just single year)
|
||||
years_single = re.findall(r'<h2 class="riznica__competitions__name"[^>]*>\s*([0-9]{4}\.)\s*<', html)
|
||||
|
||||
all_years = list(set(years + years_alt + years_single))
|
||||
all_years_sorted = sorted([y.strip() for y in all_years if y.strip()])
|
||||
print(f" Champions: {len(comp_names)}, Medal names: {len(medal_names)}, Years: {len(all_years_sorted)}", flush=True)
|
||||
if comp_names: print(f" Sample champ: {comp_names[:3]}")
|
||||
if medal_names: print(f" Sample medal: {medal_names[:3]}")
|
||||
if all_years_sorted: print(f" Years range: {all_years_sorted[0]} → {all_years_sorted[-1]}")
|
||||
|
||||
# The names may be aligned with years sequentially
|
||||
# Use whichever list has names - prefer comp_names (championships) else medal_names (individual awards)
|
||||
names = comp_names if comp_names else medal_names
|
||||
names = [n.strip() for n in names if n.strip()]
|
||||
|
||||
# The champions in HTML order represent the seasons in display order
|
||||
# Map them to years - assume index alignment with sorted years if same length
|
||||
# Otherwise, the page shows multiple slides — same name may repeat
|
||||
|
||||
# For each non-empty name, create entry
|
||||
# Best guess: names list and years list are PARALLEL (same length, in order on page)
|
||||
# Pages show all-time history, so years_alt (with format) is most reliable
|
||||
|
||||
# Smart: if len(names) matches len(all_years_sorted), pair them
|
||||
# Else create entries with name+year separately and link by index
|
||||
|
||||
# Even better: each "slide" block contains 1 name + 1 year contextually nearby
|
||||
# Find pairs by extracting full slides and matching internal patterns
|
||||
slide_pat = re.compile(
|
||||
r'(?:<h2 class="riznica__competition__name"[^>]*>\s*([^<]+?)\s*<|<h2 class="riznica__medal__name"[^>]*>\s*([^<]+?)\s*<)'
|
||||
r'.*?(?:<h2 class="riznica__competitions__current"[^>]*>\s*([0-9./]+)|<h2 class="riznica__competitions__name(?:[^"]*)"[^>]*>\s*([0-9./]+))',
|
||||
re.DOTALL
|
||||
)
|
||||
# That regex too complex - use simpler split approach
|
||||
# Split by slide divs
|
||||
all_records = []
|
||||
slide_blocks = re.split(r'<div class="riznica__slide(?:\s+is-visible)?">', html)
|
||||
for blk in slide_blocks[1:]:
|
||||
# Find name (champion or medal_name)
|
||||
name_m = re.search(r'class="riznica__competition__name"[^>]*>\s*([^<]+?)\s*<', blk)
|
||||
if not name_m:
|
||||
name_m = re.search(r'class="riznica__medal__name"[^>]*>\s*([^<]+?)\s*<', blk)
|
||||
year_m = re.search(r'class="riznica__competitions__current"[^>]*>\s*([0-9./]+)', blk)
|
||||
if not year_m:
|
||||
year_m = re.search(r'class="riznica__competitions__name(?:[^"]*)"[^>]*>\s*([0-9./]+?)\s*<', blk)
|
||||
# Image
|
||||
img_m = re.search(r'<img src="([^"]+)"', blk)
|
||||
|
||||
if name_m and year_m:
|
||||
all_records.append({
|
||||
"name": name_m.group(1).strip(),
|
||||
"year": year_m.group(1).strip(),
|
||||
"img": img_m.group(1) if img_m else None
|
||||
})
|
||||
|
||||
print(f" Parsed records: {len(all_records)}", flush=True)
|
||||
if all_records:
|
||||
for rec in all_records[:3]: print(f" {rec['year']}: {rec['name']}")
|
||||
|
||||
# Insert into DB
|
||||
for rec in all_records:
|
||||
year = rec['year']
|
||||
champ = rec['name']
|
||||
|
||||
# Find klub_id
|
||||
cu.execute("""SELECT id FROM pgz_sport.klubovi
|
||||
WHERE LOWER(naziv) LIKE LOWER(%s) OR LOWER(naziv) = LOWER(%s)
|
||||
ORDER BY
|
||||
CASE WHEN aktivan THEN 0 ELSE 1 END,
|
||||
CASE WHEN sport='vaterpolo' THEN 0 ELSE 1 END,
|
||||
id ASC LIMIT 1""",
|
||||
(f'%{champ}%', champ))
|
||||
kid_row = cu.fetchone()
|
||||
klub_id = kid_row[0] if kid_row else None
|
||||
|
||||
try:
|
||||
cu.execute("""INSERT INTO pgz_sport.klub_sezona
|
||||
(klub_id, klub_naziv, sezona, natjecanje, plasiranje, trofej, source, source_url)
|
||||
VALUES (%s, %s, %s, %s, 1, %s, 'hvs_riznica', %s)
|
||||
ON CONFLICT DO NOTHING""",
|
||||
(klub_id, champ, year, label,
|
||||
f'1. mjesto - {label} {year}', url))
|
||||
if cu.rowcount > 0:
|
||||
inserted_total += 1
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
all_winners.append({"category": label, "count": len(all_records), "records": all_records})
|
||||
|
||||
except Exception as e:
|
||||
print(f" EXC: {e}")
|
||||
|
||||
print(f"\n\n=== TOTAL INSERTED: {inserted_total} ===")
|
||||
|
||||
# Audit log
|
||||
cu.execute("""INSERT INTO pgz_sport.audit_feed
|
||||
(table_name, action, source, source_url, details)
|
||||
VALUES ('klub_sezona', 'hvs_riznica_scrape', 'hvs_riznica', 'https://hvs.hr/riznica/', %s::jsonb)""",
|
||||
(f'{{"inserted":{inserted_total},"categories":{len(CATEGORIES)}}}',))
|
||||
|
||||
# Top winners
|
||||
cu.execute("""SELECT klub_naziv, count(*) AS naslova
|
||||
FROM pgz_sport.klub_sezona
|
||||
WHERE source='hvs_riznica'
|
||||
GROUP BY klub_naziv
|
||||
ORDER BY count(*) DESC LIMIT 15""")
|
||||
print("\n=== TOP HVS prvaci/medalisti ===")
|
||||
for r in cu.fetchall():
|
||||
print(f" {r[1]:>3}× {r[0]}")
|
||||
|
||||
# PGŽ-relevant
|
||||
cu.execute("""SELECT k.naziv, count(ks.*) AS naslova
|
||||
FROM pgz_sport.klub_sezona ks
|
||||
JOIN pgz_sport.klubovi k ON k.id = ks.klub_id
|
||||
WHERE ks.source='hvs_riznica' AND ks.klub_id IS NOT NULL
|
||||
GROUP BY k.naziv
|
||||
ORDER BY count(*) DESC""")
|
||||
print("\n=== PGŽ klubovi sa HVS naslovima ===")
|
||||
for r in cu.fetchall():
|
||||
print(f" {r[1]:>3}× {r[0]}")
|
||||
|
||||
conn.close()
|
||||
Reference in New Issue
Block a user