feat: /api/v2/analiza/* endpoints - sport analytics backend

This commit is contained in:
Damir Radulic
2026-05-16 00:28:12 +02:00
parent 7ca5d7d94e
commit aca5051418
1355 changed files with 321891 additions and 4128 deletions
+162
View File
@@ -0,0 +1,162 @@
#!/usr/bin/env python3
"""B.3 — Full HBS liga scraper into natjecanja + natjecanja_tablice."""
import os
import psycopg2, requests, re, html as h, time
from datetime import datetime
DB = dict(host="10.10.0.2", port=6432, dbname="rinet_v3",
user="rinet", password=os.environ["DB_PASSWORD"])
UA = "RiNET-Civic/1.0 (https://rinet.one)"
sess = requests.Session()
sess.headers.update({"User-Agent": UA})
LIGE = [
# (url, naziv, sezona, razina)
("https://hrvatski-bocarski-savez.hr/lige/i-hbl/?sezona=2025-2026", "I HBL", "2025/2026", "1.HBL"),
("https://hrvatski-bocarski-savez.hr/lige/ii-hbl-sjever/?sezona=2025-2026", "II HBL sjever", "2025/2026", "2.HBL"),
("https://hrvatski-bocarski-savez.hr/lige/ii-hbl-jug/?sezona=2025-2026", "II HBL jug", "2025/2026", "2.HBL"),
("https://hrvatski-bocarski-savez.hr/lige/iii-hbl-istra-primorje/?sezona=2025-2026", "III HBL Istra-Primorje", "2025/2026", "3.HBL"),
("https://hrvatski-bocarski-savez.hr/lige/iii-hbl-zagreb-slavonija/?sezona=2025-2026", "III HBL Zagreb-Slavonija", "2025/2026", "3.HBL"),
("https://hrvatski-bocarski-savez.hr/lige/iii-hbl-srednja-dalmacija/?sezona=2025-2026", "III HBL srednja Dalmacija", "2025/2026", "3.HBL"),
("https://hrvatski-bocarski-savez.hr/lige/iii-hbl-dubrovnik-neretva/?sezona=2025-2026", "III HBL Dubrovnik-Neretva", "2025/2026", "3.HBL"),
("https://hrvatski-bocarski-savez.hr/lige/hrvatska-zenska-bocarska-liga/?sezona=2025-2026", "Hrvatska ženska boćarska liga", "2025/2026", "HŽBL"),
("https://hrvatski-bocarski-savez.hr/lige/juniorska-liga/?sezona=2025-2026", "Juniorska liga", "2025/2026", "Juniorska"),
("https://hrvatski-bocarski-savez.hr/lige/kadetska-liga/?sezona=2025-2026", "Kadetska liga", "2025/2026", "Kadetska"),
("https://hrvatski-bocarski-savez.hr/lige/hrvatska-raffa-liga/?sezona=2026", "Hrvatska Raffa liga", "2026", "HRL"),
("https://hrvatski-bocarski-savez.hr/lige/hrvatska-petanque-liga/?sezona=2026", "Hrvatska Petanque liga", "2026", "HPL"),
]
# PGŽ klubovi keywords - to mark relevance
PGZ_KEYWORDS = ["pula","istra","poreč","rijeka","kastav","krimeja","podhum","kukuljanovo","zagon",
"gornji kraj","ladvić","sveti rok","klana","sveti jakov","jadranovo","krk","cres",
"lošinj","opatija","lovran","brod-moravice","brod moravice","skrad","mošćenice",
"mrkopalj","fužine","hreljin","bakar","kostrena","cavle","drenova","srdoči",
"vargon","sušak","novi vinodolski","crikvenica","selce","grižane","baška",
"punat","omišalj","malinska","vrbnik","mali lošinj","ščelo","mrkopalj","lokve",
"delnice","šparta","mošćenička draga","kraljevica","trsat","sušak","plase","matulji"]
def fetch(url):
try:
r = sess.get(url, timeout=20)
return r.text if r.status_code == 200 else None
except: return None
def parse_table(html_text):
"""Parse HBS liga tablica. Returns list of dicts."""
rows = re.findall(r'<tr[^>]*>(.*?)</tr>', html_text, re.DOTALL)
if not rows: return []
# First row is header
header_cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', rows[0], re.DOTALL)
headers = [re.sub(r'<[^>]+>', '', c).strip() for c in header_cells]
# Required: Poredak, Klub
if not any("oredak" in h or "Poz" in h for h in headers): return []
out = []
for row in rows[1:]:
cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', row, re.DOTALL)
cells_clean = [re.sub(r'<[^>]+>', '', c).strip() for c in cells]
if len(cells_clean) < 5: continue
try:
poz = int(cells_clean[0])
naziv = cells_clean[1]
odigrano = int(cells_clean[2]) if cells_clean[2].isdigit() else 0
pobjede = int(cells_clean[3]) if cells_clean[3].isdigit() else 0
nerij = int(cells_clean[4]) if cells_clean[4].isdigit() else 0
porazi = int(cells_clean[5]) if len(cells_clean)>5 and cells_clean[5].isdigit() else 0
gz = int(cells_clean[6]) if len(cells_clean)>6 and cells_clean[6].isdigit() else 0
gp = int(cells_clean[7]) if len(cells_clean)>7 and cells_clean[7].isdigit() else 0
razl_str = cells_clean[8].replace('+','').replace('','-').replace(',','') if len(cells_clean)>8 else "0"
try: razl = int(razl_str)
except: razl = 0
bod = int(cells_clean[9]) if len(cells_clean)>9 and cells_clean[9].lstrip('-').isdigit() else 0
out.append({
"poz": poz, "klub": naziv, "odigrano": odigrano, "pobjede": pobjede,
"nerij": nerij, "porazi": porazi, "gz": gz, "gp": gp, "razl": razl, "bod": bod
})
except (ValueError, IndexError):
continue
return out
def find_klub_id(cur, naziv):
"""Find klub_id by fuzzy match against pgz_sport.klubovi."""
# Try exact
cur.execute("SELECT id FROM pgz_sport.klubovi WHERE LOWER(naziv) = LOWER(%s) AND id != 4426 LIMIT 1", (naziv,))
r = cur.fetchone()
if r: return r[0]
# Try with BK/MK/ŽBK prefix
for prefix in ["BK", "MK", "ŽBK", "Boćarski klub", "Bočarski klub"]:
cur.execute("SELECT id FROM pgz_sport.klubovi WHERE LOWER(naziv) ILIKE LOWER(%s) AND id != 4426 LIMIT 1",
(f"%{prefix} {naziv}%",))
r = cur.fetchone()
if r: return r[0]
# Last try: contains
cur.execute("SELECT id FROM pgz_sport.klubovi WHERE LOWER(naziv) ILIKE LOWER(%s) AND id != 4426 LIMIT 1", (f"%{naziv}%",))
r = cur.fetchone()
return r[0] if r else None
# === MAIN ===
conn = psycopg2.connect(**DB); conn.autocommit = True
cr = conn.cursor()
# Find boćanje savez_id
cr.execute("SELECT id FROM pgz_sport.savezi WHERE LOWER(naziv) ILIKE '%boćar%' OR LOWER(naziv) ILIKE '%boćan%' OR LOWER(naziv) ILIKE '%hbs%' LIMIT 1")
r = cr.fetchone()
savez_id = r[0] if r else None
print(f"savez_id (HBS): {savez_id}")
total_natj = 0; total_redova = 0; total_pgz_klub = 0
for url, naziv, sezona, razina in LIGE:
print(f"\n=== {naziv} {sezona} ===")
body = fetch(url)
if not body: print(" fetch failed"); continue
rows = parse_table(body)
if not rows: print(" no rows parsed"); continue
# Check PGZ relevance
is_pgz = any(any(kw in r["klub"].lower() for kw in PGZ_KEYWORDS) for r in rows)
# Insert natjecanje
external_id = url.split("?")[0].split("/lige/")[1].rstrip("/").replace("/","_") + "_" + sezona.replace("/","_")
cr.execute("""
INSERT INTO pgz_sport.natjecanja
(sport, savez_id, naziv, razina, tip, sezona, external_id, external_url, source, status, pgz_relevant, source_url)
VALUES ('boćanje', %s, %s, %s, 'liga', %s, %s, %s, 'hbs_savez', 'aktivno', %s, %s)
ON CONFLICT (source, external_id) DO UPDATE SET
updated_at = now(), pgz_relevant = EXCLUDED.pgz_relevant, source_url = EXCLUDED.source_url
RETURNING id
""", (savez_id, naziv, razina, sezona, external_id, url, is_pgz, url))
natj_id = cr.fetchone()[0]
total_natj += 1
print(f" natjecanje_id: {natj_id} ({len(rows)} klubova) PGZ={is_pgz}")
# Clear old data for this natjecanje + insert new
cr.execute("DELETE FROM pgz_sport.natjecanja_tablice WHERE natjecanje_id = %s", (natj_id,))
for r in rows:
kid = find_klub_id(cr, r["klub"])
if kid: total_pgz_klub += 1
cr.execute("""
INSERT INTO pgz_sport.natjecanja_tablice
(natjecanje_id, klub_id, klub_naziv, pozicija, odigrano, pobjede, nerijeseno, porazi,
gol_z, gol_p, gol_razlika, bodovi)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
""", (natj_id, kid, r["klub"][:200], r["poz"], r["odigrano"], r["pobjede"],
r["nerij"], r["porazi"], r["gz"], r["gp"], r["razl"], r["bod"]))
total_redova += 1
time.sleep(0.6)
print(f"\n=== TOTAL ===")
print(f" natjecanja: {total_natj}")
print(f" tablice rows: {total_redova}")
print(f" matched klub_id: {total_pgz_klub}")
# Verify
cr.execute("SELECT count(*) FROM pgz_sport.natjecanja WHERE source='hbs_savez'")
print(f" total HBS natjecanja in DB: {cr.fetchone()[0]}")
cr.execute("SELECT count(*) FROM pgz_sport.natjecanja_tablice")
print(f" total tablice rows in DB: {cr.fetchone()[0]}")
conn.close()