#!/usr/bin/env python3 """B.3 — Full HBS liga scraper into natjecanja + natjecanja_tablice.""" import psycopg2, requests, re, html as h, time from datetime import datetime DB = dict(host="localhost", port=5432, dbname="rinet_v3", user="rinet", password="R1net2026!SecureDB#v7") UA = "RiNET-Civic/1.0 (https://rinet.one)" sess = requests.Session() sess.headers.update({"User-Agent": UA}) LIGE = [ # (url, naziv, sezona, razina) ("https://hrvatski-bocarski-savez.hr/lige/i-hbl/?sezona=2025-2026", "I HBL", "2025/2026", "1.HBL"), ("https://hrvatski-bocarski-savez.hr/lige/ii-hbl-sjever/?sezona=2025-2026", "II HBL sjever", "2025/2026", "2.HBL"), ("https://hrvatski-bocarski-savez.hr/lige/ii-hbl-jug/?sezona=2025-2026", "II HBL jug", "2025/2026", "2.HBL"), ("https://hrvatski-bocarski-savez.hr/lige/iii-hbl-istra-primorje/?sezona=2025-2026", "III HBL Istra-Primorje", "2025/2026", "3.HBL"), ("https://hrvatski-bocarski-savez.hr/lige/iii-hbl-zagreb-slavonija/?sezona=2025-2026", "III HBL Zagreb-Slavonija", "2025/2026", "3.HBL"), ("https://hrvatski-bocarski-savez.hr/lige/iii-hbl-srednja-dalmacija/?sezona=2025-2026", "III HBL srednja Dalmacija", "2025/2026", "3.HBL"), ("https://hrvatski-bocarski-savez.hr/lige/iii-hbl-dubrovnik-neretva/?sezona=2025-2026", "III HBL Dubrovnik-Neretva", "2025/2026", "3.HBL"), ("https://hrvatski-bocarski-savez.hr/lige/hrvatska-zenska-bocarska-liga/?sezona=2025-2026", "Hrvatska ženska boćarska liga", "2025/2026", "HŽBL"), ("https://hrvatski-bocarski-savez.hr/lige/juniorska-liga/?sezona=2025-2026", "Juniorska liga", "2025/2026", "Juniorska"), ("https://hrvatski-bocarski-savez.hr/lige/kadetska-liga/?sezona=2025-2026", "Kadetska liga", "2025/2026", "Kadetska"), ("https://hrvatski-bocarski-savez.hr/lige/hrvatska-raffa-liga/?sezona=2026", "Hrvatska Raffa liga", "2026", "HRL"), ("https://hrvatski-bocarski-savez.hr/lige/hrvatska-petanque-liga/?sezona=2026", "Hrvatska Petanque liga", "2026", "HPL"), ] # PGŽ klubovi keywords - to mark relevance PGZ_KEYWORDS = ["pula","istra","poreč","rijeka","kastav","krimeja","podhum","kukuljanovo","zagon", "gornji kraj","ladvić","sveti rok","klana","sveti jakov","jadranovo","krk","cres", "lošinj","opatija","lovran","brod-moravice","brod moravice","skrad","mošćenice", "mrkopalj","fužine","hreljin","bakar","kostrena","cavle","drenova","srdoči", "vargon","sušak","novi vinodolski","crikvenica","selce","grižane","baška", "punat","omišalj","malinska","vrbnik","mali lošinj","ščelo","mrkopalj","lokve", "delnice","šparta","mošćenička draga","kraljevica","trsat","sušak","plase","matulji"] def fetch(url): try: r = sess.get(url, timeout=20) return r.text if r.status_code == 200 else None except: return None def parse_table(html_text): """Parse HBS liga tablica. Returns list of dicts.""" rows = re.findall(r']*>(.*?)', html_text, re.DOTALL) if not rows: return [] # First row is header header_cells = re.findall(r']*>(.*?)', rows[0], re.DOTALL) headers = [re.sub(r'<[^>]+>', '', c).strip() for c in header_cells] # Required: Poredak, Klub if not any("oredak" in h or "Poz" in h for h in headers): return [] out = [] for row in rows[1:]: cells = re.findall(r']*>(.*?)', row, re.DOTALL) cells_clean = [re.sub(r'<[^>]+>', '', c).strip() for c in cells] if len(cells_clean) < 5: continue try: poz = int(cells_clean[0]) naziv = cells_clean[1] odigrano = int(cells_clean[2]) if cells_clean[2].isdigit() else 0 pobjede = int(cells_clean[3]) if cells_clean[3].isdigit() else 0 nerij = int(cells_clean[4]) if cells_clean[4].isdigit() else 0 porazi = int(cells_clean[5]) if len(cells_clean)>5 and cells_clean[5].isdigit() else 0 gz = int(cells_clean[6]) if len(cells_clean)>6 and cells_clean[6].isdigit() else 0 gp = int(cells_clean[7]) if len(cells_clean)>7 and cells_clean[7].isdigit() else 0 razl_str = cells_clean[8].replace('+','').replace('−','-').replace(',','') if len(cells_clean)>8 else "0" try: razl = int(razl_str) except: razl = 0 bod = int(cells_clean[9]) if len(cells_clean)>9 and cells_clean[9].lstrip('-').isdigit() else 0 out.append({ "poz": poz, "klub": naziv, "odigrano": odigrano, "pobjede": pobjede, "nerij": nerij, "porazi": porazi, "gz": gz, "gp": gp, "razl": razl, "bod": bod }) except (ValueError, IndexError): continue return out def find_klub_id(cur, naziv): """Find klub_id by fuzzy match against pgz_sport.klubovi.""" # Try exact cur.execute("SELECT id FROM pgz_sport.klubovi WHERE LOWER(naziv) = LOWER(%s) AND id != 4426 LIMIT 1", (naziv,)) r = cur.fetchone() if r: return r[0] # Try with BK/MK/ŽBK prefix for prefix in ["BK", "MK", "ŽBK", "Boćarski klub", "Bočarski klub"]: cur.execute("SELECT id FROM pgz_sport.klubovi WHERE LOWER(naziv) ILIKE LOWER(%s) AND id != 4426 LIMIT 1", (f"%{prefix} {naziv}%",)) r = cur.fetchone() if r: return r[0] # Last try: contains cur.execute("SELECT id FROM pgz_sport.klubovi WHERE LOWER(naziv) ILIKE LOWER(%s) AND id != 4426 LIMIT 1", (f"%{naziv}%",)) r = cur.fetchone() return r[0] if r else None # === MAIN === conn = psycopg2.connect(**DB); conn.autocommit = True cr = conn.cursor() # Find boćanje savez_id cr.execute("SELECT id FROM pgz_sport.savezi WHERE LOWER(naziv) ILIKE '%boćar%' OR LOWER(naziv) ILIKE '%boćan%' OR LOWER(naziv) ILIKE '%hbs%' LIMIT 1") r = cr.fetchone() savez_id = r[0] if r else None print(f"savez_id (HBS): {savez_id}") total_natj = 0; total_redova = 0; total_pgz_klub = 0 for url, naziv, sezona, razina in LIGE: print(f"\n=== {naziv} {sezona} ===") body = fetch(url) if not body: print(" fetch failed"); continue rows = parse_table(body) if not rows: print(" no rows parsed"); continue # Check PGZ relevance is_pgz = any(any(kw in r["klub"].lower() for kw in PGZ_KEYWORDS) for r in rows) # Insert natjecanje external_id = url.split("?")[0].split("/lige/")[1].rstrip("/").replace("/","_") + "_" + sezona.replace("/","_") cr.execute(""" INSERT INTO pgz_sport.natjecanja (sport, savez_id, naziv, razina, tip, sezona, external_id, external_url, source, status, pgz_relevant, source_url) VALUES ('boćanje', %s, %s, %s, 'liga', %s, %s, %s, 'hbs_savez', 'aktivno', %s, %s) ON CONFLICT (source, external_id) DO UPDATE SET updated_at = now(), pgz_relevant = EXCLUDED.pgz_relevant, source_url = EXCLUDED.source_url RETURNING id """, (savez_id, naziv, razina, sezona, external_id, url, is_pgz, url)) natj_id = cr.fetchone()[0] total_natj += 1 print(f" natjecanje_id: {natj_id} ({len(rows)} klubova) PGZ={is_pgz}") # Clear old data for this natjecanje + insert new cr.execute("DELETE FROM pgz_sport.natjecanja_tablice WHERE natjecanje_id = %s", (natj_id,)) for r in rows: kid = find_klub_id(cr, r["klub"]) if kid: total_pgz_klub += 1 cr.execute(""" INSERT INTO pgz_sport.natjecanja_tablice (natjecanje_id, klub_id, klub_naziv, pozicija, odigrano, pobjede, nerijeseno, porazi, gol_z, gol_p, gol_razlika, bodovi) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) """, (natj_id, kid, r["klub"][:200], r["poz"], r["odigrano"], r["pobjede"], r["nerij"], r["porazi"], r["gz"], r["gp"], r["razl"], r["bod"])) total_redova += 1 time.sleep(0.6) print(f"\n=== TOTAL ===") print(f" natjecanja: {total_natj}") print(f" tablice rows: {total_redova}") print(f" matched klub_id: {total_pgz_klub}") # Verify cr.execute("SELECT count(*) FROM pgz_sport.natjecanja WHERE source='hbs_savez'") print(f" total HBS natjecanja in DB: {cr.fetchone()[0]}") cr.execute("SELECT count(*) FROM pgz_sport.natjecanja_tablice") print(f" total tablice rows in DB: {cr.fetchone()[0]}") conn.close()