Files
pgz-sport/scrapers/hbs_lige_scraper.py

166 lines
8.3 KiB
Python
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
from dotenv import load_dotenv
load_dotenv('/opt/rinet-gpu/.env.master')
# auto-added by patch_scrapers_with_dotenv.sh
"""B.3 — Full HBS liga scraper into natjecanja + natjecanja_tablice."""
import os
import psycopg2, requests, re, html as h, time
from datetime import datetime
DB = dict(host="10.10.0.2", port=6432, dbname="rinet_v3",
user="rinet", password=os.environ["DB_PASSWORD"])
UA = "RiNET-Civic/1.0 (https://rinet.one)"
sess = requests.Session()
sess.headers.update({"User-Agent": UA})
LIGE = [
# (url, naziv, sezona, razina)
("https://hrvatski-bocarski-savez.hr/lige/i-hbl/?sezona=2025-2026", "I HBL", "2025/2026", "1.HBL"),
("https://hrvatski-bocarski-savez.hr/lige/ii-hbl-sjever/?sezona=2025-2026", "II HBL sjever", "2025/2026", "2.HBL"),
("https://hrvatski-bocarski-savez.hr/lige/ii-hbl-jug/?sezona=2025-2026", "II HBL jug", "2025/2026", "2.HBL"),
("https://hrvatski-bocarski-savez.hr/lige/iii-hbl-istra-primorje/?sezona=2025-2026", "III HBL Istra-Primorje", "2025/2026", "3.HBL"),
("https://hrvatski-bocarski-savez.hr/lige/iii-hbl-zagreb-slavonija/?sezona=2025-2026", "III HBL Zagreb-Slavonija", "2025/2026", "3.HBL"),
("https://hrvatski-bocarski-savez.hr/lige/iii-hbl-srednja-dalmacija/?sezona=2025-2026", "III HBL srednja Dalmacija", "2025/2026", "3.HBL"),
("https://hrvatski-bocarski-savez.hr/lige/iii-hbl-dubrovnik-neretva/?sezona=2025-2026", "III HBL Dubrovnik-Neretva", "2025/2026", "3.HBL"),
("https://hrvatski-bocarski-savez.hr/lige/hrvatska-zenska-bocarska-liga/?sezona=2025-2026", "Hrvatska ženska boćarska liga", "2025/2026", "HŽBL"),
("https://hrvatski-bocarski-savez.hr/lige/juniorska-liga/?sezona=2025-2026", "Juniorska liga", "2025/2026", "Juniorska"),
("https://hrvatski-bocarski-savez.hr/lige/kadetska-liga/?sezona=2025-2026", "Kadetska liga", "2025/2026", "Kadetska"),
("https://hrvatski-bocarski-savez.hr/lige/hrvatska-raffa-liga/?sezona=2026", "Hrvatska Raffa liga", "2026", "HRL"),
("https://hrvatski-bocarski-savez.hr/lige/hrvatska-petanque-liga/?sezona=2026", "Hrvatska Petanque liga", "2026", "HPL"),
]
# PGŽ klubovi keywords - to mark relevance
PGZ_KEYWORDS = ["pula","istra","poreč","rijeka","kastav","krimeja","podhum","kukuljanovo","zagon",
"gornji kraj","ladvić","sveti rok","klana","sveti jakov","jadranovo","krk","cres",
"lošinj","opatija","lovran","brod-moravice","brod moravice","skrad","mošćenice",
"mrkopalj","fužine","hreljin","bakar","kostrena","cavle","drenova","srdoči",
"vargon","sušak","novi vinodolski","crikvenica","selce","grižane","baška",
"punat","omišalj","malinska","vrbnik","mali lošinj","ščelo","mrkopalj","lokve",
"delnice","šparta","mošćenička draga","kraljevica","trsat","sušak","plase","matulji"]
def fetch(url):
try:
r = sess.get(url, timeout=20)
return r.text if r.status_code == 200 else None
except: return None
def parse_table(html_text):
"""Parse HBS liga tablica. Returns list of dicts."""
rows = re.findall(r'<tr[^>]*>(.*?)</tr>', html_text, re.DOTALL)
if not rows: return []
# First row is header
header_cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', rows[0], re.DOTALL)
headers = [re.sub(r'<[^>]+>', '', c).strip() for c in header_cells]
# Required: Poredak, Klub
if not any("oredak" in h or "Poz" in h for h in headers): return []
out = []
for row in rows[1:]:
cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', row, re.DOTALL)
cells_clean = [re.sub(r'<[^>]+>', '', c).strip() for c in cells]
if len(cells_clean) < 5: continue
try:
poz = int(cells_clean[0])
naziv = cells_clean[1]
odigrano = int(cells_clean[2]) if cells_clean[2].isdigit() else 0
pobjede = int(cells_clean[3]) if cells_clean[3].isdigit() else 0
nerij = int(cells_clean[4]) if cells_clean[4].isdigit() else 0
porazi = int(cells_clean[5]) if len(cells_clean)>5 and cells_clean[5].isdigit() else 0
gz = int(cells_clean[6]) if len(cells_clean)>6 and cells_clean[6].isdigit() else 0
gp = int(cells_clean[7]) if len(cells_clean)>7 and cells_clean[7].isdigit() else 0
razl_str = cells_clean[8].replace('+','').replace('','-').replace(',','') if len(cells_clean)>8 else "0"
try: razl = int(razl_str)
except: razl = 0
bod = int(cells_clean[9]) if len(cells_clean)>9 and cells_clean[9].lstrip('-').isdigit() else 0
out.append({
"poz": poz, "klub": naziv, "odigrano": odigrano, "pobjede": pobjede,
"nerij": nerij, "porazi": porazi, "gz": gz, "gp": gp, "razl": razl, "bod": bod
})
except (ValueError, IndexError):
continue
return out
def find_klub_id(cur, naziv):
"""Find klub_id by fuzzy match against pgz_sport.klubovi."""
# Try exact
cur.execute("SELECT id FROM pgz_sport.klubovi WHERE LOWER(naziv) = LOWER(%s) AND id != 4426 LIMIT 1", (naziv,))
r = cur.fetchone()
if r: return r[0]
# Try with BK/MK/ŽBK prefix
for prefix in ["BK", "MK", "ŽBK", "Boćarski klub", "Bočarski klub"]:
cur.execute("SELECT id FROM pgz_sport.klubovi WHERE LOWER(naziv) ILIKE LOWER(%s) AND id != 4426 LIMIT 1",
(f"%{prefix} {naziv}%",))
r = cur.fetchone()
if r: return r[0]
# Last try: contains
cur.execute("SELECT id FROM pgz_sport.klubovi WHERE LOWER(naziv) ILIKE LOWER(%s) AND id != 4426 LIMIT 1", (f"%{naziv}%",))
r = cur.fetchone()
return r[0] if r else None
# === MAIN ===
conn = psycopg2.connect(**DB); conn.autocommit = True
cr = conn.cursor()
# Find boćanje savez_id
cr.execute("SELECT id FROM pgz_sport.savezi WHERE LOWER(naziv) ILIKE '%boćar%' OR LOWER(naziv) ILIKE '%boćan%' OR LOWER(naziv) ILIKE '%hbs%' LIMIT 1")
r = cr.fetchone()
savez_id = r[0] if r else None
print(f"savez_id (HBS): {savez_id}")
total_natj = 0; total_redova = 0; total_pgz_klub = 0
for url, naziv, sezona, razina in LIGE:
print(f"\n=== {naziv} {sezona} ===")
body = fetch(url)
if not body: print(" fetch failed"); continue
rows = parse_table(body)
if not rows: print(" no rows parsed"); continue
# Check PGZ relevance
is_pgz = any(any(kw in r["klub"].lower() for kw in PGZ_KEYWORDS) for r in rows)
# Insert natjecanje
external_id = url.split("?")[0].split("/lige/")[1].rstrip("/").replace("/","_") + "_" + sezona.replace("/","_")
cr.execute("""
INSERT INTO pgz_sport.natjecanja
(sport, savez_id, naziv, razina, tip, sezona, external_id, external_url, source, status, pgz_relevant, source_url)
VALUES ('boćanje', %s, %s, %s, 'liga', %s, %s, %s, 'hbs_savez', 'aktivno', %s, %s)
ON CONFLICT (source, external_id) DO UPDATE SET
updated_at = now(), pgz_relevant = EXCLUDED.pgz_relevant, source_url = EXCLUDED.source_url
RETURNING id
""", (savez_id, naziv, razina, sezona, external_id, url, is_pgz, url))
natj_id = cr.fetchone()[0]
total_natj += 1
print(f" natjecanje_id: {natj_id} ({len(rows)} klubova) PGZ={is_pgz}")
# Clear old data for this natjecanje + insert new
cr.execute("DELETE FROM pgz_sport.natjecanja_tablice WHERE natjecanje_id = %s", (natj_id,))
for r in rows:
kid = find_klub_id(cr, r["klub"])
if kid: total_pgz_klub += 1
cr.execute("""
INSERT INTO pgz_sport.natjecanja_tablice
(natjecanje_id, klub_id, klub_naziv, pozicija, odigrano, pobjede, nerijeseno, porazi,
gol_z, gol_p, gol_razlika, bodovi)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
""", (natj_id, kid, r["klub"][:200], r["poz"], r["odigrano"], r["pobjede"],
r["nerij"], r["porazi"], r["gz"], r["gp"], r["razl"], r["bod"]))
total_redova += 1
time.sleep(0.6)
print(f"\n=== TOTAL ===")
print(f" natjecanja: {total_natj}")
print(f" tablice rows: {total_redova}")
print(f" matched klub_id: {total_pgz_klub}")
# Verify
cr.execute("SELECT count(*) FROM pgz_sport.natjecanja WHERE source='hbs_savez'")
print(f" total HBS natjecanja in DB: {cr.fetchone()[0]}")
cr.execute("SELECT count(*) FROM pgz_sport.natjecanja_tablice")
print(f" total tablice rows in DB: {cr.fetchone()[0]}")
conn.close()