162 lines
8.2 KiB
Python
Executable File
162 lines
8.2 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
"""B.3 — Full HBS liga scraper into natjecanja + natjecanja_tablice."""
|
||
import psycopg2, requests, re, html as h, time
|
||
from datetime import datetime
|
||
|
||
DB = dict(host="localhost", port=5432, dbname="rinet_v3",
|
||
user="rinet", password="R1net2026!SecureDB#v7")
|
||
UA = "RiNET-Civic/1.0 (https://rinet.one)"
|
||
sess = requests.Session()
|
||
sess.headers.update({"User-Agent": UA})
|
||
|
||
LIGE = [
|
||
# (url, naziv, sezona, razina)
|
||
("https://hrvatski-bocarski-savez.hr/lige/i-hbl/?sezona=2025-2026", "I HBL", "2025/2026", "1.HBL"),
|
||
("https://hrvatski-bocarski-savez.hr/lige/ii-hbl-sjever/?sezona=2025-2026", "II HBL sjever", "2025/2026", "2.HBL"),
|
||
("https://hrvatski-bocarski-savez.hr/lige/ii-hbl-jug/?sezona=2025-2026", "II HBL jug", "2025/2026", "2.HBL"),
|
||
("https://hrvatski-bocarski-savez.hr/lige/iii-hbl-istra-primorje/?sezona=2025-2026", "III HBL Istra-Primorje", "2025/2026", "3.HBL"),
|
||
("https://hrvatski-bocarski-savez.hr/lige/iii-hbl-zagreb-slavonija/?sezona=2025-2026", "III HBL Zagreb-Slavonija", "2025/2026", "3.HBL"),
|
||
("https://hrvatski-bocarski-savez.hr/lige/iii-hbl-srednja-dalmacija/?sezona=2025-2026", "III HBL srednja Dalmacija", "2025/2026", "3.HBL"),
|
||
("https://hrvatski-bocarski-savez.hr/lige/iii-hbl-dubrovnik-neretva/?sezona=2025-2026", "III HBL Dubrovnik-Neretva", "2025/2026", "3.HBL"),
|
||
("https://hrvatski-bocarski-savez.hr/lige/hrvatska-zenska-bocarska-liga/?sezona=2025-2026", "Hrvatska ženska boćarska liga", "2025/2026", "HŽBL"),
|
||
("https://hrvatski-bocarski-savez.hr/lige/juniorska-liga/?sezona=2025-2026", "Juniorska liga", "2025/2026", "Juniorska"),
|
||
("https://hrvatski-bocarski-savez.hr/lige/kadetska-liga/?sezona=2025-2026", "Kadetska liga", "2025/2026", "Kadetska"),
|
||
("https://hrvatski-bocarski-savez.hr/lige/hrvatska-raffa-liga/?sezona=2026", "Hrvatska Raffa liga", "2026", "HRL"),
|
||
("https://hrvatski-bocarski-savez.hr/lige/hrvatska-petanque-liga/?sezona=2026", "Hrvatska Petanque liga", "2026", "HPL"),
|
||
]
|
||
|
||
# PGŽ klubovi keywords - to mark relevance
|
||
PGZ_KEYWORDS = ["pula","istra","poreč","rijeka","kastav","krimeja","podhum","kukuljanovo","zagon",
|
||
"gornji kraj","ladvić","sveti rok","klana","sveti jakov","jadranovo","krk","cres",
|
||
"lošinj","opatija","lovran","brod-moravice","brod moravice","skrad","mošćenice",
|
||
"mrkopalj","fužine","hreljin","bakar","kostrena","cavle","drenova","srdoči",
|
||
"vargon","sušak","novi vinodolski","crikvenica","selce","grižane","baška",
|
||
"punat","omišalj","malinska","vrbnik","mali lošinj","ščelo","mrkopalj","lokve",
|
||
"delnice","šparta","mošćenička draga","kraljevica","trsat","sušak","plase","matulji"]
|
||
|
||
def fetch(url):
|
||
try:
|
||
r = sess.get(url, timeout=20)
|
||
return r.text if r.status_code == 200 else None
|
||
except: return None
|
||
|
||
def parse_table(html_text):
|
||
"""Parse HBS liga tablica. Returns list of dicts."""
|
||
rows = re.findall(r'<tr[^>]*>(.*?)</tr>', html_text, re.DOTALL)
|
||
if not rows: return []
|
||
# First row is header
|
||
header_cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', rows[0], re.DOTALL)
|
||
headers = [re.sub(r'<[^>]+>', '', c).strip() for c in header_cells]
|
||
|
||
# Required: Poredak, Klub
|
||
if not any("oredak" in h or "Poz" in h for h in headers): return []
|
||
|
||
out = []
|
||
for row in rows[1:]:
|
||
cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', row, re.DOTALL)
|
||
cells_clean = [re.sub(r'<[^>]+>', '', c).strip() for c in cells]
|
||
if len(cells_clean) < 5: continue
|
||
try:
|
||
poz = int(cells_clean[0])
|
||
naziv = cells_clean[1]
|
||
odigrano = int(cells_clean[2]) if cells_clean[2].isdigit() else 0
|
||
pobjede = int(cells_clean[3]) if cells_clean[3].isdigit() else 0
|
||
nerij = int(cells_clean[4]) if cells_clean[4].isdigit() else 0
|
||
porazi = int(cells_clean[5]) if len(cells_clean)>5 and cells_clean[5].isdigit() else 0
|
||
gz = int(cells_clean[6]) if len(cells_clean)>6 and cells_clean[6].isdigit() else 0
|
||
gp = int(cells_clean[7]) if len(cells_clean)>7 and cells_clean[7].isdigit() else 0
|
||
razl_str = cells_clean[8].replace('+','').replace('−','-').replace(',','') if len(cells_clean)>8 else "0"
|
||
try: razl = int(razl_str)
|
||
except: razl = 0
|
||
bod = int(cells_clean[9]) if len(cells_clean)>9 and cells_clean[9].lstrip('-').isdigit() else 0
|
||
out.append({
|
||
"poz": poz, "klub": naziv, "odigrano": odigrano, "pobjede": pobjede,
|
||
"nerij": nerij, "porazi": porazi, "gz": gz, "gp": gp, "razl": razl, "bod": bod
|
||
})
|
||
except (ValueError, IndexError):
|
||
continue
|
||
return out
|
||
|
||
def find_klub_id(cur, naziv):
|
||
"""Find klub_id by fuzzy match against pgz_sport.klubovi."""
|
||
# Try exact
|
||
cur.execute("SELECT id FROM pgz_sport.klubovi WHERE LOWER(naziv) = LOWER(%s) AND id != 4426 LIMIT 1", (naziv,))
|
||
r = cur.fetchone()
|
||
if r: return r[0]
|
||
# Try with BK/MK/ŽBK prefix
|
||
for prefix in ["BK", "MK", "ŽBK", "Boćarski klub", "Bočarski klub"]:
|
||
cur.execute("SELECT id FROM pgz_sport.klubovi WHERE LOWER(naziv) ILIKE LOWER(%s) AND id != 4426 LIMIT 1",
|
||
(f"%{prefix} {naziv}%",))
|
||
r = cur.fetchone()
|
||
if r: return r[0]
|
||
# Last try: contains
|
||
cur.execute("SELECT id FROM pgz_sport.klubovi WHERE LOWER(naziv) ILIKE LOWER(%s) AND id != 4426 LIMIT 1", (f"%{naziv}%",))
|
||
r = cur.fetchone()
|
||
return r[0] if r else None
|
||
|
||
# === MAIN ===
|
||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||
cr = conn.cursor()
|
||
|
||
# Find boćanje savez_id
|
||
cr.execute("SELECT id FROM pgz_sport.savezi WHERE LOWER(naziv) ILIKE '%boćar%' OR LOWER(naziv) ILIKE '%boćan%' OR LOWER(naziv) ILIKE '%hbs%' LIMIT 1")
|
||
r = cr.fetchone()
|
||
savez_id = r[0] if r else None
|
||
print(f"savez_id (HBS): {savez_id}")
|
||
|
||
total_natj = 0; total_redova = 0; total_pgz_klub = 0
|
||
for url, naziv, sezona, razina in LIGE:
|
||
print(f"\n=== {naziv} {sezona} ===")
|
||
body = fetch(url)
|
||
if not body: print(" fetch failed"); continue
|
||
|
||
rows = parse_table(body)
|
||
if not rows: print(" no rows parsed"); continue
|
||
|
||
# Check PGZ relevance
|
||
is_pgz = any(any(kw in r["klub"].lower() for kw in PGZ_KEYWORDS) for r in rows)
|
||
|
||
# Insert natjecanje
|
||
external_id = url.split("?")[0].split("/lige/")[1].rstrip("/").replace("/","_") + "_" + sezona.replace("/","_")
|
||
cr.execute("""
|
||
INSERT INTO pgz_sport.natjecanja
|
||
(sport, savez_id, naziv, razina, tip, sezona, external_id, external_url, source, status, pgz_relevant, source_url)
|
||
VALUES ('boćanje', %s, %s, %s, 'liga', %s, %s, %s, 'hbs_savez', 'aktivno', %s, %s)
|
||
ON CONFLICT (source, external_id) DO UPDATE SET
|
||
updated_at = now(), pgz_relevant = EXCLUDED.pgz_relevant, source_url = EXCLUDED.source_url
|
||
RETURNING id
|
||
""", (savez_id, naziv, razina, sezona, external_id, url, is_pgz, url))
|
||
natj_id = cr.fetchone()[0]
|
||
total_natj += 1
|
||
print(f" natjecanje_id: {natj_id} ({len(rows)} klubova) PGZ={is_pgz}")
|
||
|
||
# Clear old data for this natjecanje + insert new
|
||
cr.execute("DELETE FROM pgz_sport.natjecanja_tablice WHERE natjecanje_id = %s", (natj_id,))
|
||
|
||
for r in rows:
|
||
kid = find_klub_id(cr, r["klub"])
|
||
if kid: total_pgz_klub += 1
|
||
cr.execute("""
|
||
INSERT INTO pgz_sport.natjecanja_tablice
|
||
(natjecanje_id, klub_id, klub_naziv, pozicija, odigrano, pobjede, nerijeseno, porazi,
|
||
gol_z, gol_p, gol_razlika, bodovi)
|
||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
||
""", (natj_id, kid, r["klub"][:200], r["poz"], r["odigrano"], r["pobjede"],
|
||
r["nerij"], r["porazi"], r["gz"], r["gp"], r["razl"], r["bod"]))
|
||
total_redova += 1
|
||
|
||
time.sleep(0.6)
|
||
|
||
print(f"\n=== TOTAL ===")
|
||
print(f" natjecanja: {total_natj}")
|
||
print(f" tablice rows: {total_redova}")
|
||
print(f" matched klub_id: {total_pgz_klub}")
|
||
|
||
# Verify
|
||
cr.execute("SELECT count(*) FROM pgz_sport.natjecanja WHERE source='hbs_savez'")
|
||
print(f" total HBS natjecanja in DB: {cr.fetchone()[0]}")
|
||
cr.execute("SELECT count(*) FROM pgz_sport.natjecanja_tablice")
|
||
print(f" total tablice rows in DB: {cr.fetchone()[0]}")
|
||
|
||
conn.close()
|