#!/usr/bin/env python3
from dotenv import load_dotenv
load_dotenv('/opt/rinet-gpu/.env.master')
# auto-added by patch_scrapers_with_dotenv.sh
"""D.6 — Scrape HNL + 2.HNL + 3.HNL into natjecanja_tablice."""
import os
import requests, re, html as h_unesc, psycopg2
DB = dict(host="10.10.0.2", port=6432, dbname="rinet_v3",
user="rinet", password=os.environ["DB_PASSWORD"])
UA = "RiNET-Civic/1.0"
s = requests.Session()
s.headers.update({"User-Agent": UA})
def fetch(url):
try:
r = s.get(url, timeout=15)
return r.text if r.status_code == 200 else None
except: return None
def find_table_with_header(html_text, header_marker="Klub"):
"""Find table that contains 'Klub' in header."""
tables = re.findall(r'
', html_text, re.DOTALL)
for t in tables:
rows = re.findall(r']*>(.*?)
', t, re.DOTALL)
if not rows: continue
h_cells = re.findall(r']*>(.*?)', rows[0], re.DOTALL)
h_clean = [re.sub(r'<[^>]+>', '', c).strip() for c in h_cells]
if any(header_marker in h for h in h_clean):
return rows
return None
def parse_standings_rows(rows):
"""Parse rows; expect first is header."""
out = []
for row in rows[1:]:
cells = re.findall(r']*>(.*?)', row, re.DOTALL)
cleaned = [re.sub(r'<[^>]+>', '', h_unesc.unescape(c)).strip() for c in cells]
if len(cleaned) < 8: continue
try:
poz = int(cleaned[0].rstrip('.'))
klub = cleaned[1]
odigrano = int(cleaned[2])
pobjede = int(cleaned[3])
nerij = int(cleaned[4])
porazi = int(cleaned[5])
gz = int(cleaned[6])
gp = int(cleaned[7])
razl = int(cleaned[8].replace('+','').replace('−','-'))
bod = int(cleaned[9])
out.append({"poz":poz,"klub":klub,"odigrano":odigrano,"pobjede":pobjede,
"nerij":nerij,"porazi":porazi,"gz":gz,"gp":gp,"razl":razl,"bod":bod})
except (ValueError, IndexError):
continue
return out
# === Sources ===
LIGE = [
("https://hnl.hr/", "SuperSport HNL", "1.HNL", "supersport_hnl_2025_2026", "2025/2026"),
]
conn = psycopg2.connect(**DB); conn.autocommit = True
cr = conn.cursor()
def find_klub(naziv):
# HNK / NK prefixes
for q in [f"HNK {naziv}", f"NK {naziv}", naziv, f"%{naziv}%"]:
cr.execute("SELECT id FROM pgz_sport.klubovi WHERE LOWER(naziv) ILIKE LOWER(%s) AND id != 4426 AND LOWER(sport) LIKE '%%nogomet%%' LIMIT 1", (q,))
r = cr.fetchone()
if r: return r[0]
cr.execute("SELECT id FROM pgz_sport.klubovi WHERE LOWER(naziv) ILIKE LOWER(%s) AND id != 4426 LIMIT 1", (f"%{naziv}%",))
r = cr.fetchone()
return r[0] if r else None
for url, naziv, razina, ext_id, sezona in LIGE:
print(f"=== {naziv} ===")
body = fetch(url)
if not body: print(" fetch failed"); continue
rows = find_table_with_header(body, "Klub")
if not rows: print(" no table found"); continue
parsed = parse_standings_rows(rows)
print(f" {len(parsed)} rows parsed")
cr.execute("SELECT id FROM pgz_sport.natjecanja WHERE source='hns_manual' AND external_id=%s", (ext_id,))
nid = cr.fetchone()[0]
cr.execute("DELETE FROM pgz_sport.natjecanja_tablice WHERE natjecanje_id=%s", (nid,))
matched = 0
for r in parsed:
kid = find_klub(r["klub"])
if kid: matched += 1
cr.execute("""INSERT INTO pgz_sport.natjecanja_tablice
(natjecanje_id, klub_id, klub_naziv, pozicija, odigrano, pobjede, nerijeseno,
porazi, gol_z, gol_p, gol_razlika, bodovi)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""",
(nid, kid, r["klub"][:200], r["poz"], r["odigrano"], r["pobjede"],
r["nerij"], r["porazi"], r["gz"], r["gp"], r["razl"], r["bod"]))
print(f" matched klub_id: {matched}/{len(parsed)}")
# Verify Rijeka
cr.execute("""SELECT pozicija, klub_naziv, bodovi FROM pgz_sport.natjecanja_tablice
WHERE klub_naziv ILIKE '%Rijeka%' AND natjecanje_id IN (SELECT id FROM pgz_sport.natjecanja WHERE source='hns_manual')""")
for r in cr.fetchall():
print(f" {r}")
conn.close()