#!/usr/bin/env python3 """D.6 — Scrape HNL + 2.HNL + 3.HNL into natjecanja_tablice.""" import requests, re, html as h_unesc, psycopg2 DB = dict(host="localhost", port=5432, dbname="rinet_v3", user="rinet", password="R1net2026!SecureDB#v7") UA = "RiNET-Civic/1.0" s = requests.Session() s.headers.update({"User-Agent": UA}) def fetch(url): try: r = s.get(url, timeout=15) return r.text if r.status_code == 200 else None except: return None def find_table_with_header(html_text, header_marker="Klub"): """Find table that contains 'Klub' in header.""" tables = re.findall(r']*>.*?', html_text, re.DOTALL) for t in tables: rows = re.findall(r']*>(.*?)', t, re.DOTALL) if not rows: continue h_cells = re.findall(r']*>(.*?)', rows[0], re.DOTALL) h_clean = [re.sub(r'<[^>]+>', '', c).strip() for c in h_cells] if any(header_marker in h for h in h_clean): return rows return None def parse_standings_rows(rows): """Parse rows; expect first is header.""" out = [] for row in rows[1:]: cells = re.findall(r']*>(.*?)', row, re.DOTALL) cleaned = [re.sub(r'<[^>]+>', '', h_unesc.unescape(c)).strip() for c in cells] if len(cleaned) < 8: continue try: poz = int(cleaned[0].rstrip('.')) klub = cleaned[1] odigrano = int(cleaned[2]) pobjede = int(cleaned[3]) nerij = int(cleaned[4]) porazi = int(cleaned[5]) gz = int(cleaned[6]) gp = int(cleaned[7]) razl = int(cleaned[8].replace('+','').replace('−','-')) bod = int(cleaned[9]) out.append({"poz":poz,"klub":klub,"odigrano":odigrano,"pobjede":pobjede, "nerij":nerij,"porazi":porazi,"gz":gz,"gp":gp,"razl":razl,"bod":bod}) except (ValueError, IndexError): continue return out # === Sources === LIGE = [ ("https://hnl.hr/", "SuperSport HNL", "1.HNL", "supersport_hnl_2025_2026", "2025/2026"), ] conn = psycopg2.connect(**DB); conn.autocommit = True cr = conn.cursor() def find_klub(naziv): # HNK / NK prefixes for q in [f"HNK {naziv}", f"NK {naziv}", naziv, f"%{naziv}%"]: cr.execute("SELECT id FROM pgz_sport.klubovi WHERE LOWER(naziv) ILIKE LOWER(%s) AND id != 4426 AND LOWER(sport) LIKE '%%nogomet%%' LIMIT 1", (q,)) r = cr.fetchone() if r: return r[0] cr.execute("SELECT id FROM pgz_sport.klubovi WHERE LOWER(naziv) ILIKE LOWER(%s) AND id != 4426 LIMIT 1", (f"%{naziv}%",)) r = cr.fetchone() return r[0] if r else None for url, naziv, razina, ext_id, sezona in LIGE: print(f"=== {naziv} ===") body = fetch(url) if not body: print(" fetch failed"); continue rows = find_table_with_header(body, "Klub") if not rows: print(" no table found"); continue parsed = parse_standings_rows(rows) print(f" {len(parsed)} rows parsed") cr.execute("SELECT id FROM pgz_sport.natjecanja WHERE source='hns_manual' AND external_id=%s", (ext_id,)) nid = cr.fetchone()[0] cr.execute("DELETE FROM pgz_sport.natjecanja_tablice WHERE natjecanje_id=%s", (nid,)) matched = 0 for r in parsed: kid = find_klub(r["klub"]) if kid: matched += 1 cr.execute("""INSERT INTO pgz_sport.natjecanja_tablice (natjecanje_id, klub_id, klub_naziv, pozicija, odigrano, pobjede, nerijeseno, porazi, gol_z, gol_p, gol_razlika, bodovi) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""", (nid, kid, r["klub"][:200], r["poz"], r["odigrano"], r["pobjede"], r["nerij"], r["porazi"], r["gz"], r["gp"], r["razl"], r["bod"])) print(f" matched klub_id: {matched}/{len(parsed)}") # Verify Rijeka cr.execute("""SELECT pozicija, klub_naziv, bodovi FROM pgz_sport.natjecanja_tablice WHERE klub_naziv ILIKE '%Rijeka%' AND natjecanje_id IN (SELECT id FROM pgz_sport.natjecanja WHERE source='hns_manual')""") for r in cr.fetchall(): print(f" {r}") conn.close()