#!/usr/bin/env python3 """ HKS-CBF scraper - parses Genius Sports embed standings for HR košarka leagues. Path: /opt/pgz-sport/scrapers/hks_scraper.py Source: https://hosted.dcd.shared.geniussports.com/embednf/HKS/en/competition/{ID}/standings Output: pgz_sport.natjecanja_tablice + pgz_sport.klubovi (matching) """ import requests, re, json, psycopg2, html as ihtml DB = dict(host="10.10.0.2", port=6432, port=5432, dbname='rinet_v3', user='rinet', password='R1net2026!SecureDB#v7') UA = "RiNET-Civic/1.0 (https://rinet.one)" HDR = {"User-Agent": UA, "Accept": "application/json", "Referer": "https://www.hks-cbf.hr/"} # Competition ID lookup - 2025/26 sezona COMPS = [ {"natj": "Supersport Premijer Liga (M) 2025/26", "razina": "Premijer M", "comp_id": 42186, "spol": "M", "url": "https://www.hks-cbf.hr/supersport-premijer/"}, {"natj": "Supersport Premijer Liga (Ž) 2025/26", "razina": "Premijer Ž", "comp_id": 42187, "spol": "Ž", "url": "https://www.hks-cbf.hr/premijer-zenska-liga/"}, {"natj": "1.Muška liga 2025/26", "razina": "1.M liga", "comp_id": 42259, "spol": "M", "url": "https://www.hks-cbf.hr/prva-muska-liga/"}, ] def parse_standings(html): """Parse Genius Sports standings HTML table → list of rows.""" tables = re.findall(r']*>(.+?)', html, re.DOTALL) if not tables: return [] rows_html = re.findall(r']*>(.+?)', tables[0], re.DOTALL) if len(rows_html) < 2: return [] standings = [] for row in rows_html[1:]: # skip header cells = re.findall(r']*>(.*?)', row, re.DOTALL) if len(cells) < 8: continue clean = [ihtml.unescape(re.sub(r'<[^>]+>', '', c)).strip() for c in cells] # ['1', '', 'KK ZadarZAD', '31', '58', '27', '4', '2669', '2209', '460'] try: poz = int(clean[0]) except (ValueError, IndexError): continue # Team has cell with name + abbreviation glued (e.g. "KK ZadarZAD") team_raw = clean[2] # Extract klub naziv - assume last 2-4 uppercase letters are abbrev m = re.match(r'^(.+?)([A-ZČĆŠŽĐ]{2,4})$', team_raw) if m: klub = m.group(1).strip() abbrev = m.group(2) else: klub = team_raw abbrev = None try: standings.append({ "poz": poz, "klub": klub, "abbrev": abbrev, "gp": int(clean[3]), "bod": int(clean[4]), "pob": int(clean[5]), "por": int(clean[6]), "for_pts": int(clean[7].replace(',','')), "ag_pts": int(clean[8].replace(',','')), "gd": int(clean[9].replace(',','').replace('+','')), "ner": 0, # košarka nema neriješeno }) except (ValueError, IndexError) as e: continue return standings def main(): conn = psycopg2.connect(**DB); conn.autocommit = True cu = conn.cursor() total_inserted = 0 pgz_klubovi_seen = set() for comp in COMPS: print(f"\n=== {comp['natj']} (comp_id={comp['comp_id']}) ===") api_url = f"https://hosted.dcd.shared.geniussports.com/embednf/HKS/en/competition/{comp['comp_id']}/standings?&iurl=https%3A%2F%2Fwww.hks-cbf.hr" try: r = requests.get(api_url, headers=HDR, timeout=20) d = r.json() html = d.get('html', '') except Exception as e: print(f" ERR fetch: {e}"); continue rows = parse_standings(html) print(f" Parsed {len(rows)} klubova") # Get/create natjecanje cu.execute("SELECT id FROM pgz_sport.natjecanja WHERE naziv=%s LIMIT 1", (comp['natj'],)) nr = cu.fetchone() if nr: natj_id = nr[0] else: cu.execute("""INSERT INTO pgz_sport.natjecanja (naziv, sport, razina, sezona, source, source_url) VALUES (%s, 'košarka', %s, '2025/26', 'hks_genius', %s) RETURNING id""", (comp['natj'], comp['razina'], comp['url'])) natj_id = cu.fetchone()[0] print(f" Created natjecanje id={natj_id}") # Clear old rows cu.execute("DELETE FROM pgz_sport.natjecanja_tablice WHERE natjecanje_id=%s AND source='hks_genius'", (natj_id,)) for r in rows: # Match klub klub_id = None cu.execute("""SELECT id FROM pgz_sport.klubovi WHERE LOWER(naziv) = LOWER(%s) OR LOWER(naziv) LIKE LOWER(%s) ORDER BY CASE WHEN aktivan THEN 0 ELSE 1 END, CASE WHEN sport='košarka' THEN 0 ELSE 1 END, CASE WHEN region='PGŽ' THEN 0 ELSE 1 END, id ASC LIMIT 1""", (r['klub'], f"%{r['klub']}%")) kr = cu.fetchone() if kr: klub_id = kr[0] # Check if PGŽ cu.execute("SELECT region, grad FROM pgz_sport.klubovi WHERE id=%s", (klub_id,)) kdata = cu.fetchone() if kdata and (kdata[0] == 'PGŽ' or (kdata[1] and any(g in (kdata[1] or '').lower() for g in ['rijeka','crikv','opatija','delnice','krk','cres','rab','lošinj','losinj','vrbnik','novi vinodolski','čavle','cavle','kraljevica','kostrena','kastav']))): pgz_klubovi_seen.add(r['klub']) cu.execute("""INSERT INTO pgz_sport.natjecanja_tablice (natjecanje_id, klub_id, klub_naziv, pozicija, odigrano, pobjede, nerijeseno, porazi, gol_z, gol_p, gol_razlika, bodovi, source, source_url, updated_at, extra_data) VALUES (%s, %s, %s, %s, %s, %s, 0, %s, %s, %s, %s, %s, 'hks_genius', %s, now(), %s::jsonb)""", (natj_id, klub_id, r['klub'], r['poz'], r['gp'], r['pob'], r['por'], r['for_pts'], r['ag_pts'], r['gd'], r['bod'], comp['url'], json.dumps({"abbrev": r['abbrev'], "spol": comp['spol']}))) total_inserted += 1 print(f"\n=== TOTAL: {total_inserted} rows inserted ===") print(f"PGŽ klubovi seen: {len(pgz_klubovi_seen)} - {pgz_klubovi_seen}") # Stats cu.execute("""SELECT n.naziv, count(t.*), count(*) FILTER (WHERE t.klub_id IS NOT NULL) AS matched FROM pgz_sport.natjecanja n LEFT JOIN pgz_sport.natjecanja_tablice t ON n.id=t.natjecanje_id AND t.source='hks_genius' WHERE n.source='hks_genius' GROUP BY n.id, n.naziv ORDER BY n.id""") print("\n=== HKS lige stats ===") for r in cu.fetchall(): print(f" {r[1]:>3} klubova ({r[2]} matched) {r[0]}") # PGŽ klubovi cu.execute("""SELECT n.naziv AS liga, t.pozicija, t.klub_naziv, t.bodovi, t.gol_razlika, k.id, k.naziv AS db_naziv, k.aktivan FROM pgz_sport.natjecanja_tablice t JOIN pgz_sport.natjecanja n ON n.id=t.natjecanje_id LEFT JOIN pgz_sport.klubovi k ON k.id=t.klub_id WHERE t.source='hks_genius' AND k.region='PGŽ' ORDER BY n.naziv, t.pozicija""") print("\n=== PGŽ klubovi u HKS ===") for r in cu.fetchall(): print(f" {r[0][:30]:<30} {r[1]:>2}. {r[2]:<25} {r[3]:>3}b GR{r[4]:+} -> {r[5]} '{r[6]}'") cu.execute("""INSERT INTO pgz_sport.audit_feed (table_name, action, source, source_url, details) VALUES ('natjecanja_tablice', 'hks_genius_scrape', 'hks_genius', NULL, %s::jsonb)""", (json.dumps({"inserted": total_inserted, "comps": len(COMPS), "pgz_seen": list(pgz_klubovi_seen)}),)) conn.close() if __name__ == "__main__": main()