#!/usr/bin/env python3 from dotenv import load_dotenv load_dotenv('/opt/rinet-gpu/.env.master') # auto-added by patch_scrapers_with_dotenv.sh """ ═══════════════════════════════════════════════════════════════ hks_cbf_games_scraper.py — HKS-CBF games (schedule + scores) Version: 1.0.0 | 2026-05-11 (S1 task — games half) Author: Damir Radulić Target: /opt/pgz-sport/scrapers/hks_cbf_games_scraper.py Source: hosted.dcd.shared.geniussports.com/embednf/HKS/en /competition/{id}/schedule (JSON-wrapped HTML; parsed) Run modes: python3 hks_cbf_games_scraper.py # all 3 comps, --insert python3 hks_cbf_games_scraper.py --comp 42186 # one comp python3 hks_cbf_games_scraper.py --dry # no DB writes Schema: pgz_sport.hks_cbf_games (already deployed): hks_game_id TEXT, competition_id TEXT, datum DATE, home_team_id/name, away_team_id/name, home_score/away_score, status TEXT, source_url TEXT, raw_data JSONB ═══════════════════════════════════════════════════════════════ """ from __future__ import annotations import argparse import json import os import re import sys import urllib.request from datetime import datetime from typing import Iterator, Optional import psycopg2 import psycopg2.extras GS_BASE = "https://hosted.dcd.shared.geniussports.com/embednf/HKS/en" UA = "RiNET-Civic/1.0 (https://rinet.one) HKS-CBF games sync" COMPS = [ {"id": "42186", "label": "Supersport Premijer Liga (M) 2025/26", "spol": "M"}, {"id": "42187", "label": "Supersport Premijer Liga (Ž) 2025/26", "spol": "Ž"}, {"id": "42259", "label": "1. Muška liga 2025/26", "spol": "M"}, ] DB = dict( host=os.environ.get("PG_HOST", "10.10.0.2"), port=int(os.environ.get("PG_PORT", "6432")), dbname=os.environ.get("PG_DB", "rinet_v3"), user=os.environ.get("PG_USER", "rinet"), password=os.environ["DB_PASSWORD"], ) def http_get(url: str, timeout: int = 20) -> str: req = urllib.request.Request(url, headers={"User-Agent": UA, "Accept": "application/json"}) with urllib.request.urlopen(req, timeout=timeout) as r: return r.read().decode("utf-8") # Parsers — kept small + tested against schedule HTML structure observed today. RE_MATCH = re.compile( r'
(.*?)(?=
([^<]+)", re.S) RE_VENUE = re.compile(r'class="venuename"[^>]*>([^<]+)<', re.S) RE_TEAM_BLOCK = re.compile( r'
.*?team/(\d+)\?.*?([^<]+)<', re.S, ) RE_SCORE = re.compile( r'
\s*
(\d+)<', re.S, ) def parse_us_date(s: str) -> Optional[str]: """'Sep 26, 2025, 7:00 PM' → '2025-09-26'.""" s = (s or "").strip() for fmt in ("%b %d, %Y, %I:%M %p", "%b %d, %Y"): try: return datetime.strptime(s.split(", ", 2)[0] + ", " + s.split(", ", 2)[1], fmt[:11]).strftime("%Y-%m-%d") except Exception: continue # fallback regex m = re.search(r"(\w+)\s+(\d{1,2}),\s+(\d{4})", s) if not m: return None mon = {"Jan":1,"Feb":2,"Mar":3,"Apr":4,"May":5,"Jun":6,"Jul":7,"Aug":8,"Sep":9,"Oct":10,"Nov":11,"Dec":12}.get(m.group(1)[:3]) if not mon: return None return f"{int(m.group(3)):04d}-{mon:02d}-{int(m.group(2)):02d}" def iter_comp_games(comp_id: str) -> Iterator[dict]: payload = json.loads(http_get(f"{GS_BASE}/competition/{comp_id}/schedule")) html = payload.get("html", "") for status, mid, body in RE_MATCH.findall(html): date_m = RE_DATE.search(body) venue_m = RE_VENUE.search(body) teams = RE_TEAM_BLOCK.findall(body) scores = RE_SCORE.findall(body) home_team = next((t for t in teams if t[0] == "home"), None) away_team = next((t for t in teams if t[0] == "away"), None) home_score = next((int(s) for side, s in scores if side == "home"), None) away_score = next((int(s) for side, s in scores if side == "away"), None) yield { "hks_game_id": mid, "competition_id": comp_id, "datum": parse_us_date(date_m.group(1) if date_m else ""), "home_team_id": home_team[1] if home_team else None, "home_team_name": home_team[2].strip() if home_team else None, "away_team_id": away_team[1] if away_team else None, "away_team_name": away_team[2].strip() if away_team else None, "home_score": home_score, "away_score": away_score, "status": status, "source_url": f"https://www.hks-cbf.hr/statistika/?WHurl=%2Fmatch%2F{mid}", "venue": venue_m.group(1).strip() if venue_m else None, } def upsert(cur, row: dict) -> bool: cur.execute( """ INSERT INTO pgz_sport.hks_cbf_games (hks_game_id, competition_id, datum, home_team_id, home_team_name, away_team_id, away_team_name, home_score, away_score, status, source_url, raw_data) VALUES (%(hks_game_id)s, %(competition_id)s, %(datum)s, %(home_team_id)s, %(home_team_name)s, %(away_team_id)s, %(away_team_name)s, %(home_score)s, %(away_score)s, %(status)s, %(source_url)s, %(raw)s::jsonb) ON CONFLICT DO NOTHING """, {**row, "raw": json.dumps({"venue": row.get("venue")}, ensure_ascii=False)}, ) return cur.rowcount > 0 def main() -> int: ap = argparse.ArgumentParser() ap.add_argument("--comp", help="comp id (default: all 3 in COMPS)") ap.add_argument("--dry", action="store_true", help="no DB writes") args = ap.parse_args() comps = [c for c in COMPS if (args.comp is None or c["id"] == args.comp)] if not comps: print(f"unknown comp {args.comp}", file=sys.stderr); return 2 conn = None if args.dry else psycopg2.connect(**DB) cur = conn.cursor() if conn else None n_total = 0; n_new = 0 for c in comps: n_c = 0; n_new_c = 0 for game in iter_comp_games(c["id"]): n_c += 1 if cur: if upsert(cur, game): n_new_c += 1 print(f" comp {c['id']} ({c['label']}): {n_c} games" + (f", inserted {n_new_c} new" if cur else " (dry)")) n_total += n_c; n_new += n_new_c if conn: conn.commit(); cur.close(); conn.close() print(f"\ntotal: {n_total} games seen{' / ' + str(n_new) + ' new' if conn else ''}") return 0 if __name__ == "__main__": sys.exit(main())