Files
pgz-sport/scrapers/hks_cbf_games_scraper.py_prije_env_deepseek
T

171 lines
6.7 KiB
Python
Executable File

#!/usr/bin/env python3
"""
═══════════════════════════════════════════════════════════════
hks_cbf_games_scraper.py — HKS-CBF games (schedule + scores)
Version: 1.0.0 | 2026-05-11 (S1 task — games half)
Author: Damir Radulić <dradulic@outlook.com>
Target: /opt/pgz-sport/scrapers/hks_cbf_games_scraper.py
Source: hosted.dcd.shared.geniussports.com/embednf/HKS/en
/competition/{id}/schedule (JSON-wrapped HTML; parsed)
Run modes:
python3 hks_cbf_games_scraper.py # all 3 comps, --insert
python3 hks_cbf_games_scraper.py --comp 42186 # one comp
python3 hks_cbf_games_scraper.py --dry # no DB writes
Schema: pgz_sport.hks_cbf_games (already deployed):
hks_game_id TEXT, competition_id TEXT, datum DATE,
home_team_id/name, away_team_id/name, home_score/away_score,
status TEXT, source_url TEXT, raw_data JSONB
═══════════════════════════════════════════════════════════════
"""
from __future__ import annotations
import argparse
import json
import os
import re
import sys
import urllib.request
from datetime import datetime
from typing import Iterator, Optional
import psycopg2
import psycopg2.extras
GS_BASE = "https://hosted.dcd.shared.geniussports.com/embednf/HKS/en"
UA = "RiNET-Civic/1.0 (https://rinet.one) HKS-CBF games sync"
COMPS = [
{"id": "42186", "label": "Supersport Premijer Liga (M) 2025/26", "spol": "M"},
{"id": "42187", "label": "Supersport Premijer Liga (Ž) 2025/26", "spol": "Ž"},
{"id": "42259", "label": "1. Muška liga 2025/26", "spol": "M"},
]
DB = dict(
host=os.environ.get("PG_HOST", "10.10.0.2"),
port=int(os.environ.get("PG_PORT", "6432")),
dbname=os.environ.get("PG_DB", "rinet_v3"),
user=os.environ.get("PG_USER", "rinet"),
password=os.environ["DB_PASSWORD"],
)
def http_get(url: str, timeout: int = 20) -> str:
req = urllib.request.Request(url, headers={"User-Agent": UA, "Accept": "application/json"})
with urllib.request.urlopen(req, timeout=timeout) as r:
return r.read().decode("utf-8")
# Parsers — kept small + tested against schedule HTML structure observed today.
RE_MATCH = re.compile(
r'<div class="match-wrap (\w+)" id ?= ?"extfix_(\d+)">(.*?)(?=<div class="match-wrap |$)',
re.S,
)
RE_DATE = re.compile(r"Date / Time:.*?<span>([^<]+)</span>", re.S)
RE_VENUE = re.compile(r'class="venuename"[^>]*>([^<]+)<', re.S)
RE_TEAM_BLOCK = re.compile(
r'<div class="(home|away)-team">.*?team/(\d+)\?.*?<span class="team-name-full">([^<]+)<',
re.S,
)
RE_SCORE = re.compile(
r'<div class="team-score (home|away)score">\s*<div class="fake-cell">(\d+)<',
re.S,
)
def parse_us_date(s: str) -> Optional[str]:
"""'Sep 26, 2025, 7:00 PM''2025-09-26'."""
s = (s or "").strip()
for fmt in ("%b %d, %Y, %I:%M %p", "%b %d, %Y"):
try:
return datetime.strptime(s.split(", ", 2)[0] + ", " + s.split(", ", 2)[1], fmt[:11]).strftime("%Y-%m-%d")
except Exception:
continue
# fallback regex
m = re.search(r"(\w+)\s+(\d{1,2}),\s+(\d{4})", s)
if not m:
return None
mon = {"Jan":1,"Feb":2,"Mar":3,"Apr":4,"May":5,"Jun":6,"Jul":7,"Aug":8,"Sep":9,"Oct":10,"Nov":11,"Dec":12}.get(m.group(1)[:3])
if not mon:
return None
return f"{int(m.group(3)):04d}-{mon:02d}-{int(m.group(2)):02d}"
def iter_comp_games(comp_id: str) -> Iterator[dict]:
payload = json.loads(http_get(f"{GS_BASE}/competition/{comp_id}/schedule"))
html = payload.get("html", "")
for status, mid, body in RE_MATCH.findall(html):
date_m = RE_DATE.search(body)
venue_m = RE_VENUE.search(body)
teams = RE_TEAM_BLOCK.findall(body)
scores = RE_SCORE.findall(body)
home_team = next((t for t in teams if t[0] == "home"), None)
away_team = next((t for t in teams if t[0] == "away"), None)
home_score = next((int(s) for side, s in scores if side == "home"), None)
away_score = next((int(s) for side, s in scores if side == "away"), None)
yield {
"hks_game_id": mid,
"competition_id": comp_id,
"datum": parse_us_date(date_m.group(1) if date_m else ""),
"home_team_id": home_team[1] if home_team else None,
"home_team_name": home_team[2].strip() if home_team else None,
"away_team_id": away_team[1] if away_team else None,
"away_team_name": away_team[2].strip() if away_team else None,
"home_score": home_score,
"away_score": away_score,
"status": status,
"source_url": f"https://www.hks-cbf.hr/statistika/?WHurl=%2Fmatch%2F{mid}",
"venue": venue_m.group(1).strip() if venue_m else None,
}
def upsert(cur, row: dict) -> bool:
cur.execute(
"""
INSERT INTO pgz_sport.hks_cbf_games
(hks_game_id, competition_id, datum,
home_team_id, home_team_name, away_team_id, away_team_name,
home_score, away_score, status, source_url, raw_data)
VALUES (%(hks_game_id)s, %(competition_id)s, %(datum)s,
%(home_team_id)s, %(home_team_name)s, %(away_team_id)s, %(away_team_name)s,
%(home_score)s, %(away_score)s, %(status)s, %(source_url)s, %(raw)s::jsonb)
ON CONFLICT DO NOTHING
""",
{**row, "raw": json.dumps({"venue": row.get("venue")}, ensure_ascii=False)},
)
return cur.rowcount > 0
def main() -> int:
ap = argparse.ArgumentParser()
ap.add_argument("--comp", help="comp id (default: all 3 in COMPS)")
ap.add_argument("--dry", action="store_true", help="no DB writes")
args = ap.parse_args()
comps = [c for c in COMPS if (args.comp is None or c["id"] == args.comp)]
if not comps:
print(f"unknown comp {args.comp}", file=sys.stderr); return 2
conn = None if args.dry else psycopg2.connect(**DB)
cur = conn.cursor() if conn else None
n_total = 0; n_new = 0
for c in comps:
n_c = 0; n_new_c = 0
for game in iter_comp_games(c["id"]):
n_c += 1
if cur:
if upsert(cur, game): n_new_c += 1
print(f" comp {c['id']} ({c['label']}): {n_c} games" +
(f", inserted {n_new_c} new" if cur else " (dry)"))
n_total += n_c; n_new += n_new_c
if conn:
conn.commit(); cur.close(); conn.close()
print(f"\ntotal: {n_total} games seen{' / ' + str(n_new) + ' new' if conn else ''}")
return 0
if __name__ == "__main__":
sys.exit(main())