171 lines
6.7 KiB
Python
Executable File
171 lines
6.7 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
═══════════════════════════════════════════════════════════════
|
|
hks_cbf_games_scraper.py — HKS-CBF games (schedule + scores)
|
|
Version: 1.0.0 | 2026-05-11 (S1 task — games half)
|
|
Author: Damir Radulić <dradulic@outlook.com>
|
|
Target: /opt/pgz-sport/scrapers/hks_cbf_games_scraper.py
|
|
Source: hosted.dcd.shared.geniussports.com/embednf/HKS/en
|
|
/competition/{id}/schedule (JSON-wrapped HTML; parsed)
|
|
|
|
Run modes:
|
|
python3 hks_cbf_games_scraper.py # all 3 comps, --insert
|
|
python3 hks_cbf_games_scraper.py --comp 42186 # one comp
|
|
python3 hks_cbf_games_scraper.py --dry # no DB writes
|
|
|
|
Schema: pgz_sport.hks_cbf_games (already deployed):
|
|
hks_game_id TEXT, competition_id TEXT, datum DATE,
|
|
home_team_id/name, away_team_id/name, home_score/away_score,
|
|
status TEXT, source_url TEXT, raw_data JSONB
|
|
═══════════════════════════════════════════════════════════════
|
|
"""
|
|
from __future__ import annotations
|
|
import argparse
|
|
import json
|
|
import os
|
|
import re
|
|
import sys
|
|
import urllib.request
|
|
from datetime import datetime
|
|
from typing import Iterator, Optional
|
|
|
|
import psycopg2
|
|
import psycopg2.extras
|
|
|
|
GS_BASE = "https://hosted.dcd.shared.geniussports.com/embednf/HKS/en"
|
|
UA = "RiNET-Civic/1.0 (https://rinet.one) HKS-CBF games sync"
|
|
|
|
COMPS = [
|
|
{"id": "42186", "label": "Supersport Premijer Liga (M) 2025/26", "spol": "M"},
|
|
{"id": "42187", "label": "Supersport Premijer Liga (Ž) 2025/26", "spol": "Ž"},
|
|
{"id": "42259", "label": "1. Muška liga 2025/26", "spol": "M"},
|
|
]
|
|
|
|
DB = dict(
|
|
host=os.environ.get("PG_HOST", "10.10.0.2"),
|
|
port=int(os.environ.get("PG_PORT", "6432")),
|
|
dbname=os.environ.get("PG_DB", "rinet_v3"),
|
|
user=os.environ.get("PG_USER", "rinet"),
|
|
password=os.environ["DB_PASSWORD"],
|
|
)
|
|
|
|
|
|
def http_get(url: str, timeout: int = 20) -> str:
|
|
req = urllib.request.Request(url, headers={"User-Agent": UA, "Accept": "application/json"})
|
|
with urllib.request.urlopen(req, timeout=timeout) as r:
|
|
return r.read().decode("utf-8")
|
|
|
|
|
|
# Parsers — kept small + tested against schedule HTML structure observed today.
|
|
RE_MATCH = re.compile(
|
|
r'<div class="match-wrap (\w+)" id ?= ?"extfix_(\d+)">(.*?)(?=<div class="match-wrap |$)',
|
|
re.S,
|
|
)
|
|
RE_DATE = re.compile(r"Date / Time:.*?<span>([^<]+)</span>", re.S)
|
|
RE_VENUE = re.compile(r'class="venuename"[^>]*>([^<]+)<', re.S)
|
|
RE_TEAM_BLOCK = re.compile(
|
|
r'<div class="(home|away)-team">.*?team/(\d+)\?.*?<span class="team-name-full">([^<]+)<',
|
|
re.S,
|
|
)
|
|
RE_SCORE = re.compile(
|
|
r'<div class="team-score (home|away)score">\s*<div class="fake-cell">(\d+)<',
|
|
re.S,
|
|
)
|
|
|
|
|
|
def parse_us_date(s: str) -> Optional[str]:
|
|
"""'Sep 26, 2025, 7:00 PM' → '2025-09-26'."""
|
|
s = (s or "").strip()
|
|
for fmt in ("%b %d, %Y, %I:%M %p", "%b %d, %Y"):
|
|
try:
|
|
return datetime.strptime(s.split(", ", 2)[0] + ", " + s.split(", ", 2)[1], fmt[:11]).strftime("%Y-%m-%d")
|
|
except Exception:
|
|
continue
|
|
# fallback regex
|
|
m = re.search(r"(\w+)\s+(\d{1,2}),\s+(\d{4})", s)
|
|
if not m:
|
|
return None
|
|
mon = {"Jan":1,"Feb":2,"Mar":3,"Apr":4,"May":5,"Jun":6,"Jul":7,"Aug":8,"Sep":9,"Oct":10,"Nov":11,"Dec":12}.get(m.group(1)[:3])
|
|
if not mon:
|
|
return None
|
|
return f"{int(m.group(3)):04d}-{mon:02d}-{int(m.group(2)):02d}"
|
|
|
|
|
|
def iter_comp_games(comp_id: str) -> Iterator[dict]:
|
|
payload = json.loads(http_get(f"{GS_BASE}/competition/{comp_id}/schedule"))
|
|
html = payload.get("html", "")
|
|
for status, mid, body in RE_MATCH.findall(html):
|
|
date_m = RE_DATE.search(body)
|
|
venue_m = RE_VENUE.search(body)
|
|
teams = RE_TEAM_BLOCK.findall(body)
|
|
scores = RE_SCORE.findall(body)
|
|
home_team = next((t for t in teams if t[0] == "home"), None)
|
|
away_team = next((t for t in teams if t[0] == "away"), None)
|
|
home_score = next((int(s) for side, s in scores if side == "home"), None)
|
|
away_score = next((int(s) for side, s in scores if side == "away"), None)
|
|
yield {
|
|
"hks_game_id": mid,
|
|
"competition_id": comp_id,
|
|
"datum": parse_us_date(date_m.group(1) if date_m else ""),
|
|
"home_team_id": home_team[1] if home_team else None,
|
|
"home_team_name": home_team[2].strip() if home_team else None,
|
|
"away_team_id": away_team[1] if away_team else None,
|
|
"away_team_name": away_team[2].strip() if away_team else None,
|
|
"home_score": home_score,
|
|
"away_score": away_score,
|
|
"status": status,
|
|
"source_url": f"https://www.hks-cbf.hr/statistika/?WHurl=%2Fmatch%2F{mid}",
|
|
"venue": venue_m.group(1).strip() if venue_m else None,
|
|
}
|
|
|
|
|
|
def upsert(cur, row: dict) -> bool:
|
|
cur.execute(
|
|
"""
|
|
INSERT INTO pgz_sport.hks_cbf_games
|
|
(hks_game_id, competition_id, datum,
|
|
home_team_id, home_team_name, away_team_id, away_team_name,
|
|
home_score, away_score, status, source_url, raw_data)
|
|
VALUES (%(hks_game_id)s, %(competition_id)s, %(datum)s,
|
|
%(home_team_id)s, %(home_team_name)s, %(away_team_id)s, %(away_team_name)s,
|
|
%(home_score)s, %(away_score)s, %(status)s, %(source_url)s, %(raw)s::jsonb)
|
|
ON CONFLICT DO NOTHING
|
|
""",
|
|
{**row, "raw": json.dumps({"venue": row.get("venue")}, ensure_ascii=False)},
|
|
)
|
|
return cur.rowcount > 0
|
|
|
|
|
|
def main() -> int:
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--comp", help="comp id (default: all 3 in COMPS)")
|
|
ap.add_argument("--dry", action="store_true", help="no DB writes")
|
|
args = ap.parse_args()
|
|
|
|
comps = [c for c in COMPS if (args.comp is None or c["id"] == args.comp)]
|
|
if not comps:
|
|
print(f"unknown comp {args.comp}", file=sys.stderr); return 2
|
|
|
|
conn = None if args.dry else psycopg2.connect(**DB)
|
|
cur = conn.cursor() if conn else None
|
|
n_total = 0; n_new = 0
|
|
|
|
for c in comps:
|
|
n_c = 0; n_new_c = 0
|
|
for game in iter_comp_games(c["id"]):
|
|
n_c += 1
|
|
if cur:
|
|
if upsert(cur, game): n_new_c += 1
|
|
print(f" comp {c['id']} ({c['label']}): {n_c} games" +
|
|
(f", inserted {n_new_c} new" if cur else " (dry)"))
|
|
n_total += n_c; n_new += n_new_c
|
|
|
|
if conn:
|
|
conn.commit(); cur.close(); conn.close()
|
|
print(f"\ntotal: {n_total} games seen{' / ' + str(n_new) + ' new' if conn else ''}")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|