feat: /api/v2/analiza/* endpoints - sport analytics backend
This commit is contained in:
+170
@@ -0,0 +1,170 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
═══════════════════════════════════════════════════════════════
|
||||
hks_cbf_games_scraper.py — HKS-CBF games (schedule + scores)
|
||||
Version: 1.0.0 | 2026-05-11 (S1 task — games half)
|
||||
Author: Damir Radulić <dradulic@outlook.com>
|
||||
Target: /opt/pgz-sport/scrapers/hks_cbf_games_scraper.py
|
||||
Source: hosted.dcd.shared.geniussports.com/embednf/HKS/en
|
||||
/competition/{id}/schedule (JSON-wrapped HTML; parsed)
|
||||
|
||||
Run modes:
|
||||
python3 hks_cbf_games_scraper.py # all 3 comps, --insert
|
||||
python3 hks_cbf_games_scraper.py --comp 42186 # one comp
|
||||
python3 hks_cbf_games_scraper.py --dry # no DB writes
|
||||
|
||||
Schema: pgz_sport.hks_cbf_games (already deployed):
|
||||
hks_game_id TEXT, competition_id TEXT, datum DATE,
|
||||
home_team_id/name, away_team_id/name, home_score/away_score,
|
||||
status TEXT, source_url TEXT, raw_data JSONB
|
||||
═══════════════════════════════════════════════════════════════
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import urllib.request
|
||||
from datetime import datetime
|
||||
from typing import Iterator, Optional
|
||||
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
|
||||
GS_BASE = "https://hosted.dcd.shared.geniussports.com/embednf/HKS/en"
|
||||
UA = "RiNET-Civic/1.0 (https://rinet.one) HKS-CBF games sync"
|
||||
|
||||
COMPS = [
|
||||
{"id": "42186", "label": "Supersport Premijer Liga (M) 2025/26", "spol": "M"},
|
||||
{"id": "42187", "label": "Supersport Premijer Liga (Ž) 2025/26", "spol": "Ž"},
|
||||
{"id": "42259", "label": "1. Muška liga 2025/26", "spol": "M"},
|
||||
]
|
||||
|
||||
DB = dict(
|
||||
host=os.environ.get("PG_HOST", "10.10.0.2"),
|
||||
port=int(os.environ.get("PG_PORT", "6432")),
|
||||
dbname=os.environ.get("PG_DB", "rinet_v3"),
|
||||
user=os.environ.get("PG_USER", "rinet"),
|
||||
password=os.environ["DB_PASSWORD"],
|
||||
)
|
||||
|
||||
|
||||
def http_get(url: str, timeout: int = 20) -> str:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA, "Accept": "application/json"})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||||
return r.read().decode("utf-8")
|
||||
|
||||
|
||||
# Parsers — kept small + tested against schedule HTML structure observed today.
|
||||
RE_MATCH = re.compile(
|
||||
r'<div class="match-wrap (\w+)" id ?= ?"extfix_(\d+)">(.*?)(?=<div class="match-wrap |$)',
|
||||
re.S,
|
||||
)
|
||||
RE_DATE = re.compile(r"Date / Time:.*?<span>([^<]+)</span>", re.S)
|
||||
RE_VENUE = re.compile(r'class="venuename"[^>]*>([^<]+)<', re.S)
|
||||
RE_TEAM_BLOCK = re.compile(
|
||||
r'<div class="(home|away)-team">.*?team/(\d+)\?.*?<span class="team-name-full">([^<]+)<',
|
||||
re.S,
|
||||
)
|
||||
RE_SCORE = re.compile(
|
||||
r'<div class="team-score (home|away)score">\s*<div class="fake-cell">(\d+)<',
|
||||
re.S,
|
||||
)
|
||||
|
||||
|
||||
def parse_us_date(s: str) -> Optional[str]:
|
||||
"""'Sep 26, 2025, 7:00 PM' → '2025-09-26'."""
|
||||
s = (s or "").strip()
|
||||
for fmt in ("%b %d, %Y, %I:%M %p", "%b %d, %Y"):
|
||||
try:
|
||||
return datetime.strptime(s.split(", ", 2)[0] + ", " + s.split(", ", 2)[1], fmt[:11]).strftime("%Y-%m-%d")
|
||||
except Exception:
|
||||
continue
|
||||
# fallback regex
|
||||
m = re.search(r"(\w+)\s+(\d{1,2}),\s+(\d{4})", s)
|
||||
if not m:
|
||||
return None
|
||||
mon = {"Jan":1,"Feb":2,"Mar":3,"Apr":4,"May":5,"Jun":6,"Jul":7,"Aug":8,"Sep":9,"Oct":10,"Nov":11,"Dec":12}.get(m.group(1)[:3])
|
||||
if not mon:
|
||||
return None
|
||||
return f"{int(m.group(3)):04d}-{mon:02d}-{int(m.group(2)):02d}"
|
||||
|
||||
|
||||
def iter_comp_games(comp_id: str) -> Iterator[dict]:
|
||||
payload = json.loads(http_get(f"{GS_BASE}/competition/{comp_id}/schedule"))
|
||||
html = payload.get("html", "")
|
||||
for status, mid, body in RE_MATCH.findall(html):
|
||||
date_m = RE_DATE.search(body)
|
||||
venue_m = RE_VENUE.search(body)
|
||||
teams = RE_TEAM_BLOCK.findall(body)
|
||||
scores = RE_SCORE.findall(body)
|
||||
home_team = next((t for t in teams if t[0] == "home"), None)
|
||||
away_team = next((t for t in teams if t[0] == "away"), None)
|
||||
home_score = next((int(s) for side, s in scores if side == "home"), None)
|
||||
away_score = next((int(s) for side, s in scores if side == "away"), None)
|
||||
yield {
|
||||
"hks_game_id": mid,
|
||||
"competition_id": comp_id,
|
||||
"datum": parse_us_date(date_m.group(1) if date_m else ""),
|
||||
"home_team_id": home_team[1] if home_team else None,
|
||||
"home_team_name": home_team[2].strip() if home_team else None,
|
||||
"away_team_id": away_team[1] if away_team else None,
|
||||
"away_team_name": away_team[2].strip() if away_team else None,
|
||||
"home_score": home_score,
|
||||
"away_score": away_score,
|
||||
"status": status,
|
||||
"source_url": f"https://www.hks-cbf.hr/statistika/?WHurl=%2Fmatch%2F{mid}",
|
||||
"venue": venue_m.group(1).strip() if venue_m else None,
|
||||
}
|
||||
|
||||
|
||||
def upsert(cur, row: dict) -> bool:
|
||||
cur.execute(
|
||||
"""
|
||||
INSERT INTO pgz_sport.hks_cbf_games
|
||||
(hks_game_id, competition_id, datum,
|
||||
home_team_id, home_team_name, away_team_id, away_team_name,
|
||||
home_score, away_score, status, source_url, raw_data)
|
||||
VALUES (%(hks_game_id)s, %(competition_id)s, %(datum)s,
|
||||
%(home_team_id)s, %(home_team_name)s, %(away_team_id)s, %(away_team_name)s,
|
||||
%(home_score)s, %(away_score)s, %(status)s, %(source_url)s, %(raw)s::jsonb)
|
||||
ON CONFLICT DO NOTHING
|
||||
""",
|
||||
{**row, "raw": json.dumps({"venue": row.get("venue")}, ensure_ascii=False)},
|
||||
)
|
||||
return cur.rowcount > 0
|
||||
|
||||
|
||||
def main() -> int:
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--comp", help="comp id (default: all 3 in COMPS)")
|
||||
ap.add_argument("--dry", action="store_true", help="no DB writes")
|
||||
args = ap.parse_args()
|
||||
|
||||
comps = [c for c in COMPS if (args.comp is None or c["id"] == args.comp)]
|
||||
if not comps:
|
||||
print(f"unknown comp {args.comp}", file=sys.stderr); return 2
|
||||
|
||||
conn = None if args.dry else psycopg2.connect(**DB)
|
||||
cur = conn.cursor() if conn else None
|
||||
n_total = 0; n_new = 0
|
||||
|
||||
for c in comps:
|
||||
n_c = 0; n_new_c = 0
|
||||
for game in iter_comp_games(c["id"]):
|
||||
n_c += 1
|
||||
if cur:
|
||||
if upsert(cur, game): n_new_c += 1
|
||||
print(f" comp {c['id']} ({c['label']}): {n_c} games" +
|
||||
(f", inserted {n_new_c} new" if cur else " (dry)"))
|
||||
n_total += n_c; n_new += n_new_c
|
||||
|
||||
if conn:
|
||||
conn.commit(); cur.close(); conn.close()
|
||||
print(f"\ntotal: {n_total} games seen{' / ' + str(n_new) + ' new' if conn else ''}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user