pgz-sport/scripts/hns_api_client.py

#!/usr/bin/env python3
# hns_api_client.py — HNS Semafor structured client (v1.0)
# Author: Damir Radulić <dradulic@outlook.com> / <damir@rinet.one>
# Date: 2026-05-05
# Description:
#   Reverse-engineered client for https://semafor.hns.family.
#   The site is server-rendered ASP.NET (NOT Next.js — no __NEXT_DATA__,
#   no hydration JSON). The only XHR endpoints exposed are filter helpers
#   /handlers/getOrganizations/, getCompetitions/, getAgeCategories/,
#   getCalendarEvents/. All player & club data is rendered into stable
#   semantic HTML (classes: playerHeader, playerData, playerCompetitionStatsTable,
#   matchlist, clubHeader, basic_info, playerslist…).
#
#   This module therefore implements a fast HTML→JSON parser using requests
#   + BeautifulSoup, with a connection-pooled session, polite UA, and a
#   per-hour cache. It exposes the SDK surface SUB4 was asked for:
#     fetch_player(hns_id, slug) -> dict
#     fetch_klub(hns_id, slug)   -> dict
#     fetch_klub_roster(hns_id, slug) -> list
#     get_buildid() -> str   (no buildId on this stack — returns site CSS hash)
#
# Fallback chain inside _get_html():
#   1. requests + polite UA (primary)
#   2. requests with referer + cookie (retry on 403/503)
#   3. Playwright (lazy import) for JS-only edge cases
#
import json
import re
import sys
import time
from dataclasses import dataclass, field, asdict
from functools import lru_cache
from pathlib import Path
from typing import Any

import requests
from bs4 import BeautifulSoup

BASE = "https://semafor.hns.family"
UA = (
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
    "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)
HEADERS = {
    "User-Agent": UA,
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "hr,en-US;q=0.7,en;q=0.3",
    "Accept-Encoding": "gzip, deflate, br",
    "Cache-Control": "no-cache",
}

_session = requests.Session()
_session.headers.update(HEADERS)


# ---------------------------------------------------------------------------
# helpers
# ---------------------------------------------------------------------------
def _txt(node, default: str = "") -> str:
    if node is None:
        return default
    return re.sub(r"\s+", " ", node.get_text(" ", strip=True)).strip()


def _int(s: str | None, default: int | None = None) -> int | None:
    if s is None:
        return default
    m = re.search(r"-?\d+", s.replace("\xa0", " "))
    return int(m.group()) if m else default


def _href_id(a, pattern: str) -> int | None:
    if a is None or not a.get("href"):
        return None
    m = re.search(pattern, a["href"])
    return int(m.group(1)) if m else None


def _get_html(url: str, *, timeout: int = 20, retries: int = 2) -> str:
    last_exc: Exception | None = None
    for attempt in range(retries + 1):
        try:
            r = _session.get(url, timeout=timeout)
            if r.status_code == 200 and r.text:
                return r.text
            if r.status_code in (403, 503) and attempt < retries:
                time.sleep(1.0 + attempt)
                continue
            r.raise_for_status()
        except Exception as e:  # pragma: no cover — network paths
            last_exc = e
            time.sleep(1.0 + attempt)
    # Playwright fallback (last resort)
    try:
        from playwright.sync_api import sync_playwright  # type: ignore

        with sync_playwright() as p:
            browser = p.chromium.launch(headless=True)
            ctx = browser.new_context(user_agent=UA, locale="hr-HR")
            page = ctx.new_page()
            page.goto(url, wait_until="domcontentloaded", timeout=30000)
            html = page.content()
            browser.close()
            return html
    except Exception as e:
        raise RuntimeError(f"_get_html failed for {url}: {last_exc or e}") from e


# ---------------------------------------------------------------------------
# buildId-equivalent — NOT a Next.js app. We surface a deterministic version
# token taken from the cached CSS asset hash; refreshes hourly.
# ---------------------------------------------------------------------------
@lru_cache(maxsize=1)
def _build_token_cached(hour_bucket: int) -> str:
    try:
        html = _get_html(BASE + "/", timeout=15)
        m = re.search(r"common\.min\.css\?v=([A-Za-z0-9_\-]+)", html)
        return m.group(1) if m else f"unknown-{hour_bucket}"
    except Exception:
        return f"unknown-{hour_bucket}"


def get_buildid() -> str:
    """Returns the current site asset version hash (hourly-cached).
    HNS Semafor is server-rendered ASP.NET, so there is no Next.js buildId.
    We surface the CSS asset hash as an equivalent cache-busting token."""
    return _build_token_cached(int(time.time()) // 3600)


# ---------------------------------------------------------------------------
# Player parser
# ---------------------------------------------------------------------------
def _parse_player_header(soup: BeautifulSoup) -> dict[str, Any]:
    hdr = soup.select_one("div.block.playerHeader")
    if hdr is None:
        return {}
    name = _txt(hdr.select_one(".playerName .name"))
    surname = _txt(hdr.select_one(".playerName .surname"))
    shirt = _txt(hdr.select_one(".playerName h3"))
    img = hdr.select_one(".photo img")
    photo_url = img["src"] if img and img.get("src") else None
    club_a = hdr.select_one(".playerData li.club a")
    club_name = _txt(hdr.select_one(".playerData li.club h4"))
    club_id = _href_id(club_a, r"/klubovi/(\d+)/")
    dob_node = hdr.select_one(".playerData li.dob h4")
    dob_text = _txt(dob_node)
    dob = None
    age = None
    m = re.match(r"(\d{2}\.\d{2}\.\d{4})", dob_text)
    if m:
        dob = m.group(1)
    m_age = re.search(r"\((\d+)\s+godina", dob_text)
    if m_age:
        age = int(m_age.group(1))
    pob = _txt(hdr.select_one(".playerData li.pob h4")) or None
    return {
        "name": name or None,
        "surname": surname or None,
        "shirt_number": _int(shirt),
        "photo_url": photo_url,
        "current_club": {"id": club_id, "name": club_name or None},
        "dob": dob,
        "age": age,
        "place_of_birth": pob,
    }


def _parse_season_recap(soup: BeautifulSoup) -> dict[str, Any]:
    block = soup.select_one("div.player_season_stats_recap")
    if block is None:
        return {}
    season = _txt(block.select_one("h2"))
    out = {"season": season or None}
    for li in block.select("ul > li"):
        cls = (li.get("class") or [None])[0]
        out[cls] = _int(_txt(li.select_one("h4")))
    return out


def _parse_player_seasons(soup: BeautifulSoup) -> list[dict[str, Any]]:
    """Statistika po sezonama: each season has a stats table + matches table."""
    seasons: list[dict[str, Any]] = []
    container = soup.select_one("div.player_profile_matches")
    if container is None:
        return seasons
    titles = container.select("h2.seasonTitle")
    tabbed = container.select("div.tabbedContent")
    for i, h2 in enumerate(titles):
        season = _txt(h2)
        tab = tabbed[i] if i < len(tabbed) else None
        comps: list[dict[str, Any]] = []
        matches: list[dict[str, Any]] = []
        if tab is not None:
            # competitions stats table
            for st in tab.select("div.stats_table"):
                rows: list[dict[str, Any]] = []
                for row in st.select("ul > li.row"):
                    if "header" in (row.get("class") or []):
                        continue
                    title_a = row.select_one(".title a")
                    rows.append(
                        {
                            "competition": _txt(row.select_one(".title")) or None,
                            "competition_url": title_a["href"] if title_a and title_a.get("href") else None,
                            "competition_id": _href_id(title_a, r"/natjecanja/(\d+)/"),
                            "apps": _int(_txt(row.select_one(".apps"))),
                            "starter": _int(_txt(row.select_one(".starter"))),
                            "sub": _int(_txt(row.select_one(".sub"))),
                            "minutes": _int(_txt(row.select_one(".minutes"))),
                            "goals": _int(_txt(row.select_one(".goals"))),
                            "yellows": _int(_txt(row.select_one(".yellows"))),
                            "reds": _int(_txt(row.select_one(".reds"))),
                        }
                    )
                if rows:
                    comps.append({"rows": rows})
            # matches list
            for ml in tab.select("div.matchlist"):
                for row in ml.select("li.row, div.row"):
                    if "header" in (row.get("class") or []):
                        continue
                    date = _txt(row.select_one(".date"))
                    c1 = _txt(row.select_one(".club1"))
                    c2 = _txt(row.select_one(".club2"))
                    res = _txt(row.select_one(".result"))
                    comp = _txt(row.select_one(".competitionround"))
                    if not (date or c1 or c2 or res):
                        continue
                    matches.append(
                        {
                            "date": date or None,
                            "home": c1 or None,
                            "away": c2 or None,
                            "result": res or None,
                            "competition_round": comp or None,
                        }
                    )
        seasons.append({"season": season, "competitions": comps, "matches": matches})
    return seasons


def fetch_player(hns_id: int, slug: str) -> dict[str, Any]:
    """Fetch a player profile from semafor.hns.family and return structured JSON."""
    url = f"{BASE}/igraci/{hns_id}/{slug.strip('/')}/"
    t0 = time.time()
    html = _get_html(url)
    soup = BeautifulSoup(html, "html.parser")
    data = {
        "source_url": url,
        "hns_igrac_id": hns_id,
        "slug": slug.strip("/"),
        "build_token": get_buildid(),
        "fetched_at": int(time.time()),
        "fetch_ms": int((time.time() - t0) * 1000),
        "profile": _parse_player_header(soup),
        "season_recap": _parse_season_recap(soup),
        "seasons": _parse_player_seasons(soup),
    }
    return data


# ---------------------------------------------------------------------------
# Klub parser
# ---------------------------------------------------------------------------
def _parse_klub_header(soup: BeautifulSoup) -> dict[str, Any]:
    hdr = soup.select_one("div.block.clubHeader")
    if hdr is None:
        return {}
    name = _txt(hdr.select_one(".title h1"))
    long_name = _txt(hdr.select_one(".title h2"))
    img = hdr.select_one(".logo img")
    info: dict[str, Any] = {
        "name": name or None,
        "full_name": long_name or None,
        "logo_url": img["src"] if img and img.get("src") else None,
    }
    for li in hdr.select(".info ul > li"):
        cls = (li.get("class") or [None])[0]
        val = _txt(li.select_one("h3"))
        info[cls] = val or None
    return info


def fetch_klub(hns_id: int, slug: str) -> dict[str, Any]:
    """Fetch a club profile (header + meta) from semafor.hns.family."""
    url = f"{BASE}/klubovi/{hns_id}/{slug.strip('/')}/"
    t0 = time.time()
    html = _get_html(url)
    soup = BeautifulSoup(html, "html.parser")
    return {
        "source_url": url,
        "hns_klub_id": hns_id,
        "slug": slug.strip("/"),
        "build_token": get_buildid(),
        "fetched_at": int(time.time()),
        "fetch_ms": int((time.time() - t0) * 1000),
        "info": _parse_klub_header(soup),
        "competitions": _parse_klub_competitions(soup),
        "next_match": _parse_next_match(soup),
    }


def _parse_klub_competitions(soup: BeautifulSoup) -> list[dict[str, Any]]:
    out = []
    for tbl in soup.select("div.competition_table"):
        rows = []
        for li in tbl.select("li.row, div.row"):
            if "header" in (li.get("class") or []):
                continue
            comp_a = li.select_one("a")
            rows.append(
                {
                    "competition": _txt(li.select_one(".title")) or _txt(comp_a) or None,
                    "competition_id": _href_id(comp_a, r"/natjecanja/(\d+)/"),
                }
            )
        if rows:
            out.extend(rows)
    return out


def _parse_next_match(soup: BeautifulSoup) -> dict[str, Any] | None:
    nm = soup.select_one("div.scoreboard_next_previous, div.current_results")
    if nm is None:
        return None
    return {"raw_text": _txt(nm)[:400] or None}


# ---------------------------------------------------------------------------
# Roster parser — the “Igrači” tab (server-rendered with the page)
# ---------------------------------------------------------------------------
def fetch_klub_roster(hns_id: int, slug: str) -> list[dict[str, Any]]:
    """Return list of players currently rostered at the club."""
    url = f"{BASE}/klubovi/{hns_id}/{slug.strip('/')}/"
    html = _get_html(url)
    soup = BeautifulSoup(html, "html.parser")
    roster: list[dict[str, Any]] = []
    seen: set[int] = set()
    for block in soup.select("div.playerslist"):
        for row in block.select("li.row, div.row, tr"):
            if "header" in (row.get("class") or []):
                continue
            a = row.find("a", href=re.compile(r"/igraci/\d+/"))
            if a is None:
                continue
            pid = _href_id(a, r"/igraci/(\d+)/")
            if pid is None or pid in seen:
                continue
            seen.add(pid)
            slug_m = re.search(r"/igraci/\d+/([^/]+)/", a["href"])
            roster.append(
                {
                    "hns_igrac_id": pid,
                    "slug": slug_m.group(1) if slug_m else None,
                    "name": _txt(a) or None,
                    "url": BASE + a["href"] if a["href"].startswith("/") else a["href"],
                    "apps": _int(_txt(row.select_one(".apps"))),
                    "minutes": _int(_txt(row.select_one(".minutes"))),
                    "goals": _int(_txt(row.select_one(".goals"))),
                    "yellows": _int(_txt(row.select_one(".yellows"))),
                    "reds": _int(_txt(row.select_one(".reds"))),
                }
            )
    return roster


# ---------------------------------------------------------------------------
# CLI / smoke test
# ---------------------------------------------------------------------------
def _bench(label: str, fn, *args):
    t0 = time.time()
    out = fn(*args)
    dt = (time.time() - t0) * 1000
    print(f"[{label}] {dt:6.1f} ms  args={args}", file=sys.stderr)
    return out, dt


def _slugify(name: str) -> str:
    name = name.lower()
    repl = {"č": "c", "ć": "c", "ž": "z", "š": "s", "đ": "d"}
    for k, v in repl.items():
        name = name.replace(k, v)
    name = re.sub(r"[^a-z0-9]+", "-", name).strip("-")
    return name


if __name__ == "__main__":
    samples_players = [
        (88284, "zoran-kurjaga"),
        (134238, "roko-antesic"),
        (1223263, "anel-husic"),
    ]
    samples_klubovi = [
        (1589, "nk-zamet"),
        (107150, "nk-pomorac"),
        (1574, "hnk-lovran"),
    ]

    print(f"# build_token = {get_buildid()}")
    print()

    print("## fetch_player benchmark")
    p_times: list[float] = []
    for pid, slug in samples_players:
        data, dt = _bench("player", fetch_player, pid, slug)
        p_times.append(dt)
        prof = data.get("profile", {})
        recap = data.get("season_recap", {})
        seasons = data.get("seasons", [])
        print(
            f"  - id={pid} slug={slug} -> {prof.get('name')} {prof.get('surname')}, "
            f"club={prof.get('current_club', {}).get('name')}, dob={prof.get('dob')}, "
            f"recap_season={recap.get('season')} apps={recap.get('apps')}, "
            f"seasons_count={len(seasons)}"
        )
    print(f"  avg={sum(p_times)/len(p_times):.1f} ms")
    print()

    print("## fetch_klub benchmark")
    k_times: list[float] = []
    for kid, slug in samples_klubovi:
        data, dt = _bench("klub", fetch_klub, kid, slug)
        k_times.append(dt)
        info = data.get("info", {})
        print(
            f"  - id={kid} slug={slug} -> {info.get('name')} | "
            f"founded={info.get('foundation_date')} | "
            f"address={info.get('address')} | "
            f"comps={len(data.get('competitions', []))}"
        )
    print(f"  avg={sum(k_times)/len(k_times):.1f} ms")
    print()

    print("## fetch_klub_roster benchmark")
    r_times: list[float] = []
    for kid, slug in samples_klubovi:
        roster, dt = _bench("roster", fetch_klub_roster, kid, slug)
        r_times.append(dt)
        print(f"  - id={kid} slug={slug} -> {len(roster)} players")
        for r in roster[:3]:
            print(f"      • {r['name']} (id={r['hns_igrac_id']}) apps={r.get('apps')} goals={r.get('goals')}")
    print(f"  avg={sum(r_times)/len(r_times):.1f} ms")

    # Dump first sample to disk for inspection
    out_dir = Path("/opt/pgz-sport/cc_tasks/sub4")
    out_dir.mkdir(parents=True, exist_ok=True)
    pid0, slug0 = samples_players[0]
    sample = fetch_player(pid0, slug0)
    (out_dir / "sample_player.json").write_text(
        json.dumps(sample, ensure_ascii=False, indent=2)
    )
    kid0, kslug0 = samples_klubovi[0]
    ksample = fetch_klub(kid0, kslug0)
    (out_dir / "sample_klub.json").write_text(
        json.dumps(ksample, ensure_ascii=False, indent=2)
    )
    rsample = fetch_klub_roster(kid0, kslug0)
    (out_dir / "sample_roster.json").write_text(
        json.dumps(rsample, ensure_ascii=False, indent=2)
    )
    print()
    print(f"# sample JSONs written to {out_dir}/")