Files
pgz-sport/scripts/hns_api_client.py
T
damir 360b8008ba Crisis V6: panel expand + klub matching + ne-klub filter + samo_klubovi default
DB:
- pgz_sport.potpore_nositelji.je_klub flag (false za RSS programs/savezi)
- Re-match klub_id case-insensitive trim normalize

Endpoint:
- /api/dashboard/top-primatelji: samo_klubovi=True default

Frontend:
- sport2.html #panel/#dpanel: 70vw / 1100px max-width za HNS karijera
- mobile responsive za panel
2026-05-05 14:09:47 +02:00

463 lines
18 KiB
Python

#!/usr/bin/env python3
# hns_api_client.py — HNS Semafor structured client (v1.0)
# Author: Damir Radulić <dradulic@outlook.com> / <damir@rinet.one>
# Date: 2026-05-05
# Description:
# Reverse-engineered client for https://semafor.hns.family.
# The site is server-rendered ASP.NET (NOT Next.js — no __NEXT_DATA__,
# no hydration JSON). The only XHR endpoints exposed are filter helpers
# /handlers/getOrganizations/, getCompetitions/, getAgeCategories/,
# getCalendarEvents/. All player & club data is rendered into stable
# semantic HTML (classes: playerHeader, playerData, playerCompetitionStatsTable,
# matchlist, clubHeader, basic_info, playerslist…).
#
# This module therefore implements a fast HTML→JSON parser using requests
# + BeautifulSoup, with a connection-pooled session, polite UA, and a
# per-hour cache. It exposes the SDK surface SUB4 was asked for:
# fetch_player(hns_id, slug) -> dict
# fetch_klub(hns_id, slug) -> dict
# fetch_klub_roster(hns_id, slug) -> list
# get_buildid() -> str (no buildId on this stack — returns site CSS hash)
#
# Fallback chain inside _get_html():
# 1. requests + polite UA (primary)
# 2. requests with referer + cookie (retry on 403/503)
# 3. Playwright (lazy import) for JS-only edge cases
#
import json
import re
import sys
import time
from dataclasses import dataclass, field, asdict
from functools import lru_cache
from pathlib import Path
from typing import Any
import requests
from bs4 import BeautifulSoup
BASE = "https://semafor.hns.family"
UA = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)
HEADERS = {
"User-Agent": UA,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "hr,en-US;q=0.7,en;q=0.3",
"Accept-Encoding": "gzip, deflate, br",
"Cache-Control": "no-cache",
}
_session = requests.Session()
_session.headers.update(HEADERS)
# ---------------------------------------------------------------------------
# helpers
# ---------------------------------------------------------------------------
def _txt(node, default: str = "") -> str:
if node is None:
return default
return re.sub(r"\s+", " ", node.get_text(" ", strip=True)).strip()
def _int(s: str | None, default: int | None = None) -> int | None:
if s is None:
return default
m = re.search(r"-?\d+", s.replace("\xa0", " "))
return int(m.group()) if m else default
def _href_id(a, pattern: str) -> int | None:
if a is None or not a.get("href"):
return None
m = re.search(pattern, a["href"])
return int(m.group(1)) if m else None
def _get_html(url: str, *, timeout: int = 20, retries: int = 2) -> str:
last_exc: Exception | None = None
for attempt in range(retries + 1):
try:
r = _session.get(url, timeout=timeout)
if r.status_code == 200 and r.text:
return r.text
if r.status_code in (403, 503) and attempt < retries:
time.sleep(1.0 + attempt)
continue
r.raise_for_status()
except Exception as e: # pragma: no cover — network paths
last_exc = e
time.sleep(1.0 + attempt)
# Playwright fallback (last resort)
try:
from playwright.sync_api import sync_playwright # type: ignore
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
ctx = browser.new_context(user_agent=UA, locale="hr-HR")
page = ctx.new_page()
page.goto(url, wait_until="domcontentloaded", timeout=30000)
html = page.content()
browser.close()
return html
except Exception as e:
raise RuntimeError(f"_get_html failed for {url}: {last_exc or e}") from e
# ---------------------------------------------------------------------------
# buildId-equivalent — NOT a Next.js app. We surface a deterministic version
# token taken from the cached CSS asset hash; refreshes hourly.
# ---------------------------------------------------------------------------
@lru_cache(maxsize=1)
def _build_token_cached(hour_bucket: int) -> str:
try:
html = _get_html(BASE + "/", timeout=15)
m = re.search(r"common\.min\.css\?v=([A-Za-z0-9_\-]+)", html)
return m.group(1) if m else f"unknown-{hour_bucket}"
except Exception:
return f"unknown-{hour_bucket}"
def get_buildid() -> str:
"""Returns the current site asset version hash (hourly-cached).
HNS Semafor is server-rendered ASP.NET, so there is no Next.js buildId.
We surface the CSS asset hash as an equivalent cache-busting token."""
return _build_token_cached(int(time.time()) // 3600)
# ---------------------------------------------------------------------------
# Player parser
# ---------------------------------------------------------------------------
def _parse_player_header(soup: BeautifulSoup) -> dict[str, Any]:
hdr = soup.select_one("div.block.playerHeader")
if hdr is None:
return {}
name = _txt(hdr.select_one(".playerName .name"))
surname = _txt(hdr.select_one(".playerName .surname"))
shirt = _txt(hdr.select_one(".playerName h3"))
img = hdr.select_one(".photo img")
photo_url = img["src"] if img and img.get("src") else None
club_a = hdr.select_one(".playerData li.club a")
club_name = _txt(hdr.select_one(".playerData li.club h4"))
club_id = _href_id(club_a, r"/klubovi/(\d+)/")
dob_node = hdr.select_one(".playerData li.dob h4")
dob_text = _txt(dob_node)
dob = None
age = None
m = re.match(r"(\d{2}\.\d{2}\.\d{4})", dob_text)
if m:
dob = m.group(1)
m_age = re.search(r"\((\d+)\s+godina", dob_text)
if m_age:
age = int(m_age.group(1))
pob = _txt(hdr.select_one(".playerData li.pob h4")) or None
return {
"name": name or None,
"surname": surname or None,
"shirt_number": _int(shirt),
"photo_url": photo_url,
"current_club": {"id": club_id, "name": club_name or None},
"dob": dob,
"age": age,
"place_of_birth": pob,
}
def _parse_season_recap(soup: BeautifulSoup) -> dict[str, Any]:
block = soup.select_one("div.player_season_stats_recap")
if block is None:
return {}
season = _txt(block.select_one("h2"))
out = {"season": season or None}
for li in block.select("ul > li"):
cls = (li.get("class") or [None])[0]
out[cls] = _int(_txt(li.select_one("h4")))
return out
def _parse_player_seasons(soup: BeautifulSoup) -> list[dict[str, Any]]:
"""Statistika po sezonama: each season has a stats table + matches table."""
seasons: list[dict[str, Any]] = []
container = soup.select_one("div.player_profile_matches")
if container is None:
return seasons
titles = container.select("h2.seasonTitle")
tabbed = container.select("div.tabbedContent")
for i, h2 in enumerate(titles):
season = _txt(h2)
tab = tabbed[i] if i < len(tabbed) else None
comps: list[dict[str, Any]] = []
matches: list[dict[str, Any]] = []
if tab is not None:
# competitions stats table
for st in tab.select("div.stats_table"):
rows: list[dict[str, Any]] = []
for row in st.select("ul > li.row"):
if "header" in (row.get("class") or []):
continue
title_a = row.select_one(".title a")
rows.append(
{
"competition": _txt(row.select_one(".title")) or None,
"competition_url": title_a["href"] if title_a and title_a.get("href") else None,
"competition_id": _href_id(title_a, r"/natjecanja/(\d+)/"),
"apps": _int(_txt(row.select_one(".apps"))),
"starter": _int(_txt(row.select_one(".starter"))),
"sub": _int(_txt(row.select_one(".sub"))),
"minutes": _int(_txt(row.select_one(".minutes"))),
"goals": _int(_txt(row.select_one(".goals"))),
"yellows": _int(_txt(row.select_one(".yellows"))),
"reds": _int(_txt(row.select_one(".reds"))),
}
)
if rows:
comps.append({"rows": rows})
# matches list
for ml in tab.select("div.matchlist"):
for row in ml.select("li.row, div.row"):
if "header" in (row.get("class") or []):
continue
date = _txt(row.select_one(".date"))
c1 = _txt(row.select_one(".club1"))
c2 = _txt(row.select_one(".club2"))
res = _txt(row.select_one(".result"))
comp = _txt(row.select_one(".competitionround"))
if not (date or c1 or c2 or res):
continue
matches.append(
{
"date": date or None,
"home": c1 or None,
"away": c2 or None,
"result": res or None,
"competition_round": comp or None,
}
)
seasons.append({"season": season, "competitions": comps, "matches": matches})
return seasons
def fetch_player(hns_id: int, slug: str) -> dict[str, Any]:
"""Fetch a player profile from semafor.hns.family and return structured JSON."""
url = f"{BASE}/igraci/{hns_id}/{slug.strip('/')}/"
t0 = time.time()
html = _get_html(url)
soup = BeautifulSoup(html, "html.parser")
data = {
"source_url": url,
"hns_igrac_id": hns_id,
"slug": slug.strip("/"),
"build_token": get_buildid(),
"fetched_at": int(time.time()),
"fetch_ms": int((time.time() - t0) * 1000),
"profile": _parse_player_header(soup),
"season_recap": _parse_season_recap(soup),
"seasons": _parse_player_seasons(soup),
}
return data
# ---------------------------------------------------------------------------
# Klub parser
# ---------------------------------------------------------------------------
def _parse_klub_header(soup: BeautifulSoup) -> dict[str, Any]:
hdr = soup.select_one("div.block.clubHeader")
if hdr is None:
return {}
name = _txt(hdr.select_one(".title h1"))
long_name = _txt(hdr.select_one(".title h2"))
img = hdr.select_one(".logo img")
info: dict[str, Any] = {
"name": name or None,
"full_name": long_name or None,
"logo_url": img["src"] if img and img.get("src") else None,
}
for li in hdr.select(".info ul > li"):
cls = (li.get("class") or [None])[0]
val = _txt(li.select_one("h3"))
info[cls] = val or None
return info
def fetch_klub(hns_id: int, slug: str) -> dict[str, Any]:
"""Fetch a club profile (header + meta) from semafor.hns.family."""
url = f"{BASE}/klubovi/{hns_id}/{slug.strip('/')}/"
t0 = time.time()
html = _get_html(url)
soup = BeautifulSoup(html, "html.parser")
return {
"source_url": url,
"hns_klub_id": hns_id,
"slug": slug.strip("/"),
"build_token": get_buildid(),
"fetched_at": int(time.time()),
"fetch_ms": int((time.time() - t0) * 1000),
"info": _parse_klub_header(soup),
"competitions": _parse_klub_competitions(soup),
"next_match": _parse_next_match(soup),
}
def _parse_klub_competitions(soup: BeautifulSoup) -> list[dict[str, Any]]:
out = []
for tbl in soup.select("div.competition_table"):
rows = []
for li in tbl.select("li.row, div.row"):
if "header" in (li.get("class") or []):
continue
comp_a = li.select_one("a")
rows.append(
{
"competition": _txt(li.select_one(".title")) or _txt(comp_a) or None,
"competition_id": _href_id(comp_a, r"/natjecanja/(\d+)/"),
}
)
if rows:
out.extend(rows)
return out
def _parse_next_match(soup: BeautifulSoup) -> dict[str, Any] | None:
nm = soup.select_one("div.scoreboard_next_previous, div.current_results")
if nm is None:
return None
return {"raw_text": _txt(nm)[:400] or None}
# ---------------------------------------------------------------------------
# Roster parser — the “Igrači” tab (server-rendered with the page)
# ---------------------------------------------------------------------------
def fetch_klub_roster(hns_id: int, slug: str) -> list[dict[str, Any]]:
"""Return list of players currently rostered at the club."""
url = f"{BASE}/klubovi/{hns_id}/{slug.strip('/')}/"
html = _get_html(url)
soup = BeautifulSoup(html, "html.parser")
roster: list[dict[str, Any]] = []
seen: set[int] = set()
for block in soup.select("div.playerslist"):
for row in block.select("li.row, div.row, tr"):
if "header" in (row.get("class") or []):
continue
a = row.find("a", href=re.compile(r"/igraci/\d+/"))
if a is None:
continue
pid = _href_id(a, r"/igraci/(\d+)/")
if pid is None or pid in seen:
continue
seen.add(pid)
slug_m = re.search(r"/igraci/\d+/([^/]+)/", a["href"])
roster.append(
{
"hns_igrac_id": pid,
"slug": slug_m.group(1) if slug_m else None,
"name": _txt(a) or None,
"url": BASE + a["href"] if a["href"].startswith("/") else a["href"],
"apps": _int(_txt(row.select_one(".apps"))),
"minutes": _int(_txt(row.select_one(".minutes"))),
"goals": _int(_txt(row.select_one(".goals"))),
"yellows": _int(_txt(row.select_one(".yellows"))),
"reds": _int(_txt(row.select_one(".reds"))),
}
)
return roster
# ---------------------------------------------------------------------------
# CLI / smoke test
# ---------------------------------------------------------------------------
def _bench(label: str, fn, *args):
t0 = time.time()
out = fn(*args)
dt = (time.time() - t0) * 1000
print(f"[{label}] {dt:6.1f} ms args={args}", file=sys.stderr)
return out, dt
def _slugify(name: str) -> str:
name = name.lower()
repl = {"č": "c", "ć": "c", "ž": "z", "š": "s", "đ": "d"}
for k, v in repl.items():
name = name.replace(k, v)
name = re.sub(r"[^a-z0-9]+", "-", name).strip("-")
return name
if __name__ == "__main__":
samples_players = [
(88284, "zoran-kurjaga"),
(134238, "roko-antesic"),
(1223263, "anel-husic"),
]
samples_klubovi = [
(1589, "nk-zamet"),
(107150, "nk-pomorac"),
(1574, "hnk-lovran"),
]
print(f"# build_token = {get_buildid()}")
print()
print("## fetch_player benchmark")
p_times: list[float] = []
for pid, slug in samples_players:
data, dt = _bench("player", fetch_player, pid, slug)
p_times.append(dt)
prof = data.get("profile", {})
recap = data.get("season_recap", {})
seasons = data.get("seasons", [])
print(
f" - id={pid} slug={slug} -> {prof.get('name')} {prof.get('surname')}, "
f"club={prof.get('current_club', {}).get('name')}, dob={prof.get('dob')}, "
f"recap_season={recap.get('season')} apps={recap.get('apps')}, "
f"seasons_count={len(seasons)}"
)
print(f" avg={sum(p_times)/len(p_times):.1f} ms")
print()
print("## fetch_klub benchmark")
k_times: list[float] = []
for kid, slug in samples_klubovi:
data, dt = _bench("klub", fetch_klub, kid, slug)
k_times.append(dt)
info = data.get("info", {})
print(
f" - id={kid} slug={slug} -> {info.get('name')} | "
f"founded={info.get('foundation_date')} | "
f"address={info.get('address')} | "
f"comps={len(data.get('competitions', []))}"
)
print(f" avg={sum(k_times)/len(k_times):.1f} ms")
print()
print("## fetch_klub_roster benchmark")
r_times: list[float] = []
for kid, slug in samples_klubovi:
roster, dt = _bench("roster", fetch_klub_roster, kid, slug)
r_times.append(dt)
print(f" - id={kid} slug={slug} -> {len(roster)} players")
for r in roster[:3]:
print(f"{r['name']} (id={r['hns_igrac_id']}) apps={r.get('apps')} goals={r.get('goals')}")
print(f" avg={sum(r_times)/len(r_times):.1f} ms")
# Dump first sample to disk for inspection
out_dir = Path("/opt/pgz-sport/cc_tasks/sub4")
out_dir.mkdir(parents=True, exist_ok=True)
pid0, slug0 = samples_players[0]
sample = fetch_player(pid0, slug0)
(out_dir / "sample_player.json").write_text(
json.dumps(sample, ensure_ascii=False, indent=2)
)
kid0, kslug0 = samples_klubovi[0]
ksample = fetch_klub(kid0, kslug0)
(out_dir / "sample_klub.json").write_text(
json.dumps(ksample, ensure_ascii=False, indent=2)
)
rsample = fetch_klub_roster(kid0, kslug0)
(out_dir / "sample_roster.json").write_text(
json.dumps(rsample, ensure_ascii=False, indent=2)
)
print()
print(f"# sample JSONs written to {out_dir}/")