#!/usr/bin/env python3 # hns_api_client.py — HNS Semafor structured client (v1.0) # Author: Damir Radulić / # Date: 2026-05-05 # Description: # Reverse-engineered client for https://semafor.hns.family. # The site is server-rendered ASP.NET (NOT Next.js — no __NEXT_DATA__, # no hydration JSON). The only XHR endpoints exposed are filter helpers # /handlers/getOrganizations/, getCompetitions/, getAgeCategories/, # getCalendarEvents/. All player & club data is rendered into stable # semantic HTML (classes: playerHeader, playerData, playerCompetitionStatsTable, # matchlist, clubHeader, basic_info, playerslist…). # # This module therefore implements a fast HTML→JSON parser using requests # + BeautifulSoup, with a connection-pooled session, polite UA, and a # per-hour cache. It exposes the SDK surface SUB4 was asked for: # fetch_player(hns_id, slug) -> dict # fetch_klub(hns_id, slug) -> dict # fetch_klub_roster(hns_id, slug) -> list # get_buildid() -> str (no buildId on this stack — returns site CSS hash) # # Fallback chain inside _get_html(): # 1. requests + polite UA (primary) # 2. requests with referer + cookie (retry on 403/503) # 3. Playwright (lazy import) for JS-only edge cases # import json import re import sys import time from dataclasses import dataclass, field, asdict from functools import lru_cache from pathlib import Path from typing import Any import requests from bs4 import BeautifulSoup BASE = "https://semafor.hns.family" UA = ( "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" ) HEADERS = { "User-Agent": UA, "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "hr,en-US;q=0.7,en;q=0.3", "Accept-Encoding": "gzip, deflate, br", "Cache-Control": "no-cache", } _session = requests.Session() _session.headers.update(HEADERS) # --------------------------------------------------------------------------- # helpers # --------------------------------------------------------------------------- def _txt(node, default: str = "") -> str: if node is None: return default return re.sub(r"\s+", " ", node.get_text(" ", strip=True)).strip() def _int(s: str | None, default: int | None = None) -> int | None: if s is None: return default m = re.search(r"-?\d+", s.replace("\xa0", " ")) return int(m.group()) if m else default def _href_id(a, pattern: str) -> int | None: if a is None or not a.get("href"): return None m = re.search(pattern, a["href"]) return int(m.group(1)) if m else None def _get_html(url: str, *, timeout: int = 20, retries: int = 2) -> str: last_exc: Exception | None = None for attempt in range(retries + 1): try: r = _session.get(url, timeout=timeout) if r.status_code == 200 and r.text: return r.text if r.status_code in (403, 503) and attempt < retries: time.sleep(1.0 + attempt) continue r.raise_for_status() except Exception as e: # pragma: no cover — network paths last_exc = e time.sleep(1.0 + attempt) # Playwright fallback (last resort) try: from playwright.sync_api import sync_playwright # type: ignore with sync_playwright() as p: browser = p.chromium.launch(headless=True) ctx = browser.new_context(user_agent=UA, locale="hr-HR") page = ctx.new_page() page.goto(url, wait_until="domcontentloaded", timeout=30000) html = page.content() browser.close() return html except Exception as e: raise RuntimeError(f"_get_html failed for {url}: {last_exc or e}") from e # --------------------------------------------------------------------------- # buildId-equivalent — NOT a Next.js app. We surface a deterministic version # token taken from the cached CSS asset hash; refreshes hourly. # --------------------------------------------------------------------------- @lru_cache(maxsize=1) def _build_token_cached(hour_bucket: int) -> str: try: html = _get_html(BASE + "/", timeout=15) m = re.search(r"common\.min\.css\?v=([A-Za-z0-9_\-]+)", html) return m.group(1) if m else f"unknown-{hour_bucket}" except Exception: return f"unknown-{hour_bucket}" def get_buildid() -> str: """Returns the current site asset version hash (hourly-cached). HNS Semafor is server-rendered ASP.NET, so there is no Next.js buildId. We surface the CSS asset hash as an equivalent cache-busting token.""" return _build_token_cached(int(time.time()) // 3600) # --------------------------------------------------------------------------- # Player parser # --------------------------------------------------------------------------- def _parse_player_header(soup: BeautifulSoup) -> dict[str, Any]: hdr = soup.select_one("div.block.playerHeader") if hdr is None: return {} name = _txt(hdr.select_one(".playerName .name")) surname = _txt(hdr.select_one(".playerName .surname")) shirt = _txt(hdr.select_one(".playerName h3")) img = hdr.select_one(".photo img") photo_url = img["src"] if img and img.get("src") else None club_a = hdr.select_one(".playerData li.club a") club_name = _txt(hdr.select_one(".playerData li.club h4")) club_id = _href_id(club_a, r"/klubovi/(\d+)/") dob_node = hdr.select_one(".playerData li.dob h4") dob_text = _txt(dob_node) dob = None age = None m = re.match(r"(\d{2}\.\d{2}\.\d{4})", dob_text) if m: dob = m.group(1) m_age = re.search(r"\((\d+)\s+godina", dob_text) if m_age: age = int(m_age.group(1)) pob = _txt(hdr.select_one(".playerData li.pob h4")) or None return { "name": name or None, "surname": surname or None, "shirt_number": _int(shirt), "photo_url": photo_url, "current_club": {"id": club_id, "name": club_name or None}, "dob": dob, "age": age, "place_of_birth": pob, } def _parse_season_recap(soup: BeautifulSoup) -> dict[str, Any]: block = soup.select_one("div.player_season_stats_recap") if block is None: return {} season = _txt(block.select_one("h2")) out = {"season": season or None} for li in block.select("ul > li"): cls = (li.get("class") or [None])[0] out[cls] = _int(_txt(li.select_one("h4"))) return out def _parse_player_seasons(soup: BeautifulSoup) -> list[dict[str, Any]]: """Statistika po sezonama: each season has a stats table + matches table.""" seasons: list[dict[str, Any]] = [] container = soup.select_one("div.player_profile_matches") if container is None: return seasons titles = container.select("h2.seasonTitle") tabbed = container.select("div.tabbedContent") for i, h2 in enumerate(titles): season = _txt(h2) tab = tabbed[i] if i < len(tabbed) else None comps: list[dict[str, Any]] = [] matches: list[dict[str, Any]] = [] if tab is not None: # competitions stats table for st in tab.select("div.stats_table"): rows: list[dict[str, Any]] = [] for row in st.select("ul > li.row"): if "header" in (row.get("class") or []): continue title_a = row.select_one(".title a") rows.append( { "competition": _txt(row.select_one(".title")) or None, "competition_url": title_a["href"] if title_a and title_a.get("href") else None, "competition_id": _href_id(title_a, r"/natjecanja/(\d+)/"), "apps": _int(_txt(row.select_one(".apps"))), "starter": _int(_txt(row.select_one(".starter"))), "sub": _int(_txt(row.select_one(".sub"))), "minutes": _int(_txt(row.select_one(".minutes"))), "goals": _int(_txt(row.select_one(".goals"))), "yellows": _int(_txt(row.select_one(".yellows"))), "reds": _int(_txt(row.select_one(".reds"))), } ) if rows: comps.append({"rows": rows}) # matches list for ml in tab.select("div.matchlist"): for row in ml.select("li.row, div.row"): if "header" in (row.get("class") or []): continue date = _txt(row.select_one(".date")) c1 = _txt(row.select_one(".club1")) c2 = _txt(row.select_one(".club2")) res = _txt(row.select_one(".result")) comp = _txt(row.select_one(".competitionround")) if not (date or c1 or c2 or res): continue matches.append( { "date": date or None, "home": c1 or None, "away": c2 or None, "result": res or None, "competition_round": comp or None, } ) seasons.append({"season": season, "competitions": comps, "matches": matches}) return seasons def fetch_player(hns_id: int, slug: str) -> dict[str, Any]: """Fetch a player profile from semafor.hns.family and return structured JSON.""" url = f"{BASE}/igraci/{hns_id}/{slug.strip('/')}/" t0 = time.time() html = _get_html(url) soup = BeautifulSoup(html, "html.parser") data = { "source_url": url, "hns_igrac_id": hns_id, "slug": slug.strip("/"), "build_token": get_buildid(), "fetched_at": int(time.time()), "fetch_ms": int((time.time() - t0) * 1000), "profile": _parse_player_header(soup), "season_recap": _parse_season_recap(soup), "seasons": _parse_player_seasons(soup), } return data # --------------------------------------------------------------------------- # Klub parser # --------------------------------------------------------------------------- def _parse_klub_header(soup: BeautifulSoup) -> dict[str, Any]: hdr = soup.select_one("div.block.clubHeader") if hdr is None: return {} name = _txt(hdr.select_one(".title h1")) long_name = _txt(hdr.select_one(".title h2")) img = hdr.select_one(".logo img") info: dict[str, Any] = { "name": name or None, "full_name": long_name or None, "logo_url": img["src"] if img and img.get("src") else None, } for li in hdr.select(".info ul > li"): cls = (li.get("class") or [None])[0] val = _txt(li.select_one("h3")) info[cls] = val or None return info def fetch_klub(hns_id: int, slug: str) -> dict[str, Any]: """Fetch a club profile (header + meta) from semafor.hns.family.""" url = f"{BASE}/klubovi/{hns_id}/{slug.strip('/')}/" t0 = time.time() html = _get_html(url) soup = BeautifulSoup(html, "html.parser") return { "source_url": url, "hns_klub_id": hns_id, "slug": slug.strip("/"), "build_token": get_buildid(), "fetched_at": int(time.time()), "fetch_ms": int((time.time() - t0) * 1000), "info": _parse_klub_header(soup), "competitions": _parse_klub_competitions(soup), "next_match": _parse_next_match(soup), } def _parse_klub_competitions(soup: BeautifulSoup) -> list[dict[str, Any]]: out = [] for tbl in soup.select("div.competition_table"): rows = [] for li in tbl.select("li.row, div.row"): if "header" in (li.get("class") or []): continue comp_a = li.select_one("a") rows.append( { "competition": _txt(li.select_one(".title")) or _txt(comp_a) or None, "competition_id": _href_id(comp_a, r"/natjecanja/(\d+)/"), } ) if rows: out.extend(rows) return out def _parse_next_match(soup: BeautifulSoup) -> dict[str, Any] | None: nm = soup.select_one("div.scoreboard_next_previous, div.current_results") if nm is None: return None return {"raw_text": _txt(nm)[:400] or None} # --------------------------------------------------------------------------- # Roster parser — the “Igrači” tab (server-rendered with the page) # --------------------------------------------------------------------------- def fetch_klub_roster(hns_id: int, slug: str) -> list[dict[str, Any]]: """Return list of players currently rostered at the club.""" url = f"{BASE}/klubovi/{hns_id}/{slug.strip('/')}/" html = _get_html(url) soup = BeautifulSoup(html, "html.parser") roster: list[dict[str, Any]] = [] seen: set[int] = set() for block in soup.select("div.playerslist"): for row in block.select("li.row, div.row, tr"): if "header" in (row.get("class") or []): continue a = row.find("a", href=re.compile(r"/igraci/\d+/")) if a is None: continue pid = _href_id(a, r"/igraci/(\d+)/") if pid is None or pid in seen: continue seen.add(pid) slug_m = re.search(r"/igraci/\d+/([^/]+)/", a["href"]) roster.append( { "hns_igrac_id": pid, "slug": slug_m.group(1) if slug_m else None, "name": _txt(a) or None, "url": BASE + a["href"] if a["href"].startswith("/") else a["href"], "apps": _int(_txt(row.select_one(".apps"))), "minutes": _int(_txt(row.select_one(".minutes"))), "goals": _int(_txt(row.select_one(".goals"))), "yellows": _int(_txt(row.select_one(".yellows"))), "reds": _int(_txt(row.select_one(".reds"))), } ) return roster # --------------------------------------------------------------------------- # CLI / smoke test # --------------------------------------------------------------------------- def _bench(label: str, fn, *args): t0 = time.time() out = fn(*args) dt = (time.time() - t0) * 1000 print(f"[{label}] {dt:6.1f} ms args={args}", file=sys.stderr) return out, dt def _slugify(name: str) -> str: name = name.lower() repl = {"č": "c", "ć": "c", "ž": "z", "š": "s", "đ": "d"} for k, v in repl.items(): name = name.replace(k, v) name = re.sub(r"[^a-z0-9]+", "-", name).strip("-") return name if __name__ == "__main__": samples_players = [ (88284, "zoran-kurjaga"), (134238, "roko-antesic"), (1223263, "anel-husic"), ] samples_klubovi = [ (1589, "nk-zamet"), (107150, "nk-pomorac"), (1574, "hnk-lovran"), ] print(f"# build_token = {get_buildid()}") print() print("## fetch_player benchmark") p_times: list[float] = [] for pid, slug in samples_players: data, dt = _bench("player", fetch_player, pid, slug) p_times.append(dt) prof = data.get("profile", {}) recap = data.get("season_recap", {}) seasons = data.get("seasons", []) print( f" - id={pid} slug={slug} -> {prof.get('name')} {prof.get('surname')}, " f"club={prof.get('current_club', {}).get('name')}, dob={prof.get('dob')}, " f"recap_season={recap.get('season')} apps={recap.get('apps')}, " f"seasons_count={len(seasons)}" ) print(f" avg={sum(p_times)/len(p_times):.1f} ms") print() print("## fetch_klub benchmark") k_times: list[float] = [] for kid, slug in samples_klubovi: data, dt = _bench("klub", fetch_klub, kid, slug) k_times.append(dt) info = data.get("info", {}) print( f" - id={kid} slug={slug} -> {info.get('name')} | " f"founded={info.get('foundation_date')} | " f"address={info.get('address')} | " f"comps={len(data.get('competitions', []))}" ) print(f" avg={sum(k_times)/len(k_times):.1f} ms") print() print("## fetch_klub_roster benchmark") r_times: list[float] = [] for kid, slug in samples_klubovi: roster, dt = _bench("roster", fetch_klub_roster, kid, slug) r_times.append(dt) print(f" - id={kid} slug={slug} -> {len(roster)} players") for r in roster[:3]: print(f" • {r['name']} (id={r['hns_igrac_id']}) apps={r.get('apps')} goals={r.get('goals')}") print(f" avg={sum(r_times)/len(r_times):.1f} ms") # Dump first sample to disk for inspection out_dir = Path("/opt/pgz-sport/cc_tasks/sub4") out_dir.mkdir(parents=True, exist_ok=True) pid0, slug0 = samples_players[0] sample = fetch_player(pid0, slug0) (out_dir / "sample_player.json").write_text( json.dumps(sample, ensure_ascii=False, indent=2) ) kid0, kslug0 = samples_klubovi[0] ksample = fetch_klub(kid0, kslug0) (out_dir / "sample_klub.json").write_text( json.dumps(ksample, ensure_ascii=False, indent=2) ) rsample = fetch_klub_roster(kid0, kslug0) (out_dir / "sample_roster.json").write_text( json.dumps(rsample, ensure_ascii=False, indent=2) ) print() print(f"# sample JSONs written to {out_dir}/")