#!/usr/bin/env python3 # hks_basketball.py — HKS-CBF + FIBA LiveStats košarka harvester # v2.0 — dradulic@outlook.com / damir@rinet.one — 2026-05-05 # Harvests rosters + per-match player stats for PGŽ priority basketball clubs. # Path: HKS search (?s=naziv) → match recap articles → FIBA LiveStats matchid → # https://fibalivestats.dcd.shared.geniussports.com/data/{matchid}/data.json # (public JSON boxscore) → upsert clanovi + clan_kategorije + player_stats. import sys, re, time, json, urllib.parse import requests sys.path.insert(0, '/opt/pgz-sport/scripts/sport_harvesters') from __base import SportHarvester UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36" LIGA_SECTIONS = { "supersport-premijer": "Seniori (Premijer)", "prva-muska-liga": "Seniori (1.HML)", "druga-muska-liga": "Seniori (2.HML)", "premijer-zenska-liga": "Seniorke (Premijer)", "prva-zenska-liga": "Seniorke (1.ŽKL)", "druga-zenska-liga": "Seniorke (2.ŽKL)", "jedinstvena-kadetska-liga":"Kadeti", "kadetska-liga": "Kadeti", "juniorska-liga": "Juniori", "mlade-kategorije": "Mladi", "mini-kosarka": "Mini", } NOISE_TOKENS = { "kk", "zkk", "kosarkaski", "klub", "udruga", "savez", "skola", "primorsko-goranske", "primorsko", "goranske", "zupanije", "rijeka", "rijeke", # too generic on its own; only used if it's the longest token } ARTICLE_RE = re.compile( r'href="(https://www\.hks-cbf\.hr/(' + '|'.join(re.escape(k) for k in LIGA_SECTIONS) + r')/(\d{4})/[^"]+/)"' ) FIBA_MATCHID_RE = re.compile( r'fibalivestats\.dcd\.shared\.geniussports\.com/u/HKS/(\d+)/' ) MAX_ARTICLES_PER_KLUB = 8 MAX_MATCHES_PER_KLUB = 30 HTTP_TIMEOUT = 15 HTTP_PAUSE_S = 0.4 def parse_mm_ss(s): if not s or not isinstance(s, str): return None m = re.match(r'^(\d{1,3}):(\d{2})$', s.strip()) if not m: return None return int(m.group(1)) def _ascii_lower(s): t = (s or '').lower() for old, new in [('š','s'),('č','c'),('ć','c'),('ž','z'),('đ','d')]: t = t.replace(old, new) return t def name_tokens(naziv): """Distinctive tokens from a club name, stripped of generic noise.""" t = re.sub(r'[^\wšđč枊ĐČĆŽ\s-]', ' ', naziv or '') t = _ascii_lower(t) parts = [p for p in re.split(r'\s+', t) if p and p not in NOISE_TOKENS and len(p) > 2] return parts def name_abbrev(naziv): """Acronym from significant tokens, e.g. 'Flumen Sancti Viti' → 'fsv'.""" toks = name_tokens(naziv) if len(toks) < 2: return None return ''.join(t[0] for t in toks if t) def fuzzy_klub_match(klub_naziv, side_name): """True iff klub_naziv likely refers to the same club as side_name. Strategies: 1. token overlap (3+ char tokens, post noise filter). 2. abbreviation match (e.g. 'FSV' = 'Flumen Sancti Viti'). 3. substring match on ascii-folded slugs (≥4 char overlap). """ a = set(name_tokens(klub_naziv)) b = set(name_tokens(side_name)) if a & b: return True abb_a = name_abbrev(klub_naziv) or '' abb_b = name_abbrev(side_name) or '' # treat single-token side names ≥3 chars as candidate acronyms too side_clean = _ascii_lower(re.sub(r'[^A-Za-zšđč枊ĐČĆŽ]', '', side_name or '')) klub_clean = _ascii_lower(re.sub(r'[^A-Za-zšđč枊ĐČĆŽ]', '', klub_naziv or '')) if abb_a and len(abb_a) >= 3 and abb_a in side_clean: return True if abb_b and len(abb_b) >= 3 and abb_b in klub_clean: return True # Treat any 4+ char shared substring as match (e.g. 'kvarner' in both) for tok in a: if len(tok) >= 4 and tok in side_clean: return True for tok in b: if len(tok) >= 4 and tok in klub_clean: return True return False def best_search_token(naziv): """Pick the most distinctive single token for HKS search (e.g. 'Škrljevo'). Falls back to longest word ≥3 chars when noise-filtered list is empty (e.g. 'KK Rijeka - Rijeka' → 'Rijeka'). """ toks = name_tokens(naziv) if not toks: # noise-only club name — pick longest non-noise-but-permitted word all_words = [w for w in re.findall(r'\w+', naziv or '') if len(w) >= 3] if not all_words: return naziv chosen = _ascii_lower(max(all_words, key=len)) else: chosen = max(toks, key=len) for w in re.findall(r'\w+', naziv or ''): if _ascii_lower(w) == chosen: return w return chosen class HKSHarvester(SportHarvester): SPORT = 'košarka' SOURCE = 'hks_cbf' def get_target_klubovi(self, limit=999): """Override base — task requires ALL 99 PGŽ priority basketball clubs, not just financiran/u_godisnjaku ones.""" from psycopg2.extras import RealDictCursor with self.conn.cursor(cursor_factory=RealDictCursor) as cur: cur.execute(""" SELECT * FROM pgz_sport.v_pgz_priority_klubovi WHERE sport = %s ORDER BY financiran DESC NULLS LAST, u_godisnjaku DESC NULLS LAST, id LIMIT %s """, (self.SPORT, limit)) return cur.fetchall() def __init__(self): super().__init__() self.http = requests.Session() self.http.headers.update({ "User-Agent": UA, "Accept-Language": "hr,en;q=0.8", "Accept-Encoding": "gzip, deflate", # avoid brotli — requests' decoder is flaky on chunked br }) self._seen_matches = set() # global de-dup across clubs self._klub_match_count = 0 # reset per-klub def _get(self, url, retries=1): last_err = None for attempt in range(retries + 1): try: r = self.http.get(url, timeout=HTTP_TIMEOUT) if r.status_code == 200: return r last_err = f"HTTP {r.status_code}" except Exception as e: last_err = str(e) time.sleep(0.6) self.log(f" GET fail {url}: {last_err}") return None def scrape_klub(self, page, klub): self._klub_match_count = 0 token = best_search_token(klub['naziv']) if not token or len(token) < 3: self.log(f" 🏀 Klub {klub['id']} {klub['naziv']} → no usable token, skip") return search_url = f"https://www.hks-cbf.hr/?s={urllib.parse.quote_plus(token)}" self.log(f" 🏀 Klub {klub['id']} {klub['naziv']} → token='{token}'") r = self._get(search_url) if not r: return html = r.text time.sleep(HTTP_PAUSE_S) seen = set() articles = [] for m in ARTICLE_RE.finditer(html): url = m.group(1) if url in seen: continue seen.add(url) articles.append(url) if len(articles) >= MAX_ARTICLES_PER_KLUB: break self.log(f" {len(articles)} article(s)") if not articles: return with self.conn.cursor() as cur: cur.execute(""" UPDATE pgz_sport.klubovi SET source_url = %s WHERE id = %s AND (source_url IS NULL OR source_url = '' OR source_url = 'godisnjak_zspgz_2025') """, (articles[0], klub['id'])) for art_url in articles: try: self._scrape_article(klub, art_url) except Exception as e: self.stats['errors'] += 1 self.log(f" ❌ article {art_url}: {e}") if self._klub_match_count >= MAX_MATCHES_PER_KLUB: self.log(f" cap reached ({MAX_MATCHES_PER_KLUB} matches)") break def scrape_player(self, page, person_id): """Helper: scrape an individual player career page from HKS statistika. Genius Sports widget is JS-rendered, so we need Playwright here.""" url = f"https://www.hks-cbf.hr/statistika/?WHurl=%2Fperson%2F{person_id}" try: page.goto(url, wait_until="domcontentloaded", timeout=20000) page.wait_for_timeout(4000) tables = [t.inner_text() for t in page.locator('table').all()[:3]] return {"person_id": person_id, "url": url, "tables": tables} except Exception as e: self.log(f" ❌ scrape_player({person_id}): {e}") return None def _scrape_article(self, klub, art_url): r = self._get(art_url) if not r: return time.sleep(HTTP_PAUSE_S) section_match = re.search(r'https://www\.hks-cbf\.hr/([^/]+)/(\d{4})/', art_url) section = section_match.group(1) if section_match else "" year = int(section_match.group(2)) if section_match else None kategorija = LIGA_SECTIONS.get(section) sezona = f"{year-1}/{year}" if year else None seen = set() matchids = [] for m in FIBA_MATCHID_RE.finditer(r.text): mid = m.group(1) if mid in seen: continue seen.add(mid) matchids.append(mid) if not matchids: return for mid in matchids: if mid in self._seen_matches: continue self._seen_matches.add(mid) try: self._harvest_match(klub, mid, art_url, kategorija, sezona, section) except Exception as e: self.stats['errors'] += 1 self.log(f" ❌ match {mid}: {e}") if self._klub_match_count >= MAX_MATCHES_PER_KLUB: return def _harvest_match(self, klub, matchid, art_url, kategorija, sezona, section): url = f"https://fibalivestats.dcd.shared.geniussports.com/data/{matchid}/data.json" r = self._get(url, retries=2) if not r: return try: data = r.json() except Exception as e: self.log(f" ⚠️ {matchid} JSON parse: {e}") return time.sleep(HTTP_PAUSE_S) tm = data.get('tm') or {} if not tm: return side_key = None for side in ('1', '2'): t = tm.get(side, {}) tname = t.get('name') or t.get('nameInternational') or '' if fuzzy_klub_match(klub['naziv'], tname): side_key = side break if not side_key: n1 = (tm.get('1') or {}).get('name') n2 = (tm.get('2') or {}).get('name') self.log(f" ⚠️ {matchid} no side match for '{klub['naziv']}' (sides: {n1!r}, {n2!r})") return team = tm[side_key] klub_naziv = team.get('name') or klub['naziv'] natjecanje = kategorija or section or "košarka" natjecanje_match = f"{natjecanje} match {matchid}" players = team.get('pl') or {} iter_pairs = list(players.items()) if isinstance(players, dict) else list(enumerate(players)) added = 0 for pkey, p in iter_pairs: if not isinstance(p, dict): continue ime = (p.get('firstName') or p.get('internationalFirstName') or '').strip() prezime = (p.get('familyName') or p.get('internationalFamilyName') or '').strip() if not (ime or prezime): continue full_slug = self.slugify(f"{ime} {prezime}") source_id = full_slug or f"m{matchid}p{pkey}" extra = { "shirtNumber": p.get('shirtNumber'), "playingPosition": p.get('playingPosition'), "scoreboardName": p.get('scoreboardName'), "photoT": (p.get('photoT') or {}).get('url') if isinstance(p.get('photoT'), dict) else p.get('photoT'), "matchids_seen": [matchid], } try: clan_id = self.upsert_clan( klub_id=klub['id'], source_id=source_id, ime=ime, prezime=prezime, source_url=art_url, kategorija=kategorija, sezona=sezona, extra=extra, ) except Exception as e: self.stats['errors'] += 1 self.log(f" ❌ upsert_clan {ime} {prezime}: {e}") continue stats = { 'nastupi': 1, 'golovi': None, 'asistencije': p.get('sAssists'), 'bodovi': p.get('sPoints'), 'trice': p.get('sThreePointersMade'), 'skokovi': p.get('sReboundsTotal'), 'blokade': p.get('sBlocks'), 'servis_asovi': None, 'zuti': None, 'crveni': 1 if (p.get('sFoulsPersonal') or 0) >= 5 else 0, 'minute': parse_mm_ss(p.get('sMinutes')), } try: self.upsert_stats( clan_id=clan_id, sezona=sezona, klub_id=klub['id'], klub_naziv=klub_naziv, natjecanje=natjecanje_match, kategorija=kategorija, stats_dict=stats, raw={'matchid': matchid, 'art_url': art_url, 'player': p}, ) self.stats['stats'] += 1 except Exception as e: self.stats['errors'] += 1 self.log(f" ❌ upsert_stats {ime} {prezime}: {e}") continue added += 1 self.stats['players'] += added self._klub_match_count += 1 self.log(f" ✅ {matchid} side={side_key} '{klub_naziv}' → {added} players") if __name__ == '__main__': HKSHarvester().run(limit=int(sys.argv[1]) if len(sys.argv) > 1 else 50)