Files
damir 1d02c0897d Sidebar: +ERP +CRM +Dokumenti, godišnjaci import (18 PDFs), filter helpers
- pgz nav now includes /erp/full, /crm/v2, /admin/users, /dokumenti
- 4 dokumenti endpoints: list, godišnjaci/list, godišnjak/{godina} PDF, detail
- 18 godišnjaka u pgz_sport.dokumenti (2006-2024) with savez_id=333
- PGŽ filter helpers (window._pgz_filter_priority, togglePGZFilter)
- navItemClick handler for nav items with href
2026-05-05 13:08:11 +02:00

374 lines
14 KiB
Python
Executable File

#!/usr/bin/env python3
# hks_basketball.py — HKS-CBF + FIBA LiveStats košarka harvester
# v2.0 — dradulic@outlook.com / damir@rinet.one — 2026-05-05
# Harvests rosters + per-match player stats for PGŽ priority basketball clubs.
# Path: HKS search (?s=naziv) → match recap articles → FIBA LiveStats matchid →
# https://fibalivestats.dcd.shared.geniussports.com/data/{matchid}/data.json
# (public JSON boxscore) → upsert clanovi + clan_kategorije + player_stats.
import sys, re, time, json, urllib.parse
import requests
sys.path.insert(0, '/opt/pgz-sport/scripts/sport_harvesters')
from __base import SportHarvester
UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36"
LIGA_SECTIONS = {
"supersport-premijer": "Seniori (Premijer)",
"prva-muska-liga": "Seniori (1.HML)",
"druga-muska-liga": "Seniori (2.HML)",
"premijer-zenska-liga": "Seniorke (Premijer)",
"prva-zenska-liga": "Seniorke (1.ŽKL)",
"druga-zenska-liga": "Seniorke (2.ŽKL)",
"jedinstvena-kadetska-liga":"Kadeti",
"kadetska-liga": "Kadeti",
"juniorska-liga": "Juniori",
"mlade-kategorije": "Mladi",
"mini-kosarka": "Mini",
}
NOISE_TOKENS = {
"kk", "zkk", "kosarkaski", "klub", "udruga", "savez", "skola",
"primorsko-goranske", "primorsko", "goranske", "zupanije",
"rijeka", "rijeke", # too generic on its own; only used if it's the longest token
}
ARTICLE_RE = re.compile(
r'href="(https://www\.hks-cbf\.hr/(' + '|'.join(re.escape(k) for k in LIGA_SECTIONS) + r')/(\d{4})/[^"]+/)"'
)
FIBA_MATCHID_RE = re.compile(
r'fibalivestats\.dcd\.shared\.geniussports\.com/u/HKS/(\d+)/'
)
MAX_ARTICLES_PER_KLUB = 8
MAX_MATCHES_PER_KLUB = 30
HTTP_TIMEOUT = 15
HTTP_PAUSE_S = 0.4
def parse_mm_ss(s):
if not s or not isinstance(s, str): return None
m = re.match(r'^(\d{1,3}):(\d{2})$', s.strip())
if not m: return None
return int(m.group(1))
def _ascii_lower(s):
t = (s or '').lower()
for old, new in [('š','s'),('č','c'),('ć','c'),('ž','z'),('đ','d')]:
t = t.replace(old, new)
return t
def name_tokens(naziv):
"""Distinctive tokens from a club name, stripped of generic noise."""
t = re.sub(r'[^\wšđč枊ĐČĆŽ\s-]', ' ', naziv or '')
t = _ascii_lower(t)
parts = [p for p in re.split(r'\s+', t) if p and p not in NOISE_TOKENS and len(p) > 2]
return parts
def name_abbrev(naziv):
"""Acronym from significant tokens, e.g. 'Flumen Sancti Viti''fsv'."""
toks = name_tokens(naziv)
if len(toks) < 2:
return None
return ''.join(t[0] for t in toks if t)
def fuzzy_klub_match(klub_naziv, side_name):
"""True iff klub_naziv likely refers to the same club as side_name.
Strategies:
1. token overlap (3+ char tokens, post noise filter).
2. abbreviation match (e.g. 'FSV' = 'Flumen Sancti Viti').
3. substring match on ascii-folded slugs (≥4 char overlap).
"""
a = set(name_tokens(klub_naziv))
b = set(name_tokens(side_name))
if a & b:
return True
abb_a = name_abbrev(klub_naziv) or ''
abb_b = name_abbrev(side_name) or ''
# treat single-token side names ≥3 chars as candidate acronyms too
side_clean = _ascii_lower(re.sub(r'[^A-Za-zšđč枊ĐČĆŽ]', '', side_name or ''))
klub_clean = _ascii_lower(re.sub(r'[^A-Za-zšđč枊ĐČĆŽ]', '', klub_naziv or ''))
if abb_a and len(abb_a) >= 3 and abb_a in side_clean:
return True
if abb_b and len(abb_b) >= 3 and abb_b in klub_clean:
return True
# Treat any 4+ char shared substring as match (e.g. 'kvarner' in both)
for tok in a:
if len(tok) >= 4 and tok in side_clean:
return True
for tok in b:
if len(tok) >= 4 and tok in klub_clean:
return True
return False
def best_search_token(naziv):
"""Pick the most distinctive single token for HKS search (e.g. 'Škrljevo').
Falls back to longest word ≥3 chars when noise-filtered list is empty
(e.g. 'KK Rijeka - Rijeka''Rijeka').
"""
toks = name_tokens(naziv)
if not toks:
# noise-only club name — pick longest non-noise-but-permitted word
all_words = [w for w in re.findall(r'\w+', naziv or '') if len(w) >= 3]
if not all_words:
return naziv
chosen = _ascii_lower(max(all_words, key=len))
else:
chosen = max(toks, key=len)
for w in re.findall(r'\w+', naziv or ''):
if _ascii_lower(w) == chosen:
return w
return chosen
class HKSHarvester(SportHarvester):
SPORT = 'košarka'
SOURCE = 'hks_cbf'
def get_target_klubovi(self, limit=999):
"""Override base — task requires ALL 99 PGŽ priority basketball clubs,
not just financiran/u_godisnjaku ones."""
from psycopg2.extras import RealDictCursor
with self.conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT * FROM pgz_sport.v_pgz_priority_klubovi
WHERE sport = %s
ORDER BY financiran DESC NULLS LAST,
u_godisnjaku DESC NULLS LAST,
id
LIMIT %s
""", (self.SPORT, limit))
return cur.fetchall()
def __init__(self):
super().__init__()
self.http = requests.Session()
self.http.headers.update({
"User-Agent": UA,
"Accept-Language": "hr,en;q=0.8",
"Accept-Encoding": "gzip, deflate", # avoid brotli — requests' decoder is flaky on chunked br
})
self._seen_matches = set() # global de-dup across clubs
self._klub_match_count = 0 # reset per-klub
def _get(self, url, retries=1):
last_err = None
for attempt in range(retries + 1):
try:
r = self.http.get(url, timeout=HTTP_TIMEOUT)
if r.status_code == 200:
return r
last_err = f"HTTP {r.status_code}"
except Exception as e:
last_err = str(e)
time.sleep(0.6)
self.log(f" GET fail {url}: {last_err}")
return None
def scrape_klub(self, page, klub):
self._klub_match_count = 0
token = best_search_token(klub['naziv'])
if not token or len(token) < 3:
self.log(f" 🏀 Klub {klub['id']} {klub['naziv']} → no usable token, skip")
return
search_url = f"https://www.hks-cbf.hr/?s={urllib.parse.quote_plus(token)}"
self.log(f" 🏀 Klub {klub['id']} {klub['naziv']} → token='{token}'")
r = self._get(search_url)
if not r:
return
html = r.text
time.sleep(HTTP_PAUSE_S)
seen = set()
articles = []
for m in ARTICLE_RE.finditer(html):
url = m.group(1)
if url in seen: continue
seen.add(url)
articles.append(url)
if len(articles) >= MAX_ARTICLES_PER_KLUB:
break
self.log(f" {len(articles)} article(s)")
if not articles:
return
with self.conn.cursor() as cur:
cur.execute("""
UPDATE pgz_sport.klubovi
SET source_url = %s
WHERE id = %s AND (source_url IS NULL OR source_url = '' OR source_url = 'godisnjak_zspgz_2025')
""", (articles[0], klub['id']))
for art_url in articles:
try:
self._scrape_article(klub, art_url)
except Exception as e:
self.stats['errors'] += 1
self.log(f" ❌ article {art_url}: {e}")
if self._klub_match_count >= MAX_MATCHES_PER_KLUB:
self.log(f" cap reached ({MAX_MATCHES_PER_KLUB} matches)")
break
def scrape_player(self, page, person_id):
"""Helper: scrape an individual player career page from HKS statistika.
Genius Sports widget is JS-rendered, so we need Playwright here."""
url = f"https://www.hks-cbf.hr/statistika/?WHurl=%2Fperson%2F{person_id}"
try:
page.goto(url, wait_until="domcontentloaded", timeout=20000)
page.wait_for_timeout(4000)
tables = [t.inner_text() for t in page.locator('table').all()[:3]]
return {"person_id": person_id, "url": url, "tables": tables}
except Exception as e:
self.log(f" ❌ scrape_player({person_id}): {e}")
return None
def _scrape_article(self, klub, art_url):
r = self._get(art_url)
if not r:
return
time.sleep(HTTP_PAUSE_S)
section_match = re.search(r'https://www\.hks-cbf\.hr/([^/]+)/(\d{4})/', art_url)
section = section_match.group(1) if section_match else ""
year = int(section_match.group(2)) if section_match else None
kategorija = LIGA_SECTIONS.get(section)
sezona = f"{year-1}/{year}" if year else None
seen = set()
matchids = []
for m in FIBA_MATCHID_RE.finditer(r.text):
mid = m.group(1)
if mid in seen: continue
seen.add(mid)
matchids.append(mid)
if not matchids:
return
for mid in matchids:
if mid in self._seen_matches:
continue
self._seen_matches.add(mid)
try:
self._harvest_match(klub, mid, art_url, kategorija, sezona, section)
except Exception as e:
self.stats['errors'] += 1
self.log(f" ❌ match {mid}: {e}")
if self._klub_match_count >= MAX_MATCHES_PER_KLUB:
return
def _harvest_match(self, klub, matchid, art_url, kategorija, sezona, section):
url = f"https://fibalivestats.dcd.shared.geniussports.com/data/{matchid}/data.json"
r = self._get(url, retries=2)
if not r:
return
try:
data = r.json()
except Exception as e:
self.log(f" ⚠️ {matchid} JSON parse: {e}")
return
time.sleep(HTTP_PAUSE_S)
tm = data.get('tm') or {}
if not tm:
return
side_key = None
for side in ('1', '2'):
t = tm.get(side, {})
tname = t.get('name') or t.get('nameInternational') or ''
if fuzzy_klub_match(klub['naziv'], tname):
side_key = side
break
if not side_key:
n1 = (tm.get('1') or {}).get('name')
n2 = (tm.get('2') or {}).get('name')
self.log(f" ⚠️ {matchid} no side match for '{klub['naziv']}' (sides: {n1!r}, {n2!r})")
return
team = tm[side_key]
klub_naziv = team.get('name') or klub['naziv']
natjecanje = kategorija or section or "košarka"
natjecanje_match = f"{natjecanje} match {matchid}"
players = team.get('pl') or {}
iter_pairs = list(players.items()) if isinstance(players, dict) else list(enumerate(players))
added = 0
for pkey, p in iter_pairs:
if not isinstance(p, dict):
continue
ime = (p.get('firstName') or p.get('internationalFirstName') or '').strip()
prezime = (p.get('familyName') or p.get('internationalFamilyName') or '').strip()
if not (ime or prezime):
continue
full_slug = self.slugify(f"{ime} {prezime}")
source_id = full_slug or f"m{matchid}p{pkey}"
extra = {
"shirtNumber": p.get('shirtNumber'),
"playingPosition": p.get('playingPosition'),
"scoreboardName": p.get('scoreboardName'),
"photoT": (p.get('photoT') or {}).get('url') if isinstance(p.get('photoT'), dict) else p.get('photoT'),
"matchids_seen": [matchid],
}
try:
clan_id = self.upsert_clan(
klub_id=klub['id'],
source_id=source_id,
ime=ime,
prezime=prezime,
source_url=art_url,
kategorija=kategorija,
sezona=sezona,
extra=extra,
)
except Exception as e:
self.stats['errors'] += 1
self.log(f" ❌ upsert_clan {ime} {prezime}: {e}")
continue
stats = {
'nastupi': 1,
'golovi': None,
'asistencije': p.get('sAssists'),
'bodovi': p.get('sPoints'),
'trice': p.get('sThreePointersMade'),
'skokovi': p.get('sReboundsTotal'),
'blokade': p.get('sBlocks'),
'servis_asovi': None,
'zuti': None,
'crveni': 1 if (p.get('sFoulsPersonal') or 0) >= 5 else 0,
'minute': parse_mm_ss(p.get('sMinutes')),
}
try:
self.upsert_stats(
clan_id=clan_id,
sezona=sezona,
klub_id=klub['id'],
klub_naziv=klub_naziv,
natjecanje=natjecanje_match,
kategorija=kategorija,
stats_dict=stats,
raw={'matchid': matchid, 'art_url': art_url, 'player': p},
)
self.stats['stats'] += 1
except Exception as e:
self.stats['errors'] += 1
self.log(f" ❌ upsert_stats {ime} {prezime}: {e}")
continue
added += 1
self.stats['players'] += added
self._klub_match_count += 1
self.log(f"{matchid} side={side_key} '{klub_naziv}'{added} players")
if __name__ == '__main__':
HKSHarvester().run(limit=int(sys.argv[1]) if len(sys.argv) > 1 else 50)