1d02c0897d
- pgz nav now includes /erp/full, /crm/v2, /admin/users, /dokumenti
- 4 dokumenti endpoints: list, godišnjaci/list, godišnjak/{godina} PDF, detail
- 18 godišnjaka u pgz_sport.dokumenti (2006-2024) with savez_id=333
- PGŽ filter helpers (window._pgz_filter_priority, togglePGZFilter)
- navItemClick handler for nav items with href
374 lines
14 KiB
Python
Executable File
374 lines
14 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# hks_basketball.py — HKS-CBF + FIBA LiveStats košarka harvester
|
|
# v2.0 — dradulic@outlook.com / damir@rinet.one — 2026-05-05
|
|
# Harvests rosters + per-match player stats for PGŽ priority basketball clubs.
|
|
# Path: HKS search (?s=naziv) → match recap articles → FIBA LiveStats matchid →
|
|
# https://fibalivestats.dcd.shared.geniussports.com/data/{matchid}/data.json
|
|
# (public JSON boxscore) → upsert clanovi + clan_kategorije + player_stats.
|
|
|
|
import sys, re, time, json, urllib.parse
|
|
import requests
|
|
sys.path.insert(0, '/opt/pgz-sport/scripts/sport_harvesters')
|
|
from __base import SportHarvester
|
|
|
|
UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36"
|
|
|
|
LIGA_SECTIONS = {
|
|
"supersport-premijer": "Seniori (Premijer)",
|
|
"prva-muska-liga": "Seniori (1.HML)",
|
|
"druga-muska-liga": "Seniori (2.HML)",
|
|
"premijer-zenska-liga": "Seniorke (Premijer)",
|
|
"prva-zenska-liga": "Seniorke (1.ŽKL)",
|
|
"druga-zenska-liga": "Seniorke (2.ŽKL)",
|
|
"jedinstvena-kadetska-liga":"Kadeti",
|
|
"kadetska-liga": "Kadeti",
|
|
"juniorska-liga": "Juniori",
|
|
"mlade-kategorije": "Mladi",
|
|
"mini-kosarka": "Mini",
|
|
}
|
|
|
|
NOISE_TOKENS = {
|
|
"kk", "zkk", "kosarkaski", "klub", "udruga", "savez", "skola",
|
|
"primorsko-goranske", "primorsko", "goranske", "zupanije",
|
|
"rijeka", "rijeke", # too generic on its own; only used if it's the longest token
|
|
}
|
|
|
|
ARTICLE_RE = re.compile(
|
|
r'href="(https://www\.hks-cbf\.hr/(' + '|'.join(re.escape(k) for k in LIGA_SECTIONS) + r')/(\d{4})/[^"]+/)"'
|
|
)
|
|
FIBA_MATCHID_RE = re.compile(
|
|
r'fibalivestats\.dcd\.shared\.geniussports\.com/u/HKS/(\d+)/'
|
|
)
|
|
|
|
MAX_ARTICLES_PER_KLUB = 8
|
|
MAX_MATCHES_PER_KLUB = 30
|
|
HTTP_TIMEOUT = 15
|
|
HTTP_PAUSE_S = 0.4
|
|
|
|
|
|
def parse_mm_ss(s):
|
|
if not s or not isinstance(s, str): return None
|
|
m = re.match(r'^(\d{1,3}):(\d{2})$', s.strip())
|
|
if not m: return None
|
|
return int(m.group(1))
|
|
|
|
|
|
def _ascii_lower(s):
|
|
t = (s or '').lower()
|
|
for old, new in [('š','s'),('č','c'),('ć','c'),('ž','z'),('đ','d')]:
|
|
t = t.replace(old, new)
|
|
return t
|
|
|
|
|
|
def name_tokens(naziv):
|
|
"""Distinctive tokens from a club name, stripped of generic noise."""
|
|
t = re.sub(r'[^\wšđč枊ĐČĆŽ\s-]', ' ', naziv or '')
|
|
t = _ascii_lower(t)
|
|
parts = [p for p in re.split(r'\s+', t) if p and p not in NOISE_TOKENS and len(p) > 2]
|
|
return parts
|
|
|
|
|
|
def name_abbrev(naziv):
|
|
"""Acronym from significant tokens, e.g. 'Flumen Sancti Viti' → 'fsv'."""
|
|
toks = name_tokens(naziv)
|
|
if len(toks) < 2:
|
|
return None
|
|
return ''.join(t[0] for t in toks if t)
|
|
|
|
|
|
def fuzzy_klub_match(klub_naziv, side_name):
|
|
"""True iff klub_naziv likely refers to the same club as side_name.
|
|
|
|
Strategies:
|
|
1. token overlap (3+ char tokens, post noise filter).
|
|
2. abbreviation match (e.g. 'FSV' = 'Flumen Sancti Viti').
|
|
3. substring match on ascii-folded slugs (≥4 char overlap).
|
|
"""
|
|
a = set(name_tokens(klub_naziv))
|
|
b = set(name_tokens(side_name))
|
|
if a & b:
|
|
return True
|
|
abb_a = name_abbrev(klub_naziv) or ''
|
|
abb_b = name_abbrev(side_name) or ''
|
|
# treat single-token side names ≥3 chars as candidate acronyms too
|
|
side_clean = _ascii_lower(re.sub(r'[^A-Za-zšđč枊ĐČĆŽ]', '', side_name or ''))
|
|
klub_clean = _ascii_lower(re.sub(r'[^A-Za-zšđč枊ĐČĆŽ]', '', klub_naziv or ''))
|
|
if abb_a and len(abb_a) >= 3 and abb_a in side_clean:
|
|
return True
|
|
if abb_b and len(abb_b) >= 3 and abb_b in klub_clean:
|
|
return True
|
|
# Treat any 4+ char shared substring as match (e.g. 'kvarner' in both)
|
|
for tok in a:
|
|
if len(tok) >= 4 and tok in side_clean:
|
|
return True
|
|
for tok in b:
|
|
if len(tok) >= 4 and tok in klub_clean:
|
|
return True
|
|
return False
|
|
|
|
|
|
def best_search_token(naziv):
|
|
"""Pick the most distinctive single token for HKS search (e.g. 'Škrljevo').
|
|
|
|
Falls back to longest word ≥3 chars when noise-filtered list is empty
|
|
(e.g. 'KK Rijeka - Rijeka' → 'Rijeka').
|
|
"""
|
|
toks = name_tokens(naziv)
|
|
if not toks:
|
|
# noise-only club name — pick longest non-noise-but-permitted word
|
|
all_words = [w for w in re.findall(r'\w+', naziv or '') if len(w) >= 3]
|
|
if not all_words:
|
|
return naziv
|
|
chosen = _ascii_lower(max(all_words, key=len))
|
|
else:
|
|
chosen = max(toks, key=len)
|
|
for w in re.findall(r'\w+', naziv or ''):
|
|
if _ascii_lower(w) == chosen:
|
|
return w
|
|
return chosen
|
|
|
|
|
|
class HKSHarvester(SportHarvester):
|
|
SPORT = 'košarka'
|
|
SOURCE = 'hks_cbf'
|
|
|
|
def get_target_klubovi(self, limit=999):
|
|
"""Override base — task requires ALL 99 PGŽ priority basketball clubs,
|
|
not just financiran/u_godisnjaku ones."""
|
|
from psycopg2.extras import RealDictCursor
|
|
with self.conn.cursor(cursor_factory=RealDictCursor) as cur:
|
|
cur.execute("""
|
|
SELECT * FROM pgz_sport.v_pgz_priority_klubovi
|
|
WHERE sport = %s
|
|
ORDER BY financiran DESC NULLS LAST,
|
|
u_godisnjaku DESC NULLS LAST,
|
|
id
|
|
LIMIT %s
|
|
""", (self.SPORT, limit))
|
|
return cur.fetchall()
|
|
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.http = requests.Session()
|
|
self.http.headers.update({
|
|
"User-Agent": UA,
|
|
"Accept-Language": "hr,en;q=0.8",
|
|
"Accept-Encoding": "gzip, deflate", # avoid brotli — requests' decoder is flaky on chunked br
|
|
})
|
|
self._seen_matches = set() # global de-dup across clubs
|
|
self._klub_match_count = 0 # reset per-klub
|
|
|
|
def _get(self, url, retries=1):
|
|
last_err = None
|
|
for attempt in range(retries + 1):
|
|
try:
|
|
r = self.http.get(url, timeout=HTTP_TIMEOUT)
|
|
if r.status_code == 200:
|
|
return r
|
|
last_err = f"HTTP {r.status_code}"
|
|
except Exception as e:
|
|
last_err = str(e)
|
|
time.sleep(0.6)
|
|
self.log(f" GET fail {url}: {last_err}")
|
|
return None
|
|
|
|
def scrape_klub(self, page, klub):
|
|
self._klub_match_count = 0
|
|
token = best_search_token(klub['naziv'])
|
|
if not token or len(token) < 3:
|
|
self.log(f" 🏀 Klub {klub['id']} {klub['naziv']} → no usable token, skip")
|
|
return
|
|
search_url = f"https://www.hks-cbf.hr/?s={urllib.parse.quote_plus(token)}"
|
|
self.log(f" 🏀 Klub {klub['id']} {klub['naziv']} → token='{token}'")
|
|
|
|
r = self._get(search_url)
|
|
if not r:
|
|
return
|
|
html = r.text
|
|
time.sleep(HTTP_PAUSE_S)
|
|
|
|
seen = set()
|
|
articles = []
|
|
for m in ARTICLE_RE.finditer(html):
|
|
url = m.group(1)
|
|
if url in seen: continue
|
|
seen.add(url)
|
|
articles.append(url)
|
|
if len(articles) >= MAX_ARTICLES_PER_KLUB:
|
|
break
|
|
self.log(f" {len(articles)} article(s)")
|
|
if not articles:
|
|
return
|
|
|
|
with self.conn.cursor() as cur:
|
|
cur.execute("""
|
|
UPDATE pgz_sport.klubovi
|
|
SET source_url = %s
|
|
WHERE id = %s AND (source_url IS NULL OR source_url = '' OR source_url = 'godisnjak_zspgz_2025')
|
|
""", (articles[0], klub['id']))
|
|
|
|
for art_url in articles:
|
|
try:
|
|
self._scrape_article(klub, art_url)
|
|
except Exception as e:
|
|
self.stats['errors'] += 1
|
|
self.log(f" ❌ article {art_url}: {e}")
|
|
if self._klub_match_count >= MAX_MATCHES_PER_KLUB:
|
|
self.log(f" cap reached ({MAX_MATCHES_PER_KLUB} matches)")
|
|
break
|
|
|
|
def scrape_player(self, page, person_id):
|
|
"""Helper: scrape an individual player career page from HKS statistika.
|
|
Genius Sports widget is JS-rendered, so we need Playwright here."""
|
|
url = f"https://www.hks-cbf.hr/statistika/?WHurl=%2Fperson%2F{person_id}"
|
|
try:
|
|
page.goto(url, wait_until="domcontentloaded", timeout=20000)
|
|
page.wait_for_timeout(4000)
|
|
tables = [t.inner_text() for t in page.locator('table').all()[:3]]
|
|
return {"person_id": person_id, "url": url, "tables": tables}
|
|
except Exception as e:
|
|
self.log(f" ❌ scrape_player({person_id}): {e}")
|
|
return None
|
|
|
|
def _scrape_article(self, klub, art_url):
|
|
r = self._get(art_url)
|
|
if not r:
|
|
return
|
|
time.sleep(HTTP_PAUSE_S)
|
|
section_match = re.search(r'https://www\.hks-cbf\.hr/([^/]+)/(\d{4})/', art_url)
|
|
section = section_match.group(1) if section_match else ""
|
|
year = int(section_match.group(2)) if section_match else None
|
|
kategorija = LIGA_SECTIONS.get(section)
|
|
sezona = f"{year-1}/{year}" if year else None
|
|
|
|
seen = set()
|
|
matchids = []
|
|
for m in FIBA_MATCHID_RE.finditer(r.text):
|
|
mid = m.group(1)
|
|
if mid in seen: continue
|
|
seen.add(mid)
|
|
matchids.append(mid)
|
|
if not matchids:
|
|
return
|
|
|
|
for mid in matchids:
|
|
if mid in self._seen_matches:
|
|
continue
|
|
self._seen_matches.add(mid)
|
|
try:
|
|
self._harvest_match(klub, mid, art_url, kategorija, sezona, section)
|
|
except Exception as e:
|
|
self.stats['errors'] += 1
|
|
self.log(f" ❌ match {mid}: {e}")
|
|
if self._klub_match_count >= MAX_MATCHES_PER_KLUB:
|
|
return
|
|
|
|
def _harvest_match(self, klub, matchid, art_url, kategorija, sezona, section):
|
|
url = f"https://fibalivestats.dcd.shared.geniussports.com/data/{matchid}/data.json"
|
|
r = self._get(url, retries=2)
|
|
if not r:
|
|
return
|
|
try:
|
|
data = r.json()
|
|
except Exception as e:
|
|
self.log(f" ⚠️ {matchid} JSON parse: {e}")
|
|
return
|
|
time.sleep(HTTP_PAUSE_S)
|
|
|
|
tm = data.get('tm') or {}
|
|
if not tm:
|
|
return
|
|
|
|
side_key = None
|
|
for side in ('1', '2'):
|
|
t = tm.get(side, {})
|
|
tname = t.get('name') or t.get('nameInternational') or ''
|
|
if fuzzy_klub_match(klub['naziv'], tname):
|
|
side_key = side
|
|
break
|
|
if not side_key:
|
|
n1 = (tm.get('1') or {}).get('name')
|
|
n2 = (tm.get('2') or {}).get('name')
|
|
self.log(f" ⚠️ {matchid} no side match for '{klub['naziv']}' (sides: {n1!r}, {n2!r})")
|
|
return
|
|
|
|
team = tm[side_key]
|
|
klub_naziv = team.get('name') or klub['naziv']
|
|
natjecanje = kategorija or section or "košarka"
|
|
natjecanje_match = f"{natjecanje} match {matchid}"
|
|
|
|
players = team.get('pl') or {}
|
|
iter_pairs = list(players.items()) if isinstance(players, dict) else list(enumerate(players))
|
|
|
|
added = 0
|
|
for pkey, p in iter_pairs:
|
|
if not isinstance(p, dict):
|
|
continue
|
|
ime = (p.get('firstName') or p.get('internationalFirstName') or '').strip()
|
|
prezime = (p.get('familyName') or p.get('internationalFamilyName') or '').strip()
|
|
if not (ime or prezime):
|
|
continue
|
|
full_slug = self.slugify(f"{ime} {prezime}")
|
|
source_id = full_slug or f"m{matchid}p{pkey}"
|
|
extra = {
|
|
"shirtNumber": p.get('shirtNumber'),
|
|
"playingPosition": p.get('playingPosition'),
|
|
"scoreboardName": p.get('scoreboardName'),
|
|
"photoT": (p.get('photoT') or {}).get('url') if isinstance(p.get('photoT'), dict) else p.get('photoT'),
|
|
"matchids_seen": [matchid],
|
|
}
|
|
try:
|
|
clan_id = self.upsert_clan(
|
|
klub_id=klub['id'],
|
|
source_id=source_id,
|
|
ime=ime,
|
|
prezime=prezime,
|
|
source_url=art_url,
|
|
kategorija=kategorija,
|
|
sezona=sezona,
|
|
extra=extra,
|
|
)
|
|
except Exception as e:
|
|
self.stats['errors'] += 1
|
|
self.log(f" ❌ upsert_clan {ime} {prezime}: {e}")
|
|
continue
|
|
|
|
stats = {
|
|
'nastupi': 1,
|
|
'golovi': None,
|
|
'asistencije': p.get('sAssists'),
|
|
'bodovi': p.get('sPoints'),
|
|
'trice': p.get('sThreePointersMade'),
|
|
'skokovi': p.get('sReboundsTotal'),
|
|
'blokade': p.get('sBlocks'),
|
|
'servis_asovi': None,
|
|
'zuti': None,
|
|
'crveni': 1 if (p.get('sFoulsPersonal') or 0) >= 5 else 0,
|
|
'minute': parse_mm_ss(p.get('sMinutes')),
|
|
}
|
|
try:
|
|
self.upsert_stats(
|
|
clan_id=clan_id,
|
|
sezona=sezona,
|
|
klub_id=klub['id'],
|
|
klub_naziv=klub_naziv,
|
|
natjecanje=natjecanje_match,
|
|
kategorija=kategorija,
|
|
stats_dict=stats,
|
|
raw={'matchid': matchid, 'art_url': art_url, 'player': p},
|
|
)
|
|
self.stats['stats'] += 1
|
|
except Exception as e:
|
|
self.stats['errors'] += 1
|
|
self.log(f" ❌ upsert_stats {ime} {prezime}: {e}")
|
|
continue
|
|
added += 1
|
|
|
|
self.stats['players'] += added
|
|
self._klub_match_count += 1
|
|
self.log(f" ✅ {matchid} side={side_key} '{klub_naziv}' → {added} players")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
HKSHarvester().run(limit=int(sys.argv[1]) if len(sys.argv) > 1 else 50)
|