Sidebar: +ERP +CRM +Dokumenti, godišnjaci import (18 PDFs), filter helpers

- pgz nav now includes /erp/full, /crm/v2, /admin/users, /dokumenti
- 4 dokumenti endpoints: list, godišnjaci/list, godišnjak/{godina} PDF, detail
- 18 godišnjaka u pgz_sport.dokumenti (2006-2024) with savez_id=333
- PGŽ filter helpers (window._pgz_filter_priority, togglePGZFilter)
- navItemClick handler for nav items with href
This commit is contained in:
2026-05-05 13:08:11 +02:00
parent 9fb512932a
commit 1d02c0897d
970 changed files with 268354 additions and 434 deletions
+359 -18
View File
@@ -1,32 +1,373 @@
#!/usr/bin/env python3
"""HKS-CBF + FIBA LiveStats basketball harvester."""
import sys, re
# hks_basketball.py — HKS-CBF + FIBA LiveStats košarka harvester
# v2.0 — dradulic@outlook.com / damir@rinet.one — 2026-05-05
# Harvests rosters + per-match player stats for PGŽ priority basketball clubs.
# Path: HKS search (?s=naziv) → match recap articles → FIBA LiveStats matchid →
# https://fibalivestats.dcd.shared.geniussports.com/data/{matchid}/data.json
# (public JSON boxscore) → upsert clanovi + clan_kategorije + player_stats.
import sys, re, time, json, urllib.parse
import requests
sys.path.insert(0, '/opt/pgz-sport/scripts/sport_harvesters')
from __base import SportHarvester
UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36"
LIGA_SECTIONS = {
"supersport-premijer": "Seniori (Premijer)",
"prva-muska-liga": "Seniori (1.HML)",
"druga-muska-liga": "Seniori (2.HML)",
"premijer-zenska-liga": "Seniorke (Premijer)",
"prva-zenska-liga": "Seniorke (1.ŽKL)",
"druga-zenska-liga": "Seniorke (2.ŽKL)",
"jedinstvena-kadetska-liga":"Kadeti",
"kadetska-liga": "Kadeti",
"juniorska-liga": "Juniori",
"mlade-kategorije": "Mladi",
"mini-kosarka": "Mini",
}
NOISE_TOKENS = {
"kk", "zkk", "kosarkaski", "klub", "udruga", "savez", "skola",
"primorsko-goranske", "primorsko", "goranske", "zupanije",
"rijeka", "rijeke", # too generic on its own; only used if it's the longest token
}
ARTICLE_RE = re.compile(
r'href="(https://www\.hks-cbf\.hr/(' + '|'.join(re.escape(k) for k in LIGA_SECTIONS) + r')/(\d{4})/[^"]+/)"'
)
FIBA_MATCHID_RE = re.compile(
r'fibalivestats\.dcd\.shared\.geniussports\.com/u/HKS/(\d+)/'
)
MAX_ARTICLES_PER_KLUB = 8
MAX_MATCHES_PER_KLUB = 30
HTTP_TIMEOUT = 15
HTTP_PAUSE_S = 0.4
def parse_mm_ss(s):
if not s or not isinstance(s, str): return None
m = re.match(r'^(\d{1,3}):(\d{2})$', s.strip())
if not m: return None
return int(m.group(1))
def _ascii_lower(s):
t = (s or '').lower()
for old, new in [('š','s'),('č','c'),('ć','c'),('ž','z'),('đ','d')]:
t = t.replace(old, new)
return t
def name_tokens(naziv):
"""Distinctive tokens from a club name, stripped of generic noise."""
t = re.sub(r'[^\wšđč枊ĐČĆŽ\s-]', ' ', naziv or '')
t = _ascii_lower(t)
parts = [p for p in re.split(r'\s+', t) if p and p not in NOISE_TOKENS and len(p) > 2]
return parts
def name_abbrev(naziv):
"""Acronym from significant tokens, e.g. 'Flumen Sancti Viti''fsv'."""
toks = name_tokens(naziv)
if len(toks) < 2:
return None
return ''.join(t[0] for t in toks if t)
def fuzzy_klub_match(klub_naziv, side_name):
"""True iff klub_naziv likely refers to the same club as side_name.
Strategies:
1. token overlap (3+ char tokens, post noise filter).
2. abbreviation match (e.g. 'FSV' = 'Flumen Sancti Viti').
3. substring match on ascii-folded slugs (≥4 char overlap).
"""
a = set(name_tokens(klub_naziv))
b = set(name_tokens(side_name))
if a & b:
return True
abb_a = name_abbrev(klub_naziv) or ''
abb_b = name_abbrev(side_name) or ''
# treat single-token side names ≥3 chars as candidate acronyms too
side_clean = _ascii_lower(re.sub(r'[^A-Za-zšđč枊ĐČĆŽ]', '', side_name or ''))
klub_clean = _ascii_lower(re.sub(r'[^A-Za-zšđč枊ĐČĆŽ]', '', klub_naziv or ''))
if abb_a and len(abb_a) >= 3 and abb_a in side_clean:
return True
if abb_b and len(abb_b) >= 3 and abb_b in klub_clean:
return True
# Treat any 4+ char shared substring as match (e.g. 'kvarner' in both)
for tok in a:
if len(tok) >= 4 and tok in side_clean:
return True
for tok in b:
if len(tok) >= 4 and tok in klub_clean:
return True
return False
def best_search_token(naziv):
"""Pick the most distinctive single token for HKS search (e.g. 'Škrljevo').
Falls back to longest word ≥3 chars when noise-filtered list is empty
(e.g. 'KK Rijeka - Rijeka''Rijeka').
"""
toks = name_tokens(naziv)
if not toks:
# noise-only club name — pick longest non-noise-but-permitted word
all_words = [w for w in re.findall(r'\w+', naziv or '') if len(w) >= 3]
if not all_words:
return naziv
chosen = _ascii_lower(max(all_words, key=len))
else:
chosen = max(toks, key=len)
for w in re.findall(r'\w+', naziv or ''):
if _ascii_lower(w) == chosen:
return w
return chosen
class HKSHarvester(SportHarvester):
SPORT = 'košarka'
SOURCE = 'hks_cbf'
def get_target_klubovi(self, limit=999):
"""Override base — task requires ALL 99 PGŽ priority basketball clubs,
not just financiran/u_godisnjaku ones."""
from psycopg2.extras import RealDictCursor
with self.conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT * FROM pgz_sport.v_pgz_priority_klubovi
WHERE sport = %s
ORDER BY financiran DESC NULLS LAST,
u_godisnjaku DESC NULLS LAST,
id
LIMIT %s
""", (self.SPORT, limit))
return cur.fetchall()
def __init__(self):
super().__init__()
self.http = requests.Session()
self.http.headers.update({
"User-Agent": UA,
"Accept-Language": "hr,en;q=0.8",
"Accept-Encoding": "gzip, deflate", # avoid brotli — requests' decoder is flaky on chunked br
})
self._seen_matches = set() # global de-dup across clubs
self._klub_match_count = 0 # reset per-klub
def _get(self, url, retries=1):
last_err = None
for attempt in range(retries + 1):
try:
r = self.http.get(url, timeout=HTTP_TIMEOUT)
if r.status_code == 200:
return r
last_err = f"HTTP {r.status_code}"
except Exception as e:
last_err = str(e)
time.sleep(0.6)
self.log(f" GET fail {url}: {last_err}")
return None
def scrape_klub(self, page, klub):
# Discovery: try search hks-cbf.hr by club name
url = f"https://www.hks-cbf.hr/?s={klub['naziv'].replace(' ','+')}"
self.log(f" 🏀 Klub {klub['id']} {klub['naziv']}{url}")
self._klub_match_count = 0
token = best_search_token(klub['naziv'])
if not token or len(token) < 3:
self.log(f" 🏀 Klub {klub['id']} {klub['naziv']} → no usable token, skip")
return
search_url = f"https://www.hks-cbf.hr/?s={urllib.parse.quote_plus(token)}"
self.log(f" 🏀 Klub {klub['id']} {klub['naziv']} → token='{token}'")
r = self._get(search_url)
if not r:
return
html = r.text
time.sleep(HTTP_PAUSE_S)
seen = set()
articles = []
for m in ARTICLE_RE.finditer(html):
url = m.group(1)
if url in seen: continue
seen.add(url)
articles.append(url)
if len(articles) >= MAX_ARTICLES_PER_KLUB:
break
self.log(f" {len(articles)} article(s)")
if not articles:
return
with self.conn.cursor() as cur:
cur.execute("""
UPDATE pgz_sport.klubovi
SET source_url = %s
WHERE id = %s AND (source_url IS NULL OR source_url = '' OR source_url = 'godisnjak_zspgz_2025')
""", (articles[0], klub['id']))
for art_url in articles:
try:
self._scrape_article(klub, art_url)
except Exception as e:
self.stats['errors'] += 1
self.log(f" ❌ article {art_url}: {e}")
if self._klub_match_count >= MAX_MATCHES_PER_KLUB:
self.log(f" cap reached ({MAX_MATCHES_PER_KLUB} matches)")
break
def scrape_player(self, page, person_id):
"""Helper: scrape an individual player career page from HKS statistika.
Genius Sports widget is JS-rendered, so we need Playwright here."""
url = f"https://www.hks-cbf.hr/statistika/?WHurl=%2Fperson%2F{person_id}"
try:
page.goto(url, wait_until="domcontentloaded", timeout=20000)
# Look for /klubovi/ or /klub/ link
klub_links = page.locator('a[href*="/klubovi/"], a[href*="/klub/"]').all()
for a in klub_links[:3]:
href = a.get_attribute('href')
if href and 'klub' in href:
self.log(f" Found: {href}")
# Save URL to klub
with self.conn.cursor() as cur:
cur.execute("UPDATE pgz_sport.klubovi SET source_url = COALESCE(NULLIF(source_url,''), %s) WHERE id = %s",
(href, klub['id']))
break
page.wait_for_timeout(4000)
tables = [t.inner_text() for t in page.locator('table').all()[:3]]
return {"person_id": person_id, "url": url, "tables": tables}
except Exception as e:
self.log(f"{e}")
self.log(f" scrape_player({person_id}): {e}")
return None
def _scrape_article(self, klub, art_url):
r = self._get(art_url)
if not r:
return
time.sleep(HTTP_PAUSE_S)
section_match = re.search(r'https://www\.hks-cbf\.hr/([^/]+)/(\d{4})/', art_url)
section = section_match.group(1) if section_match else ""
year = int(section_match.group(2)) if section_match else None
kategorija = LIGA_SECTIONS.get(section)
sezona = f"{year-1}/{year}" if year else None
seen = set()
matchids = []
for m in FIBA_MATCHID_RE.finditer(r.text):
mid = m.group(1)
if mid in seen: continue
seen.add(mid)
matchids.append(mid)
if not matchids:
return
for mid in matchids:
if mid in self._seen_matches:
continue
self._seen_matches.add(mid)
try:
self._harvest_match(klub, mid, art_url, kategorija, sezona, section)
except Exception as e:
self.stats['errors'] += 1
self.log(f" ❌ match {mid}: {e}")
if self._klub_match_count >= MAX_MATCHES_PER_KLUB:
return
def _harvest_match(self, klub, matchid, art_url, kategorija, sezona, section):
url = f"https://fibalivestats.dcd.shared.geniussports.com/data/{matchid}/data.json"
r = self._get(url, retries=2)
if not r:
return
try:
data = r.json()
except Exception as e:
self.log(f" ⚠️ {matchid} JSON parse: {e}")
return
time.sleep(HTTP_PAUSE_S)
tm = data.get('tm') or {}
if not tm:
return
side_key = None
for side in ('1', '2'):
t = tm.get(side, {})
tname = t.get('name') or t.get('nameInternational') or ''
if fuzzy_klub_match(klub['naziv'], tname):
side_key = side
break
if not side_key:
n1 = (tm.get('1') or {}).get('name')
n2 = (tm.get('2') or {}).get('name')
self.log(f" ⚠️ {matchid} no side match for '{klub['naziv']}' (sides: {n1!r}, {n2!r})")
return
team = tm[side_key]
klub_naziv = team.get('name') or klub['naziv']
natjecanje = kategorija or section or "košarka"
natjecanje_match = f"{natjecanje} match {matchid}"
players = team.get('pl') or {}
iter_pairs = list(players.items()) if isinstance(players, dict) else list(enumerate(players))
added = 0
for pkey, p in iter_pairs:
if not isinstance(p, dict):
continue
ime = (p.get('firstName') or p.get('internationalFirstName') or '').strip()
prezime = (p.get('familyName') or p.get('internationalFamilyName') or '').strip()
if not (ime or prezime):
continue
full_slug = self.slugify(f"{ime} {prezime}")
source_id = full_slug or f"m{matchid}p{pkey}"
extra = {
"shirtNumber": p.get('shirtNumber'),
"playingPosition": p.get('playingPosition'),
"scoreboardName": p.get('scoreboardName'),
"photoT": (p.get('photoT') or {}).get('url') if isinstance(p.get('photoT'), dict) else p.get('photoT'),
"matchids_seen": [matchid],
}
try:
clan_id = self.upsert_clan(
klub_id=klub['id'],
source_id=source_id,
ime=ime,
prezime=prezime,
source_url=art_url,
kategorija=kategorija,
sezona=sezona,
extra=extra,
)
except Exception as e:
self.stats['errors'] += 1
self.log(f" ❌ upsert_clan {ime} {prezime}: {e}")
continue
stats = {
'nastupi': 1,
'golovi': None,
'asistencije': p.get('sAssists'),
'bodovi': p.get('sPoints'),
'trice': p.get('sThreePointersMade'),
'skokovi': p.get('sReboundsTotal'),
'blokade': p.get('sBlocks'),
'servis_asovi': None,
'zuti': None,
'crveni': 1 if (p.get('sFoulsPersonal') or 0) >= 5 else 0,
'minute': parse_mm_ss(p.get('sMinutes')),
}
try:
self.upsert_stats(
clan_id=clan_id,
sezona=sezona,
klub_id=klub['id'],
klub_naziv=klub_naziv,
natjecanje=natjecanje_match,
kategorija=kategorija,
stats_dict=stats,
raw={'matchid': matchid, 'art_url': art_url, 'player': p},
)
self.stats['stats'] += 1
except Exception as e:
self.stats['errors'] += 1
self.log(f" ❌ upsert_stats {ime} {prezime}: {e}")
continue
added += 1
self.stats['players'] += added
self._klub_match_count += 1
self.log(f"{matchid} side={side_key} '{klub_naziv}'{added} players")
if __name__ == '__main__':
HKSHarvester().run(limit=int(sys.argv[1]) if len(sys.argv) > 1 else 50)
+405 -10
View File
@@ -1,21 +1,416 @@
#!/usr/bin/env python3
"""HOS volleyball harvester."""
import sys
# hos_volleyball.py
# v1.0.0 — dradulic@outlook.com / damir@rinet.one — 2026-05-05
# HOS odbojka harvester: hos-cvf.hr natjecanja + standings, hos-web.dataproject.com match stats.
# Targets all 77 PGŽ odbojka klubova.
import sys, re, json, time
import html as ihtml
from datetime import datetime
import requests
from psycopg2.extras import RealDictCursor
sys.path.insert(0, '/opt/pgz-sport/scripts/sport_harvesters')
from __base import SportHarvester
UA = "RiNET-Civic/1.0 (https://rinet.one)"
HDR = {"User-Agent": UA}
def _http_get(url, retries=1):
last = None
for i in range(retries + 1):
try:
r = requests.get(url, headers=HDR, timeout=25)
if r.status_code == 200 and r.text:
return r.text
last = f"HTTP {r.status_code}"
except Exception as e:
last = str(e)
time.sleep(2)
raise RuntimeError(f"GET {url} failed: {last}")
def _strip_tags(s):
return ihtml.unescape(re.sub(r'<[^>]+>', '', s or '')).strip()
def _parse_standings(html):
"""Return list of {poz, klub, uk, pob, por, bod} from first plausible table."""
tables = re.findall(r'<table[^>]*>(.+?)</table>', html, re.DOTALL | re.IGNORECASE)
for tbl in tables:
rows = re.findall(r'<tr[^>]*>(.+?)</tr>', tbl, re.DOTALL | re.IGNORECASE)
out = []
for row in rows:
cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', row, re.DOTALL | re.IGNORECASE)
clean = [_strip_tags(c) for c in cells]
if not clean or not clean[0]:
continue
if clean[0] in ('', '#', 'Pos', 'Poz', 'R'):
continue
try:
m = re.match(r'(\d+)\.?', clean[0])
if not m:
continue
poz = int(m.group(1))
if len(clean) < 5:
continue
klub = clean[2] if (len(clean) >= 7 and not re.match(r'^\d+$', clean[2])) else clean[1]
if not klub or re.match(r'^\d+$', klub):
continue
numcells = [c for c in clean if re.match(r'^-?\d+$', c)]
if len(numcells) < 4:
continue
tail = numcells[1:]
uk = int(tail[0])
pob = int(tail[1])
por = int(tail[2])
bod = int(tail[-1])
out.append({'poz': poz, 'klub': klub, 'uk': uk, 'pob': pob, 'por': por, 'bod': bod})
except Exception:
continue
if out and len(out) >= 2:
return out
return []
def _parse_title(html):
m = re.search(r'<h1[^>]*>(.*?)</h1>', html, re.DOTALL | re.IGNORECASE)
if m:
t = _strip_tags(m.group(1))
if t and len(t) > 4:
return t
m = re.search(r'<title[^>]*>(.*?)</title>', html, re.DOTALL | re.IGNORECASE)
if m:
return _strip_tags(m.group(1))
return None
def _detect_razina_spol(title):
t = (title or '').lower()
razina = None
for key, lab in [
('superliga 2', 'Superliga 2'),
('superliga', 'Superliga'),
('1. liga', '1.liga'), ('1.liga', '1.liga'),
('2. liga', '2.liga'), ('2.liga', '2.liga'),
('3. liga', '3.liga'), ('3.liga', '3.liga'),
('kup', 'Kup'),
('kadeti', 'Kadeti'), ('kadetkinje', 'Kadetkinje'),
('juniori', 'Juniori'), ('juniorke', 'Juniorke'),
('mini', 'Mini'),
('beach', 'Beach'), ('pijesku', 'Beach'),
]:
if key in t:
razina = lab
break
spol = None
if re.search(r'\(\s*[mM]\s*\)|\bmu[šs]ki\b|\bmuska\b|\bjuniori\b|\bkadeti\b', t):
spol = 'M'
elif re.search(r'\(\s*[ŽzZ]\s*\)|\bžen|\bjuniorke\b|\bkadetkinje\b', t):
spol = 'Ž'
return razina, spol
class HOSHarvester(SportHarvester):
SPORT = 'odbojka'
SOURCE = 'hos'
def scrape_klub(self, page, klub):
# HOS-CVF.hr search
self.log(f" 🏐 Klub {klub['id']} {klub['naziv']}")
BASE_CVF = 'https://hos-cvf.hr'
BASE_DP = 'https://hos-web.dataproject.com'
SEZONA = '2025/26'
MAX_NATJ = 80
MAX_MATCHES_PER_KLUB = 5
MAX_MATCHES_TOTAL = 120
def __init__(self):
super().__init__()
self._natj_by_klub = {}
self._matches_for_klub = {}
self._dp_match_seen = set()
self._matches_scraped_total = 0
self.stats.setdefault('natjecanja', 0)
self.stats.setdefault('tablice', 0)
self.stats.setdefault('matches', 0)
def get_target_klubovi(self, limit=999):
with self.conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT * FROM pgz_sport.v_pgz_priority_klubovi
WHERE sport = 'odbojka'
ORDER BY (financiran OR u_godisnjaku) DESC, id
LIMIT %s
""", (limit,))
return cur.fetchall()
def _discover_natjecanje_ids(self):
try:
page.goto("https://hos-cvf.hr/", wait_until="domcontentloaded", timeout=20000)
self.log(f" [discovery mode] HOS site loaded")
html = _http_get(self.BASE_CVF + '/')
except Exception as e:
self.log(f" {e}")
self.log(f"⚠ failed to load hos-cvf.hr: {e}")
return []
ids = sorted({int(m) for m in re.findall(r'natjecanje\.php\?id=(\d+)', html)})
self.log(f" found {len(ids)} natjecanje ids on hos-cvf.hr")
return ids[:self.MAX_NATJ]
def _upsert_natjecanje(self, nid, naziv, razina, spol, source_url):
with self.conn.cursor() as cur:
cur.execute("""
INSERT INTO pgz_sport.natjecanja
(sport, naziv, razina, sezona, spol, source, external_id, external_url,
source_id, source_url, status, updated_at)
VALUES ('odbojka', %s, %s, %s, %s, 'hos_cvf', %s, %s, %s, %s, 'aktivno', now())
ON CONFLICT (source, external_id) DO UPDATE
SET naziv = EXCLUDED.naziv,
razina = COALESCE(EXCLUDED.razina, pgz_sport.natjecanja.razina),
spol = COALESCE(EXCLUDED.spol, pgz_sport.natjecanja.spol),
sezona = EXCLUDED.sezona,
source_url = EXCLUDED.source_url,
external_url = EXCLUDED.external_url,
updated_at = now()
RETURNING id
""", (naziv, razina, self.SEZONA, spol, str(nid), source_url, str(nid), source_url))
return cur.fetchone()[0]
def _find_klub_id(self, klub_naziv):
with self.conn.cursor() as cur:
cur.execute("""
SELECT id, region FROM pgz_sport.klubovi
WHERE sport = 'odbojka'
AND (LOWER(naziv) = LOWER(%s) OR LOWER(naziv) LIKE LOWER(%s))
ORDER BY CASE WHEN aktivan THEN 0 ELSE 1 END,
CASE WHEN region='PGŽ' THEN 0 ELSE 1 END,
id
LIMIT 1
""", (klub_naziv, f"%{klub_naziv}%"))
r = cur.fetchone()
if r:
return r[0]
target = self.slugify(klub_naziv)
toks = [t for t in target.split('-') if len(t) > 3]
if not toks:
return None
with self.conn.cursor() as cur:
cur.execute("""
SELECT id, naziv FROM pgz_sport.klubovi
WHERE sport='odbojka' AND aktivan
""")
best = None
best_score = 0
for kid, knaz in cur.fetchall():
kslug = self.slugify(knaz)
score = sum(1 for t in toks if t in kslug)
if score > best_score:
best_score = score
best = kid
if best_score >= max(1, len(toks) - 1):
return best
return None
def _replace_tablice(self, natj_id, source_url, rows, spol):
with self.conn.cursor() as cur:
cur.execute("DELETE FROM pgz_sport.natjecanja_tablice WHERE natjecanje_id=%s AND source='hos_cvf'", (natj_id,))
for r in rows:
klub_id = self._find_klub_id(r['klub'])
cur.execute("""
INSERT INTO pgz_sport.natjecanja_tablice
(natjecanje_id, klub_id, klub_naziv, pozicija, odigrano, pobjede,
nerijeseno, porazi, gol_z, gol_p, gol_razlika, bodovi,
source, source_url, updated_at, extra_data)
VALUES (%s, %s, %s, %s, %s, %s, 0, %s, 0, 0, 0, %s,
'hos_cvf', %s, now(), %s::jsonb)
ON CONFLICT (natjecanje_id, klub_naziv) DO UPDATE SET
pozicija = EXCLUDED.pozicija,
odigrano = EXCLUDED.odigrano,
pobjede = EXCLUDED.pobjede,
porazi = EXCLUDED.porazi,
bodovi = EXCLUDED.bodovi,
klub_id = COALESCE(EXCLUDED.klub_id, pgz_sport.natjecanja_tablice.klub_id),
source_url = EXCLUDED.source_url,
updated_at = now()
""", (natj_id, klub_id, r['klub'], r['poz'], r['uk'], r['pob'], r['por'],
r['bod'], source_url, json.dumps({'spol': spol})))
if klub_id:
self._natj_by_klub.setdefault(klub_id, []).append({
'natj_id': natj_id,
'natj_naziv': None,
'url': source_url,
'klub_naziv': r['klub'],
'pozicija': r['poz'],
})
def _harvest_natjecanje(self, nid):
url = f"{self.BASE_CVF}/natjecanje.php?id={nid}"
try:
html = _http_get(url)
except Exception as e:
self.log(f" ⚠ natj {nid}: {e}")
self.stats['errors'] += 1
return
title = _parse_title(html) or f"HOS natjecanje #{nid}"
razina, spol = _detect_razina_spol(title)
natj_id = self._upsert_natjecanje(nid, title, razina, spol, url)
rows = _parse_standings(html)
if rows:
self._replace_tablice(natj_id, url, rows, spol)
self.stats['tablice'] += len(rows)
for kid, entries in self._natj_by_klub.items():
for e in entries:
if e['natj_id'] == natj_id and e.get('natj_naziv') is None:
e['natj_naziv'] = title
mids = sorted({int(m) for m in re.findall(r'MatchStatistics\.aspx\?mID=(\d+)', html, re.IGNORECASE)})
if mids:
klub_ids_here = [kid for kid, entries in self._natj_by_klub.items()
if any(e['natj_id'] == natj_id for e in entries)]
for kid in klub_ids_here:
bucket = self._matches_for_klub.setdefault(kid, [])
for mid in mids:
bucket.append({'mid': mid, 'natj_id': natj_id, 'natj_naziv': title})
self.stats['natjecanja'] += 1
def _harvest_federation(self):
self.log("📋 preflight: hos-cvf.hr natjecanja discovery")
ids = self._discover_natjecanje_ids()
for nid in ids:
self._harvest_natjecanje(nid)
self.log(f" preflight done: natjecanja={self.stats['natjecanja']}, "
f"tablice={self.stats['tablice']}, klubova_with_match={len(self._natj_by_klub)}")
def _scrape_dp_match(self, page, mid, klub_id, klub_naziv, natj_naziv):
if mid in self._dp_match_seen:
return 0
if self._matches_scraped_total >= self.MAX_MATCHES_TOTAL:
return 0
url = f"{self.BASE_DP}/MatchStatistics.aspx?mID={mid}"
added = 0
try:
page.goto(url, wait_until='domcontentloaded', timeout=30000)
try:
page.wait_for_load_state('networkidle', timeout=10000)
except Exception:
pass
self._dp_match_seen.add(mid)
self._matches_scraped_total += 1
self.stats['matches'] += 1
rows = []
for sel in ['table.statTbl tr', 'table.report tr', 'table tr']:
try:
txts = page.locator(sel).all_inner_texts()
except Exception:
txts = []
if txts:
rows = txts
break
for txt in rows:
line = re.sub(r'\s+', ' ', txt.replace('\t', ' ')).strip()
if not line:
continue
m = re.match(r'^(\d{1,3})\s+([A-ZČĆŽŠĐ][\wČĆŽŠĐčćžšđ\.\-\']+(?:\s+[A-ZČĆŽŠĐ][\wČĆŽŠĐčćžšđ\.\-\']+)+)\b(.*)$', line)
if not m:
continue
jersey = m.group(1)
fullname = m.group(2).strip()
tail = m.group(3).strip()
nums = [int(x) for x in re.findall(r'-?\d+', tail)]
if not nums:
continue
pts = nums[0] if len(nums) >= 1 else None
aces = nums[5] if len(nums) > 5 else None
blocks = nums[7] if len(nums) > 7 else None
parts = fullname.split()
if parts[0].isupper() and len(parts) >= 2:
prezime = parts[0].title()
ime = ' '.join(parts[1:])
else:
ime = parts[0]
prezime = ' '.join(parts[1:]) if len(parts) > 1 else ''
slug_key = self.slugify(fullname)
source_id = f"dp:{mid}:{jersey}:{slug_key}"
try:
clan_id = self.upsert_clan(
klub_id=klub_id, source_id=source_id,
ime=ime, prezime=prezime,
source_url=url, kategorija='senior', sezona=self.SEZONA,
extra={'dp_match_id': mid, 'jersey': jersey},
)
self.upsert_stats(
clan_id=clan_id, sezona=self.SEZONA,
klub_id=klub_id, klub_naziv=klub_naziv,
natjecanje=natj_naziv, kategorija='senior',
stats_dict={
'nastupi': 1,
'bodovi': pts,
'servis_asovi': aces,
'blokade': blocks,
},
raw={'mid': mid, 'jersey': jersey, 'name': fullname, 'tail_nums': nums},
)
self.stats['players'] += 1
self.stats['stats'] += 1
added += 1
except Exception as e:
self.log(f" ⚠ upsert player '{fullname}': {e}")
except Exception as e:
self.log(f" ⚠ dp match {mid}: {e}")
self.stats['errors'] += 1
return added
def scrape_klub(self, page, klub):
self.log(f" 🏐 Klub {klub['id']} {klub['naziv']}")
entries = list(self._natj_by_klub.get(klub['id'], []))
if not entries:
kslug = self.slugify(klub['naziv'])
ktoks = [t for t in kslug.split('-') if len(t) > 3]
if ktoks:
for kid, ents in list(self._natj_by_klub.items()):
for e in ents:
eslug = self.slugify(e['klub_naziv'])
if sum(1 for t in ktoks if t in eslug) >= max(1, len(ktoks) - 1):
entries.append(e)
break
if entries:
break
if entries:
first = entries[0]
with self.conn.cursor() as cur:
cur.execute("""
UPDATE pgz_sport.klubovi
SET source_url = COALESCE(NULLIF(source_url, ''), %s),
source = COALESCE(source, 'hos_cvf'),
last_scraped_at = now()
WHERE id = %s
""", (first['url'], klub['id']))
naz_list = ', '.join(sorted({(e.get('natj_naziv') or '?') for e in entries}))[:120]
self.log(f"{len(entries)} natjecanja: {naz_list}")
else:
self.log(f" · no HOS natjecanje hit")
match_bucket = self._matches_for_klub.get(klub['id'], [])
if not match_bucket and entries:
for kid, ents in self._natj_by_klub.items():
if any(e['natj_id'] == entries[0]['natj_id'] for e in ents):
match_bucket = self._matches_for_klub.get(kid, [])
if match_bucket:
break
scraped_for_klub = 0
for m in match_bucket:
if scraped_for_klub >= self.MAX_MATCHES_PER_KLUB:
break
if self._matches_scraped_total >= self.MAX_MATCHES_TOTAL:
break
n = self._scrape_dp_match(page, m['mid'], klub['id'], klub['naziv'], m['natj_naziv'] or 'HOS')
if n > 0:
scraped_for_klub += 1
if scraped_for_klub:
self.log(f" ↳ scraped {scraped_for_klub} match(es) from dataproject")
def run(self, limit=999):
self._harvest_federation()
super().run(limit)
if __name__ == '__main__':
HOSHarvester().run(limit=int(sys.argv[1]) if len(sys.argv) > 1 else 50)
HOSHarvester().run(limit=int(sys.argv[1]) if len(sys.argv) > 1 else 999)
+484 -22
View File
@@ -1,27 +1,489 @@
#!/usr/bin/env python3
"""HRS handball harvester."""
import sys
"""
hrs_handball.py — HRS Rukomet harvester v1.0
Authors: dradulic@outlook.com / damir@rinet.one
Date: 2026-05-05
Description:
Scrapes Hrvatski rukometni savez (HRS) competition data via the
sportinfocentar2.com JSON endpoints (no HTML rendering needed):
- https://www.sportinfocentar2.com/coman/natjecanje{LID}.js
→ league JSON: lige[].utakmice[] {broj, e1, e2, k1, k2, d, ...}
- https://www.sportinfocentar2.com/ziceri/webupitisapi.dll?tab=128&utakmica={MID}
→ per-match player roster + box-score stats
Fuzzy-matches HRS team names to PGŽ priority handball clubs (~71) in
pgz_sport.klubovi, then aggregates each player's per-(klub, natjecanje, sezona)
totals into pgz_sport.player_stats; upserts pgz_sport.clanovi + clan_kategorije.
Run:
python3 /opt/pgz-sport/scripts/sport_harvesters/hrs_handball.py [LIMIT_NATJECANJA]
"""
import os, sys, re, json, time, unicodedata
import urllib.request
import urllib.error
from datetime import datetime, date
from collections import defaultdict
sys.path.insert(0, '/opt/pgz-sport/scripts/sport_harvesters')
from __base import SportHarvester
from __base import SportHarvester # noqa: E402
class HRSHarvester(SportHarvester):
SPORT = 'rukomet'
SOURCE = 'hrs'
def scrape_klub(self, page, klub):
url = f"https://hrs.hr/?s={klub['naziv'].replace(' ','+')}"
self.log(f" 🤾 Klub {klub['id']} {klub['naziv']}")
import psycopg2
from psycopg2.extras import RealDictCursor
# ─── HRS league IDs (HRS top menu, 2025/26) ────────────────────────────────
HRS_NATJECANJA = [
# Seniori M
1632, # Paket24 Premijer liga (M)
1633, # 1. HRL Sjever - M
1634, # 1. HRL Jug - M
1639, # 2. HRL Istok - M
1641, # 2. HRL Zapad - M ★ PGŽ
1642, # 2. HRL Sjever - M
1643, # 2. HRL Jug - M
1675, # 3. HRL Istok - M
1676, # 3. HRL Sjever - M
1677, # 3. HRL Središte - M
1678, # 3. HRL Zapad - M ★ PGŽ
1384, # Međužupanijska liga
# Seniori Ž
1629, # 1. HRL Žene
1637, # 2. HRL Sjever - Ž
1638, # 2. HRL Zapad - Ž ★ PGŽ
1644, # 2. HRL Jug - Ž
1671, # 3. HRL Sjever - Ž
1672, # 3. HRL Zapad - Ž ★ PGŽ
1673, # 3. HRL Središte - Ž
1674, # 3. HRL Istok - Ž
# Mladi M
1389, # 1. HRL U18 - M
1705, # 1. HRL U17 - M
1763, # 2. HRL U17 - M
1706, # 1. HRL U15 - M
1716, # 2. HRL U15 - M
1707, # 1. HRL U13 - M
1717, # 2. HRL U13 - M
1746, # 1. HRL U12 - M
1709, # 1. HRL U11 - M
# Dodatno (linkovi iz sidebara — ako vrate natjecanjeobjekt)
1620, 1622, 1625, 1626, 1645, 1646,
1761, 1762, 1773, 1753,
1774, 1776, 1777, 1783, 1784, 1785, 1786, 1787, 1788,
1796, 1797, 1818, 1834,
1765, 1766,
# Kupovi
1092, 1302, 1303, 1441,
]
UA = ("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
def http_text(url, timeout=20, retries=2):
"""Plain HTTP GET → text; small retry on transient errors.
sportinfocentar2 files are mostly UTF-8 but occasionally contain stray cp1250
bytes (e.g. typographic quotes from Word), so a strict-utf8 decode can fail.
Strategy: strict utf-8 first; on failure fall back to utf-8/replace (keeps
the bulk of the file Unicode-correct rather than re-decoding as latin-1)."""
last = None
for attempt in range(retries + 1):
try:
page.goto(url, wait_until="domcontentloaded", timeout=20000)
# Find natjecanje or klub link
links = page.locator('a[href*="hrs.hr"]').all()
for a in links[:5]:
href = a.get_attribute('href') or ''
if 'natjecanje' in href or 'klub' in href:
self.log(f" Found: {href}")
break
except Exception as e:
self.log(f"{e}")
req = urllib.request.Request(url, headers={"User-Agent": UA})
with urllib.request.urlopen(req, timeout=timeout) as r:
raw = r.read()
try:
return raw.decode("utf-8")
except UnicodeDecodeError:
return raw.decode("utf-8", errors="replace")
except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError) as e:
last = e
if attempt < retries:
time.sleep(1.5 * (attempt + 1))
raise RuntimeError(f"GET {url} failed: {last}")
if __name__ == '__main__':
HRSHarvester().run(limit=int(sys.argv[1]) if len(sys.argv) > 1 else 50)
_UNQUOTED_KEY_RE = re.compile(r'([{,]\s*)([A-Za-z_][A-Za-z0-9_]*)\s*:')
_TRAIL_COMMA_RE = re.compile(r',(\s*[}\]])')
_LEADING_ZERO_RE = re.compile(r'([\s:,\[])0+(\d)')
def parse_var_json(body, var_prefix):
"""Strip 'var <name> = ' wrapper and normalize the lazy-JSON dialect that
sportinfocentar2 emits (unquoted keys, leading zeros in numbers)."""
body = body.strip()
# Both forms occur: `var foo = ...` (coman/) and bare `foo = ...` (ziceri/).
m = re.match(rf"^\s*(?:var\s+)?{re.escape(var_prefix)}\s*=\s*", body, re.I)
if m:
body = body[m.end():]
body = body.rstrip().rstrip(";").rstrip()
# Quote unquoted property names (already-quoted keys have `"`, not `[A-Za-z_]`)
body = _UNQUOTED_KEY_RE.sub(r'\1"\2":', body)
# Strip JS leading-zero numbers (e.g. `"mu": 018,`) that JSON rejects.
# Anchor on a non-digit char so we never touch zeros inside quoted strings.
body = _LEADING_ZERO_RE.sub(r'\1\2', body)
body = _TRAIL_COMMA_RE.sub(r'\1', body)
return json.loads(body)
def derive_sezona(d):
"""Croatian sport season from a calendar date: JulDec → YYYY/YYYY+1."""
if not d:
return None
if isinstance(d, str):
try:
d = datetime.strptime(d[:10], "%Y-%m-%d").date()
except Exception:
return None
y = d.year
if d.month >= 7:
return f"{y}/{y + 1}"
return f"{y - 1}/{y}"
def derive_kategorija(naziv):
"""Map natjecanje naziv → kategorija (handball age groups)."""
n = (naziv or "").lower()
if "u11" in n or "u-11" in n:
return "mini U11"
if "u12" in n or "u-12" in n:
return "mini U12"
if "u13" in n or "u-13" in n:
return "dječaci U13"
if "u15" in n or "u-15" in n:
return "mlađi kadeti U15"
if "u17" in n or "u-17" in n:
return "kadeti U17"
if "u18" in n or "u-18" in n:
return "juniori U18"
return "seniori"
# ─── Klub-name normalization for fuzzy match ──────────────────────────────
_DIA = str.maketrans("čćžšđČĆŽŠĐ", "cczsdcczsd")
_PREFIX_RE = re.compile(
r"^(?:"
r"hrvatski\s+|muski\s+|zenski\s+|"
r"rukometni\s+(?:klub|savez)\s+|"
r"hrk|mrk|zrk|rk"
r")\s*",
re.I,
)
_TRAIL_LOC_RE = re.compile(r"\s*-\s*[a-z][a-z\s]*$", re.I)
_SUFFIX_2_RE = re.compile(r"\s+(?:ii|2)\s*$", re.I)
_NUMERIC_LIGA_RE = re.compile(r"\d+\.\s+u\s+.*$", re.I)
_PAREN_RE = re.compile(r"\([^)]*\)")
_NON_ALNUM_RE = re.compile(r"[^a-z0-9]+")
def normalize_klub_name(name):
"""Aggressively normalize a Croatian handball club name to a comparable token."""
if not name:
return ""
s = str(name).strip()
s = _PAREN_RE.sub(" ", s)
s = _NUMERIC_LIGA_RE.sub("", s)
s = s.translate(_DIA)
s = s.lower()
while True:
new = _PREFIX_RE.sub("", s)
if new == s:
break
s = new
s = _SUFFIX_2_RE.sub("", s)
s = _TRAIL_LOC_RE.sub("", s)
s = _NON_ALNUM_RE.sub(" ", s).strip()
return s
def is_team_2nd(name):
n = (name or "").strip().lower()
return bool(re.search(r"\s(?:ii|2)\s*$", n))
def is_pgz_klub_candidate(naziv):
"""Filter out savezi / udruge / zborovi / clearly non-club rows."""
n = (naziv or "").lower()
bad = ("savez", "udruga", "zbor", "trener")
if any(b in n for b in bad):
return False
# Junk like 'RK RK' or 'RK PŠR' (no real name body)
if re.fullmatch(r"\s*(rk|zrk|mrk|hrk)\s*(rk|psr|psr selce|liburnija|mornar|omisalj)?\s*", n):
return False
return True
def is_zenska_klub(naziv):
n = (naziv or "").strip().lower()
return n.startswith("ženski") or n.startswith("zenski") or n.startswith("ž ") \
or n.startswith("zrk ") or n.startswith("ž.") or " žene" in n or " zene" in n
# ─── Harvester ─────────────────────────────────────────────────────────────
class HRSHarvester(SportHarvester):
SPORT = "rukomet"
SOURCE = "hrs"
def __init__(self):
super().__init__()
self.team_to_klub_m = {}
self.team_to_klub_z = {}
self.unmatched_teams = set()
# Override base — base filters financiran||u_godisnjaku (only 3 rukomet rows).
# Brief mandates ALL 71 PGŽ priority rukomet klubova.
def get_target_klubovi(self, limit=999):
with self.conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT id, naziv, sport
FROM pgz_sport.v_pgz_priority_klubovi
WHERE sport = %s
ORDER BY id
LIMIT %s
""", (self.SPORT, limit))
return cur.fetchall()
def build_klub_maps(self):
rows = self.get_target_klubovi(999)
for r in rows:
naziv = r["naziv"]
if not is_pgz_klub_candidate(naziv):
continue
norm = normalize_klub_name(naziv)
if not norm:
continue
target = self.team_to_klub_z if is_zenska_klub(naziv) else self.team_to_klub_m
cur = target.get(norm)
if cur is None or r["id"] < cur[0]:
target[norm] = (r["id"], naziv)
self.log(f"klub maps: men={len(self.team_to_klub_m)} women={len(self.team_to_klub_z)}")
def match_team(self, hrs_team_name, is_zenska_liga):
"""Direct → token-subset → fallback. Tokens come from normalize_klub_name."""
if not hrs_team_name:
return None
candidates = [hrs_team_name]
if is_team_2nd(hrs_team_name):
candidates.append(re.sub(r"\s+(?:ii|2)\s*$", "", hrs_team_name).strip())
m = self.team_to_klub_z if is_zenska_liga else self.team_to_klub_m
for c in candidates:
n = normalize_klub_name(c)
if not n:
continue
if n in m:
return m[n]
n_tokens = set(n.split())
if not n_tokens:
continue
best = None
for k_norm, (kid, kname) in m.items():
k_tokens = set(k_norm.split())
if not k_tokens:
continue
# token-subset match in either direction
if not (n_tokens <= k_tokens or k_tokens <= n_tokens):
continue
shared = n_tokens & k_tokens
# Require at least one shared token of length ≥ 4 to avoid noise like {"rk"}
if not any(len(t) >= 4 for t in shared):
continue
# Prefer lowest klub_id (canonical row, not godišnjak duplicate)
if best is None or kid < best[0]:
best = (kid, kname)
if best:
return best
return None
# ─── HRS endpoints ─────────────────────────────────────────────────────
def fetch_natjecanje(self, lid):
url = f"https://www.sportinfocentar2.com/coman/natjecanje{lid}.js"
try:
body = http_text(url, timeout=20)
return parse_var_json(body, "natjecanjeobjekt")
except Exception as e:
self.log(f" ⚠ fetch_natjecanje({lid}): {e}")
return None
def fetch_match_stats(self, mid):
url = f"https://www.sportinfocentar2.com/ziceri/webupitisapi.dll?tab=128&utakmica={mid}"
try:
body = http_text(url, timeout=15)
stripped = body.strip()
if "not authorized" in stripped.lower() or stripped.startswith("//"):
return None
return parse_var_json(body, "tab128")
except Exception as e:
self.log(f" ⚠ fetch_match({mid}): {e}")
return None
# ─── Aggregation & upserts ─────────────────────────────────────────────
@staticmethod
def _aggregate_player_stats(rows):
out = defaultdict(int)
for r in rows:
out["nastupi"] += 1
out["golovi"] += int(r.get("sutd") or 0)
out["asistencije"] += int(r.get("asistencija") or 0)
out["zuti"] += int(r.get("zutih") or 0)
out["crveni"] += int(r.get("crvenih") or 0)
return dict(out)
def upsert_klub_roster(self, klub_id, hrs_team_id, ekipa, sezona, raw):
try:
with self.conn.cursor() as cur:
cur.execute("""
INSERT INTO pgz_sport.klub_roster
(klub_id, sport, source, source_id, source_url, ekipa, sezona, raw_data)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s::jsonb)
ON CONFLICT (klub_id, source, source_id, ekipa, sezona) DO UPDATE
SET raw_data = EXCLUDED.raw_data, scraped_at = now()
""", (klub_id, self.SPORT, self.SOURCE, str(hrs_team_id),
f"https://hrs.hr/natjecanje/?ekipa={hrs_team_id}",
ekipa, sezona, json.dumps(raw)))
except Exception as e:
self.log(f" ⚠ upsert_klub_roster: {e}")
# ─── Main run ──────────────────────────────────────────────────────────
def run(self, limit=999):
self.build_klub_maps()
nat_ids = HRS_NATJECANJA[: int(limit)] if limit else HRS_NATJECANJA
self.log(f"🤾 Starting HRS harvest. Natjecanja: {len(nat_ids)}")
agg = defaultdict(list)
clan_meta = {}
for lid in nat_ids:
nat = self.fetch_natjecanje(lid)
if not nat:
continue
naziv = nat.get("naziv") or f"natjecanje {lid}"
spol_int = nat.get("spol", 0)
is_zenska = ("- ž" in naziv.lower()) or ("žene" in naziv.lower()) or (spol_int == 1)
kategorija = derive_kategorija(naziv)
self.log(f"━━ Liga {lid}: {naziv} ({'Ž' if is_zenska else 'M'}, {kategorija})")
team_idx = {}
matches = []
for liga in (nat.get("lige") or []):
for u in (liga.get("utakmice") or []):
mid = u.get("broj")
k1, k2 = u.get("k1"), u.get("k2")
e1, e2 = u.get("e1") or "", u.get("e2") or ""
d = u.get("d") or u.get("pc")
if not mid or not k1 or not k2:
continue
if k1 and e1:
team_idx[k1] = e1
if k2 and e2:
team_idx[k2] = e2
matches.append((mid, k1, e1, k2, e2, d))
pgz_team_ids = {}
for tid, tname in team_idx.items():
m = self.match_team(tname, is_zenska)
if m:
pgz_team_ids[tid] = m
else:
self.unmatched_teams.add(f"{tname} [{ 'Ž' if is_zenska else 'M' }]")
if not pgz_team_ids:
self.log(f" · no PGŽ teams in this league")
continue
self.log(" ✓ PGŽ teams: " + ", ".join(
f"{tid}:{team_idx[tid]} → klub#{kid}"
for tid, (kid, _) in pgz_team_ids.items()))
roster_seen = {}
for (mid, k1, e1, k2, e2, mdate) in matches:
if k1 not in pgz_team_ids and k2 not in pgz_team_ids:
continue
sezona = derive_sezona(mdate) or "2025/2026"
rows = self.fetch_match_stats(mid)
if not rows:
continue
for r in rows:
rb = r.get("rbekipa")
if rb == 1:
hrs_team_id, ekipa_name = k1, e1
elif rb == 2:
hrs_team_id, ekipa_name = k2, e2
else:
continue
if hrs_team_id not in pgz_team_ids:
continue
klub_id, klub_naziv = pgz_team_ids[hrs_team_id]
igrac = r.get("igrac")
if not igrac:
continue
ime = (r.get("ime") or "").strip()
prezime = (r.get("prezime") or "").strip()
rkey = (klub_id, hrs_team_id, sezona)
if rkey not in roster_seen:
roster_seen[rkey] = (ekipa_name,
{"hrs_team_id": hrs_team_id, "natjecanje": naziv})
pkey = (igrac, klub_id, naziv, sezona)
agg[pkey].append(r)
if pkey not in clan_meta:
clan_meta[pkey] = {
"ime": ime, "prezime": prezime,
"hrs_team_id": hrs_team_id, "ekipa": ekipa_name,
"kategorija": kategorija,
"spol": "Ž" if is_zenska else "M",
"natjecanje": naziv, "lid": lid,
}
self.stats["stats"] += 1
time.sleep(0.05)
for (klub_id, hrs_team_id, sezona), (ekipa_name, raw) in roster_seen.items():
self.upsert_klub_roster(klub_id, hrs_team_id, ekipa_name, sezona, raw)
self.log(f"━━ Aggregated keys: {len(agg)}, unique players: {len({k[0] for k in agg})}")
upserted = 0
for (igrac, klub_id, naziv, sezona), match_rows in agg.items():
meta = clan_meta[(igrac, klub_id, naziv, sezona)]
try:
clan_id = self.upsert_clan(
klub_id=klub_id,
source_id=igrac,
ime=meta["ime"], prezime=meta["prezime"],
source_url=f"https://hrs.hr/natjecanje/?igrac={igrac}",
kategorija=meta["kategorija"],
sezona=sezona,
extra={"hrs_team_id": meta["hrs_team_id"],
"ekipa": meta["ekipa"], "spol": meta["spol"]},
)
self.stats["players"] += 1
stats_dict = self._aggregate_player_stats(match_rows)
self.upsert_stats(
clan_id=clan_id, sezona=sezona,
klub_id=klub_id, klub_naziv=meta["ekipa"],
natjecanje=naziv, kategorija=meta["kategorija"],
stats_dict=stats_dict,
raw={"matches": len(match_rows), "lid": meta["lid"]},
)
upserted += 1
except Exception as e:
self.stats["errors"] += 1
self.log(f" ❌ upsert clan {igrac}: {e}")
self.log(f"✅ Done. {upserted} player_stats rows. "
f"Stats: {self.stats}. Unmatched HRS teams: {len(self.unmatched_teams)}")
for t in sorted(self.unmatched_teams)[:30]:
self.log(f" unmatched: {t}")
try:
import subprocess
subprocess.run(["curl", "-s", "-X", "POST",
"https://api.telegram.org/bot8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y/sendMessage",
"-d", "chat_id=7969491558",
"--data-urlencode",
f"text=🤾 HRS rukomet harvest done. Players: {self.stats['players']}, "
f"stats rows: {upserted}, unmatched HRS teams: {len(self.unmatched_teams)}"],
timeout=8, capture_output=True)
except Exception:
pass
if __name__ == "__main__":
limit = int(sys.argv[1]) if len(sys.argv) > 1 else 999
HRSHarvester().run(limit=limit)
+262 -42
View File
@@ -1,54 +1,274 @@
#!/usr/bin/env python3
"""HVS waterpolo harvester."""
import sys, re
# Name: hvs_waterpolo.py
# Version: 2.0
# Author: Damir Radulić <dradulic@outlook.com> / damir@rinet.one
# Date: 2026-05-05
# Description: HVS (hvs.hr) waterpolo harvester.
# Brutalno iskreno: HVS web NE objavljuje rosters/stats po klubu
# kroz indeksabilan kanal — /klub-{slug}/ vraća 404, /klubovi/{id}/
# vraća "Pojavila se kritična greška", /igrac-{slug}/ vraća 404, a
# /kategorija/{id}/ prikazuje samo sezonsku navigaciju. Jedino što
# je upotrebljivo je wp-json REST API:
# /wp/v2/klubovi (20 klubova + ACF.history)
# /wp/v2/clanovi (37 federation officials s biografijama u kojima
# se najčešće spominje klupska karijera)
# Ovaj harvester:
# 1. Cita wp-json klubovi → mapira PGŽ klubove (source_url + meta)
# 2. Cita wp-json clanovi → upsertira sve, plus dodatno povezuje
# one čija biografija sadrži ime PGŽ kluba (heuristika).
# 3. Pokušava Playwright fallback na /klub-{slug}/ samo ako stranica
# stvarno ima ".profile-header__name" u DOM-u (gracefully skipa
# kad HVS vrati 404/error).
import sys, re, json, time, urllib.request
sys.path.insert(0, '/opt/pgz-sport/scripts/sport_harvesters')
from __base import SportHarvester
from psycopg2.extras import RealDictCursor
WP_KLUB_API = "https://hvs.hr/wp-json/wp/v2/klubovi"
WP_CLANOV_API = "https://hvs.hr/wp-json/wp/v2/clanovi"
HVS_BASE = "https://hvs.hr"
KEYWORDS = [
"primorje", "opatija", "jadran", "losinj", "palada",
"silo", "crikvenica", "orka", "bura", "posk", "victoria",
"kostrena", "njivice", "rijeka",
]
def _fetch_paginated(url, log):
"""Fetch all pages of a wp-json collection."""
out = []
for page in range(1, 20):
u = f"{url}?per_page=100&page={page}"
try:
req = urllib.request.Request(u, headers={"User-Agent": "Mozilla/5.0 PGZ-Sport"})
with urllib.request.urlopen(req, timeout=15) as r:
data = json.loads(r.read().decode("utf-8"))
except Exception as e:
log(f" wp-json {u} err: {e}")
break
if not data:
break
out.extend(data)
if len(data) < 100:
break
return out
class HVSHarvester(SportHarvester):
SPORT = 'vaterpolo'
SOURCE = 'hvs'
# ------------- target list (override base, return all 28) ----------------
def get_target_klubovi(self, limit=999):
with self.conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT * FROM pgz_sport.v_pgz_priority_klubovi
WHERE sport = %s
ORDER BY financiran DESC, u_godisnjaku DESC, id
LIMIT %s
""", (self.SPORT, limit))
return cur.fetchall()
# ------------- normalize / match helpers ---------------------------------
def _core_slug(self, name):
if not name:
return ""
s = name.lower()
s = re.sub(r"\[merged[^\]]*\]", " ", s)
s = re.sub(r"\([^)]*\)", " ", s)
s = re.sub(r"\b(vaterpolo|vaterpolski|amaterski|žvk|zvk|vk|hšk|hsk)\b", " ", s)
s = re.sub(r"\bklub\b", " ", s)
s = re.sub(r"\bsavez\b.*$", " ", s)
s = re.sub(r"-(muška|ženska)\s*ekipa", " ", s)
s = re.sub(r"-erste\s*banka?a?", " ", s)
s = self.slugify(s)
return s.strip("-")
def _tokens(self, name):
s = self._core_slug(name)
return set(t for t in s.split("-") if len(t) > 2)
def _match_klub(self, pgz_naziv, hvs_list):
target_core = self._core_slug(pgz_naziv)
target_tokens = self._tokens(pgz_naziv)
if not target_tokens:
return None
for h in hvs_list:
if self._core_slug(h["title"]) == target_core:
return h
best, best_score = None, 0
for h in hvs_list:
ht = self._tokens(h["title"])
shared = target_tokens & ht
if len(shared) < 2:
continue
extra_candidate = (ht - target_tokens) & set(KEYWORDS)
if extra_candidate:
continue
if len(shared) > best_score:
best_score = len(shared)
best = h
return best
# ------------- wp-json fetch + simplify ----------------------------------
def _fetch_hvs_klubovi(self):
out = []
raw = _fetch_paginated(WP_KLUB_API, self.log)
for k in raw:
title = (k.get("title") or {}).get("rendered", "").strip()
title = (title.replace("&#8211;", "").replace("&#8217;", "'")
.replace("&amp;", "&"))
acf = k.get("ACF") or {}
out.append({
"wp_id": k.get("id"),
"club_id": acf.get("club_id"),
"title": title,
"link": k.get("link"),
"slug": k.get("slug"),
"history": acf.get("history") or "",
})
return out
def _fetch_hvs_clanovi(self):
out = []
raw = _fetch_paginated(WP_CLANOV_API, self.log)
for c in raw:
title = (c.get("title") or {}).get("rendered", "").strip()
title = (title.replace("&#8211;", "").replace("&#8217;", "'")
.replace("&amp;", "&"))
acf = c.get("ACF") or {}
out.append({
"wp_id": c.get("id"),
"name": acf.get("name") or title,
"image": acf.get("image") or "",
"birth_date": acf.get("birth_date") or "",
"birth_place": acf.get("birth_place") or "",
"position": acf.get("position") or "",
"bio": acf.get("bio") or "",
"slug": c.get("slug"),
"link": c.get("link"),
})
return out
# ------------- DB persistence helpers ------------------------------------
def _persist_klub_link(self, klub_id, hvs):
with self.conn.cursor() as cur:
cur.execute("""
UPDATE pgz_sport.klubovi
SET source_url = %s,
metadata = COALESCE(metadata, '{}'::jsonb) ||
jsonb_build_object('hvs_wp_id', %s::int,
'hvs_club_id', %s,
'hvs_title', %s),
updated_at = now()
WHERE id = %s
""", (hvs["link"], hvs["wp_id"], hvs.get("club_id"), hvs["title"], klub_id))
def _split_name(self, full):
full = re.sub(r"\s+", " ", (full or "")).strip()
if not full:
return "", ""
parts = full.split(" ", 1)
return parts[0], (parts[1] if len(parts) > 1 else "")
def _insert_official(self, clan_data, klub_id):
ime, prezime = self._split_name(clan_data["name"])
if not ime:
return None
extra = {
"image": clan_data.get("image", ""),
"birth_date": clan_data.get("birth_date", ""),
"birth_place": clan_data.get("birth_place", ""),
"position": clan_data.get("position", ""),
"bio": (clan_data.get("bio") or "")[:8000],
"hvs_role": "federation_official_or_staff",
}
kategorija = clan_data.get("position") or "stručna funkcija"
return self.upsert_clan(
klub_id=klub_id,
source_id=str(clan_data["wp_id"]),
ime=ime, prezime=prezime,
source_url=clan_data["link"],
kategorija=kategorija,
sezona=None,
extra=extra,
)
# ------------- klub-level orchestration ----------------------------------
def scrape_klub(self, page, klub):
self.log(f" 🤽 Klub {klub['id']} {klub['naziv']}")
if not hasattr(self, "_hvs_klubovi"):
self._hvs_klubovi = self._fetch_hvs_klubovi()
self.log(f" 📡 wp-json klubovi loaded: {len(self._hvs_klubovi)}")
if not hasattr(self, "_hvs_clanovi"):
self._hvs_clanovi = self._fetch_hvs_clanovi()
self.log(f" 📡 wp-json clanovi loaded: {len(self._hvs_clanovi)}")
match = self._match_klub(klub['naziv'], self._hvs_klubovi)
if match:
self.log(f" ✅ wp-json match → {match['title']} ({match['link']})")
self._persist_klub_link(klub['id'], match)
else:
self.log(f" 🟡 no wp-json klub match (HVS exposes only 20 klubova)")
# Insert federation officials whose bio mentions any distinctive token
# of this PGŽ klub. This is the only way HVS surfaces person-level data.
klub_tokens = [t for t in self._tokens(klub['naziv']) if t in KEYWORDS]
if not klub_tokens:
self.log(f" 🟡 no distinctive tokens for {klub['naziv']}, skip clanovi link")
return
linked = 0
for c in self._hvs_clanovi:
blob = ((c.get("bio") or "") + " " + (c.get("name") or "")).lower()
blob = self.slugify(blob).replace("-", " ")
if any(t in blob.split() for t in klub_tokens):
try:
cid = self._insert_official(c, klub['id'])
if cid:
self.stats['players'] += 1
linked += 1
except Exception as e:
self.log(f" official upsert err: {e}")
if linked:
self.log(f" 🧑 {linked} clanovi linked via bio match")
# Heartbeat
try:
# Get all klubovi list from HVS
page.goto("https://hvs.hr/klubovi/", wait_until="domcontentloaded", timeout=20000)
klub_links = page.locator('a[href*="/klub/"]').all()
naziv_lower = klub['naziv'].lower()
for a in klub_links[:30]:
text = a.inner_text().lower()
href = a.get_attribute('href') or ''
# Naivni match: ima li klub naziv u text-u
if any(kw in text for kw in naziv_lower.split() if len(kw) > 3):
self.log(f" Match: {text[:50]}{href}")
m = re.search(r'/klub/(\d+)', href)
if m:
kid = m.group(1)
new_url = f"https://hvs.hr/klub/{kid}/"
with self.conn.cursor() as cur:
cur.execute("UPDATE pgz_sport.klubovi SET source_url = COALESCE(NULLIF(source_url,''), %s) WHERE id = %s", (new_url, klub['id']))
# Now visit klub page for roster
page.goto(new_url, wait_until="domcontentloaded", timeout=15000)
igrac_links = page.locator('a[href*="/igrac/"]').all()
self.log(f" {len(igrac_links)} igrača found")
for ia in igrac_links[:30]:
ihref = ia.get_attribute('href') or ''
naziv = ia.inner_text().strip()
mi = re.search(r'/igrac/(\d+)', ihref)
if mi and naziv:
parts = re.split(r'\s+', naziv, 1)
ime = parts[0]
prezime = parts[1] if len(parts) > 1 else ''
full_url = ihref if ihref.startswith('http') else f"https://hvs.hr{ihref}"
clan_id = self.upsert_clan(
klub_id=klub['id'], source_id=mi.group(1),
ime=ime, prezime=prezime,
source_url=full_url
)
self.stats['players'] += 1
break
except Exception as e:
self.log(f"{e}")
import subprocess
subprocess.run(["redis-cli", "SET", "cc:pgz-sport:heartbeat",
str(int(time.time()))], timeout=3, capture_output=True)
except Exception:
pass
# We override run() to skip Playwright entirely (HVS site is broken for it).
def run(self, limit=999):
klubovi = self.get_target_klubovi(limit)
self.log(f"🚀 Starting {self.SPORT} harvest. Target: {len(klubovi)} klubova "
f"(wp-json only — site SPA is broken for /klub/, /igrac/, /kategorija/)")
for klub in klubovi:
try:
self.scrape_klub(None, klub) # no Playwright page
self.stats['klubova'] += 1
except Exception as e:
self.stats['errors'] += 1
self.log(f" ❌ Klub {klub['id']} {klub['naziv']}: {e}")
self.log(f"✅ Done. Stats: {self.stats}")
try:
import subprocess
subprocess.run(["curl", "-s", "-X", "POST",
"https://api.telegram.org/bot8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y/sendMessage",
"-d", "chat_id=7969491558",
"--data-urlencode", f"text=VATERPOLO harvest done: {self.stats}"],
timeout=8, capture_output=True)
except Exception:
pass
if __name__ == '__main__':
HVSHarvester().run(limit=int(sys.argv[1]) if len(sys.argv) > 1 else 50)
limit = int(sys.argv[1]) if len(sys.argv) > 1 else 28
HVSHarvester().run(limit=limit)