Files
pgz-sport/scripts/sport_harvesters/hrs_handball.py
T
damir 1d02c0897d Sidebar: +ERP +CRM +Dokumenti, godišnjaci import (18 PDFs), filter helpers
- pgz nav now includes /erp/full, /crm/v2, /admin/users, /dokumenti
- 4 dokumenti endpoints: list, godišnjaci/list, godišnjak/{godina} PDF, detail
- 18 godišnjaka u pgz_sport.dokumenti (2006-2024) with savez_id=333
- PGŽ filter helpers (window._pgz_filter_priority, togglePGZFilter)
- navItemClick handler for nav items with href
2026-05-05 13:08:11 +02:00

490 lines
19 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
hrs_handball.py — HRS Rukomet harvester v1.0
Authors: dradulic@outlook.com / damir@rinet.one
Date: 2026-05-05
Description:
Scrapes Hrvatski rukometni savez (HRS) competition data via the
sportinfocentar2.com JSON endpoints (no HTML rendering needed):
- https://www.sportinfocentar2.com/coman/natjecanje{LID}.js
→ league JSON: lige[].utakmice[] {broj, e1, e2, k1, k2, d, ...}
- https://www.sportinfocentar2.com/ziceri/webupitisapi.dll?tab=128&utakmica={MID}
→ per-match player roster + box-score stats
Fuzzy-matches HRS team names to PGŽ priority handball clubs (~71) in
pgz_sport.klubovi, then aggregates each player's per-(klub, natjecanje, sezona)
totals into pgz_sport.player_stats; upserts pgz_sport.clanovi + clan_kategorije.
Run:
python3 /opt/pgz-sport/scripts/sport_harvesters/hrs_handball.py [LIMIT_NATJECANJA]
"""
import os, sys, re, json, time, unicodedata
import urllib.request
import urllib.error
from datetime import datetime, date
from collections import defaultdict
sys.path.insert(0, '/opt/pgz-sport/scripts/sport_harvesters')
from __base import SportHarvester # noqa: E402
import psycopg2
from psycopg2.extras import RealDictCursor
# ─── HRS league IDs (HRS top menu, 2025/26) ────────────────────────────────
HRS_NATJECANJA = [
# Seniori M
1632, # Paket24 Premijer liga (M)
1633, # 1. HRL Sjever - M
1634, # 1. HRL Jug - M
1639, # 2. HRL Istok - M
1641, # 2. HRL Zapad - M ★ PGŽ
1642, # 2. HRL Sjever - M
1643, # 2. HRL Jug - M
1675, # 3. HRL Istok - M
1676, # 3. HRL Sjever - M
1677, # 3. HRL Središte - M
1678, # 3. HRL Zapad - M ★ PGŽ
1384, # Međužupanijska liga
# Seniori Ž
1629, # 1. HRL Žene
1637, # 2. HRL Sjever - Ž
1638, # 2. HRL Zapad - Ž ★ PGŽ
1644, # 2. HRL Jug - Ž
1671, # 3. HRL Sjever - Ž
1672, # 3. HRL Zapad - Ž ★ PGŽ
1673, # 3. HRL Središte - Ž
1674, # 3. HRL Istok - Ž
# Mladi M
1389, # 1. HRL U18 - M
1705, # 1. HRL U17 - M
1763, # 2. HRL U17 - M
1706, # 1. HRL U15 - M
1716, # 2. HRL U15 - M
1707, # 1. HRL U13 - M
1717, # 2. HRL U13 - M
1746, # 1. HRL U12 - M
1709, # 1. HRL U11 - M
# Dodatno (linkovi iz sidebara — ako vrate natjecanjeobjekt)
1620, 1622, 1625, 1626, 1645, 1646,
1761, 1762, 1773, 1753,
1774, 1776, 1777, 1783, 1784, 1785, 1786, 1787, 1788,
1796, 1797, 1818, 1834,
1765, 1766,
# Kupovi
1092, 1302, 1303, 1441,
]
UA = ("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
def http_text(url, timeout=20, retries=2):
"""Plain HTTP GET → text; small retry on transient errors.
sportinfocentar2 files are mostly UTF-8 but occasionally contain stray cp1250
bytes (e.g. typographic quotes from Word), so a strict-utf8 decode can fail.
Strategy: strict utf-8 first; on failure fall back to utf-8/replace (keeps
the bulk of the file Unicode-correct rather than re-decoding as latin-1)."""
last = None
for attempt in range(retries + 1):
try:
req = urllib.request.Request(url, headers={"User-Agent": UA})
with urllib.request.urlopen(req, timeout=timeout) as r:
raw = r.read()
try:
return raw.decode("utf-8")
except UnicodeDecodeError:
return raw.decode("utf-8", errors="replace")
except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError) as e:
last = e
if attempt < retries:
time.sleep(1.5 * (attempt + 1))
raise RuntimeError(f"GET {url} failed: {last}")
_UNQUOTED_KEY_RE = re.compile(r'([{,]\s*)([A-Za-z_][A-Za-z0-9_]*)\s*:')
_TRAIL_COMMA_RE = re.compile(r',(\s*[}\]])')
_LEADING_ZERO_RE = re.compile(r'([\s:,\[])0+(\d)')
def parse_var_json(body, var_prefix):
"""Strip 'var <name> = ' wrapper and normalize the lazy-JSON dialect that
sportinfocentar2 emits (unquoted keys, leading zeros in numbers)."""
body = body.strip()
# Both forms occur: `var foo = ...` (coman/) and bare `foo = ...` (ziceri/).
m = re.match(rf"^\s*(?:var\s+)?{re.escape(var_prefix)}\s*=\s*", body, re.I)
if m:
body = body[m.end():]
body = body.rstrip().rstrip(";").rstrip()
# Quote unquoted property names (already-quoted keys have `"`, not `[A-Za-z_]`)
body = _UNQUOTED_KEY_RE.sub(r'\1"\2":', body)
# Strip JS leading-zero numbers (e.g. `"mu": 018,`) that JSON rejects.
# Anchor on a non-digit char so we never touch zeros inside quoted strings.
body = _LEADING_ZERO_RE.sub(r'\1\2', body)
body = _TRAIL_COMMA_RE.sub(r'\1', body)
return json.loads(body)
def derive_sezona(d):
"""Croatian sport season from a calendar date: JulDec → YYYY/YYYY+1."""
if not d:
return None
if isinstance(d, str):
try:
d = datetime.strptime(d[:10], "%Y-%m-%d").date()
except Exception:
return None
y = d.year
if d.month >= 7:
return f"{y}/{y + 1}"
return f"{y - 1}/{y}"
def derive_kategorija(naziv):
"""Map natjecanje naziv → kategorija (handball age groups)."""
n = (naziv or "").lower()
if "u11" in n or "u-11" in n:
return "mini U11"
if "u12" in n or "u-12" in n:
return "mini U12"
if "u13" in n or "u-13" in n:
return "dječaci U13"
if "u15" in n or "u-15" in n:
return "mlađi kadeti U15"
if "u17" in n or "u-17" in n:
return "kadeti U17"
if "u18" in n or "u-18" in n:
return "juniori U18"
return "seniori"
# ─── Klub-name normalization for fuzzy match ──────────────────────────────
_DIA = str.maketrans("čćžšđČĆŽŠĐ", "cczsdcczsd")
_PREFIX_RE = re.compile(
r"^(?:"
r"hrvatski\s+|muski\s+|zenski\s+|"
r"rukometni\s+(?:klub|savez)\s+|"
r"hrk|mrk|zrk|rk"
r")\s*",
re.I,
)
_TRAIL_LOC_RE = re.compile(r"\s*-\s*[a-z][a-z\s]*$", re.I)
_SUFFIX_2_RE = re.compile(r"\s+(?:ii|2)\s*$", re.I)
_NUMERIC_LIGA_RE = re.compile(r"\d+\.\s+u\s+.*$", re.I)
_PAREN_RE = re.compile(r"\([^)]*\)")
_NON_ALNUM_RE = re.compile(r"[^a-z0-9]+")
def normalize_klub_name(name):
"""Aggressively normalize a Croatian handball club name to a comparable token."""
if not name:
return ""
s = str(name).strip()
s = _PAREN_RE.sub(" ", s)
s = _NUMERIC_LIGA_RE.sub("", s)
s = s.translate(_DIA)
s = s.lower()
while True:
new = _PREFIX_RE.sub("", s)
if new == s:
break
s = new
s = _SUFFIX_2_RE.sub("", s)
s = _TRAIL_LOC_RE.sub("", s)
s = _NON_ALNUM_RE.sub(" ", s).strip()
return s
def is_team_2nd(name):
n = (name or "").strip().lower()
return bool(re.search(r"\s(?:ii|2)\s*$", n))
def is_pgz_klub_candidate(naziv):
"""Filter out savezi / udruge / zborovi / clearly non-club rows."""
n = (naziv or "").lower()
bad = ("savez", "udruga", "zbor", "trener")
if any(b in n for b in bad):
return False
# Junk like 'RK RK' or 'RK PŠR' (no real name body)
if re.fullmatch(r"\s*(rk|zrk|mrk|hrk)\s*(rk|psr|psr selce|liburnija|mornar|omisalj)?\s*", n):
return False
return True
def is_zenska_klub(naziv):
n = (naziv or "").strip().lower()
return n.startswith("ženski") or n.startswith("zenski") or n.startswith("ž ") \
or n.startswith("zrk ") or n.startswith("ž.") or " žene" in n or " zene" in n
# ─── Harvester ─────────────────────────────────────────────────────────────
class HRSHarvester(SportHarvester):
SPORT = "rukomet"
SOURCE = "hrs"
def __init__(self):
super().__init__()
self.team_to_klub_m = {}
self.team_to_klub_z = {}
self.unmatched_teams = set()
# Override base — base filters financiran||u_godisnjaku (only 3 rukomet rows).
# Brief mandates ALL 71 PGŽ priority rukomet klubova.
def get_target_klubovi(self, limit=999):
with self.conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT id, naziv, sport
FROM pgz_sport.v_pgz_priority_klubovi
WHERE sport = %s
ORDER BY id
LIMIT %s
""", (self.SPORT, limit))
return cur.fetchall()
def build_klub_maps(self):
rows = self.get_target_klubovi(999)
for r in rows:
naziv = r["naziv"]
if not is_pgz_klub_candidate(naziv):
continue
norm = normalize_klub_name(naziv)
if not norm:
continue
target = self.team_to_klub_z if is_zenska_klub(naziv) else self.team_to_klub_m
cur = target.get(norm)
if cur is None or r["id"] < cur[0]:
target[norm] = (r["id"], naziv)
self.log(f"klub maps: men={len(self.team_to_klub_m)} women={len(self.team_to_klub_z)}")
def match_team(self, hrs_team_name, is_zenska_liga):
"""Direct → token-subset → fallback. Tokens come from normalize_klub_name."""
if not hrs_team_name:
return None
candidates = [hrs_team_name]
if is_team_2nd(hrs_team_name):
candidates.append(re.sub(r"\s+(?:ii|2)\s*$", "", hrs_team_name).strip())
m = self.team_to_klub_z if is_zenska_liga else self.team_to_klub_m
for c in candidates:
n = normalize_klub_name(c)
if not n:
continue
if n in m:
return m[n]
n_tokens = set(n.split())
if not n_tokens:
continue
best = None
for k_norm, (kid, kname) in m.items():
k_tokens = set(k_norm.split())
if not k_tokens:
continue
# token-subset match in either direction
if not (n_tokens <= k_tokens or k_tokens <= n_tokens):
continue
shared = n_tokens & k_tokens
# Require at least one shared token of length ≥ 4 to avoid noise like {"rk"}
if not any(len(t) >= 4 for t in shared):
continue
# Prefer lowest klub_id (canonical row, not godišnjak duplicate)
if best is None or kid < best[0]:
best = (kid, kname)
if best:
return best
return None
# ─── HRS endpoints ─────────────────────────────────────────────────────
def fetch_natjecanje(self, lid):
url = f"https://www.sportinfocentar2.com/coman/natjecanje{lid}.js"
try:
body = http_text(url, timeout=20)
return parse_var_json(body, "natjecanjeobjekt")
except Exception as e:
self.log(f" ⚠ fetch_natjecanje({lid}): {e}")
return None
def fetch_match_stats(self, mid):
url = f"https://www.sportinfocentar2.com/ziceri/webupitisapi.dll?tab=128&utakmica={mid}"
try:
body = http_text(url, timeout=15)
stripped = body.strip()
if "not authorized" in stripped.lower() or stripped.startswith("//"):
return None
return parse_var_json(body, "tab128")
except Exception as e:
self.log(f" ⚠ fetch_match({mid}): {e}")
return None
# ─── Aggregation & upserts ─────────────────────────────────────────────
@staticmethod
def _aggregate_player_stats(rows):
out = defaultdict(int)
for r in rows:
out["nastupi"] += 1
out["golovi"] += int(r.get("sutd") or 0)
out["asistencije"] += int(r.get("asistencija") or 0)
out["zuti"] += int(r.get("zutih") or 0)
out["crveni"] += int(r.get("crvenih") or 0)
return dict(out)
def upsert_klub_roster(self, klub_id, hrs_team_id, ekipa, sezona, raw):
try:
with self.conn.cursor() as cur:
cur.execute("""
INSERT INTO pgz_sport.klub_roster
(klub_id, sport, source, source_id, source_url, ekipa, sezona, raw_data)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s::jsonb)
ON CONFLICT (klub_id, source, source_id, ekipa, sezona) DO UPDATE
SET raw_data = EXCLUDED.raw_data, scraped_at = now()
""", (klub_id, self.SPORT, self.SOURCE, str(hrs_team_id),
f"https://hrs.hr/natjecanje/?ekipa={hrs_team_id}",
ekipa, sezona, json.dumps(raw)))
except Exception as e:
self.log(f" ⚠ upsert_klub_roster: {e}")
# ─── Main run ──────────────────────────────────────────────────────────
def run(self, limit=999):
self.build_klub_maps()
nat_ids = HRS_NATJECANJA[: int(limit)] if limit else HRS_NATJECANJA
self.log(f"🤾 Starting HRS harvest. Natjecanja: {len(nat_ids)}")
agg = defaultdict(list)
clan_meta = {}
for lid in nat_ids:
nat = self.fetch_natjecanje(lid)
if not nat:
continue
naziv = nat.get("naziv") or f"natjecanje {lid}"
spol_int = nat.get("spol", 0)
is_zenska = ("- ž" in naziv.lower()) or ("žene" in naziv.lower()) or (spol_int == 1)
kategorija = derive_kategorija(naziv)
self.log(f"━━ Liga {lid}: {naziv} ({'Ž' if is_zenska else 'M'}, {kategorija})")
team_idx = {}
matches = []
for liga in (nat.get("lige") or []):
for u in (liga.get("utakmice") or []):
mid = u.get("broj")
k1, k2 = u.get("k1"), u.get("k2")
e1, e2 = u.get("e1") or "", u.get("e2") or ""
d = u.get("d") or u.get("pc")
if not mid or not k1 or not k2:
continue
if k1 and e1:
team_idx[k1] = e1
if k2 and e2:
team_idx[k2] = e2
matches.append((mid, k1, e1, k2, e2, d))
pgz_team_ids = {}
for tid, tname in team_idx.items():
m = self.match_team(tname, is_zenska)
if m:
pgz_team_ids[tid] = m
else:
self.unmatched_teams.add(f"{tname} [{ 'Ž' if is_zenska else 'M' }]")
if not pgz_team_ids:
self.log(f" · no PGŽ teams in this league")
continue
self.log(" ✓ PGŽ teams: " + ", ".join(
f"{tid}:{team_idx[tid]} → klub#{kid}"
for tid, (kid, _) in pgz_team_ids.items()))
roster_seen = {}
for (mid, k1, e1, k2, e2, mdate) in matches:
if k1 not in pgz_team_ids and k2 not in pgz_team_ids:
continue
sezona = derive_sezona(mdate) or "2025/2026"
rows = self.fetch_match_stats(mid)
if not rows:
continue
for r in rows:
rb = r.get("rbekipa")
if rb == 1:
hrs_team_id, ekipa_name = k1, e1
elif rb == 2:
hrs_team_id, ekipa_name = k2, e2
else:
continue
if hrs_team_id not in pgz_team_ids:
continue
klub_id, klub_naziv = pgz_team_ids[hrs_team_id]
igrac = r.get("igrac")
if not igrac:
continue
ime = (r.get("ime") or "").strip()
prezime = (r.get("prezime") or "").strip()
rkey = (klub_id, hrs_team_id, sezona)
if rkey not in roster_seen:
roster_seen[rkey] = (ekipa_name,
{"hrs_team_id": hrs_team_id, "natjecanje": naziv})
pkey = (igrac, klub_id, naziv, sezona)
agg[pkey].append(r)
if pkey not in clan_meta:
clan_meta[pkey] = {
"ime": ime, "prezime": prezime,
"hrs_team_id": hrs_team_id, "ekipa": ekipa_name,
"kategorija": kategorija,
"spol": "Ž" if is_zenska else "M",
"natjecanje": naziv, "lid": lid,
}
self.stats["stats"] += 1
time.sleep(0.05)
for (klub_id, hrs_team_id, sezona), (ekipa_name, raw) in roster_seen.items():
self.upsert_klub_roster(klub_id, hrs_team_id, ekipa_name, sezona, raw)
self.log(f"━━ Aggregated keys: {len(agg)}, unique players: {len({k[0] for k in agg})}")
upserted = 0
for (igrac, klub_id, naziv, sezona), match_rows in agg.items():
meta = clan_meta[(igrac, klub_id, naziv, sezona)]
try:
clan_id = self.upsert_clan(
klub_id=klub_id,
source_id=igrac,
ime=meta["ime"], prezime=meta["prezime"],
source_url=f"https://hrs.hr/natjecanje/?igrac={igrac}",
kategorija=meta["kategorija"],
sezona=sezona,
extra={"hrs_team_id": meta["hrs_team_id"],
"ekipa": meta["ekipa"], "spol": meta["spol"]},
)
self.stats["players"] += 1
stats_dict = self._aggregate_player_stats(match_rows)
self.upsert_stats(
clan_id=clan_id, sezona=sezona,
klub_id=klub_id, klub_naziv=meta["ekipa"],
natjecanje=naziv, kategorija=meta["kategorija"],
stats_dict=stats_dict,
raw={"matches": len(match_rows), "lid": meta["lid"]},
)
upserted += 1
except Exception as e:
self.stats["errors"] += 1
self.log(f" ❌ upsert clan {igrac}: {e}")
self.log(f"✅ Done. {upserted} player_stats rows. "
f"Stats: {self.stats}. Unmatched HRS teams: {len(self.unmatched_teams)}")
for t in sorted(self.unmatched_teams)[:30]:
self.log(f" unmatched: {t}")
try:
import subprocess
subprocess.run(["curl", "-s", "-X", "POST",
"https://api.telegram.org/bot8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y/sendMessage",
"-d", "chat_id=7969491558",
"--data-urlencode",
f"text=🤾 HRS rukomet harvest done. Players: {self.stats['players']}, "
f"stats rows: {upserted}, unmatched HRS teams: {len(self.unmatched_teams)}"],
timeout=8, capture_output=True)
except Exception:
pass
if __name__ == "__main__":
limit = int(sys.argv[1]) if len(sys.argv) > 1 else 999
HRSHarvester().run(limit=limit)