1d02c0897d
- pgz nav now includes /erp/full, /crm/v2, /admin/users, /dokumenti
- 4 dokumenti endpoints: list, godišnjaci/list, godišnjak/{godina} PDF, detail
- 18 godišnjaka u pgz_sport.dokumenti (2006-2024) with savez_id=333
- PGŽ filter helpers (window._pgz_filter_priority, togglePGZFilter)
- navItemClick handler for nav items with href
490 lines
19 KiB
Python
Executable File
490 lines
19 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
"""
|
||
hrs_handball.py — HRS Rukomet harvester v1.0
|
||
Authors: dradulic@outlook.com / damir@rinet.one
|
||
Date: 2026-05-05
|
||
Description:
|
||
Scrapes Hrvatski rukometni savez (HRS) competition data via the
|
||
sportinfocentar2.com JSON endpoints (no HTML rendering needed):
|
||
- https://www.sportinfocentar2.com/coman/natjecanje{LID}.js
|
||
→ league JSON: lige[].utakmice[] {broj, e1, e2, k1, k2, d, ...}
|
||
- https://www.sportinfocentar2.com/ziceri/webupitisapi.dll?tab=128&utakmica={MID}
|
||
→ per-match player roster + box-score stats
|
||
Fuzzy-matches HRS team names to PGŽ priority handball clubs (~71) in
|
||
pgz_sport.klubovi, then aggregates each player's per-(klub, natjecanje, sezona)
|
||
totals into pgz_sport.player_stats; upserts pgz_sport.clanovi + clan_kategorije.
|
||
|
||
Run:
|
||
python3 /opt/pgz-sport/scripts/sport_harvesters/hrs_handball.py [LIMIT_NATJECANJA]
|
||
"""
|
||
import os, sys, re, json, time, unicodedata
|
||
import urllib.request
|
||
import urllib.error
|
||
from datetime import datetime, date
|
||
from collections import defaultdict
|
||
|
||
sys.path.insert(0, '/opt/pgz-sport/scripts/sport_harvesters')
|
||
from __base import SportHarvester # noqa: E402
|
||
|
||
import psycopg2
|
||
from psycopg2.extras import RealDictCursor
|
||
|
||
|
||
# ─── HRS league IDs (HRS top menu, 2025/26) ────────────────────────────────
|
||
HRS_NATJECANJA = [
|
||
# Seniori M
|
||
1632, # Paket24 Premijer liga (M)
|
||
1633, # 1. HRL Sjever - M
|
||
1634, # 1. HRL Jug - M
|
||
1639, # 2. HRL Istok - M
|
||
1641, # 2. HRL Zapad - M ★ PGŽ
|
||
1642, # 2. HRL Sjever - M
|
||
1643, # 2. HRL Jug - M
|
||
1675, # 3. HRL Istok - M
|
||
1676, # 3. HRL Sjever - M
|
||
1677, # 3. HRL Središte - M
|
||
1678, # 3. HRL Zapad - M ★ PGŽ
|
||
1384, # Međužupanijska liga
|
||
# Seniori Ž
|
||
1629, # 1. HRL Žene
|
||
1637, # 2. HRL Sjever - Ž
|
||
1638, # 2. HRL Zapad - Ž ★ PGŽ
|
||
1644, # 2. HRL Jug - Ž
|
||
1671, # 3. HRL Sjever - Ž
|
||
1672, # 3. HRL Zapad - Ž ★ PGŽ
|
||
1673, # 3. HRL Središte - Ž
|
||
1674, # 3. HRL Istok - Ž
|
||
# Mladi M
|
||
1389, # 1. HRL U18 - M
|
||
1705, # 1. HRL U17 - M
|
||
1763, # 2. HRL U17 - M
|
||
1706, # 1. HRL U15 - M
|
||
1716, # 2. HRL U15 - M
|
||
1707, # 1. HRL U13 - M
|
||
1717, # 2. HRL U13 - M
|
||
1746, # 1. HRL U12 - M
|
||
1709, # 1. HRL U11 - M
|
||
# Dodatno (linkovi iz sidebara — ako vrate natjecanjeobjekt)
|
||
1620, 1622, 1625, 1626, 1645, 1646,
|
||
1761, 1762, 1773, 1753,
|
||
1774, 1776, 1777, 1783, 1784, 1785, 1786, 1787, 1788,
|
||
1796, 1797, 1818, 1834,
|
||
1765, 1766,
|
||
# Kupovi
|
||
1092, 1302, 1303, 1441,
|
||
]
|
||
|
||
UA = ("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
|
||
|
||
|
||
def http_text(url, timeout=20, retries=2):
|
||
"""Plain HTTP GET → text; small retry on transient errors.
|
||
sportinfocentar2 files are mostly UTF-8 but occasionally contain stray cp1250
|
||
bytes (e.g. typographic quotes from Word), so a strict-utf8 decode can fail.
|
||
Strategy: strict utf-8 first; on failure fall back to utf-8/replace (keeps
|
||
the bulk of the file Unicode-correct rather than re-decoding as latin-1)."""
|
||
last = None
|
||
for attempt in range(retries + 1):
|
||
try:
|
||
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
||
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||
raw = r.read()
|
||
try:
|
||
return raw.decode("utf-8")
|
||
except UnicodeDecodeError:
|
||
return raw.decode("utf-8", errors="replace")
|
||
except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError) as e:
|
||
last = e
|
||
if attempt < retries:
|
||
time.sleep(1.5 * (attempt + 1))
|
||
raise RuntimeError(f"GET {url} failed: {last}")
|
||
|
||
|
||
_UNQUOTED_KEY_RE = re.compile(r'([{,]\s*)([A-Za-z_][A-Za-z0-9_]*)\s*:')
|
||
_TRAIL_COMMA_RE = re.compile(r',(\s*[}\]])')
|
||
_LEADING_ZERO_RE = re.compile(r'([\s:,\[])0+(\d)')
|
||
|
||
|
||
def parse_var_json(body, var_prefix):
|
||
"""Strip 'var <name> = ' wrapper and normalize the lazy-JSON dialect that
|
||
sportinfocentar2 emits (unquoted keys, leading zeros in numbers)."""
|
||
body = body.strip()
|
||
# Both forms occur: `var foo = ...` (coman/) and bare `foo = ...` (ziceri/).
|
||
m = re.match(rf"^\s*(?:var\s+)?{re.escape(var_prefix)}\s*=\s*", body, re.I)
|
||
if m:
|
||
body = body[m.end():]
|
||
body = body.rstrip().rstrip(";").rstrip()
|
||
# Quote unquoted property names (already-quoted keys have `"`, not `[A-Za-z_]`)
|
||
body = _UNQUOTED_KEY_RE.sub(r'\1"\2":', body)
|
||
# Strip JS leading-zero numbers (e.g. `"mu": 018,`) that JSON rejects.
|
||
# Anchor on a non-digit char so we never touch zeros inside quoted strings.
|
||
body = _LEADING_ZERO_RE.sub(r'\1\2', body)
|
||
body = _TRAIL_COMMA_RE.sub(r'\1', body)
|
||
return json.loads(body)
|
||
|
||
|
||
def derive_sezona(d):
|
||
"""Croatian sport season from a calendar date: Jul–Dec → YYYY/YYYY+1."""
|
||
if not d:
|
||
return None
|
||
if isinstance(d, str):
|
||
try:
|
||
d = datetime.strptime(d[:10], "%Y-%m-%d").date()
|
||
except Exception:
|
||
return None
|
||
y = d.year
|
||
if d.month >= 7:
|
||
return f"{y}/{y + 1}"
|
||
return f"{y - 1}/{y}"
|
||
|
||
|
||
def derive_kategorija(naziv):
|
||
"""Map natjecanje naziv → kategorija (handball age groups)."""
|
||
n = (naziv or "").lower()
|
||
if "u11" in n or "u-11" in n:
|
||
return "mini U11"
|
||
if "u12" in n or "u-12" in n:
|
||
return "mini U12"
|
||
if "u13" in n or "u-13" in n:
|
||
return "dječaci U13"
|
||
if "u15" in n or "u-15" in n:
|
||
return "mlađi kadeti U15"
|
||
if "u17" in n or "u-17" in n:
|
||
return "kadeti U17"
|
||
if "u18" in n or "u-18" in n:
|
||
return "juniori U18"
|
||
return "seniori"
|
||
|
||
|
||
# ─── Klub-name normalization for fuzzy match ──────────────────────────────
|
||
_DIA = str.maketrans("čćžšđČĆŽŠĐ", "cczsdcczsd")
|
||
|
||
_PREFIX_RE = re.compile(
|
||
r"^(?:"
|
||
r"hrvatski\s+|muski\s+|zenski\s+|"
|
||
r"rukometni\s+(?:klub|savez)\s+|"
|
||
r"hrk|mrk|zrk|rk"
|
||
r")\s*",
|
||
re.I,
|
||
)
|
||
_TRAIL_LOC_RE = re.compile(r"\s*-\s*[a-z][a-z\s]*$", re.I)
|
||
_SUFFIX_2_RE = re.compile(r"\s+(?:ii|2)\s*$", re.I)
|
||
_NUMERIC_LIGA_RE = re.compile(r"\d+\.\s+u\s+.*$", re.I)
|
||
_PAREN_RE = re.compile(r"\([^)]*\)")
|
||
_NON_ALNUM_RE = re.compile(r"[^a-z0-9]+")
|
||
|
||
|
||
def normalize_klub_name(name):
|
||
"""Aggressively normalize a Croatian handball club name to a comparable token."""
|
||
if not name:
|
||
return ""
|
||
s = str(name).strip()
|
||
s = _PAREN_RE.sub(" ", s)
|
||
s = _NUMERIC_LIGA_RE.sub("", s)
|
||
s = s.translate(_DIA)
|
||
s = s.lower()
|
||
while True:
|
||
new = _PREFIX_RE.sub("", s)
|
||
if new == s:
|
||
break
|
||
s = new
|
||
s = _SUFFIX_2_RE.sub("", s)
|
||
s = _TRAIL_LOC_RE.sub("", s)
|
||
s = _NON_ALNUM_RE.sub(" ", s).strip()
|
||
return s
|
||
|
||
|
||
def is_team_2nd(name):
|
||
n = (name or "").strip().lower()
|
||
return bool(re.search(r"\s(?:ii|2)\s*$", n))
|
||
|
||
|
||
def is_pgz_klub_candidate(naziv):
|
||
"""Filter out savezi / udruge / zborovi / clearly non-club rows."""
|
||
n = (naziv or "").lower()
|
||
bad = ("savez", "udruga", "zbor", "trener")
|
||
if any(b in n for b in bad):
|
||
return False
|
||
# Junk like 'RK RK' or 'RK PŠR' (no real name body)
|
||
if re.fullmatch(r"\s*(rk|zrk|mrk|hrk)\s*(rk|psr|psr selce|liburnija|mornar|omisalj)?\s*", n):
|
||
return False
|
||
return True
|
||
|
||
|
||
def is_zenska_klub(naziv):
|
||
n = (naziv or "").strip().lower()
|
||
return n.startswith("ženski") or n.startswith("zenski") or n.startswith("ž ") \
|
||
or n.startswith("zrk ") or n.startswith("ž.") or " žene" in n or " zene" in n
|
||
|
||
|
||
# ─── Harvester ─────────────────────────────────────────────────────────────
|
||
class HRSHarvester(SportHarvester):
|
||
SPORT = "rukomet"
|
||
SOURCE = "hrs"
|
||
|
||
def __init__(self):
|
||
super().__init__()
|
||
self.team_to_klub_m = {}
|
||
self.team_to_klub_z = {}
|
||
self.unmatched_teams = set()
|
||
|
||
# Override base — base filters financiran||u_godisnjaku (only 3 rukomet rows).
|
||
# Brief mandates ALL 71 PGŽ priority rukomet klubova.
|
||
def get_target_klubovi(self, limit=999):
|
||
with self.conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||
cur.execute("""
|
||
SELECT id, naziv, sport
|
||
FROM pgz_sport.v_pgz_priority_klubovi
|
||
WHERE sport = %s
|
||
ORDER BY id
|
||
LIMIT %s
|
||
""", (self.SPORT, limit))
|
||
return cur.fetchall()
|
||
|
||
def build_klub_maps(self):
|
||
rows = self.get_target_klubovi(999)
|
||
for r in rows:
|
||
naziv = r["naziv"]
|
||
if not is_pgz_klub_candidate(naziv):
|
||
continue
|
||
norm = normalize_klub_name(naziv)
|
||
if not norm:
|
||
continue
|
||
target = self.team_to_klub_z if is_zenska_klub(naziv) else self.team_to_klub_m
|
||
cur = target.get(norm)
|
||
if cur is None or r["id"] < cur[0]:
|
||
target[norm] = (r["id"], naziv)
|
||
self.log(f"klub maps: men={len(self.team_to_klub_m)} women={len(self.team_to_klub_z)}")
|
||
|
||
def match_team(self, hrs_team_name, is_zenska_liga):
|
||
"""Direct → token-subset → fallback. Tokens come from normalize_klub_name."""
|
||
if not hrs_team_name:
|
||
return None
|
||
candidates = [hrs_team_name]
|
||
if is_team_2nd(hrs_team_name):
|
||
candidates.append(re.sub(r"\s+(?:ii|2)\s*$", "", hrs_team_name).strip())
|
||
m = self.team_to_klub_z if is_zenska_liga else self.team_to_klub_m
|
||
for c in candidates:
|
||
n = normalize_klub_name(c)
|
||
if not n:
|
||
continue
|
||
if n in m:
|
||
return m[n]
|
||
n_tokens = set(n.split())
|
||
if not n_tokens:
|
||
continue
|
||
best = None
|
||
for k_norm, (kid, kname) in m.items():
|
||
k_tokens = set(k_norm.split())
|
||
if not k_tokens:
|
||
continue
|
||
# token-subset match in either direction
|
||
if not (n_tokens <= k_tokens or k_tokens <= n_tokens):
|
||
continue
|
||
shared = n_tokens & k_tokens
|
||
# Require at least one shared token of length ≥ 4 to avoid noise like {"rk"}
|
||
if not any(len(t) >= 4 for t in shared):
|
||
continue
|
||
# Prefer lowest klub_id (canonical row, not godišnjak duplicate)
|
||
if best is None or kid < best[0]:
|
||
best = (kid, kname)
|
||
if best:
|
||
return best
|
||
return None
|
||
|
||
# ─── HRS endpoints ─────────────────────────────────────────────────────
|
||
def fetch_natjecanje(self, lid):
|
||
url = f"https://www.sportinfocentar2.com/coman/natjecanje{lid}.js"
|
||
try:
|
||
body = http_text(url, timeout=20)
|
||
return parse_var_json(body, "natjecanjeobjekt")
|
||
except Exception as e:
|
||
self.log(f" ⚠ fetch_natjecanje({lid}): {e}")
|
||
return None
|
||
|
||
def fetch_match_stats(self, mid):
|
||
url = f"https://www.sportinfocentar2.com/ziceri/webupitisapi.dll?tab=128&utakmica={mid}"
|
||
try:
|
||
body = http_text(url, timeout=15)
|
||
stripped = body.strip()
|
||
if "not authorized" in stripped.lower() or stripped.startswith("//"):
|
||
return None
|
||
return parse_var_json(body, "tab128")
|
||
except Exception as e:
|
||
self.log(f" ⚠ fetch_match({mid}): {e}")
|
||
return None
|
||
|
||
# ─── Aggregation & upserts ─────────────────────────────────────────────
|
||
@staticmethod
|
||
def _aggregate_player_stats(rows):
|
||
out = defaultdict(int)
|
||
for r in rows:
|
||
out["nastupi"] += 1
|
||
out["golovi"] += int(r.get("sutd") or 0)
|
||
out["asistencije"] += int(r.get("asistencija") or 0)
|
||
out["zuti"] += int(r.get("zutih") or 0)
|
||
out["crveni"] += int(r.get("crvenih") or 0)
|
||
return dict(out)
|
||
|
||
def upsert_klub_roster(self, klub_id, hrs_team_id, ekipa, sezona, raw):
|
||
try:
|
||
with self.conn.cursor() as cur:
|
||
cur.execute("""
|
||
INSERT INTO pgz_sport.klub_roster
|
||
(klub_id, sport, source, source_id, source_url, ekipa, sezona, raw_data)
|
||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s::jsonb)
|
||
ON CONFLICT (klub_id, source, source_id, ekipa, sezona) DO UPDATE
|
||
SET raw_data = EXCLUDED.raw_data, scraped_at = now()
|
||
""", (klub_id, self.SPORT, self.SOURCE, str(hrs_team_id),
|
||
f"https://hrs.hr/natjecanje/?ekipa={hrs_team_id}",
|
||
ekipa, sezona, json.dumps(raw)))
|
||
except Exception as e:
|
||
self.log(f" ⚠ upsert_klub_roster: {e}")
|
||
|
||
# ─── Main run ──────────────────────────────────────────────────────────
|
||
def run(self, limit=999):
|
||
self.build_klub_maps()
|
||
nat_ids = HRS_NATJECANJA[: int(limit)] if limit else HRS_NATJECANJA
|
||
self.log(f"🤾 Starting HRS harvest. Natjecanja: {len(nat_ids)}")
|
||
|
||
agg = defaultdict(list)
|
||
clan_meta = {}
|
||
|
||
for lid in nat_ids:
|
||
nat = self.fetch_natjecanje(lid)
|
||
if not nat:
|
||
continue
|
||
naziv = nat.get("naziv") or f"natjecanje {lid}"
|
||
spol_int = nat.get("spol", 0)
|
||
is_zenska = ("- ž" in naziv.lower()) or ("žene" in naziv.lower()) or (spol_int == 1)
|
||
kategorija = derive_kategorija(naziv)
|
||
self.log(f"━━ Liga {lid}: {naziv} ({'Ž' if is_zenska else 'M'}, {kategorija})")
|
||
|
||
team_idx = {}
|
||
matches = []
|
||
for liga in (nat.get("lige") or []):
|
||
for u in (liga.get("utakmice") or []):
|
||
mid = u.get("broj")
|
||
k1, k2 = u.get("k1"), u.get("k2")
|
||
e1, e2 = u.get("e1") or "", u.get("e2") or ""
|
||
d = u.get("d") or u.get("pc")
|
||
if not mid or not k1 or not k2:
|
||
continue
|
||
if k1 and e1:
|
||
team_idx[k1] = e1
|
||
if k2 and e2:
|
||
team_idx[k2] = e2
|
||
matches.append((mid, k1, e1, k2, e2, d))
|
||
|
||
pgz_team_ids = {}
|
||
for tid, tname in team_idx.items():
|
||
m = self.match_team(tname, is_zenska)
|
||
if m:
|
||
pgz_team_ids[tid] = m
|
||
else:
|
||
self.unmatched_teams.add(f"{tname} [{ 'Ž' if is_zenska else 'M' }]")
|
||
|
||
if not pgz_team_ids:
|
||
self.log(f" · no PGŽ teams in this league")
|
||
continue
|
||
self.log(" ✓ PGŽ teams: " + ", ".join(
|
||
f"{tid}:{team_idx[tid]} → klub#{kid}"
|
||
for tid, (kid, _) in pgz_team_ids.items()))
|
||
|
||
roster_seen = {}
|
||
|
||
for (mid, k1, e1, k2, e2, mdate) in matches:
|
||
if k1 not in pgz_team_ids and k2 not in pgz_team_ids:
|
||
continue
|
||
sezona = derive_sezona(mdate) or "2025/2026"
|
||
rows = self.fetch_match_stats(mid)
|
||
if not rows:
|
||
continue
|
||
for r in rows:
|
||
rb = r.get("rbekipa")
|
||
if rb == 1:
|
||
hrs_team_id, ekipa_name = k1, e1
|
||
elif rb == 2:
|
||
hrs_team_id, ekipa_name = k2, e2
|
||
else:
|
||
continue
|
||
if hrs_team_id not in pgz_team_ids:
|
||
continue
|
||
klub_id, klub_naziv = pgz_team_ids[hrs_team_id]
|
||
igrac = r.get("igrac")
|
||
if not igrac:
|
||
continue
|
||
ime = (r.get("ime") or "").strip()
|
||
prezime = (r.get("prezime") or "").strip()
|
||
rkey = (klub_id, hrs_team_id, sezona)
|
||
if rkey not in roster_seen:
|
||
roster_seen[rkey] = (ekipa_name,
|
||
{"hrs_team_id": hrs_team_id, "natjecanje": naziv})
|
||
pkey = (igrac, klub_id, naziv, sezona)
|
||
agg[pkey].append(r)
|
||
if pkey not in clan_meta:
|
||
clan_meta[pkey] = {
|
||
"ime": ime, "prezime": prezime,
|
||
"hrs_team_id": hrs_team_id, "ekipa": ekipa_name,
|
||
"kategorija": kategorija,
|
||
"spol": "Ž" if is_zenska else "M",
|
||
"natjecanje": naziv, "lid": lid,
|
||
}
|
||
self.stats["stats"] += 1
|
||
time.sleep(0.05)
|
||
|
||
for (klub_id, hrs_team_id, sezona), (ekipa_name, raw) in roster_seen.items():
|
||
self.upsert_klub_roster(klub_id, hrs_team_id, ekipa_name, sezona, raw)
|
||
|
||
self.log(f"━━ Aggregated keys: {len(agg)}, unique players: {len({k[0] for k in agg})}")
|
||
upserted = 0
|
||
for (igrac, klub_id, naziv, sezona), match_rows in agg.items():
|
||
meta = clan_meta[(igrac, klub_id, naziv, sezona)]
|
||
try:
|
||
clan_id = self.upsert_clan(
|
||
klub_id=klub_id,
|
||
source_id=igrac,
|
||
ime=meta["ime"], prezime=meta["prezime"],
|
||
source_url=f"https://hrs.hr/natjecanje/?igrac={igrac}",
|
||
kategorija=meta["kategorija"],
|
||
sezona=sezona,
|
||
extra={"hrs_team_id": meta["hrs_team_id"],
|
||
"ekipa": meta["ekipa"], "spol": meta["spol"]},
|
||
)
|
||
self.stats["players"] += 1
|
||
stats_dict = self._aggregate_player_stats(match_rows)
|
||
self.upsert_stats(
|
||
clan_id=clan_id, sezona=sezona,
|
||
klub_id=klub_id, klub_naziv=meta["ekipa"],
|
||
natjecanje=naziv, kategorija=meta["kategorija"],
|
||
stats_dict=stats_dict,
|
||
raw={"matches": len(match_rows), "lid": meta["lid"]},
|
||
)
|
||
upserted += 1
|
||
except Exception as e:
|
||
self.stats["errors"] += 1
|
||
self.log(f" ❌ upsert clan {igrac}: {e}")
|
||
|
||
self.log(f"✅ Done. {upserted} player_stats rows. "
|
||
f"Stats: {self.stats}. Unmatched HRS teams: {len(self.unmatched_teams)}")
|
||
for t in sorted(self.unmatched_teams)[:30]:
|
||
self.log(f" unmatched: {t}")
|
||
|
||
try:
|
||
import subprocess
|
||
subprocess.run(["curl", "-s", "-X", "POST",
|
||
"https://api.telegram.org/bot8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y/sendMessage",
|
||
"-d", "chat_id=7969491558",
|
||
"--data-urlencode",
|
||
f"text=🤾 HRS rukomet harvest done. Players: {self.stats['players']}, "
|
||
f"stats rows: {upserted}, unmatched HRS teams: {len(self.unmatched_teams)}"],
|
||
timeout=8, capture_output=True)
|
||
except Exception:
|
||
pass
|
||
|
||
|
||
if __name__ == "__main__":
|
||
limit = int(sys.argv[1]) if len(sys.argv) > 1 else 999
|
||
HRSHarvester().run(limit=limit)
|