#!/usr/bin/env python3 """ hrs_handball.py — HRS Rukomet harvester v1.0 Authors: dradulic@outlook.com / damir@rinet.one Date: 2026-05-05 Description: Scrapes Hrvatski rukometni savez (HRS) competition data via the sportinfocentar2.com JSON endpoints (no HTML rendering needed): - https://www.sportinfocentar2.com/coman/natjecanje{LID}.js → league JSON: lige[].utakmice[] {broj, e1, e2, k1, k2, d, ...} - https://www.sportinfocentar2.com/ziceri/webupitisapi.dll?tab=128&utakmica={MID} → per-match player roster + box-score stats Fuzzy-matches HRS team names to PGŽ priority handball clubs (~71) in pgz_sport.klubovi, then aggregates each player's per-(klub, natjecanje, sezona) totals into pgz_sport.player_stats; upserts pgz_sport.clanovi + clan_kategorije. Run: python3 /opt/pgz-sport/scripts/sport_harvesters/hrs_handball.py [LIMIT_NATJECANJA] """ import os, sys, re, json, time, unicodedata import urllib.request import urllib.error from datetime import datetime, date from collections import defaultdict sys.path.insert(0, '/opt/pgz-sport/scripts/sport_harvesters') from __base import SportHarvester # noqa: E402 import psycopg2 from psycopg2.extras import RealDictCursor # ─── HRS league IDs (HRS top menu, 2025/26) ──────────────────────────────── HRS_NATJECANJA = [ # Seniori M 1632, # Paket24 Premijer liga (M) 1633, # 1. HRL Sjever - M 1634, # 1. HRL Jug - M 1639, # 2. HRL Istok - M 1641, # 2. HRL Zapad - M ★ PGŽ 1642, # 2. HRL Sjever - M 1643, # 2. HRL Jug - M 1675, # 3. HRL Istok - M 1676, # 3. HRL Sjever - M 1677, # 3. HRL Središte - M 1678, # 3. HRL Zapad - M ★ PGŽ 1384, # Međužupanijska liga # Seniori Ž 1629, # 1. HRL Žene 1637, # 2. HRL Sjever - Ž 1638, # 2. HRL Zapad - Ž ★ PGŽ 1644, # 2. HRL Jug - Ž 1671, # 3. HRL Sjever - Ž 1672, # 3. HRL Zapad - Ž ★ PGŽ 1673, # 3. HRL Središte - Ž 1674, # 3. HRL Istok - Ž # Mladi M 1389, # 1. HRL U18 - M 1705, # 1. HRL U17 - M 1763, # 2. HRL U17 - M 1706, # 1. HRL U15 - M 1716, # 2. HRL U15 - M 1707, # 1. HRL U13 - M 1717, # 2. HRL U13 - M 1746, # 1. HRL U12 - M 1709, # 1. HRL U11 - M # Dodatno (linkovi iz sidebara — ako vrate natjecanjeobjekt) 1620, 1622, 1625, 1626, 1645, 1646, 1761, 1762, 1773, 1753, 1774, 1776, 1777, 1783, 1784, 1785, 1786, 1787, 1788, 1796, 1797, 1818, 1834, 1765, 1766, # Kupovi 1092, 1302, 1303, 1441, ] UA = ("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36") def http_text(url, timeout=20, retries=2): """Plain HTTP GET → text; small retry on transient errors. sportinfocentar2 files are mostly UTF-8 but occasionally contain stray cp1250 bytes (e.g. typographic quotes from Word), so a strict-utf8 decode can fail. Strategy: strict utf-8 first; on failure fall back to utf-8/replace (keeps the bulk of the file Unicode-correct rather than re-decoding as latin-1).""" last = None for attempt in range(retries + 1): try: req = urllib.request.Request(url, headers={"User-Agent": UA}) with urllib.request.urlopen(req, timeout=timeout) as r: raw = r.read() try: return raw.decode("utf-8") except UnicodeDecodeError: return raw.decode("utf-8", errors="replace") except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError) as e: last = e if attempt < retries: time.sleep(1.5 * (attempt + 1)) raise RuntimeError(f"GET {url} failed: {last}") _UNQUOTED_KEY_RE = re.compile(r'([{,]\s*)([A-Za-z_][A-Za-z0-9_]*)\s*:') _TRAIL_COMMA_RE = re.compile(r',(\s*[}\]])') _LEADING_ZERO_RE = re.compile(r'([\s:,\[])0+(\d)') def parse_var_json(body, var_prefix): """Strip 'var = ' wrapper and normalize the lazy-JSON dialect that sportinfocentar2 emits (unquoted keys, leading zeros in numbers).""" body = body.strip() # Both forms occur: `var foo = ...` (coman/) and bare `foo = ...` (ziceri/). m = re.match(rf"^\s*(?:var\s+)?{re.escape(var_prefix)}\s*=\s*", body, re.I) if m: body = body[m.end():] body = body.rstrip().rstrip(";").rstrip() # Quote unquoted property names (already-quoted keys have `"`, not `[A-Za-z_]`) body = _UNQUOTED_KEY_RE.sub(r'\1"\2":', body) # Strip JS leading-zero numbers (e.g. `"mu": 018,`) that JSON rejects. # Anchor on a non-digit char so we never touch zeros inside quoted strings. body = _LEADING_ZERO_RE.sub(r'\1\2', body) body = _TRAIL_COMMA_RE.sub(r'\1', body) return json.loads(body) def derive_sezona(d): """Croatian sport season from a calendar date: Jul–Dec → YYYY/YYYY+1.""" if not d: return None if isinstance(d, str): try: d = datetime.strptime(d[:10], "%Y-%m-%d").date() except Exception: return None y = d.year if d.month >= 7: return f"{y}/{y + 1}" return f"{y - 1}/{y}" def derive_kategorija(naziv): """Map natjecanje naziv → kategorija (handball age groups).""" n = (naziv or "").lower() if "u11" in n or "u-11" in n: return "mini U11" if "u12" in n or "u-12" in n: return "mini U12" if "u13" in n or "u-13" in n: return "dječaci U13" if "u15" in n or "u-15" in n: return "mlađi kadeti U15" if "u17" in n or "u-17" in n: return "kadeti U17" if "u18" in n or "u-18" in n: return "juniori U18" return "seniori" # ─── Klub-name normalization for fuzzy match ────────────────────────────── _DIA = str.maketrans("čćžšđČĆŽŠĐ", "cczsdcczsd") _PREFIX_RE = re.compile( r"^(?:" r"hrvatski\s+|muski\s+|zenski\s+|" r"rukometni\s+(?:klub|savez)\s+|" r"hrk|mrk|zrk|rk" r")\s*", re.I, ) _TRAIL_LOC_RE = re.compile(r"\s*-\s*[a-z][a-z\s]*$", re.I) _SUFFIX_2_RE = re.compile(r"\s+(?:ii|2)\s*$", re.I) _NUMERIC_LIGA_RE = re.compile(r"\d+\.\s+u\s+.*$", re.I) _PAREN_RE = re.compile(r"\([^)]*\)") _NON_ALNUM_RE = re.compile(r"[^a-z0-9]+") def normalize_klub_name(name): """Aggressively normalize a Croatian handball club name to a comparable token.""" if not name: return "" s = str(name).strip() s = _PAREN_RE.sub(" ", s) s = _NUMERIC_LIGA_RE.sub("", s) s = s.translate(_DIA) s = s.lower() while True: new = _PREFIX_RE.sub("", s) if new == s: break s = new s = _SUFFIX_2_RE.sub("", s) s = _TRAIL_LOC_RE.sub("", s) s = _NON_ALNUM_RE.sub(" ", s).strip() return s def is_team_2nd(name): n = (name or "").strip().lower() return bool(re.search(r"\s(?:ii|2)\s*$", n)) def is_pgz_klub_candidate(naziv): """Filter out savezi / udruge / zborovi / clearly non-club rows.""" n = (naziv or "").lower() bad = ("savez", "udruga", "zbor", "trener") if any(b in n for b in bad): return False # Junk like 'RK RK' or 'RK PŠR' (no real name body) if re.fullmatch(r"\s*(rk|zrk|mrk|hrk)\s*(rk|psr|psr selce|liburnija|mornar|omisalj)?\s*", n): return False return True def is_zenska_klub(naziv): n = (naziv or "").strip().lower() return n.startswith("ženski") or n.startswith("zenski") or n.startswith("ž ") \ or n.startswith("zrk ") or n.startswith("ž.") or " žene" in n or " zene" in n # ─── Harvester ───────────────────────────────────────────────────────────── class HRSHarvester(SportHarvester): SPORT = "rukomet" SOURCE = "hrs" def __init__(self): super().__init__() self.team_to_klub_m = {} self.team_to_klub_z = {} self.unmatched_teams = set() # Override base — base filters financiran||u_godisnjaku (only 3 rukomet rows). # Brief mandates ALL 71 PGŽ priority rukomet klubova. def get_target_klubovi(self, limit=999): with self.conn.cursor(cursor_factory=RealDictCursor) as cur: cur.execute(""" SELECT id, naziv, sport FROM pgz_sport.v_pgz_priority_klubovi WHERE sport = %s ORDER BY id LIMIT %s """, (self.SPORT, limit)) return cur.fetchall() def build_klub_maps(self): rows = self.get_target_klubovi(999) for r in rows: naziv = r["naziv"] if not is_pgz_klub_candidate(naziv): continue norm = normalize_klub_name(naziv) if not norm: continue target = self.team_to_klub_z if is_zenska_klub(naziv) else self.team_to_klub_m cur = target.get(norm) if cur is None or r["id"] < cur[0]: target[norm] = (r["id"], naziv) self.log(f"klub maps: men={len(self.team_to_klub_m)} women={len(self.team_to_klub_z)}") def match_team(self, hrs_team_name, is_zenska_liga): """Direct → token-subset → fallback. Tokens come from normalize_klub_name.""" if not hrs_team_name: return None candidates = [hrs_team_name] if is_team_2nd(hrs_team_name): candidates.append(re.sub(r"\s+(?:ii|2)\s*$", "", hrs_team_name).strip()) m = self.team_to_klub_z if is_zenska_liga else self.team_to_klub_m for c in candidates: n = normalize_klub_name(c) if not n: continue if n in m: return m[n] n_tokens = set(n.split()) if not n_tokens: continue best = None for k_norm, (kid, kname) in m.items(): k_tokens = set(k_norm.split()) if not k_tokens: continue # token-subset match in either direction if not (n_tokens <= k_tokens or k_tokens <= n_tokens): continue shared = n_tokens & k_tokens # Require at least one shared token of length ≥ 4 to avoid noise like {"rk"} if not any(len(t) >= 4 for t in shared): continue # Prefer lowest klub_id (canonical row, not godišnjak duplicate) if best is None or kid < best[0]: best = (kid, kname) if best: return best return None # ─── HRS endpoints ───────────────────────────────────────────────────── def fetch_natjecanje(self, lid): url = f"https://www.sportinfocentar2.com/coman/natjecanje{lid}.js" try: body = http_text(url, timeout=20) return parse_var_json(body, "natjecanjeobjekt") except Exception as e: self.log(f" ⚠ fetch_natjecanje({lid}): {e}") return None def fetch_match_stats(self, mid): url = f"https://www.sportinfocentar2.com/ziceri/webupitisapi.dll?tab=128&utakmica={mid}" try: body = http_text(url, timeout=15) stripped = body.strip() if "not authorized" in stripped.lower() or stripped.startswith("//"): return None return parse_var_json(body, "tab128") except Exception as e: self.log(f" ⚠ fetch_match({mid}): {e}") return None # ─── Aggregation & upserts ───────────────────────────────────────────── @staticmethod def _aggregate_player_stats(rows): out = defaultdict(int) for r in rows: out["nastupi"] += 1 out["golovi"] += int(r.get("sutd") or 0) out["asistencije"] += int(r.get("asistencija") or 0) out["zuti"] += int(r.get("zutih") or 0) out["crveni"] += int(r.get("crvenih") or 0) return dict(out) def upsert_klub_roster(self, klub_id, hrs_team_id, ekipa, sezona, raw): try: with self.conn.cursor() as cur: cur.execute(""" INSERT INTO pgz_sport.klub_roster (klub_id, sport, source, source_id, source_url, ekipa, sezona, raw_data) VALUES (%s, %s, %s, %s, %s, %s, %s, %s::jsonb) ON CONFLICT (klub_id, source, source_id, ekipa, sezona) DO UPDATE SET raw_data = EXCLUDED.raw_data, scraped_at = now() """, (klub_id, self.SPORT, self.SOURCE, str(hrs_team_id), f"https://hrs.hr/natjecanje/?ekipa={hrs_team_id}", ekipa, sezona, json.dumps(raw))) except Exception as e: self.log(f" ⚠ upsert_klub_roster: {e}") # ─── Main run ────────────────────────────────────────────────────────── def run(self, limit=999): self.build_klub_maps() nat_ids = HRS_NATJECANJA[: int(limit)] if limit else HRS_NATJECANJA self.log(f"🤾 Starting HRS harvest. Natjecanja: {len(nat_ids)}") agg = defaultdict(list) clan_meta = {} for lid in nat_ids: nat = self.fetch_natjecanje(lid) if not nat: continue naziv = nat.get("naziv") or f"natjecanje {lid}" spol_int = nat.get("spol", 0) is_zenska = ("- ž" in naziv.lower()) or ("žene" in naziv.lower()) or (spol_int == 1) kategorija = derive_kategorija(naziv) self.log(f"━━ Liga {lid}: {naziv} ({'Ž' if is_zenska else 'M'}, {kategorija})") team_idx = {} matches = [] for liga in (nat.get("lige") or []): for u in (liga.get("utakmice") or []): mid = u.get("broj") k1, k2 = u.get("k1"), u.get("k2") e1, e2 = u.get("e1") or "", u.get("e2") or "" d = u.get("d") or u.get("pc") if not mid or not k1 or not k2: continue if k1 and e1: team_idx[k1] = e1 if k2 and e2: team_idx[k2] = e2 matches.append((mid, k1, e1, k2, e2, d)) pgz_team_ids = {} for tid, tname in team_idx.items(): m = self.match_team(tname, is_zenska) if m: pgz_team_ids[tid] = m else: self.unmatched_teams.add(f"{tname} [{ 'Ž' if is_zenska else 'M' }]") if not pgz_team_ids: self.log(f" · no PGŽ teams in this league") continue self.log(" ✓ PGŽ teams: " + ", ".join( f"{tid}:{team_idx[tid]} → klub#{kid}" for tid, (kid, _) in pgz_team_ids.items())) roster_seen = {} for (mid, k1, e1, k2, e2, mdate) in matches: if k1 not in pgz_team_ids and k2 not in pgz_team_ids: continue sezona = derive_sezona(mdate) or "2025/2026" rows = self.fetch_match_stats(mid) if not rows: continue for r in rows: rb = r.get("rbekipa") if rb == 1: hrs_team_id, ekipa_name = k1, e1 elif rb == 2: hrs_team_id, ekipa_name = k2, e2 else: continue if hrs_team_id not in pgz_team_ids: continue klub_id, klub_naziv = pgz_team_ids[hrs_team_id] igrac = r.get("igrac") if not igrac: continue ime = (r.get("ime") or "").strip() prezime = (r.get("prezime") or "").strip() rkey = (klub_id, hrs_team_id, sezona) if rkey not in roster_seen: roster_seen[rkey] = (ekipa_name, {"hrs_team_id": hrs_team_id, "natjecanje": naziv}) pkey = (igrac, klub_id, naziv, sezona) agg[pkey].append(r) if pkey not in clan_meta: clan_meta[pkey] = { "ime": ime, "prezime": prezime, "hrs_team_id": hrs_team_id, "ekipa": ekipa_name, "kategorija": kategorija, "spol": "Ž" if is_zenska else "M", "natjecanje": naziv, "lid": lid, } self.stats["stats"] += 1 time.sleep(0.05) for (klub_id, hrs_team_id, sezona), (ekipa_name, raw) in roster_seen.items(): self.upsert_klub_roster(klub_id, hrs_team_id, ekipa_name, sezona, raw) self.log(f"━━ Aggregated keys: {len(agg)}, unique players: {len({k[0] for k in agg})}") upserted = 0 for (igrac, klub_id, naziv, sezona), match_rows in agg.items(): meta = clan_meta[(igrac, klub_id, naziv, sezona)] try: clan_id = self.upsert_clan( klub_id=klub_id, source_id=igrac, ime=meta["ime"], prezime=meta["prezime"], source_url=f"https://hrs.hr/natjecanje/?igrac={igrac}", kategorija=meta["kategorija"], sezona=sezona, extra={"hrs_team_id": meta["hrs_team_id"], "ekipa": meta["ekipa"], "spol": meta["spol"]}, ) self.stats["players"] += 1 stats_dict = self._aggregate_player_stats(match_rows) self.upsert_stats( clan_id=clan_id, sezona=sezona, klub_id=klub_id, klub_naziv=meta["ekipa"], natjecanje=naziv, kategorija=meta["kategorija"], stats_dict=stats_dict, raw={"matches": len(match_rows), "lid": meta["lid"]}, ) upserted += 1 except Exception as e: self.stats["errors"] += 1 self.log(f" ❌ upsert clan {igrac}: {e}") self.log(f"✅ Done. {upserted} player_stats rows. " f"Stats: {self.stats}. Unmatched HRS teams: {len(self.unmatched_teams)}") for t in sorted(self.unmatched_teams)[:30]: self.log(f" unmatched: {t}") try: import subprocess subprocess.run(["curl", "-s", "-X", "POST", "https://api.telegram.org/bot8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y/sendMessage", "-d", "chat_id=7969491558", "--data-urlencode", f"text=🤾 HRS rukomet harvest done. Players: {self.stats['players']}, " f"stats rows: {upserted}, unmatched HRS teams: {len(self.unmatched_teams)}"], timeout=8, capture_output=True) except Exception: pass if __name__ == "__main__": limit = int(sys.argv[1]) if len(sys.argv) > 1 else 999 HRSHarvester().run(limit=limit)