Sidebar: +ERP +CRM +Dokumenti, godišnjaci import (18 PDFs), filter helpers

- pgz nav now includes /erp/full, /crm/v2, /admin/users, /dokumenti
- 4 dokumenti endpoints: list, godišnjaci/list, godišnjak/{godina} PDF, detail
- 18 godišnjaka u pgz_sport.dokumenti (2006-2024) with savez_id=333
- PGŽ filter helpers (window._pgz_filter_priority, togglePGZFilter)
- navItemClick handler for nav items with href
This commit is contained in:
2026-05-05 13:08:11 +02:00
parent 9fb512932a
commit 1d02c0897d
970 changed files with 268354 additions and 434 deletions
+484 -22
View File
@@ -1,27 +1,489 @@
#!/usr/bin/env python3
"""HRS handball harvester."""
import sys
"""
hrs_handball.py — HRS Rukomet harvester v1.0
Authors: dradulic@outlook.com / damir@rinet.one
Date: 2026-05-05
Description:
Scrapes Hrvatski rukometni savez (HRS) competition data via the
sportinfocentar2.com JSON endpoints (no HTML rendering needed):
- https://www.sportinfocentar2.com/coman/natjecanje{LID}.js
→ league JSON: lige[].utakmice[] {broj, e1, e2, k1, k2, d, ...}
- https://www.sportinfocentar2.com/ziceri/webupitisapi.dll?tab=128&utakmica={MID}
→ per-match player roster + box-score stats
Fuzzy-matches HRS team names to PGŽ priority handball clubs (~71) in
pgz_sport.klubovi, then aggregates each player's per-(klub, natjecanje, sezona)
totals into pgz_sport.player_stats; upserts pgz_sport.clanovi + clan_kategorije.
Run:
python3 /opt/pgz-sport/scripts/sport_harvesters/hrs_handball.py [LIMIT_NATJECANJA]
"""
import os, sys, re, json, time, unicodedata
import urllib.request
import urllib.error
from datetime import datetime, date
from collections import defaultdict
sys.path.insert(0, '/opt/pgz-sport/scripts/sport_harvesters')
from __base import SportHarvester
from __base import SportHarvester # noqa: E402
class HRSHarvester(SportHarvester):
SPORT = 'rukomet'
SOURCE = 'hrs'
def scrape_klub(self, page, klub):
url = f"https://hrs.hr/?s={klub['naziv'].replace(' ','+')}"
self.log(f" 🤾 Klub {klub['id']} {klub['naziv']}")
import psycopg2
from psycopg2.extras import RealDictCursor
# ─── HRS league IDs (HRS top menu, 2025/26) ────────────────────────────────
HRS_NATJECANJA = [
# Seniori M
1632, # Paket24 Premijer liga (M)
1633, # 1. HRL Sjever - M
1634, # 1. HRL Jug - M
1639, # 2. HRL Istok - M
1641, # 2. HRL Zapad - M ★ PGŽ
1642, # 2. HRL Sjever - M
1643, # 2. HRL Jug - M
1675, # 3. HRL Istok - M
1676, # 3. HRL Sjever - M
1677, # 3. HRL Središte - M
1678, # 3. HRL Zapad - M ★ PGŽ
1384, # Međužupanijska liga
# Seniori Ž
1629, # 1. HRL Žene
1637, # 2. HRL Sjever - Ž
1638, # 2. HRL Zapad - Ž ★ PGŽ
1644, # 2. HRL Jug - Ž
1671, # 3. HRL Sjever - Ž
1672, # 3. HRL Zapad - Ž ★ PGŽ
1673, # 3. HRL Središte - Ž
1674, # 3. HRL Istok - Ž
# Mladi M
1389, # 1. HRL U18 - M
1705, # 1. HRL U17 - M
1763, # 2. HRL U17 - M
1706, # 1. HRL U15 - M
1716, # 2. HRL U15 - M
1707, # 1. HRL U13 - M
1717, # 2. HRL U13 - M
1746, # 1. HRL U12 - M
1709, # 1. HRL U11 - M
# Dodatno (linkovi iz sidebara — ako vrate natjecanjeobjekt)
1620, 1622, 1625, 1626, 1645, 1646,
1761, 1762, 1773, 1753,
1774, 1776, 1777, 1783, 1784, 1785, 1786, 1787, 1788,
1796, 1797, 1818, 1834,
1765, 1766,
# Kupovi
1092, 1302, 1303, 1441,
]
UA = ("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
def http_text(url, timeout=20, retries=2):
"""Plain HTTP GET → text; small retry on transient errors.
sportinfocentar2 files are mostly UTF-8 but occasionally contain stray cp1250
bytes (e.g. typographic quotes from Word), so a strict-utf8 decode can fail.
Strategy: strict utf-8 first; on failure fall back to utf-8/replace (keeps
the bulk of the file Unicode-correct rather than re-decoding as latin-1)."""
last = None
for attempt in range(retries + 1):
try:
page.goto(url, wait_until="domcontentloaded", timeout=20000)
# Find natjecanje or klub link
links = page.locator('a[href*="hrs.hr"]').all()
for a in links[:5]:
href = a.get_attribute('href') or ''
if 'natjecanje' in href or 'klub' in href:
self.log(f" Found: {href}")
break
except Exception as e:
self.log(f"{e}")
req = urllib.request.Request(url, headers={"User-Agent": UA})
with urllib.request.urlopen(req, timeout=timeout) as r:
raw = r.read()
try:
return raw.decode("utf-8")
except UnicodeDecodeError:
return raw.decode("utf-8", errors="replace")
except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError) as e:
last = e
if attempt < retries:
time.sleep(1.5 * (attempt + 1))
raise RuntimeError(f"GET {url} failed: {last}")
if __name__ == '__main__':
HRSHarvester().run(limit=int(sys.argv[1]) if len(sys.argv) > 1 else 50)
_UNQUOTED_KEY_RE = re.compile(r'([{,]\s*)([A-Za-z_][A-Za-z0-9_]*)\s*:')
_TRAIL_COMMA_RE = re.compile(r',(\s*[}\]])')
_LEADING_ZERO_RE = re.compile(r'([\s:,\[])0+(\d)')
def parse_var_json(body, var_prefix):
"""Strip 'var <name> = ' wrapper and normalize the lazy-JSON dialect that
sportinfocentar2 emits (unquoted keys, leading zeros in numbers)."""
body = body.strip()
# Both forms occur: `var foo = ...` (coman/) and bare `foo = ...` (ziceri/).
m = re.match(rf"^\s*(?:var\s+)?{re.escape(var_prefix)}\s*=\s*", body, re.I)
if m:
body = body[m.end():]
body = body.rstrip().rstrip(";").rstrip()
# Quote unquoted property names (already-quoted keys have `"`, not `[A-Za-z_]`)
body = _UNQUOTED_KEY_RE.sub(r'\1"\2":', body)
# Strip JS leading-zero numbers (e.g. `"mu": 018,`) that JSON rejects.
# Anchor on a non-digit char so we never touch zeros inside quoted strings.
body = _LEADING_ZERO_RE.sub(r'\1\2', body)
body = _TRAIL_COMMA_RE.sub(r'\1', body)
return json.loads(body)
def derive_sezona(d):
"""Croatian sport season from a calendar date: JulDec → YYYY/YYYY+1."""
if not d:
return None
if isinstance(d, str):
try:
d = datetime.strptime(d[:10], "%Y-%m-%d").date()
except Exception:
return None
y = d.year
if d.month >= 7:
return f"{y}/{y + 1}"
return f"{y - 1}/{y}"
def derive_kategorija(naziv):
"""Map natjecanje naziv → kategorija (handball age groups)."""
n = (naziv or "").lower()
if "u11" in n or "u-11" in n:
return "mini U11"
if "u12" in n or "u-12" in n:
return "mini U12"
if "u13" in n or "u-13" in n:
return "dječaci U13"
if "u15" in n or "u-15" in n:
return "mlađi kadeti U15"
if "u17" in n or "u-17" in n:
return "kadeti U17"
if "u18" in n or "u-18" in n:
return "juniori U18"
return "seniori"
# ─── Klub-name normalization for fuzzy match ──────────────────────────────
_DIA = str.maketrans("čćžšđČĆŽŠĐ", "cczsdcczsd")
_PREFIX_RE = re.compile(
r"^(?:"
r"hrvatski\s+|muski\s+|zenski\s+|"
r"rukometni\s+(?:klub|savez)\s+|"
r"hrk|mrk|zrk|rk"
r")\s*",
re.I,
)
_TRAIL_LOC_RE = re.compile(r"\s*-\s*[a-z][a-z\s]*$", re.I)
_SUFFIX_2_RE = re.compile(r"\s+(?:ii|2)\s*$", re.I)
_NUMERIC_LIGA_RE = re.compile(r"\d+\.\s+u\s+.*$", re.I)
_PAREN_RE = re.compile(r"\([^)]*\)")
_NON_ALNUM_RE = re.compile(r"[^a-z0-9]+")
def normalize_klub_name(name):
"""Aggressively normalize a Croatian handball club name to a comparable token."""
if not name:
return ""
s = str(name).strip()
s = _PAREN_RE.sub(" ", s)
s = _NUMERIC_LIGA_RE.sub("", s)
s = s.translate(_DIA)
s = s.lower()
while True:
new = _PREFIX_RE.sub("", s)
if new == s:
break
s = new
s = _SUFFIX_2_RE.sub("", s)
s = _TRAIL_LOC_RE.sub("", s)
s = _NON_ALNUM_RE.sub(" ", s).strip()
return s
def is_team_2nd(name):
n = (name or "").strip().lower()
return bool(re.search(r"\s(?:ii|2)\s*$", n))
def is_pgz_klub_candidate(naziv):
"""Filter out savezi / udruge / zborovi / clearly non-club rows."""
n = (naziv or "").lower()
bad = ("savez", "udruga", "zbor", "trener")
if any(b in n for b in bad):
return False
# Junk like 'RK RK' or 'RK PŠR' (no real name body)
if re.fullmatch(r"\s*(rk|zrk|mrk|hrk)\s*(rk|psr|psr selce|liburnija|mornar|omisalj)?\s*", n):
return False
return True
def is_zenska_klub(naziv):
n = (naziv or "").strip().lower()
return n.startswith("ženski") or n.startswith("zenski") or n.startswith("ž ") \
or n.startswith("zrk ") or n.startswith("ž.") or " žene" in n or " zene" in n
# ─── Harvester ─────────────────────────────────────────────────────────────
class HRSHarvester(SportHarvester):
SPORT = "rukomet"
SOURCE = "hrs"
def __init__(self):
super().__init__()
self.team_to_klub_m = {}
self.team_to_klub_z = {}
self.unmatched_teams = set()
# Override base — base filters financiran||u_godisnjaku (only 3 rukomet rows).
# Brief mandates ALL 71 PGŽ priority rukomet klubova.
def get_target_klubovi(self, limit=999):
with self.conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT id, naziv, sport
FROM pgz_sport.v_pgz_priority_klubovi
WHERE sport = %s
ORDER BY id
LIMIT %s
""", (self.SPORT, limit))
return cur.fetchall()
def build_klub_maps(self):
rows = self.get_target_klubovi(999)
for r in rows:
naziv = r["naziv"]
if not is_pgz_klub_candidate(naziv):
continue
norm = normalize_klub_name(naziv)
if not norm:
continue
target = self.team_to_klub_z if is_zenska_klub(naziv) else self.team_to_klub_m
cur = target.get(norm)
if cur is None or r["id"] < cur[0]:
target[norm] = (r["id"], naziv)
self.log(f"klub maps: men={len(self.team_to_klub_m)} women={len(self.team_to_klub_z)}")
def match_team(self, hrs_team_name, is_zenska_liga):
"""Direct → token-subset → fallback. Tokens come from normalize_klub_name."""
if not hrs_team_name:
return None
candidates = [hrs_team_name]
if is_team_2nd(hrs_team_name):
candidates.append(re.sub(r"\s+(?:ii|2)\s*$", "", hrs_team_name).strip())
m = self.team_to_klub_z if is_zenska_liga else self.team_to_klub_m
for c in candidates:
n = normalize_klub_name(c)
if not n:
continue
if n in m:
return m[n]
n_tokens = set(n.split())
if not n_tokens:
continue
best = None
for k_norm, (kid, kname) in m.items():
k_tokens = set(k_norm.split())
if not k_tokens:
continue
# token-subset match in either direction
if not (n_tokens <= k_tokens or k_tokens <= n_tokens):
continue
shared = n_tokens & k_tokens
# Require at least one shared token of length ≥ 4 to avoid noise like {"rk"}
if not any(len(t) >= 4 for t in shared):
continue
# Prefer lowest klub_id (canonical row, not godišnjak duplicate)
if best is None or kid < best[0]:
best = (kid, kname)
if best:
return best
return None
# ─── HRS endpoints ─────────────────────────────────────────────────────
def fetch_natjecanje(self, lid):
url = f"https://www.sportinfocentar2.com/coman/natjecanje{lid}.js"
try:
body = http_text(url, timeout=20)
return parse_var_json(body, "natjecanjeobjekt")
except Exception as e:
self.log(f" ⚠ fetch_natjecanje({lid}): {e}")
return None
def fetch_match_stats(self, mid):
url = f"https://www.sportinfocentar2.com/ziceri/webupitisapi.dll?tab=128&utakmica={mid}"
try:
body = http_text(url, timeout=15)
stripped = body.strip()
if "not authorized" in stripped.lower() or stripped.startswith("//"):
return None
return parse_var_json(body, "tab128")
except Exception as e:
self.log(f" ⚠ fetch_match({mid}): {e}")
return None
# ─── Aggregation & upserts ─────────────────────────────────────────────
@staticmethod
def _aggregate_player_stats(rows):
out = defaultdict(int)
for r in rows:
out["nastupi"] += 1
out["golovi"] += int(r.get("sutd") or 0)
out["asistencije"] += int(r.get("asistencija") or 0)
out["zuti"] += int(r.get("zutih") or 0)
out["crveni"] += int(r.get("crvenih") or 0)
return dict(out)
def upsert_klub_roster(self, klub_id, hrs_team_id, ekipa, sezona, raw):
try:
with self.conn.cursor() as cur:
cur.execute("""
INSERT INTO pgz_sport.klub_roster
(klub_id, sport, source, source_id, source_url, ekipa, sezona, raw_data)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s::jsonb)
ON CONFLICT (klub_id, source, source_id, ekipa, sezona) DO UPDATE
SET raw_data = EXCLUDED.raw_data, scraped_at = now()
""", (klub_id, self.SPORT, self.SOURCE, str(hrs_team_id),
f"https://hrs.hr/natjecanje/?ekipa={hrs_team_id}",
ekipa, sezona, json.dumps(raw)))
except Exception as e:
self.log(f" ⚠ upsert_klub_roster: {e}")
# ─── Main run ──────────────────────────────────────────────────────────
def run(self, limit=999):
self.build_klub_maps()
nat_ids = HRS_NATJECANJA[: int(limit)] if limit else HRS_NATJECANJA
self.log(f"🤾 Starting HRS harvest. Natjecanja: {len(nat_ids)}")
agg = defaultdict(list)
clan_meta = {}
for lid in nat_ids:
nat = self.fetch_natjecanje(lid)
if not nat:
continue
naziv = nat.get("naziv") or f"natjecanje {lid}"
spol_int = nat.get("spol", 0)
is_zenska = ("- ž" in naziv.lower()) or ("žene" in naziv.lower()) or (spol_int == 1)
kategorija = derive_kategorija(naziv)
self.log(f"━━ Liga {lid}: {naziv} ({'Ž' if is_zenska else 'M'}, {kategorija})")
team_idx = {}
matches = []
for liga in (nat.get("lige") or []):
for u in (liga.get("utakmice") or []):
mid = u.get("broj")
k1, k2 = u.get("k1"), u.get("k2")
e1, e2 = u.get("e1") or "", u.get("e2") or ""
d = u.get("d") or u.get("pc")
if not mid or not k1 or not k2:
continue
if k1 and e1:
team_idx[k1] = e1
if k2 and e2:
team_idx[k2] = e2
matches.append((mid, k1, e1, k2, e2, d))
pgz_team_ids = {}
for tid, tname in team_idx.items():
m = self.match_team(tname, is_zenska)
if m:
pgz_team_ids[tid] = m
else:
self.unmatched_teams.add(f"{tname} [{ 'Ž' if is_zenska else 'M' }]")
if not pgz_team_ids:
self.log(f" · no PGŽ teams in this league")
continue
self.log(" ✓ PGŽ teams: " + ", ".join(
f"{tid}:{team_idx[tid]} → klub#{kid}"
for tid, (kid, _) in pgz_team_ids.items()))
roster_seen = {}
for (mid, k1, e1, k2, e2, mdate) in matches:
if k1 not in pgz_team_ids and k2 not in pgz_team_ids:
continue
sezona = derive_sezona(mdate) or "2025/2026"
rows = self.fetch_match_stats(mid)
if not rows:
continue
for r in rows:
rb = r.get("rbekipa")
if rb == 1:
hrs_team_id, ekipa_name = k1, e1
elif rb == 2:
hrs_team_id, ekipa_name = k2, e2
else:
continue
if hrs_team_id not in pgz_team_ids:
continue
klub_id, klub_naziv = pgz_team_ids[hrs_team_id]
igrac = r.get("igrac")
if not igrac:
continue
ime = (r.get("ime") or "").strip()
prezime = (r.get("prezime") or "").strip()
rkey = (klub_id, hrs_team_id, sezona)
if rkey not in roster_seen:
roster_seen[rkey] = (ekipa_name,
{"hrs_team_id": hrs_team_id, "natjecanje": naziv})
pkey = (igrac, klub_id, naziv, sezona)
agg[pkey].append(r)
if pkey not in clan_meta:
clan_meta[pkey] = {
"ime": ime, "prezime": prezime,
"hrs_team_id": hrs_team_id, "ekipa": ekipa_name,
"kategorija": kategorija,
"spol": "Ž" if is_zenska else "M",
"natjecanje": naziv, "lid": lid,
}
self.stats["stats"] += 1
time.sleep(0.05)
for (klub_id, hrs_team_id, sezona), (ekipa_name, raw) in roster_seen.items():
self.upsert_klub_roster(klub_id, hrs_team_id, ekipa_name, sezona, raw)
self.log(f"━━ Aggregated keys: {len(agg)}, unique players: {len({k[0] for k in agg})}")
upserted = 0
for (igrac, klub_id, naziv, sezona), match_rows in agg.items():
meta = clan_meta[(igrac, klub_id, naziv, sezona)]
try:
clan_id = self.upsert_clan(
klub_id=klub_id,
source_id=igrac,
ime=meta["ime"], prezime=meta["prezime"],
source_url=f"https://hrs.hr/natjecanje/?igrac={igrac}",
kategorija=meta["kategorija"],
sezona=sezona,
extra={"hrs_team_id": meta["hrs_team_id"],
"ekipa": meta["ekipa"], "spol": meta["spol"]},
)
self.stats["players"] += 1
stats_dict = self._aggregate_player_stats(match_rows)
self.upsert_stats(
clan_id=clan_id, sezona=sezona,
klub_id=klub_id, klub_naziv=meta["ekipa"],
natjecanje=naziv, kategorija=meta["kategorija"],
stats_dict=stats_dict,
raw={"matches": len(match_rows), "lid": meta["lid"]},
)
upserted += 1
except Exception as e:
self.stats["errors"] += 1
self.log(f" ❌ upsert clan {igrac}: {e}")
self.log(f"✅ Done. {upserted} player_stats rows. "
f"Stats: {self.stats}. Unmatched HRS teams: {len(self.unmatched_teams)}")
for t in sorted(self.unmatched_teams)[:30]:
self.log(f" unmatched: {t}")
try:
import subprocess
subprocess.run(["curl", "-s", "-X", "POST",
"https://api.telegram.org/bot8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y/sendMessage",
"-d", "chat_id=7969491558",
"--data-urlencode",
f"text=🤾 HRS rukomet harvest done. Players: {self.stats['players']}, "
f"stats rows: {upserted}, unmatched HRS teams: {len(self.unmatched_teams)}"],
timeout=8, capture_output=True)
except Exception:
pass
if __name__ == "__main__":
limit = int(sys.argv[1]) if len(sys.argv) > 1 else 999
HRSHarvester().run(limit=limit)