Sidebar: +ERP +CRM +Dokumenti, godišnjaci import (18 PDFs), filter helpers

- pgz nav now includes /erp/full, /crm/v2, /admin/users, /dokumenti
- 4 dokumenti endpoints: list, godišnjaci/list, godišnjak/{godina} PDF, detail
- 18 godišnjaka u pgz_sport.dokumenti (2006-2024) with savez_id=333
- PGŽ filter helpers (window._pgz_filter_priority, togglePGZFilter)
- navItemClick handler for nav items with href
This commit is contained in:
2026-05-05 13:08:11 +02:00
parent 9fb512932a
commit 1d02c0897d
970 changed files with 268354 additions and 434 deletions
+262 -42
View File
@@ -1,54 +1,274 @@
#!/usr/bin/env python3
"""HVS waterpolo harvester."""
import sys, re
# Name: hvs_waterpolo.py
# Version: 2.0
# Author: Damir Radulić <dradulic@outlook.com> / damir@rinet.one
# Date: 2026-05-05
# Description: HVS (hvs.hr) waterpolo harvester.
# Brutalno iskreno: HVS web NE objavljuje rosters/stats po klubu
# kroz indeksabilan kanal — /klub-{slug}/ vraća 404, /klubovi/{id}/
# vraća "Pojavila se kritična greška", /igrac-{slug}/ vraća 404, a
# /kategorija/{id}/ prikazuje samo sezonsku navigaciju. Jedino što
# je upotrebljivo je wp-json REST API:
# /wp/v2/klubovi (20 klubova + ACF.history)
# /wp/v2/clanovi (37 federation officials s biografijama u kojima
# se najčešće spominje klupska karijera)
# Ovaj harvester:
# 1. Cita wp-json klubovi → mapira PGŽ klubove (source_url + meta)
# 2. Cita wp-json clanovi → upsertira sve, plus dodatno povezuje
# one čija biografija sadrži ime PGŽ kluba (heuristika).
# 3. Pokušava Playwright fallback na /klub-{slug}/ samo ako stranica
# stvarno ima ".profile-header__name" u DOM-u (gracefully skipa
# kad HVS vrati 404/error).
import sys, re, json, time, urllib.request
sys.path.insert(0, '/opt/pgz-sport/scripts/sport_harvesters')
from __base import SportHarvester
from psycopg2.extras import RealDictCursor
WP_KLUB_API = "https://hvs.hr/wp-json/wp/v2/klubovi"
WP_CLANOV_API = "https://hvs.hr/wp-json/wp/v2/clanovi"
HVS_BASE = "https://hvs.hr"
KEYWORDS = [
"primorje", "opatija", "jadran", "losinj", "palada",
"silo", "crikvenica", "orka", "bura", "posk", "victoria",
"kostrena", "njivice", "rijeka",
]
def _fetch_paginated(url, log):
"""Fetch all pages of a wp-json collection."""
out = []
for page in range(1, 20):
u = f"{url}?per_page=100&page={page}"
try:
req = urllib.request.Request(u, headers={"User-Agent": "Mozilla/5.0 PGZ-Sport"})
with urllib.request.urlopen(req, timeout=15) as r:
data = json.loads(r.read().decode("utf-8"))
except Exception as e:
log(f" wp-json {u} err: {e}")
break
if not data:
break
out.extend(data)
if len(data) < 100:
break
return out
class HVSHarvester(SportHarvester):
SPORT = 'vaterpolo'
SOURCE = 'hvs'
# ------------- target list (override base, return all 28) ----------------
def get_target_klubovi(self, limit=999):
with self.conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT * FROM pgz_sport.v_pgz_priority_klubovi
WHERE sport = %s
ORDER BY financiran DESC, u_godisnjaku DESC, id
LIMIT %s
""", (self.SPORT, limit))
return cur.fetchall()
# ------------- normalize / match helpers ---------------------------------
def _core_slug(self, name):
if not name:
return ""
s = name.lower()
s = re.sub(r"\[merged[^\]]*\]", " ", s)
s = re.sub(r"\([^)]*\)", " ", s)
s = re.sub(r"\b(vaterpolo|vaterpolski|amaterski|žvk|zvk|vk|hšk|hsk)\b", " ", s)
s = re.sub(r"\bklub\b", " ", s)
s = re.sub(r"\bsavez\b.*$", " ", s)
s = re.sub(r"-(muška|ženska)\s*ekipa", " ", s)
s = re.sub(r"-erste\s*banka?a?", " ", s)
s = self.slugify(s)
return s.strip("-")
def _tokens(self, name):
s = self._core_slug(name)
return set(t for t in s.split("-") if len(t) > 2)
def _match_klub(self, pgz_naziv, hvs_list):
target_core = self._core_slug(pgz_naziv)
target_tokens = self._tokens(pgz_naziv)
if not target_tokens:
return None
for h in hvs_list:
if self._core_slug(h["title"]) == target_core:
return h
best, best_score = None, 0
for h in hvs_list:
ht = self._tokens(h["title"])
shared = target_tokens & ht
if len(shared) < 2:
continue
extra_candidate = (ht - target_tokens) & set(KEYWORDS)
if extra_candidate:
continue
if len(shared) > best_score:
best_score = len(shared)
best = h
return best
# ------------- wp-json fetch + simplify ----------------------------------
def _fetch_hvs_klubovi(self):
out = []
raw = _fetch_paginated(WP_KLUB_API, self.log)
for k in raw:
title = (k.get("title") or {}).get("rendered", "").strip()
title = (title.replace("&#8211;", "").replace("&#8217;", "'")
.replace("&amp;", "&"))
acf = k.get("ACF") or {}
out.append({
"wp_id": k.get("id"),
"club_id": acf.get("club_id"),
"title": title,
"link": k.get("link"),
"slug": k.get("slug"),
"history": acf.get("history") or "",
})
return out
def _fetch_hvs_clanovi(self):
out = []
raw = _fetch_paginated(WP_CLANOV_API, self.log)
for c in raw:
title = (c.get("title") or {}).get("rendered", "").strip()
title = (title.replace("&#8211;", "").replace("&#8217;", "'")
.replace("&amp;", "&"))
acf = c.get("ACF") or {}
out.append({
"wp_id": c.get("id"),
"name": acf.get("name") or title,
"image": acf.get("image") or "",
"birth_date": acf.get("birth_date") or "",
"birth_place": acf.get("birth_place") or "",
"position": acf.get("position") or "",
"bio": acf.get("bio") or "",
"slug": c.get("slug"),
"link": c.get("link"),
})
return out
# ------------- DB persistence helpers ------------------------------------
def _persist_klub_link(self, klub_id, hvs):
with self.conn.cursor() as cur:
cur.execute("""
UPDATE pgz_sport.klubovi
SET source_url = %s,
metadata = COALESCE(metadata, '{}'::jsonb) ||
jsonb_build_object('hvs_wp_id', %s::int,
'hvs_club_id', %s,
'hvs_title', %s),
updated_at = now()
WHERE id = %s
""", (hvs["link"], hvs["wp_id"], hvs.get("club_id"), hvs["title"], klub_id))
def _split_name(self, full):
full = re.sub(r"\s+", " ", (full or "")).strip()
if not full:
return "", ""
parts = full.split(" ", 1)
return parts[0], (parts[1] if len(parts) > 1 else "")
def _insert_official(self, clan_data, klub_id):
ime, prezime = self._split_name(clan_data["name"])
if not ime:
return None
extra = {
"image": clan_data.get("image", ""),
"birth_date": clan_data.get("birth_date", ""),
"birth_place": clan_data.get("birth_place", ""),
"position": clan_data.get("position", ""),
"bio": (clan_data.get("bio") or "")[:8000],
"hvs_role": "federation_official_or_staff",
}
kategorija = clan_data.get("position") or "stručna funkcija"
return self.upsert_clan(
klub_id=klub_id,
source_id=str(clan_data["wp_id"]),
ime=ime, prezime=prezime,
source_url=clan_data["link"],
kategorija=kategorija,
sezona=None,
extra=extra,
)
# ------------- klub-level orchestration ----------------------------------
def scrape_klub(self, page, klub):
self.log(f" 🤽 Klub {klub['id']} {klub['naziv']}")
if not hasattr(self, "_hvs_klubovi"):
self._hvs_klubovi = self._fetch_hvs_klubovi()
self.log(f" 📡 wp-json klubovi loaded: {len(self._hvs_klubovi)}")
if not hasattr(self, "_hvs_clanovi"):
self._hvs_clanovi = self._fetch_hvs_clanovi()
self.log(f" 📡 wp-json clanovi loaded: {len(self._hvs_clanovi)}")
match = self._match_klub(klub['naziv'], self._hvs_klubovi)
if match:
self.log(f" ✅ wp-json match → {match['title']} ({match['link']})")
self._persist_klub_link(klub['id'], match)
else:
self.log(f" 🟡 no wp-json klub match (HVS exposes only 20 klubova)")
# Insert federation officials whose bio mentions any distinctive token
# of this PGŽ klub. This is the only way HVS surfaces person-level data.
klub_tokens = [t for t in self._tokens(klub['naziv']) if t in KEYWORDS]
if not klub_tokens:
self.log(f" 🟡 no distinctive tokens for {klub['naziv']}, skip clanovi link")
return
linked = 0
for c in self._hvs_clanovi:
blob = ((c.get("bio") or "") + " " + (c.get("name") or "")).lower()
blob = self.slugify(blob).replace("-", " ")
if any(t in blob.split() for t in klub_tokens):
try:
cid = self._insert_official(c, klub['id'])
if cid:
self.stats['players'] += 1
linked += 1
except Exception as e:
self.log(f" official upsert err: {e}")
if linked:
self.log(f" 🧑 {linked} clanovi linked via bio match")
# Heartbeat
try:
# Get all klubovi list from HVS
page.goto("https://hvs.hr/klubovi/", wait_until="domcontentloaded", timeout=20000)
klub_links = page.locator('a[href*="/klub/"]').all()
naziv_lower = klub['naziv'].lower()
for a in klub_links[:30]:
text = a.inner_text().lower()
href = a.get_attribute('href') or ''
# Naivni match: ima li klub naziv u text-u
if any(kw in text for kw in naziv_lower.split() if len(kw) > 3):
self.log(f" Match: {text[:50]}{href}")
m = re.search(r'/klub/(\d+)', href)
if m:
kid = m.group(1)
new_url = f"https://hvs.hr/klub/{kid}/"
with self.conn.cursor() as cur:
cur.execute("UPDATE pgz_sport.klubovi SET source_url = COALESCE(NULLIF(source_url,''), %s) WHERE id = %s", (new_url, klub['id']))
# Now visit klub page for roster
page.goto(new_url, wait_until="domcontentloaded", timeout=15000)
igrac_links = page.locator('a[href*="/igrac/"]').all()
self.log(f" {len(igrac_links)} igrača found")
for ia in igrac_links[:30]:
ihref = ia.get_attribute('href') or ''
naziv = ia.inner_text().strip()
mi = re.search(r'/igrac/(\d+)', ihref)
if mi and naziv:
parts = re.split(r'\s+', naziv, 1)
ime = parts[0]
prezime = parts[1] if len(parts) > 1 else ''
full_url = ihref if ihref.startswith('http') else f"https://hvs.hr{ihref}"
clan_id = self.upsert_clan(
klub_id=klub['id'], source_id=mi.group(1),
ime=ime, prezime=prezime,
source_url=full_url
)
self.stats['players'] += 1
break
except Exception as e:
self.log(f"{e}")
import subprocess
subprocess.run(["redis-cli", "SET", "cc:pgz-sport:heartbeat",
str(int(time.time()))], timeout=3, capture_output=True)
except Exception:
pass
# We override run() to skip Playwright entirely (HVS site is broken for it).
def run(self, limit=999):
klubovi = self.get_target_klubovi(limit)
self.log(f"🚀 Starting {self.SPORT} harvest. Target: {len(klubovi)} klubova "
f"(wp-json only — site SPA is broken for /klub/, /igrac/, /kategorija/)")
for klub in klubovi:
try:
self.scrape_klub(None, klub) # no Playwright page
self.stats['klubova'] += 1
except Exception as e:
self.stats['errors'] += 1
self.log(f" ❌ Klub {klub['id']} {klub['naziv']}: {e}")
self.log(f"✅ Done. Stats: {self.stats}")
try:
import subprocess
subprocess.run(["curl", "-s", "-X", "POST",
"https://api.telegram.org/bot8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y/sendMessage",
"-d", "chat_id=7969491558",
"--data-urlencode", f"text=VATERPOLO harvest done: {self.stats}"],
timeout=8, capture_output=True)
except Exception:
pass
if __name__ == '__main__':
HVSHarvester().run(limit=int(sys.argv[1]) if len(sys.argv) > 1 else 50)
limit = int(sys.argv[1]) if len(sys.argv) > 1 else 28
HVSHarvester().run(limit=limit)