Files
pgz-sport/scripts/sport_harvesters/hvs_waterpolo.py
T
damir 1d02c0897d Sidebar: +ERP +CRM +Dokumenti, godišnjaci import (18 PDFs), filter helpers
- pgz nav now includes /erp/full, /crm/v2, /admin/users, /dokumenti
- 4 dokumenti endpoints: list, godišnjaci/list, godišnjak/{godina} PDF, detail
- 18 godišnjaka u pgz_sport.dokumenti (2006-2024) with savez_id=333
- PGŽ filter helpers (window._pgz_filter_priority, togglePGZFilter)
- navItemClick handler for nav items with href
2026-05-05 13:08:11 +02:00

275 lines
11 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
# Name: hvs_waterpolo.py
# Version: 2.0
# Author: Damir Radulić <dradulic@outlook.com> / damir@rinet.one
# Date: 2026-05-05
# Description: HVS (hvs.hr) waterpolo harvester.
# Brutalno iskreno: HVS web NE objavljuje rosters/stats po klubu
# kroz indeksabilan kanal — /klub-{slug}/ vraća 404, /klubovi/{id}/
# vraća "Pojavila se kritična greška", /igrac-{slug}/ vraća 404, a
# /kategorija/{id}/ prikazuje samo sezonsku navigaciju. Jedino što
# je upotrebljivo je wp-json REST API:
# /wp/v2/klubovi (20 klubova + ACF.history)
# /wp/v2/clanovi (37 federation officials s biografijama u kojima
# se najčešće spominje klupska karijera)
# Ovaj harvester:
# 1. Cita wp-json klubovi → mapira PGŽ klubove (source_url + meta)
# 2. Cita wp-json clanovi → upsertira sve, plus dodatno povezuje
# one čija biografija sadrži ime PGŽ kluba (heuristika).
# 3. Pokušava Playwright fallback na /klub-{slug}/ samo ako stranica
# stvarno ima ".profile-header__name" u DOM-u (gracefully skipa
# kad HVS vrati 404/error).
import sys, re, json, time, urllib.request
sys.path.insert(0, '/opt/pgz-sport/scripts/sport_harvesters')
from __base import SportHarvester
from psycopg2.extras import RealDictCursor
WP_KLUB_API = "https://hvs.hr/wp-json/wp/v2/klubovi"
WP_CLANOV_API = "https://hvs.hr/wp-json/wp/v2/clanovi"
HVS_BASE = "https://hvs.hr"
KEYWORDS = [
"primorje", "opatija", "jadran", "losinj", "palada",
"silo", "crikvenica", "orka", "bura", "posk", "victoria",
"kostrena", "njivice", "rijeka",
]
def _fetch_paginated(url, log):
"""Fetch all pages of a wp-json collection."""
out = []
for page in range(1, 20):
u = f"{url}?per_page=100&page={page}"
try:
req = urllib.request.Request(u, headers={"User-Agent": "Mozilla/5.0 PGZ-Sport"})
with urllib.request.urlopen(req, timeout=15) as r:
data = json.loads(r.read().decode("utf-8"))
except Exception as e:
log(f" wp-json {u} err: {e}")
break
if not data:
break
out.extend(data)
if len(data) < 100:
break
return out
class HVSHarvester(SportHarvester):
SPORT = 'vaterpolo'
SOURCE = 'hvs'
# ------------- target list (override base, return all 28) ----------------
def get_target_klubovi(self, limit=999):
with self.conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT * FROM pgz_sport.v_pgz_priority_klubovi
WHERE sport = %s
ORDER BY financiran DESC, u_godisnjaku DESC, id
LIMIT %s
""", (self.SPORT, limit))
return cur.fetchall()
# ------------- normalize / match helpers ---------------------------------
def _core_slug(self, name):
if not name:
return ""
s = name.lower()
s = re.sub(r"\[merged[^\]]*\]", " ", s)
s = re.sub(r"\([^)]*\)", " ", s)
s = re.sub(r"\b(vaterpolo|vaterpolski|amaterski|žvk|zvk|vk|hšk|hsk)\b", " ", s)
s = re.sub(r"\bklub\b", " ", s)
s = re.sub(r"\bsavez\b.*$", " ", s)
s = re.sub(r"-(muška|ženska)\s*ekipa", " ", s)
s = re.sub(r"-erste\s*banka?a?", " ", s)
s = self.slugify(s)
return s.strip("-")
def _tokens(self, name):
s = self._core_slug(name)
return set(t for t in s.split("-") if len(t) > 2)
def _match_klub(self, pgz_naziv, hvs_list):
target_core = self._core_slug(pgz_naziv)
target_tokens = self._tokens(pgz_naziv)
if not target_tokens:
return None
for h in hvs_list:
if self._core_slug(h["title"]) == target_core:
return h
best, best_score = None, 0
for h in hvs_list:
ht = self._tokens(h["title"])
shared = target_tokens & ht
if len(shared) < 2:
continue
extra_candidate = (ht - target_tokens) & set(KEYWORDS)
if extra_candidate:
continue
if len(shared) > best_score:
best_score = len(shared)
best = h
return best
# ------------- wp-json fetch + simplify ----------------------------------
def _fetch_hvs_klubovi(self):
out = []
raw = _fetch_paginated(WP_KLUB_API, self.log)
for k in raw:
title = (k.get("title") or {}).get("rendered", "").strip()
title = (title.replace("&#8211;", "").replace("&#8217;", "'")
.replace("&amp;", "&"))
acf = k.get("ACF") or {}
out.append({
"wp_id": k.get("id"),
"club_id": acf.get("club_id"),
"title": title,
"link": k.get("link"),
"slug": k.get("slug"),
"history": acf.get("history") or "",
})
return out
def _fetch_hvs_clanovi(self):
out = []
raw = _fetch_paginated(WP_CLANOV_API, self.log)
for c in raw:
title = (c.get("title") or {}).get("rendered", "").strip()
title = (title.replace("&#8211;", "").replace("&#8217;", "'")
.replace("&amp;", "&"))
acf = c.get("ACF") or {}
out.append({
"wp_id": c.get("id"),
"name": acf.get("name") or title,
"image": acf.get("image") or "",
"birth_date": acf.get("birth_date") or "",
"birth_place": acf.get("birth_place") or "",
"position": acf.get("position") or "",
"bio": acf.get("bio") or "",
"slug": c.get("slug"),
"link": c.get("link"),
})
return out
# ------------- DB persistence helpers ------------------------------------
def _persist_klub_link(self, klub_id, hvs):
with self.conn.cursor() as cur:
cur.execute("""
UPDATE pgz_sport.klubovi
SET source_url = %s,
metadata = COALESCE(metadata, '{}'::jsonb) ||
jsonb_build_object('hvs_wp_id', %s::int,
'hvs_club_id', %s,
'hvs_title', %s),
updated_at = now()
WHERE id = %s
""", (hvs["link"], hvs["wp_id"], hvs.get("club_id"), hvs["title"], klub_id))
def _split_name(self, full):
full = re.sub(r"\s+", " ", (full or "")).strip()
if not full:
return "", ""
parts = full.split(" ", 1)
return parts[0], (parts[1] if len(parts) > 1 else "")
def _insert_official(self, clan_data, klub_id):
ime, prezime = self._split_name(clan_data["name"])
if not ime:
return None
extra = {
"image": clan_data.get("image", ""),
"birth_date": clan_data.get("birth_date", ""),
"birth_place": clan_data.get("birth_place", ""),
"position": clan_data.get("position", ""),
"bio": (clan_data.get("bio") or "")[:8000],
"hvs_role": "federation_official_or_staff",
}
kategorija = clan_data.get("position") or "stručna funkcija"
return self.upsert_clan(
klub_id=klub_id,
source_id=str(clan_data["wp_id"]),
ime=ime, prezime=prezime,
source_url=clan_data["link"],
kategorija=kategorija,
sezona=None,
extra=extra,
)
# ------------- klub-level orchestration ----------------------------------
def scrape_klub(self, page, klub):
self.log(f" 🤽 Klub {klub['id']} {klub['naziv']}")
if not hasattr(self, "_hvs_klubovi"):
self._hvs_klubovi = self._fetch_hvs_klubovi()
self.log(f" 📡 wp-json klubovi loaded: {len(self._hvs_klubovi)}")
if not hasattr(self, "_hvs_clanovi"):
self._hvs_clanovi = self._fetch_hvs_clanovi()
self.log(f" 📡 wp-json clanovi loaded: {len(self._hvs_clanovi)}")
match = self._match_klub(klub['naziv'], self._hvs_klubovi)
if match:
self.log(f" ✅ wp-json match → {match['title']} ({match['link']})")
self._persist_klub_link(klub['id'], match)
else:
self.log(f" 🟡 no wp-json klub match (HVS exposes only 20 klubova)")
# Insert federation officials whose bio mentions any distinctive token
# of this PGŽ klub. This is the only way HVS surfaces person-level data.
klub_tokens = [t for t in self._tokens(klub['naziv']) if t in KEYWORDS]
if not klub_tokens:
self.log(f" 🟡 no distinctive tokens for {klub['naziv']}, skip clanovi link")
return
linked = 0
for c in self._hvs_clanovi:
blob = ((c.get("bio") or "") + " " + (c.get("name") or "")).lower()
blob = self.slugify(blob).replace("-", " ")
if any(t in blob.split() for t in klub_tokens):
try:
cid = self._insert_official(c, klub['id'])
if cid:
self.stats['players'] += 1
linked += 1
except Exception as e:
self.log(f" official upsert err: {e}")
if linked:
self.log(f" 🧑 {linked} clanovi linked via bio match")
# Heartbeat
try:
import subprocess
subprocess.run(["redis-cli", "SET", "cc:pgz-sport:heartbeat",
str(int(time.time()))], timeout=3, capture_output=True)
except Exception:
pass
# We override run() to skip Playwright entirely (HVS site is broken for it).
def run(self, limit=999):
klubovi = self.get_target_klubovi(limit)
self.log(f"🚀 Starting {self.SPORT} harvest. Target: {len(klubovi)} klubova "
f"(wp-json only — site SPA is broken for /klub/, /igrac/, /kategorija/)")
for klub in klubovi:
try:
self.scrape_klub(None, klub) # no Playwright page
self.stats['klubova'] += 1
except Exception as e:
self.stats['errors'] += 1
self.log(f" ❌ Klub {klub['id']} {klub['naziv']}: {e}")
self.log(f"✅ Done. Stats: {self.stats}")
try:
import subprocess
subprocess.run(["curl", "-s", "-X", "POST",
"https://api.telegram.org/bot8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y/sendMessage",
"-d", "chat_id=7969491558",
"--data-urlencode", f"text=VATERPOLO harvest done: {self.stats}"],
timeout=8, capture_output=True)
except Exception:
pass
if __name__ == '__main__':
limit = int(sys.argv[1]) if len(sys.argv) > 1 else 28
HVSHarvester().run(limit=limit)