1d02c0897d
- pgz nav now includes /erp/full, /crm/v2, /admin/users, /dokumenti
- 4 dokumenti endpoints: list, godišnjaci/list, godišnjak/{godina} PDF, detail
- 18 godišnjaka u pgz_sport.dokumenti (2006-2024) with savez_id=333
- PGŽ filter helpers (window._pgz_filter_priority, togglePGZFilter)
- navItemClick handler for nav items with href
275 lines
11 KiB
Python
Executable File
275 lines
11 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
# Name: hvs_waterpolo.py
|
||
# Version: 2.0
|
||
# Author: Damir Radulić <dradulic@outlook.com> / damir@rinet.one
|
||
# Date: 2026-05-05
|
||
# Description: HVS (hvs.hr) waterpolo harvester.
|
||
# Brutalno iskreno: HVS web NE objavljuje rosters/stats po klubu
|
||
# kroz indeksabilan kanal — /klub-{slug}/ vraća 404, /klubovi/{id}/
|
||
# vraća "Pojavila se kritična greška", /igrac-{slug}/ vraća 404, a
|
||
# /kategorija/{id}/ prikazuje samo sezonsku navigaciju. Jedino što
|
||
# je upotrebljivo je wp-json REST API:
|
||
# /wp/v2/klubovi (20 klubova + ACF.history)
|
||
# /wp/v2/clanovi (37 federation officials s biografijama u kojima
|
||
# se najčešće spominje klupska karijera)
|
||
# Ovaj harvester:
|
||
# 1. Cita wp-json klubovi → mapira PGŽ klubove (source_url + meta)
|
||
# 2. Cita wp-json clanovi → upsertira sve, plus dodatno povezuje
|
||
# one čija biografija sadrži ime PGŽ kluba (heuristika).
|
||
# 3. Pokušava Playwright fallback na /klub-{slug}/ samo ako stranica
|
||
# stvarno ima ".profile-header__name" u DOM-u (gracefully skipa
|
||
# kad HVS vrati 404/error).
|
||
import sys, re, json, time, urllib.request
|
||
sys.path.insert(0, '/opt/pgz-sport/scripts/sport_harvesters')
|
||
from __base import SportHarvester
|
||
from psycopg2.extras import RealDictCursor
|
||
|
||
WP_KLUB_API = "https://hvs.hr/wp-json/wp/v2/klubovi"
|
||
WP_CLANOV_API = "https://hvs.hr/wp-json/wp/v2/clanovi"
|
||
HVS_BASE = "https://hvs.hr"
|
||
|
||
KEYWORDS = [
|
||
"primorje", "opatija", "jadran", "losinj", "palada",
|
||
"silo", "crikvenica", "orka", "bura", "posk", "victoria",
|
||
"kostrena", "njivice", "rijeka",
|
||
]
|
||
|
||
|
||
def _fetch_paginated(url, log):
|
||
"""Fetch all pages of a wp-json collection."""
|
||
out = []
|
||
for page in range(1, 20):
|
||
u = f"{url}?per_page=100&page={page}"
|
||
try:
|
||
req = urllib.request.Request(u, headers={"User-Agent": "Mozilla/5.0 PGZ-Sport"})
|
||
with urllib.request.urlopen(req, timeout=15) as r:
|
||
data = json.loads(r.read().decode("utf-8"))
|
||
except Exception as e:
|
||
log(f" wp-json {u} err: {e}")
|
||
break
|
||
if not data:
|
||
break
|
||
out.extend(data)
|
||
if len(data) < 100:
|
||
break
|
||
return out
|
||
|
||
|
||
class HVSHarvester(SportHarvester):
|
||
SPORT = 'vaterpolo'
|
||
SOURCE = 'hvs'
|
||
|
||
# ------------- target list (override base, return all 28) ----------------
|
||
def get_target_klubovi(self, limit=999):
|
||
with self.conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||
cur.execute("""
|
||
SELECT * FROM pgz_sport.v_pgz_priority_klubovi
|
||
WHERE sport = %s
|
||
ORDER BY financiran DESC, u_godisnjaku DESC, id
|
||
LIMIT %s
|
||
""", (self.SPORT, limit))
|
||
return cur.fetchall()
|
||
|
||
# ------------- normalize / match helpers ---------------------------------
|
||
def _core_slug(self, name):
|
||
if not name:
|
||
return ""
|
||
s = name.lower()
|
||
s = re.sub(r"\[merged[^\]]*\]", " ", s)
|
||
s = re.sub(r"\([^)]*\)", " ", s)
|
||
s = re.sub(r"\b(vaterpolo|vaterpolski|amaterski|žvk|zvk|vk|hšk|hsk)\b", " ", s)
|
||
s = re.sub(r"\bklub\b", " ", s)
|
||
s = re.sub(r"\bsavez\b.*$", " ", s)
|
||
s = re.sub(r"-(muška|ženska)\s*ekipa", " ", s)
|
||
s = re.sub(r"-erste\s*banka?a?", " ", s)
|
||
s = self.slugify(s)
|
||
return s.strip("-")
|
||
|
||
def _tokens(self, name):
|
||
s = self._core_slug(name)
|
||
return set(t for t in s.split("-") if len(t) > 2)
|
||
|
||
def _match_klub(self, pgz_naziv, hvs_list):
|
||
target_core = self._core_slug(pgz_naziv)
|
||
target_tokens = self._tokens(pgz_naziv)
|
||
if not target_tokens:
|
||
return None
|
||
|
||
for h in hvs_list:
|
||
if self._core_slug(h["title"]) == target_core:
|
||
return h
|
||
|
||
best, best_score = None, 0
|
||
for h in hvs_list:
|
||
ht = self._tokens(h["title"])
|
||
shared = target_tokens & ht
|
||
if len(shared) < 2:
|
||
continue
|
||
extra_candidate = (ht - target_tokens) & set(KEYWORDS)
|
||
if extra_candidate:
|
||
continue
|
||
if len(shared) > best_score:
|
||
best_score = len(shared)
|
||
best = h
|
||
return best
|
||
|
||
# ------------- wp-json fetch + simplify ----------------------------------
|
||
def _fetch_hvs_klubovi(self):
|
||
out = []
|
||
raw = _fetch_paginated(WP_KLUB_API, self.log)
|
||
for k in raw:
|
||
title = (k.get("title") or {}).get("rendered", "").strip()
|
||
title = (title.replace("–", "–").replace("’", "'")
|
||
.replace("&", "&"))
|
||
acf = k.get("ACF") or {}
|
||
out.append({
|
||
"wp_id": k.get("id"),
|
||
"club_id": acf.get("club_id"),
|
||
"title": title,
|
||
"link": k.get("link"),
|
||
"slug": k.get("slug"),
|
||
"history": acf.get("history") or "",
|
||
})
|
||
return out
|
||
|
||
def _fetch_hvs_clanovi(self):
|
||
out = []
|
||
raw = _fetch_paginated(WP_CLANOV_API, self.log)
|
||
for c in raw:
|
||
title = (c.get("title") or {}).get("rendered", "").strip()
|
||
title = (title.replace("–", "–").replace("’", "'")
|
||
.replace("&", "&"))
|
||
acf = c.get("ACF") or {}
|
||
out.append({
|
||
"wp_id": c.get("id"),
|
||
"name": acf.get("name") or title,
|
||
"image": acf.get("image") or "",
|
||
"birth_date": acf.get("birth_date") or "",
|
||
"birth_place": acf.get("birth_place") or "",
|
||
"position": acf.get("position") or "",
|
||
"bio": acf.get("bio") or "",
|
||
"slug": c.get("slug"),
|
||
"link": c.get("link"),
|
||
})
|
||
return out
|
||
|
||
# ------------- DB persistence helpers ------------------------------------
|
||
def _persist_klub_link(self, klub_id, hvs):
|
||
with self.conn.cursor() as cur:
|
||
cur.execute("""
|
||
UPDATE pgz_sport.klubovi
|
||
SET source_url = %s,
|
||
metadata = COALESCE(metadata, '{}'::jsonb) ||
|
||
jsonb_build_object('hvs_wp_id', %s::int,
|
||
'hvs_club_id', %s,
|
||
'hvs_title', %s),
|
||
updated_at = now()
|
||
WHERE id = %s
|
||
""", (hvs["link"], hvs["wp_id"], hvs.get("club_id"), hvs["title"], klub_id))
|
||
|
||
def _split_name(self, full):
|
||
full = re.sub(r"\s+", " ", (full or "")).strip()
|
||
if not full:
|
||
return "", ""
|
||
parts = full.split(" ", 1)
|
||
return parts[0], (parts[1] if len(parts) > 1 else "")
|
||
|
||
def _insert_official(self, clan_data, klub_id):
|
||
ime, prezime = self._split_name(clan_data["name"])
|
||
if not ime:
|
||
return None
|
||
extra = {
|
||
"image": clan_data.get("image", ""),
|
||
"birth_date": clan_data.get("birth_date", ""),
|
||
"birth_place": clan_data.get("birth_place", ""),
|
||
"position": clan_data.get("position", ""),
|
||
"bio": (clan_data.get("bio") or "")[:8000],
|
||
"hvs_role": "federation_official_or_staff",
|
||
}
|
||
kategorija = clan_data.get("position") or "stručna funkcija"
|
||
return self.upsert_clan(
|
||
klub_id=klub_id,
|
||
source_id=str(clan_data["wp_id"]),
|
||
ime=ime, prezime=prezime,
|
||
source_url=clan_data["link"],
|
||
kategorija=kategorija,
|
||
sezona=None,
|
||
extra=extra,
|
||
)
|
||
|
||
# ------------- klub-level orchestration ----------------------------------
|
||
def scrape_klub(self, page, klub):
|
||
self.log(f" 🤽 Klub {klub['id']} {klub['naziv']}")
|
||
|
||
if not hasattr(self, "_hvs_klubovi"):
|
||
self._hvs_klubovi = self._fetch_hvs_klubovi()
|
||
self.log(f" 📡 wp-json klubovi loaded: {len(self._hvs_klubovi)}")
|
||
|
||
if not hasattr(self, "_hvs_clanovi"):
|
||
self._hvs_clanovi = self._fetch_hvs_clanovi()
|
||
self.log(f" 📡 wp-json clanovi loaded: {len(self._hvs_clanovi)}")
|
||
|
||
match = self._match_klub(klub['naziv'], self._hvs_klubovi)
|
||
if match:
|
||
self.log(f" ✅ wp-json match → {match['title']} ({match['link']})")
|
||
self._persist_klub_link(klub['id'], match)
|
||
else:
|
||
self.log(f" 🟡 no wp-json klub match (HVS exposes only 20 klubova)")
|
||
|
||
# Insert federation officials whose bio mentions any distinctive token
|
||
# of this PGŽ klub. This is the only way HVS surfaces person-level data.
|
||
klub_tokens = [t for t in self._tokens(klub['naziv']) if t in KEYWORDS]
|
||
if not klub_tokens:
|
||
self.log(f" 🟡 no distinctive tokens for {klub['naziv']}, skip clanovi link")
|
||
return
|
||
linked = 0
|
||
for c in self._hvs_clanovi:
|
||
blob = ((c.get("bio") or "") + " " + (c.get("name") or "")).lower()
|
||
blob = self.slugify(blob).replace("-", " ")
|
||
if any(t in blob.split() for t in klub_tokens):
|
||
try:
|
||
cid = self._insert_official(c, klub['id'])
|
||
if cid:
|
||
self.stats['players'] += 1
|
||
linked += 1
|
||
except Exception as e:
|
||
self.log(f" official upsert err: {e}")
|
||
if linked:
|
||
self.log(f" 🧑 {linked} clanovi linked via bio match")
|
||
|
||
# Heartbeat
|
||
try:
|
||
import subprocess
|
||
subprocess.run(["redis-cli", "SET", "cc:pgz-sport:heartbeat",
|
||
str(int(time.time()))], timeout=3, capture_output=True)
|
||
except Exception:
|
||
pass
|
||
|
||
# We override run() to skip Playwright entirely (HVS site is broken for it).
|
||
def run(self, limit=999):
|
||
klubovi = self.get_target_klubovi(limit)
|
||
self.log(f"🚀 Starting {self.SPORT} harvest. Target: {len(klubovi)} klubova "
|
||
f"(wp-json only — site SPA is broken for /klub/, /igrac/, /kategorija/)")
|
||
for klub in klubovi:
|
||
try:
|
||
self.scrape_klub(None, klub) # no Playwright page
|
||
self.stats['klubova'] += 1
|
||
except Exception as e:
|
||
self.stats['errors'] += 1
|
||
self.log(f" ❌ Klub {klub['id']} {klub['naziv']}: {e}")
|
||
self.log(f"✅ Done. Stats: {self.stats}")
|
||
try:
|
||
import subprocess
|
||
subprocess.run(["curl", "-s", "-X", "POST",
|
||
"https://api.telegram.org/bot8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y/sendMessage",
|
||
"-d", "chat_id=7969491558",
|
||
"--data-urlencode", f"text=VATERPOLO harvest done: {self.stats}"],
|
||
timeout=8, capture_output=True)
|
||
except Exception:
|
||
pass
|
||
|
||
|
||
if __name__ == '__main__':
|
||
limit = int(sys.argv[1]) if len(sys.argv) > 1 else 28
|
||
HVSHarvester().run(limit=limit)
|