#!/usr/bin/env python3 # Name: hvs_waterpolo.py # Version: 2.0 # Author: Damir Radulić / damir@rinet.one # Date: 2026-05-05 # Description: HVS (hvs.hr) waterpolo harvester. # Brutalno iskreno: HVS web NE objavljuje rosters/stats po klubu # kroz indeksabilan kanal — /klub-{slug}/ vraća 404, /klubovi/{id}/ # vraća "Pojavila se kritična greška", /igrac-{slug}/ vraća 404, a # /kategorija/{id}/ prikazuje samo sezonsku navigaciju. Jedino što # je upotrebljivo je wp-json REST API: # /wp/v2/klubovi (20 klubova + ACF.history) # /wp/v2/clanovi (37 federation officials s biografijama u kojima # se najčešće spominje klupska karijera) # Ovaj harvester: # 1. Cita wp-json klubovi → mapira PGŽ klubove (source_url + meta) # 2. Cita wp-json clanovi → upsertira sve, plus dodatno povezuje # one čija biografija sadrži ime PGŽ kluba (heuristika). # 3. Pokušava Playwright fallback na /klub-{slug}/ samo ako stranica # stvarno ima ".profile-header__name" u DOM-u (gracefully skipa # kad HVS vrati 404/error). import sys, re, json, time, urllib.request sys.path.insert(0, '/opt/pgz-sport/scripts/sport_harvesters') from __base import SportHarvester from psycopg2.extras import RealDictCursor WP_KLUB_API = "https://hvs.hr/wp-json/wp/v2/klubovi" WP_CLANOV_API = "https://hvs.hr/wp-json/wp/v2/clanovi" HVS_BASE = "https://hvs.hr" KEYWORDS = [ "primorje", "opatija", "jadran", "losinj", "palada", "silo", "crikvenica", "orka", "bura", "posk", "victoria", "kostrena", "njivice", "rijeka", ] def _fetch_paginated(url, log): """Fetch all pages of a wp-json collection.""" out = [] for page in range(1, 20): u = f"{url}?per_page=100&page={page}" try: req = urllib.request.Request(u, headers={"User-Agent": "Mozilla/5.0 PGZ-Sport"}) with urllib.request.urlopen(req, timeout=15) as r: data = json.loads(r.read().decode("utf-8")) except Exception as e: log(f" wp-json {u} err: {e}") break if not data: break out.extend(data) if len(data) < 100: break return out class HVSHarvester(SportHarvester): SPORT = 'vaterpolo' SOURCE = 'hvs' # ------------- target list (override base, return all 28) ---------------- def get_target_klubovi(self, limit=999): with self.conn.cursor(cursor_factory=RealDictCursor) as cur: cur.execute(""" SELECT * FROM pgz_sport.v_pgz_priority_klubovi WHERE sport = %s ORDER BY financiran DESC, u_godisnjaku DESC, id LIMIT %s """, (self.SPORT, limit)) return cur.fetchall() # ------------- normalize / match helpers --------------------------------- def _core_slug(self, name): if not name: return "" s = name.lower() s = re.sub(r"\[merged[^\]]*\]", " ", s) s = re.sub(r"\([^)]*\)", " ", s) s = re.sub(r"\b(vaterpolo|vaterpolski|amaterski|žvk|zvk|vk|hšk|hsk)\b", " ", s) s = re.sub(r"\bklub\b", " ", s) s = re.sub(r"\bsavez\b.*$", " ", s) s = re.sub(r"-(muška|ženska)\s*ekipa", " ", s) s = re.sub(r"-erste\s*banka?a?", " ", s) s = self.slugify(s) return s.strip("-") def _tokens(self, name): s = self._core_slug(name) return set(t for t in s.split("-") if len(t) > 2) def _match_klub(self, pgz_naziv, hvs_list): target_core = self._core_slug(pgz_naziv) target_tokens = self._tokens(pgz_naziv) if not target_tokens: return None for h in hvs_list: if self._core_slug(h["title"]) == target_core: return h best, best_score = None, 0 for h in hvs_list: ht = self._tokens(h["title"]) shared = target_tokens & ht if len(shared) < 2: continue extra_candidate = (ht - target_tokens) & set(KEYWORDS) if extra_candidate: continue if len(shared) > best_score: best_score = len(shared) best = h return best # ------------- wp-json fetch + simplify ---------------------------------- def _fetch_hvs_klubovi(self): out = [] raw = _fetch_paginated(WP_KLUB_API, self.log) for k in raw: title = (k.get("title") or {}).get("rendered", "").strip() title = (title.replace("–", "–").replace("’", "'") .replace("&", "&")) acf = k.get("ACF") or {} out.append({ "wp_id": k.get("id"), "club_id": acf.get("club_id"), "title": title, "link": k.get("link"), "slug": k.get("slug"), "history": acf.get("history") or "", }) return out def _fetch_hvs_clanovi(self): out = [] raw = _fetch_paginated(WP_CLANOV_API, self.log) for c in raw: title = (c.get("title") or {}).get("rendered", "").strip() title = (title.replace("–", "–").replace("’", "'") .replace("&", "&")) acf = c.get("ACF") or {} out.append({ "wp_id": c.get("id"), "name": acf.get("name") or title, "image": acf.get("image") or "", "birth_date": acf.get("birth_date") or "", "birth_place": acf.get("birth_place") or "", "position": acf.get("position") or "", "bio": acf.get("bio") or "", "slug": c.get("slug"), "link": c.get("link"), }) return out # ------------- DB persistence helpers ------------------------------------ def _persist_klub_link(self, klub_id, hvs): with self.conn.cursor() as cur: cur.execute(""" UPDATE pgz_sport.klubovi SET source_url = %s, metadata = COALESCE(metadata, '{}'::jsonb) || jsonb_build_object('hvs_wp_id', %s::int, 'hvs_club_id', %s, 'hvs_title', %s), updated_at = now() WHERE id = %s """, (hvs["link"], hvs["wp_id"], hvs.get("club_id"), hvs["title"], klub_id)) def _split_name(self, full): full = re.sub(r"\s+", " ", (full or "")).strip() if not full: return "", "" parts = full.split(" ", 1) return parts[0], (parts[1] if len(parts) > 1 else "") def _insert_official(self, clan_data, klub_id): ime, prezime = self._split_name(clan_data["name"]) if not ime: return None extra = { "image": clan_data.get("image", ""), "birth_date": clan_data.get("birth_date", ""), "birth_place": clan_data.get("birth_place", ""), "position": clan_data.get("position", ""), "bio": (clan_data.get("bio") or "")[:8000], "hvs_role": "federation_official_or_staff", } kategorija = clan_data.get("position") or "stručna funkcija" return self.upsert_clan( klub_id=klub_id, source_id=str(clan_data["wp_id"]), ime=ime, prezime=prezime, source_url=clan_data["link"], kategorija=kategorija, sezona=None, extra=extra, ) # ------------- klub-level orchestration ---------------------------------- def scrape_klub(self, page, klub): self.log(f" 🤽 Klub {klub['id']} {klub['naziv']}") if not hasattr(self, "_hvs_klubovi"): self._hvs_klubovi = self._fetch_hvs_klubovi() self.log(f" 📡 wp-json klubovi loaded: {len(self._hvs_klubovi)}") if not hasattr(self, "_hvs_clanovi"): self._hvs_clanovi = self._fetch_hvs_clanovi() self.log(f" 📡 wp-json clanovi loaded: {len(self._hvs_clanovi)}") match = self._match_klub(klub['naziv'], self._hvs_klubovi) if match: self.log(f" ✅ wp-json match → {match['title']} ({match['link']})") self._persist_klub_link(klub['id'], match) else: self.log(f" 🟡 no wp-json klub match (HVS exposes only 20 klubova)") # Insert federation officials whose bio mentions any distinctive token # of this PGŽ klub. This is the only way HVS surfaces person-level data. klub_tokens = [t for t in self._tokens(klub['naziv']) if t in KEYWORDS] if not klub_tokens: self.log(f" 🟡 no distinctive tokens for {klub['naziv']}, skip clanovi link") return linked = 0 for c in self._hvs_clanovi: blob = ((c.get("bio") or "") + " " + (c.get("name") or "")).lower() blob = self.slugify(blob).replace("-", " ") if any(t in blob.split() for t in klub_tokens): try: cid = self._insert_official(c, klub['id']) if cid: self.stats['players'] += 1 linked += 1 except Exception as e: self.log(f" official upsert err: {e}") if linked: self.log(f" 🧑 {linked} clanovi linked via bio match") # Heartbeat try: import subprocess subprocess.run(["redis-cli", "SET", "cc:pgz-sport:heartbeat", str(int(time.time()))], timeout=3, capture_output=True) except Exception: pass # We override run() to skip Playwright entirely (HVS site is broken for it). def run(self, limit=999): klubovi = self.get_target_klubovi(limit) self.log(f"🚀 Starting {self.SPORT} harvest. Target: {len(klubovi)} klubova " f"(wp-json only — site SPA is broken for /klub/, /igrac/, /kategorija/)") for klub in klubovi: try: self.scrape_klub(None, klub) # no Playwright page self.stats['klubova'] += 1 except Exception as e: self.stats['errors'] += 1 self.log(f" ❌ Klub {klub['id']} {klub['naziv']}: {e}") self.log(f"✅ Done. Stats: {self.stats}") try: import subprocess subprocess.run(["curl", "-s", "-X", "POST", "https://api.telegram.org/bot8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y/sendMessage", "-d", "chat_id=7969491558", "--data-urlencode", f"text=VATERPOLO harvest done: {self.stats}"], timeout=8, capture_output=True) except Exception: pass if __name__ == '__main__': limit = int(sys.argv[1]) if len(sys.argv) > 1 else 28 HVSHarvester().run(limit=limit)