#!/usr/bin/env python3 """HVS waterpolo harvester.""" import sys, re sys.path.insert(0, '/opt/pgz-sport/scripts/sport_harvesters') from __base import SportHarvester class HVSHarvester(SportHarvester): SPORT = 'vaterpolo' SOURCE = 'hvs' def scrape_klub(self, page, klub): self.log(f" 🤽 Klub {klub['id']} {klub['naziv']}") try: # Get all klubovi list from HVS page.goto("https://hvs.hr/klubovi/", wait_until="domcontentloaded", timeout=20000) klub_links = page.locator('a[href*="/klub/"]').all() naziv_lower = klub['naziv'].lower() for a in klub_links[:30]: text = a.inner_text().lower() href = a.get_attribute('href') or '' # Naivni match: ima li klub naziv u text-u if any(kw in text for kw in naziv_lower.split() if len(kw) > 3): self.log(f" Match: {text[:50]} → {href}") m = re.search(r'/klub/(\d+)', href) if m: kid = m.group(1) new_url = f"https://hvs.hr/klub/{kid}/" with self.conn.cursor() as cur: cur.execute("UPDATE pgz_sport.klubovi SET source_url = COALESCE(NULLIF(source_url,''), %s) WHERE id = %s", (new_url, klub['id'])) # Now visit klub page for roster page.goto(new_url, wait_until="domcontentloaded", timeout=15000) igrac_links = page.locator('a[href*="/igrac/"]').all() self.log(f" {len(igrac_links)} igrača found") for ia in igrac_links[:30]: ihref = ia.get_attribute('href') or '' naziv = ia.inner_text().strip() mi = re.search(r'/igrac/(\d+)', ihref) if mi and naziv: parts = re.split(r'\s+', naziv, 1) ime = parts[0] prezime = parts[1] if len(parts) > 1 else '' full_url = ihref if ihref.startswith('http') else f"https://hvs.hr{ihref}" clan_id = self.upsert_clan( klub_id=klub['id'], source_id=mi.group(1), ime=ime, prezime=prezime, source_url=full_url ) self.stats['players'] += 1 break except Exception as e: self.log(f" ❌ {e}") if __name__ == '__main__': HVSHarvester().run(limit=int(sys.argv[1]) if len(sys.argv) > 1 else 50)