#!/usr/bin/env python3 # hos_volleyball.py # v1.0.0 — dradulic@outlook.com / damir@rinet.one — 2026-05-05 # HOS odbojka harvester: hos-cvf.hr natjecanja + standings, hos-web.dataproject.com match stats. # Targets all 77 PGŽ odbojka klubova. import sys, re, json, time import html as ihtml from datetime import datetime import requests from psycopg2.extras import RealDictCursor sys.path.insert(0, '/opt/pgz-sport/scripts/sport_harvesters') from __base import SportHarvester UA = "RiNET-Civic/1.0 (https://rinet.one)" HDR = {"User-Agent": UA} def _http_get(url, retries=1): last = None for i in range(retries + 1): try: r = requests.get(url, headers=HDR, timeout=25) if r.status_code == 200 and r.text: return r.text last = f"HTTP {r.status_code}" except Exception as e: last = str(e) time.sleep(2) raise RuntimeError(f"GET {url} failed: {last}") def _strip_tags(s): return ihtml.unescape(re.sub(r'<[^>]+>', '', s or '')).strip() def _parse_standings(html): """Return list of {poz, klub, uk, pob, por, bod} from first plausible table.""" tables = re.findall(r']*>(.+?)', html, re.DOTALL | re.IGNORECASE) for tbl in tables: rows = re.findall(r']*>(.+?)', tbl, re.DOTALL | re.IGNORECASE) out = [] for row in rows: cells = re.findall(r']*>(.*?)', row, re.DOTALL | re.IGNORECASE) clean = [_strip_tags(c) for c in cells] if not clean or not clean[0]: continue if clean[0] in ('', '#', 'Pos', 'Poz', 'R'): continue try: m = re.match(r'(\d+)\.?', clean[0]) if not m: continue poz = int(m.group(1)) if len(clean) < 5: continue klub = clean[2] if (len(clean) >= 7 and not re.match(r'^\d+$', clean[2])) else clean[1] if not klub or re.match(r'^\d+$', klub): continue numcells = [c for c in clean if re.match(r'^-?\d+$', c)] if len(numcells) < 4: continue tail = numcells[1:] uk = int(tail[0]) pob = int(tail[1]) por = int(tail[2]) bod = int(tail[-1]) out.append({'poz': poz, 'klub': klub, 'uk': uk, 'pob': pob, 'por': por, 'bod': bod}) except Exception: continue if out and len(out) >= 2: return out return [] def _parse_title(html): m = re.search(r']*>(.*?)', html, re.DOTALL | re.IGNORECASE) if m: t = _strip_tags(m.group(1)) if t and len(t) > 4: return t m = re.search(r']*>(.*?)', html, re.DOTALL | re.IGNORECASE) if m: return _strip_tags(m.group(1)) return None def _detect_razina_spol(title): t = (title or '').lower() razina = None for key, lab in [ ('superliga 2', 'Superliga 2'), ('superliga', 'Superliga'), ('1. liga', '1.liga'), ('1.liga', '1.liga'), ('2. liga', '2.liga'), ('2.liga', '2.liga'), ('3. liga', '3.liga'), ('3.liga', '3.liga'), ('kup', 'Kup'), ('kadeti', 'Kadeti'), ('kadetkinje', 'Kadetkinje'), ('juniori', 'Juniori'), ('juniorke', 'Juniorke'), ('mini', 'Mini'), ('beach', 'Beach'), ('pijesku', 'Beach'), ]: if key in t: razina = lab break spol = None if re.search(r'\(\s*[mM]\s*\)|\bmu[šs]ki\b|\bmuska\b|\bjuniori\b|\bkadeti\b', t): spol = 'M' elif re.search(r'\(\s*[ŽzZ]\s*\)|\bžen|\bjuniorke\b|\bkadetkinje\b', t): spol = 'Ž' return razina, spol class HOSHarvester(SportHarvester): SPORT = 'odbojka' SOURCE = 'hos' BASE_CVF = 'https://hos-cvf.hr' BASE_DP = 'https://hos-web.dataproject.com' SEZONA = '2025/26' MAX_NATJ = 80 MAX_MATCHES_PER_KLUB = 5 MAX_MATCHES_TOTAL = 120 def __init__(self): super().__init__() self._natj_by_klub = {} self._matches_for_klub = {} self._dp_match_seen = set() self._matches_scraped_total = 0 self.stats.setdefault('natjecanja', 0) self.stats.setdefault('tablice', 0) self.stats.setdefault('matches', 0) def get_target_klubovi(self, limit=999): with self.conn.cursor(cursor_factory=RealDictCursor) as cur: cur.execute(""" SELECT * FROM pgz_sport.v_pgz_priority_klubovi WHERE sport = 'odbojka' ORDER BY (financiran OR u_godisnjaku) DESC, id LIMIT %s """, (limit,)) return cur.fetchall() def _discover_natjecanje_ids(self): try: html = _http_get(self.BASE_CVF + '/') except Exception as e: self.log(f"⚠ failed to load hos-cvf.hr: {e}") return [] ids = sorted({int(m) for m in re.findall(r'natjecanje\.php\?id=(\d+)', html)}) self.log(f" found {len(ids)} natjecanje ids on hos-cvf.hr") return ids[:self.MAX_NATJ] def _upsert_natjecanje(self, nid, naziv, razina, spol, source_url): with self.conn.cursor() as cur: cur.execute(""" INSERT INTO pgz_sport.natjecanja (sport, naziv, razina, sezona, spol, source, external_id, external_url, source_id, source_url, status, updated_at) VALUES ('odbojka', %s, %s, %s, %s, 'hos_cvf', %s, %s, %s, %s, 'aktivno', now()) ON CONFLICT (source, external_id) DO UPDATE SET naziv = EXCLUDED.naziv, razina = COALESCE(EXCLUDED.razina, pgz_sport.natjecanja.razina), spol = COALESCE(EXCLUDED.spol, pgz_sport.natjecanja.spol), sezona = EXCLUDED.sezona, source_url = EXCLUDED.source_url, external_url = EXCLUDED.external_url, updated_at = now() RETURNING id """, (naziv, razina, self.SEZONA, spol, str(nid), source_url, str(nid), source_url)) return cur.fetchone()[0] def _find_klub_id(self, klub_naziv): with self.conn.cursor() as cur: cur.execute(""" SELECT id, region FROM pgz_sport.klubovi WHERE sport = 'odbojka' AND (LOWER(naziv) = LOWER(%s) OR LOWER(naziv) LIKE LOWER(%s)) ORDER BY CASE WHEN aktivan THEN 0 ELSE 1 END, CASE WHEN region='PGŽ' THEN 0 ELSE 1 END, id LIMIT 1 """, (klub_naziv, f"%{klub_naziv}%")) r = cur.fetchone() if r: return r[0] target = self.slugify(klub_naziv) toks = [t for t in target.split('-') if len(t) > 3] if not toks: return None with self.conn.cursor() as cur: cur.execute(""" SELECT id, naziv FROM pgz_sport.klubovi WHERE sport='odbojka' AND aktivan """) best = None best_score = 0 for kid, knaz in cur.fetchall(): kslug = self.slugify(knaz) score = sum(1 for t in toks if t in kslug) if score > best_score: best_score = score best = kid if best_score >= max(1, len(toks) - 1): return best return None def _replace_tablice(self, natj_id, source_url, rows, spol): with self.conn.cursor() as cur: cur.execute("DELETE FROM pgz_sport.natjecanja_tablice WHERE natjecanje_id=%s AND source='hos_cvf'", (natj_id,)) for r in rows: klub_id = self._find_klub_id(r['klub']) cur.execute(""" INSERT INTO pgz_sport.natjecanja_tablice (natjecanje_id, klub_id, klub_naziv, pozicija, odigrano, pobjede, nerijeseno, porazi, gol_z, gol_p, gol_razlika, bodovi, source, source_url, updated_at, extra_data) VALUES (%s, %s, %s, %s, %s, %s, 0, %s, 0, 0, 0, %s, 'hos_cvf', %s, now(), %s::jsonb) ON CONFLICT (natjecanje_id, klub_naziv) DO UPDATE SET pozicija = EXCLUDED.pozicija, odigrano = EXCLUDED.odigrano, pobjede = EXCLUDED.pobjede, porazi = EXCLUDED.porazi, bodovi = EXCLUDED.bodovi, klub_id = COALESCE(EXCLUDED.klub_id, pgz_sport.natjecanja_tablice.klub_id), source_url = EXCLUDED.source_url, updated_at = now() """, (natj_id, klub_id, r['klub'], r['poz'], r['uk'], r['pob'], r['por'], r['bod'], source_url, json.dumps({'spol': spol}))) if klub_id: self._natj_by_klub.setdefault(klub_id, []).append({ 'natj_id': natj_id, 'natj_naziv': None, 'url': source_url, 'klub_naziv': r['klub'], 'pozicija': r['poz'], }) def _harvest_natjecanje(self, nid): url = f"{self.BASE_CVF}/natjecanje.php?id={nid}" try: html = _http_get(url) except Exception as e: self.log(f" ⚠ natj {nid}: {e}") self.stats['errors'] += 1 return title = _parse_title(html) or f"HOS natjecanje #{nid}" razina, spol = _detect_razina_spol(title) natj_id = self._upsert_natjecanje(nid, title, razina, spol, url) rows = _parse_standings(html) if rows: self._replace_tablice(natj_id, url, rows, spol) self.stats['tablice'] += len(rows) for kid, entries in self._natj_by_klub.items(): for e in entries: if e['natj_id'] == natj_id and e.get('natj_naziv') is None: e['natj_naziv'] = title mids = sorted({int(m) for m in re.findall(r'MatchStatistics\.aspx\?mID=(\d+)', html, re.IGNORECASE)}) if mids: klub_ids_here = [kid for kid, entries in self._natj_by_klub.items() if any(e['natj_id'] == natj_id for e in entries)] for kid in klub_ids_here: bucket = self._matches_for_klub.setdefault(kid, []) for mid in mids: bucket.append({'mid': mid, 'natj_id': natj_id, 'natj_naziv': title}) self.stats['natjecanja'] += 1 def _harvest_federation(self): self.log("📋 preflight: hos-cvf.hr natjecanja discovery") ids = self._discover_natjecanje_ids() for nid in ids: self._harvest_natjecanje(nid) self.log(f" preflight done: natjecanja={self.stats['natjecanja']}, " f"tablice={self.stats['tablice']}, klubova_with_match={len(self._natj_by_klub)}") def _scrape_dp_match(self, page, mid, klub_id, klub_naziv, natj_naziv): if mid in self._dp_match_seen: return 0 if self._matches_scraped_total >= self.MAX_MATCHES_TOTAL: return 0 url = f"{self.BASE_DP}/MatchStatistics.aspx?mID={mid}" added = 0 try: page.goto(url, wait_until='domcontentloaded', timeout=30000) try: page.wait_for_load_state('networkidle', timeout=10000) except Exception: pass self._dp_match_seen.add(mid) self._matches_scraped_total += 1 self.stats['matches'] += 1 rows = [] for sel in ['table.statTbl tr', 'table.report tr', 'table tr']: try: txts = page.locator(sel).all_inner_texts() except Exception: txts = [] if txts: rows = txts break for txt in rows: line = re.sub(r'\s+', ' ', txt.replace('\t', ' ')).strip() if not line: continue m = re.match(r'^(\d{1,3})\s+([A-ZČĆŽŠĐ][\wČĆŽŠĐčćžšđ\.\-\']+(?:\s+[A-ZČĆŽŠĐ][\wČĆŽŠĐčćžšđ\.\-\']+)+)\b(.*)$', line) if not m: continue jersey = m.group(1) fullname = m.group(2).strip() tail = m.group(3).strip() nums = [int(x) for x in re.findall(r'-?\d+', tail)] if not nums: continue pts = nums[0] if len(nums) >= 1 else None aces = nums[5] if len(nums) > 5 else None blocks = nums[7] if len(nums) > 7 else None parts = fullname.split() if parts[0].isupper() and len(parts) >= 2: prezime = parts[0].title() ime = ' '.join(parts[1:]) else: ime = parts[0] prezime = ' '.join(parts[1:]) if len(parts) > 1 else '' slug_key = self.slugify(fullname) source_id = f"dp:{mid}:{jersey}:{slug_key}" try: clan_id = self.upsert_clan( klub_id=klub_id, source_id=source_id, ime=ime, prezime=prezime, source_url=url, kategorija='senior', sezona=self.SEZONA, extra={'dp_match_id': mid, 'jersey': jersey}, ) self.upsert_stats( clan_id=clan_id, sezona=self.SEZONA, klub_id=klub_id, klub_naziv=klub_naziv, natjecanje=natj_naziv, kategorija='senior', stats_dict={ 'nastupi': 1, 'bodovi': pts, 'servis_asovi': aces, 'blokade': blocks, }, raw={'mid': mid, 'jersey': jersey, 'name': fullname, 'tail_nums': nums}, ) self.stats['players'] += 1 self.stats['stats'] += 1 added += 1 except Exception as e: self.log(f" ⚠ upsert player '{fullname}': {e}") except Exception as e: self.log(f" ⚠ dp match {mid}: {e}") self.stats['errors'] += 1 return added def scrape_klub(self, page, klub): self.log(f" 🏐 Klub {klub['id']} {klub['naziv']}") entries = list(self._natj_by_klub.get(klub['id'], [])) if not entries: kslug = self.slugify(klub['naziv']) ktoks = [t for t in kslug.split('-') if len(t) > 3] if ktoks: for kid, ents in list(self._natj_by_klub.items()): for e in ents: eslug = self.slugify(e['klub_naziv']) if sum(1 for t in ktoks if t in eslug) >= max(1, len(ktoks) - 1): entries.append(e) break if entries: break if entries: first = entries[0] with self.conn.cursor() as cur: cur.execute(""" UPDATE pgz_sport.klubovi SET source_url = COALESCE(NULLIF(source_url, ''), %s), source = COALESCE(source, 'hos_cvf'), last_scraped_at = now() WHERE id = %s """, (first['url'], klub['id'])) naz_list = ', '.join(sorted({(e.get('natj_naziv') or '?') for e in entries}))[:120] self.log(f" ↳ {len(entries)} natjecanja: {naz_list}") else: self.log(f" · no HOS natjecanje hit") match_bucket = self._matches_for_klub.get(klub['id'], []) if not match_bucket and entries: for kid, ents in self._natj_by_klub.items(): if any(e['natj_id'] == entries[0]['natj_id'] for e in ents): match_bucket = self._matches_for_klub.get(kid, []) if match_bucket: break scraped_for_klub = 0 for m in match_bucket: if scraped_for_klub >= self.MAX_MATCHES_PER_KLUB: break if self._matches_scraped_total >= self.MAX_MATCHES_TOTAL: break n = self._scrape_dp_match(page, m['mid'], klub['id'], klub['naziv'], m['natj_naziv'] or 'HOS') if n > 0: scraped_for_klub += 1 if scraped_for_klub: self.log(f" ↳ scraped {scraped_for_klub} match(es) from dataproject") def run(self, limit=999): self._harvest_federation() super().run(limit) if __name__ == '__main__': HOSHarvester().run(limit=int(sys.argv[1]) if len(sys.argv) > 1 else 999)