1d02c0897d
- pgz nav now includes /erp/full, /crm/v2, /admin/users, /dokumenti
- 4 dokumenti endpoints: list, godišnjaci/list, godišnjak/{godina} PDF, detail
- 18 godišnjaka u pgz_sport.dokumenti (2006-2024) with savez_id=333
- PGŽ filter helpers (window._pgz_filter_priority, togglePGZFilter)
- navItemClick handler for nav items with href
417 lines
17 KiB
Python
Executable File
417 lines
17 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# hos_volleyball.py
|
|
# v1.0.0 — dradulic@outlook.com / damir@rinet.one — 2026-05-05
|
|
# HOS odbojka harvester: hos-cvf.hr natjecanja + standings, hos-web.dataproject.com match stats.
|
|
# Targets all 77 PGŽ odbojka klubova.
|
|
|
|
import sys, re, json, time
|
|
import html as ihtml
|
|
from datetime import datetime
|
|
import requests
|
|
from psycopg2.extras import RealDictCursor
|
|
|
|
sys.path.insert(0, '/opt/pgz-sport/scripts/sport_harvesters')
|
|
from __base import SportHarvester
|
|
|
|
|
|
UA = "RiNET-Civic/1.0 (https://rinet.one)"
|
|
HDR = {"User-Agent": UA}
|
|
|
|
|
|
def _http_get(url, retries=1):
|
|
last = None
|
|
for i in range(retries + 1):
|
|
try:
|
|
r = requests.get(url, headers=HDR, timeout=25)
|
|
if r.status_code == 200 and r.text:
|
|
return r.text
|
|
last = f"HTTP {r.status_code}"
|
|
except Exception as e:
|
|
last = str(e)
|
|
time.sleep(2)
|
|
raise RuntimeError(f"GET {url} failed: {last}")
|
|
|
|
|
|
def _strip_tags(s):
|
|
return ihtml.unescape(re.sub(r'<[^>]+>', '', s or '')).strip()
|
|
|
|
|
|
def _parse_standings(html):
|
|
"""Return list of {poz, klub, uk, pob, por, bod} from first plausible table."""
|
|
tables = re.findall(r'<table[^>]*>(.+?)</table>', html, re.DOTALL | re.IGNORECASE)
|
|
for tbl in tables:
|
|
rows = re.findall(r'<tr[^>]*>(.+?)</tr>', tbl, re.DOTALL | re.IGNORECASE)
|
|
out = []
|
|
for row in rows:
|
|
cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', row, re.DOTALL | re.IGNORECASE)
|
|
clean = [_strip_tags(c) for c in cells]
|
|
if not clean or not clean[0]:
|
|
continue
|
|
if clean[0] in ('', '#', 'Pos', 'Poz', 'R'):
|
|
continue
|
|
try:
|
|
m = re.match(r'(\d+)\.?', clean[0])
|
|
if not m:
|
|
continue
|
|
poz = int(m.group(1))
|
|
if len(clean) < 5:
|
|
continue
|
|
klub = clean[2] if (len(clean) >= 7 and not re.match(r'^\d+$', clean[2])) else clean[1]
|
|
if not klub or re.match(r'^\d+$', klub):
|
|
continue
|
|
numcells = [c for c in clean if re.match(r'^-?\d+$', c)]
|
|
if len(numcells) < 4:
|
|
continue
|
|
tail = numcells[1:]
|
|
uk = int(tail[0])
|
|
pob = int(tail[1])
|
|
por = int(tail[2])
|
|
bod = int(tail[-1])
|
|
out.append({'poz': poz, 'klub': klub, 'uk': uk, 'pob': pob, 'por': por, 'bod': bod})
|
|
except Exception:
|
|
continue
|
|
if out and len(out) >= 2:
|
|
return out
|
|
return []
|
|
|
|
|
|
def _parse_title(html):
|
|
m = re.search(r'<h1[^>]*>(.*?)</h1>', html, re.DOTALL | re.IGNORECASE)
|
|
if m:
|
|
t = _strip_tags(m.group(1))
|
|
if t and len(t) > 4:
|
|
return t
|
|
m = re.search(r'<title[^>]*>(.*?)</title>', html, re.DOTALL | re.IGNORECASE)
|
|
if m:
|
|
return _strip_tags(m.group(1))
|
|
return None
|
|
|
|
|
|
def _detect_razina_spol(title):
|
|
t = (title or '').lower()
|
|
razina = None
|
|
for key, lab in [
|
|
('superliga 2', 'Superliga 2'),
|
|
('superliga', 'Superliga'),
|
|
('1. liga', '1.liga'), ('1.liga', '1.liga'),
|
|
('2. liga', '2.liga'), ('2.liga', '2.liga'),
|
|
('3. liga', '3.liga'), ('3.liga', '3.liga'),
|
|
('kup', 'Kup'),
|
|
('kadeti', 'Kadeti'), ('kadetkinje', 'Kadetkinje'),
|
|
('juniori', 'Juniori'), ('juniorke', 'Juniorke'),
|
|
('mini', 'Mini'),
|
|
('beach', 'Beach'), ('pijesku', 'Beach'),
|
|
]:
|
|
if key in t:
|
|
razina = lab
|
|
break
|
|
spol = None
|
|
if re.search(r'\(\s*[mM]\s*\)|\bmu[šs]ki\b|\bmuska\b|\bjuniori\b|\bkadeti\b', t):
|
|
spol = 'M'
|
|
elif re.search(r'\(\s*[ŽzZ]\s*\)|\bžen|\bjuniorke\b|\bkadetkinje\b', t):
|
|
spol = 'Ž'
|
|
return razina, spol
|
|
|
|
|
|
class HOSHarvester(SportHarvester):
|
|
SPORT = 'odbojka'
|
|
SOURCE = 'hos'
|
|
|
|
BASE_CVF = 'https://hos-cvf.hr'
|
|
BASE_DP = 'https://hos-web.dataproject.com'
|
|
SEZONA = '2025/26'
|
|
MAX_NATJ = 80
|
|
MAX_MATCHES_PER_KLUB = 5
|
|
MAX_MATCHES_TOTAL = 120
|
|
|
|
def __init__(self):
|
|
super().__init__()
|
|
self._natj_by_klub = {}
|
|
self._matches_for_klub = {}
|
|
self._dp_match_seen = set()
|
|
self._matches_scraped_total = 0
|
|
self.stats.setdefault('natjecanja', 0)
|
|
self.stats.setdefault('tablice', 0)
|
|
self.stats.setdefault('matches', 0)
|
|
|
|
def get_target_klubovi(self, limit=999):
|
|
with self.conn.cursor(cursor_factory=RealDictCursor) as cur:
|
|
cur.execute("""
|
|
SELECT * FROM pgz_sport.v_pgz_priority_klubovi
|
|
WHERE sport = 'odbojka'
|
|
ORDER BY (financiran OR u_godisnjaku) DESC, id
|
|
LIMIT %s
|
|
""", (limit,))
|
|
return cur.fetchall()
|
|
|
|
def _discover_natjecanje_ids(self):
|
|
try:
|
|
html = _http_get(self.BASE_CVF + '/')
|
|
except Exception as e:
|
|
self.log(f"⚠ failed to load hos-cvf.hr: {e}")
|
|
return []
|
|
ids = sorted({int(m) for m in re.findall(r'natjecanje\.php\?id=(\d+)', html)})
|
|
self.log(f" found {len(ids)} natjecanje ids on hos-cvf.hr")
|
|
return ids[:self.MAX_NATJ]
|
|
|
|
def _upsert_natjecanje(self, nid, naziv, razina, spol, source_url):
|
|
with self.conn.cursor() as cur:
|
|
cur.execute("""
|
|
INSERT INTO pgz_sport.natjecanja
|
|
(sport, naziv, razina, sezona, spol, source, external_id, external_url,
|
|
source_id, source_url, status, updated_at)
|
|
VALUES ('odbojka', %s, %s, %s, %s, 'hos_cvf', %s, %s, %s, %s, 'aktivno', now())
|
|
ON CONFLICT (source, external_id) DO UPDATE
|
|
SET naziv = EXCLUDED.naziv,
|
|
razina = COALESCE(EXCLUDED.razina, pgz_sport.natjecanja.razina),
|
|
spol = COALESCE(EXCLUDED.spol, pgz_sport.natjecanja.spol),
|
|
sezona = EXCLUDED.sezona,
|
|
source_url = EXCLUDED.source_url,
|
|
external_url = EXCLUDED.external_url,
|
|
updated_at = now()
|
|
RETURNING id
|
|
""", (naziv, razina, self.SEZONA, spol, str(nid), source_url, str(nid), source_url))
|
|
return cur.fetchone()[0]
|
|
|
|
def _find_klub_id(self, klub_naziv):
|
|
with self.conn.cursor() as cur:
|
|
cur.execute("""
|
|
SELECT id, region FROM pgz_sport.klubovi
|
|
WHERE sport = 'odbojka'
|
|
AND (LOWER(naziv) = LOWER(%s) OR LOWER(naziv) LIKE LOWER(%s))
|
|
ORDER BY CASE WHEN aktivan THEN 0 ELSE 1 END,
|
|
CASE WHEN region='PGŽ' THEN 0 ELSE 1 END,
|
|
id
|
|
LIMIT 1
|
|
""", (klub_naziv, f"%{klub_naziv}%"))
|
|
r = cur.fetchone()
|
|
if r:
|
|
return r[0]
|
|
target = self.slugify(klub_naziv)
|
|
toks = [t for t in target.split('-') if len(t) > 3]
|
|
if not toks:
|
|
return None
|
|
with self.conn.cursor() as cur:
|
|
cur.execute("""
|
|
SELECT id, naziv FROM pgz_sport.klubovi
|
|
WHERE sport='odbojka' AND aktivan
|
|
""")
|
|
best = None
|
|
best_score = 0
|
|
for kid, knaz in cur.fetchall():
|
|
kslug = self.slugify(knaz)
|
|
score = sum(1 for t in toks if t in kslug)
|
|
if score > best_score:
|
|
best_score = score
|
|
best = kid
|
|
if best_score >= max(1, len(toks) - 1):
|
|
return best
|
|
return None
|
|
|
|
def _replace_tablice(self, natj_id, source_url, rows, spol):
|
|
with self.conn.cursor() as cur:
|
|
cur.execute("DELETE FROM pgz_sport.natjecanja_tablice WHERE natjecanje_id=%s AND source='hos_cvf'", (natj_id,))
|
|
for r in rows:
|
|
klub_id = self._find_klub_id(r['klub'])
|
|
cur.execute("""
|
|
INSERT INTO pgz_sport.natjecanja_tablice
|
|
(natjecanje_id, klub_id, klub_naziv, pozicija, odigrano, pobjede,
|
|
nerijeseno, porazi, gol_z, gol_p, gol_razlika, bodovi,
|
|
source, source_url, updated_at, extra_data)
|
|
VALUES (%s, %s, %s, %s, %s, %s, 0, %s, 0, 0, 0, %s,
|
|
'hos_cvf', %s, now(), %s::jsonb)
|
|
ON CONFLICT (natjecanje_id, klub_naziv) DO UPDATE SET
|
|
pozicija = EXCLUDED.pozicija,
|
|
odigrano = EXCLUDED.odigrano,
|
|
pobjede = EXCLUDED.pobjede,
|
|
porazi = EXCLUDED.porazi,
|
|
bodovi = EXCLUDED.bodovi,
|
|
klub_id = COALESCE(EXCLUDED.klub_id, pgz_sport.natjecanja_tablice.klub_id),
|
|
source_url = EXCLUDED.source_url,
|
|
updated_at = now()
|
|
""", (natj_id, klub_id, r['klub'], r['poz'], r['uk'], r['pob'], r['por'],
|
|
r['bod'], source_url, json.dumps({'spol': spol})))
|
|
if klub_id:
|
|
self._natj_by_klub.setdefault(klub_id, []).append({
|
|
'natj_id': natj_id,
|
|
'natj_naziv': None,
|
|
'url': source_url,
|
|
'klub_naziv': r['klub'],
|
|
'pozicija': r['poz'],
|
|
})
|
|
|
|
def _harvest_natjecanje(self, nid):
|
|
url = f"{self.BASE_CVF}/natjecanje.php?id={nid}"
|
|
try:
|
|
html = _http_get(url)
|
|
except Exception as e:
|
|
self.log(f" ⚠ natj {nid}: {e}")
|
|
self.stats['errors'] += 1
|
|
return
|
|
title = _parse_title(html) or f"HOS natjecanje #{nid}"
|
|
razina, spol = _detect_razina_spol(title)
|
|
natj_id = self._upsert_natjecanje(nid, title, razina, spol, url)
|
|
rows = _parse_standings(html)
|
|
if rows:
|
|
self._replace_tablice(natj_id, url, rows, spol)
|
|
self.stats['tablice'] += len(rows)
|
|
for kid, entries in self._natj_by_klub.items():
|
|
for e in entries:
|
|
if e['natj_id'] == natj_id and e.get('natj_naziv') is None:
|
|
e['natj_naziv'] = title
|
|
mids = sorted({int(m) for m in re.findall(r'MatchStatistics\.aspx\?mID=(\d+)', html, re.IGNORECASE)})
|
|
if mids:
|
|
klub_ids_here = [kid for kid, entries in self._natj_by_klub.items()
|
|
if any(e['natj_id'] == natj_id for e in entries)]
|
|
for kid in klub_ids_here:
|
|
bucket = self._matches_for_klub.setdefault(kid, [])
|
|
for mid in mids:
|
|
bucket.append({'mid': mid, 'natj_id': natj_id, 'natj_naziv': title})
|
|
self.stats['natjecanja'] += 1
|
|
|
|
def _harvest_federation(self):
|
|
self.log("📋 preflight: hos-cvf.hr natjecanja discovery")
|
|
ids = self._discover_natjecanje_ids()
|
|
for nid in ids:
|
|
self._harvest_natjecanje(nid)
|
|
self.log(f" preflight done: natjecanja={self.stats['natjecanja']}, "
|
|
f"tablice={self.stats['tablice']}, klubova_with_match={len(self._natj_by_klub)}")
|
|
|
|
def _scrape_dp_match(self, page, mid, klub_id, klub_naziv, natj_naziv):
|
|
if mid in self._dp_match_seen:
|
|
return 0
|
|
if self._matches_scraped_total >= self.MAX_MATCHES_TOTAL:
|
|
return 0
|
|
url = f"{self.BASE_DP}/MatchStatistics.aspx?mID={mid}"
|
|
added = 0
|
|
try:
|
|
page.goto(url, wait_until='domcontentloaded', timeout=30000)
|
|
try:
|
|
page.wait_for_load_state('networkidle', timeout=10000)
|
|
except Exception:
|
|
pass
|
|
self._dp_match_seen.add(mid)
|
|
self._matches_scraped_total += 1
|
|
self.stats['matches'] += 1
|
|
rows = []
|
|
for sel in ['table.statTbl tr', 'table.report tr', 'table tr']:
|
|
try:
|
|
txts = page.locator(sel).all_inner_texts()
|
|
except Exception:
|
|
txts = []
|
|
if txts:
|
|
rows = txts
|
|
break
|
|
|
|
for txt in rows:
|
|
line = re.sub(r'\s+', ' ', txt.replace('\t', ' ')).strip()
|
|
if not line:
|
|
continue
|
|
m = re.match(r'^(\d{1,3})\s+([A-ZČĆŽŠĐ][\wČĆŽŠĐčćžšđ\.\-\']+(?:\s+[A-ZČĆŽŠĐ][\wČĆŽŠĐčćžšđ\.\-\']+)+)\b(.*)$', line)
|
|
if not m:
|
|
continue
|
|
jersey = m.group(1)
|
|
fullname = m.group(2).strip()
|
|
tail = m.group(3).strip()
|
|
nums = [int(x) for x in re.findall(r'-?\d+', tail)]
|
|
if not nums:
|
|
continue
|
|
pts = nums[0] if len(nums) >= 1 else None
|
|
aces = nums[5] if len(nums) > 5 else None
|
|
blocks = nums[7] if len(nums) > 7 else None
|
|
parts = fullname.split()
|
|
if parts[0].isupper() and len(parts) >= 2:
|
|
prezime = parts[0].title()
|
|
ime = ' '.join(parts[1:])
|
|
else:
|
|
ime = parts[0]
|
|
prezime = ' '.join(parts[1:]) if len(parts) > 1 else ''
|
|
slug_key = self.slugify(fullname)
|
|
source_id = f"dp:{mid}:{jersey}:{slug_key}"
|
|
try:
|
|
clan_id = self.upsert_clan(
|
|
klub_id=klub_id, source_id=source_id,
|
|
ime=ime, prezime=prezime,
|
|
source_url=url, kategorija='senior', sezona=self.SEZONA,
|
|
extra={'dp_match_id': mid, 'jersey': jersey},
|
|
)
|
|
self.upsert_stats(
|
|
clan_id=clan_id, sezona=self.SEZONA,
|
|
klub_id=klub_id, klub_naziv=klub_naziv,
|
|
natjecanje=natj_naziv, kategorija='senior',
|
|
stats_dict={
|
|
'nastupi': 1,
|
|
'bodovi': pts,
|
|
'servis_asovi': aces,
|
|
'blokade': blocks,
|
|
},
|
|
raw={'mid': mid, 'jersey': jersey, 'name': fullname, 'tail_nums': nums},
|
|
)
|
|
self.stats['players'] += 1
|
|
self.stats['stats'] += 1
|
|
added += 1
|
|
except Exception as e:
|
|
self.log(f" ⚠ upsert player '{fullname}': {e}")
|
|
except Exception as e:
|
|
self.log(f" ⚠ dp match {mid}: {e}")
|
|
self.stats['errors'] += 1
|
|
return added
|
|
|
|
def scrape_klub(self, page, klub):
|
|
self.log(f" 🏐 Klub {klub['id']} {klub['naziv']}")
|
|
entries = list(self._natj_by_klub.get(klub['id'], []))
|
|
if not entries:
|
|
kslug = self.slugify(klub['naziv'])
|
|
ktoks = [t for t in kslug.split('-') if len(t) > 3]
|
|
if ktoks:
|
|
for kid, ents in list(self._natj_by_klub.items()):
|
|
for e in ents:
|
|
eslug = self.slugify(e['klub_naziv'])
|
|
if sum(1 for t in ktoks if t in eslug) >= max(1, len(ktoks) - 1):
|
|
entries.append(e)
|
|
break
|
|
if entries:
|
|
break
|
|
|
|
if entries:
|
|
first = entries[0]
|
|
with self.conn.cursor() as cur:
|
|
cur.execute("""
|
|
UPDATE pgz_sport.klubovi
|
|
SET source_url = COALESCE(NULLIF(source_url, ''), %s),
|
|
source = COALESCE(source, 'hos_cvf'),
|
|
last_scraped_at = now()
|
|
WHERE id = %s
|
|
""", (first['url'], klub['id']))
|
|
naz_list = ', '.join(sorted({(e.get('natj_naziv') or '?') for e in entries}))[:120]
|
|
self.log(f" ↳ {len(entries)} natjecanja: {naz_list}")
|
|
else:
|
|
self.log(f" · no HOS natjecanje hit")
|
|
|
|
match_bucket = self._matches_for_klub.get(klub['id'], [])
|
|
if not match_bucket and entries:
|
|
for kid, ents in self._natj_by_klub.items():
|
|
if any(e['natj_id'] == entries[0]['natj_id'] for e in ents):
|
|
match_bucket = self._matches_for_klub.get(kid, [])
|
|
if match_bucket:
|
|
break
|
|
scraped_for_klub = 0
|
|
for m in match_bucket:
|
|
if scraped_for_klub >= self.MAX_MATCHES_PER_KLUB:
|
|
break
|
|
if self._matches_scraped_total >= self.MAX_MATCHES_TOTAL:
|
|
break
|
|
n = self._scrape_dp_match(page, m['mid'], klub['id'], klub['naziv'], m['natj_naziv'] or 'HOS')
|
|
if n > 0:
|
|
scraped_for_klub += 1
|
|
if scraped_for_klub:
|
|
self.log(f" ↳ scraped {scraped_for_klub} match(es) from dataproject")
|
|
|
|
def run(self, limit=999):
|
|
self._harvest_federation()
|
|
super().run(limit)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
HOSHarvester().run(limit=int(sys.argv[1]) if len(sys.argv) > 1 else 999)
|