Files
damir 1d02c0897d Sidebar: +ERP +CRM +Dokumenti, godišnjaci import (18 PDFs), filter helpers
- pgz nav now includes /erp/full, /crm/v2, /admin/users, /dokumenti
- 4 dokumenti endpoints: list, godišnjaci/list, godišnjak/{godina} PDF, detail
- 18 godišnjaka u pgz_sport.dokumenti (2006-2024) with savez_id=333
- PGŽ filter helpers (window._pgz_filter_priority, togglePGZFilter)
- navItemClick handler for nav items with href
2026-05-05 13:08:11 +02:00

417 lines
17 KiB
Python
Executable File

#!/usr/bin/env python3
# hos_volleyball.py
# v1.0.0 — dradulic@outlook.com / damir@rinet.one — 2026-05-05
# HOS odbojka harvester: hos-cvf.hr natjecanja + standings, hos-web.dataproject.com match stats.
# Targets all 77 PGŽ odbojka klubova.
import sys, re, json, time
import html as ihtml
from datetime import datetime
import requests
from psycopg2.extras import RealDictCursor
sys.path.insert(0, '/opt/pgz-sport/scripts/sport_harvesters')
from __base import SportHarvester
UA = "RiNET-Civic/1.0 (https://rinet.one)"
HDR = {"User-Agent": UA}
def _http_get(url, retries=1):
last = None
for i in range(retries + 1):
try:
r = requests.get(url, headers=HDR, timeout=25)
if r.status_code == 200 and r.text:
return r.text
last = f"HTTP {r.status_code}"
except Exception as e:
last = str(e)
time.sleep(2)
raise RuntimeError(f"GET {url} failed: {last}")
def _strip_tags(s):
return ihtml.unescape(re.sub(r'<[^>]+>', '', s or '')).strip()
def _parse_standings(html):
"""Return list of {poz, klub, uk, pob, por, bod} from first plausible table."""
tables = re.findall(r'<table[^>]*>(.+?)</table>', html, re.DOTALL | re.IGNORECASE)
for tbl in tables:
rows = re.findall(r'<tr[^>]*>(.+?)</tr>', tbl, re.DOTALL | re.IGNORECASE)
out = []
for row in rows:
cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', row, re.DOTALL | re.IGNORECASE)
clean = [_strip_tags(c) for c in cells]
if not clean or not clean[0]:
continue
if clean[0] in ('', '#', 'Pos', 'Poz', 'R'):
continue
try:
m = re.match(r'(\d+)\.?', clean[0])
if not m:
continue
poz = int(m.group(1))
if len(clean) < 5:
continue
klub = clean[2] if (len(clean) >= 7 and not re.match(r'^\d+$', clean[2])) else clean[1]
if not klub or re.match(r'^\d+$', klub):
continue
numcells = [c for c in clean if re.match(r'^-?\d+$', c)]
if len(numcells) < 4:
continue
tail = numcells[1:]
uk = int(tail[0])
pob = int(tail[1])
por = int(tail[2])
bod = int(tail[-1])
out.append({'poz': poz, 'klub': klub, 'uk': uk, 'pob': pob, 'por': por, 'bod': bod})
except Exception:
continue
if out and len(out) >= 2:
return out
return []
def _parse_title(html):
m = re.search(r'<h1[^>]*>(.*?)</h1>', html, re.DOTALL | re.IGNORECASE)
if m:
t = _strip_tags(m.group(1))
if t and len(t) > 4:
return t
m = re.search(r'<title[^>]*>(.*?)</title>', html, re.DOTALL | re.IGNORECASE)
if m:
return _strip_tags(m.group(1))
return None
def _detect_razina_spol(title):
t = (title or '').lower()
razina = None
for key, lab in [
('superliga 2', 'Superliga 2'),
('superliga', 'Superliga'),
('1. liga', '1.liga'), ('1.liga', '1.liga'),
('2. liga', '2.liga'), ('2.liga', '2.liga'),
('3. liga', '3.liga'), ('3.liga', '3.liga'),
('kup', 'Kup'),
('kadeti', 'Kadeti'), ('kadetkinje', 'Kadetkinje'),
('juniori', 'Juniori'), ('juniorke', 'Juniorke'),
('mini', 'Mini'),
('beach', 'Beach'), ('pijesku', 'Beach'),
]:
if key in t:
razina = lab
break
spol = None
if re.search(r'\(\s*[mM]\s*\)|\bmu[šs]ki\b|\bmuska\b|\bjuniori\b|\bkadeti\b', t):
spol = 'M'
elif re.search(r'\(\s*[ŽzZ]\s*\)|\bžen|\bjuniorke\b|\bkadetkinje\b', t):
spol = 'Ž'
return razina, spol
class HOSHarvester(SportHarvester):
SPORT = 'odbojka'
SOURCE = 'hos'
BASE_CVF = 'https://hos-cvf.hr'
BASE_DP = 'https://hos-web.dataproject.com'
SEZONA = '2025/26'
MAX_NATJ = 80
MAX_MATCHES_PER_KLUB = 5
MAX_MATCHES_TOTAL = 120
def __init__(self):
super().__init__()
self._natj_by_klub = {}
self._matches_for_klub = {}
self._dp_match_seen = set()
self._matches_scraped_total = 0
self.stats.setdefault('natjecanja', 0)
self.stats.setdefault('tablice', 0)
self.stats.setdefault('matches', 0)
def get_target_klubovi(self, limit=999):
with self.conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT * FROM pgz_sport.v_pgz_priority_klubovi
WHERE sport = 'odbojka'
ORDER BY (financiran OR u_godisnjaku) DESC, id
LIMIT %s
""", (limit,))
return cur.fetchall()
def _discover_natjecanje_ids(self):
try:
html = _http_get(self.BASE_CVF + '/')
except Exception as e:
self.log(f"⚠ failed to load hos-cvf.hr: {e}")
return []
ids = sorted({int(m) for m in re.findall(r'natjecanje\.php\?id=(\d+)', html)})
self.log(f" found {len(ids)} natjecanje ids on hos-cvf.hr")
return ids[:self.MAX_NATJ]
def _upsert_natjecanje(self, nid, naziv, razina, spol, source_url):
with self.conn.cursor() as cur:
cur.execute("""
INSERT INTO pgz_sport.natjecanja
(sport, naziv, razina, sezona, spol, source, external_id, external_url,
source_id, source_url, status, updated_at)
VALUES ('odbojka', %s, %s, %s, %s, 'hos_cvf', %s, %s, %s, %s, 'aktivno', now())
ON CONFLICT (source, external_id) DO UPDATE
SET naziv = EXCLUDED.naziv,
razina = COALESCE(EXCLUDED.razina, pgz_sport.natjecanja.razina),
spol = COALESCE(EXCLUDED.spol, pgz_sport.natjecanja.spol),
sezona = EXCLUDED.sezona,
source_url = EXCLUDED.source_url,
external_url = EXCLUDED.external_url,
updated_at = now()
RETURNING id
""", (naziv, razina, self.SEZONA, spol, str(nid), source_url, str(nid), source_url))
return cur.fetchone()[0]
def _find_klub_id(self, klub_naziv):
with self.conn.cursor() as cur:
cur.execute("""
SELECT id, region FROM pgz_sport.klubovi
WHERE sport = 'odbojka'
AND (LOWER(naziv) = LOWER(%s) OR LOWER(naziv) LIKE LOWER(%s))
ORDER BY CASE WHEN aktivan THEN 0 ELSE 1 END,
CASE WHEN region='PGŽ' THEN 0 ELSE 1 END,
id
LIMIT 1
""", (klub_naziv, f"%{klub_naziv}%"))
r = cur.fetchone()
if r:
return r[0]
target = self.slugify(klub_naziv)
toks = [t for t in target.split('-') if len(t) > 3]
if not toks:
return None
with self.conn.cursor() as cur:
cur.execute("""
SELECT id, naziv FROM pgz_sport.klubovi
WHERE sport='odbojka' AND aktivan
""")
best = None
best_score = 0
for kid, knaz in cur.fetchall():
kslug = self.slugify(knaz)
score = sum(1 for t in toks if t in kslug)
if score > best_score:
best_score = score
best = kid
if best_score >= max(1, len(toks) - 1):
return best
return None
def _replace_tablice(self, natj_id, source_url, rows, spol):
with self.conn.cursor() as cur:
cur.execute("DELETE FROM pgz_sport.natjecanja_tablice WHERE natjecanje_id=%s AND source='hos_cvf'", (natj_id,))
for r in rows:
klub_id = self._find_klub_id(r['klub'])
cur.execute("""
INSERT INTO pgz_sport.natjecanja_tablice
(natjecanje_id, klub_id, klub_naziv, pozicija, odigrano, pobjede,
nerijeseno, porazi, gol_z, gol_p, gol_razlika, bodovi,
source, source_url, updated_at, extra_data)
VALUES (%s, %s, %s, %s, %s, %s, 0, %s, 0, 0, 0, %s,
'hos_cvf', %s, now(), %s::jsonb)
ON CONFLICT (natjecanje_id, klub_naziv) DO UPDATE SET
pozicija = EXCLUDED.pozicija,
odigrano = EXCLUDED.odigrano,
pobjede = EXCLUDED.pobjede,
porazi = EXCLUDED.porazi,
bodovi = EXCLUDED.bodovi,
klub_id = COALESCE(EXCLUDED.klub_id, pgz_sport.natjecanja_tablice.klub_id),
source_url = EXCLUDED.source_url,
updated_at = now()
""", (natj_id, klub_id, r['klub'], r['poz'], r['uk'], r['pob'], r['por'],
r['bod'], source_url, json.dumps({'spol': spol})))
if klub_id:
self._natj_by_klub.setdefault(klub_id, []).append({
'natj_id': natj_id,
'natj_naziv': None,
'url': source_url,
'klub_naziv': r['klub'],
'pozicija': r['poz'],
})
def _harvest_natjecanje(self, nid):
url = f"{self.BASE_CVF}/natjecanje.php?id={nid}"
try:
html = _http_get(url)
except Exception as e:
self.log(f" ⚠ natj {nid}: {e}")
self.stats['errors'] += 1
return
title = _parse_title(html) or f"HOS natjecanje #{nid}"
razina, spol = _detect_razina_spol(title)
natj_id = self._upsert_natjecanje(nid, title, razina, spol, url)
rows = _parse_standings(html)
if rows:
self._replace_tablice(natj_id, url, rows, spol)
self.stats['tablice'] += len(rows)
for kid, entries in self._natj_by_klub.items():
for e in entries:
if e['natj_id'] == natj_id and e.get('natj_naziv') is None:
e['natj_naziv'] = title
mids = sorted({int(m) for m in re.findall(r'MatchStatistics\.aspx\?mID=(\d+)', html, re.IGNORECASE)})
if mids:
klub_ids_here = [kid for kid, entries in self._natj_by_klub.items()
if any(e['natj_id'] == natj_id for e in entries)]
for kid in klub_ids_here:
bucket = self._matches_for_klub.setdefault(kid, [])
for mid in mids:
bucket.append({'mid': mid, 'natj_id': natj_id, 'natj_naziv': title})
self.stats['natjecanja'] += 1
def _harvest_federation(self):
self.log("📋 preflight: hos-cvf.hr natjecanja discovery")
ids = self._discover_natjecanje_ids()
for nid in ids:
self._harvest_natjecanje(nid)
self.log(f" preflight done: natjecanja={self.stats['natjecanja']}, "
f"tablice={self.stats['tablice']}, klubova_with_match={len(self._natj_by_klub)}")
def _scrape_dp_match(self, page, mid, klub_id, klub_naziv, natj_naziv):
if mid in self._dp_match_seen:
return 0
if self._matches_scraped_total >= self.MAX_MATCHES_TOTAL:
return 0
url = f"{self.BASE_DP}/MatchStatistics.aspx?mID={mid}"
added = 0
try:
page.goto(url, wait_until='domcontentloaded', timeout=30000)
try:
page.wait_for_load_state('networkidle', timeout=10000)
except Exception:
pass
self._dp_match_seen.add(mid)
self._matches_scraped_total += 1
self.stats['matches'] += 1
rows = []
for sel in ['table.statTbl tr', 'table.report tr', 'table tr']:
try:
txts = page.locator(sel).all_inner_texts()
except Exception:
txts = []
if txts:
rows = txts
break
for txt in rows:
line = re.sub(r'\s+', ' ', txt.replace('\t', ' ')).strip()
if not line:
continue
m = re.match(r'^(\d{1,3})\s+([A-ZČĆŽŠĐ][\wČĆŽŠĐčćžšđ\.\-\']+(?:\s+[A-ZČĆŽŠĐ][\wČĆŽŠĐčćžšđ\.\-\']+)+)\b(.*)$', line)
if not m:
continue
jersey = m.group(1)
fullname = m.group(2).strip()
tail = m.group(3).strip()
nums = [int(x) for x in re.findall(r'-?\d+', tail)]
if not nums:
continue
pts = nums[0] if len(nums) >= 1 else None
aces = nums[5] if len(nums) > 5 else None
blocks = nums[7] if len(nums) > 7 else None
parts = fullname.split()
if parts[0].isupper() and len(parts) >= 2:
prezime = parts[0].title()
ime = ' '.join(parts[1:])
else:
ime = parts[0]
prezime = ' '.join(parts[1:]) if len(parts) > 1 else ''
slug_key = self.slugify(fullname)
source_id = f"dp:{mid}:{jersey}:{slug_key}"
try:
clan_id = self.upsert_clan(
klub_id=klub_id, source_id=source_id,
ime=ime, prezime=prezime,
source_url=url, kategorija='senior', sezona=self.SEZONA,
extra={'dp_match_id': mid, 'jersey': jersey},
)
self.upsert_stats(
clan_id=clan_id, sezona=self.SEZONA,
klub_id=klub_id, klub_naziv=klub_naziv,
natjecanje=natj_naziv, kategorija='senior',
stats_dict={
'nastupi': 1,
'bodovi': pts,
'servis_asovi': aces,
'blokade': blocks,
},
raw={'mid': mid, 'jersey': jersey, 'name': fullname, 'tail_nums': nums},
)
self.stats['players'] += 1
self.stats['stats'] += 1
added += 1
except Exception as e:
self.log(f" ⚠ upsert player '{fullname}': {e}")
except Exception as e:
self.log(f" ⚠ dp match {mid}: {e}")
self.stats['errors'] += 1
return added
def scrape_klub(self, page, klub):
self.log(f" 🏐 Klub {klub['id']} {klub['naziv']}")
entries = list(self._natj_by_klub.get(klub['id'], []))
if not entries:
kslug = self.slugify(klub['naziv'])
ktoks = [t for t in kslug.split('-') if len(t) > 3]
if ktoks:
for kid, ents in list(self._natj_by_klub.items()):
for e in ents:
eslug = self.slugify(e['klub_naziv'])
if sum(1 for t in ktoks if t in eslug) >= max(1, len(ktoks) - 1):
entries.append(e)
break
if entries:
break
if entries:
first = entries[0]
with self.conn.cursor() as cur:
cur.execute("""
UPDATE pgz_sport.klubovi
SET source_url = COALESCE(NULLIF(source_url, ''), %s),
source = COALESCE(source, 'hos_cvf'),
last_scraped_at = now()
WHERE id = %s
""", (first['url'], klub['id']))
naz_list = ', '.join(sorted({(e.get('natj_naziv') or '?') for e in entries}))[:120]
self.log(f"{len(entries)} natjecanja: {naz_list}")
else:
self.log(f" · no HOS natjecanje hit")
match_bucket = self._matches_for_klub.get(klub['id'], [])
if not match_bucket and entries:
for kid, ents in self._natj_by_klub.items():
if any(e['natj_id'] == entries[0]['natj_id'] for e in ents):
match_bucket = self._matches_for_klub.get(kid, [])
if match_bucket:
break
scraped_for_klub = 0
for m in match_bucket:
if scraped_for_klub >= self.MAX_MATCHES_PER_KLUB:
break
if self._matches_scraped_total >= self.MAX_MATCHES_TOTAL:
break
n = self._scrape_dp_match(page, m['mid'], klub['id'], klub['naziv'], m['natj_naziv'] or 'HOS')
if n > 0:
scraped_for_klub += 1
if scraped_for_klub:
self.log(f" ↳ scraped {scraped_for_klub} match(es) from dataproject")
def run(self, limit=999):
self._harvest_federation()
super().run(limit)
if __name__ == '__main__':
HOSHarvester().run(limit=int(sys.argv[1]) if len(sys.argv) > 1 else 999)