Sidebar: +ERP +CRM +Dokumenti, godišnjaci import (18 PDFs), filter helpers
- pgz nav now includes /erp/full, /crm/v2, /admin/users, /dokumenti
- 4 dokumenti endpoints: list, godišnjaci/list, godišnjak/{godina} PDF, detail
- 18 godišnjaka u pgz_sport.dokumenti (2006-2024) with savez_id=333
- PGŽ filter helpers (window._pgz_filter_priority, togglePGZFilter)
- navItemClick handler for nav items with href
This commit is contained in:
@@ -1,21 +1,416 @@
|
||||
#!/usr/bin/env python3
|
||||
"""HOS volleyball harvester."""
|
||||
import sys
|
||||
# hos_volleyball.py
|
||||
# v1.0.0 — dradulic@outlook.com / damir@rinet.one — 2026-05-05
|
||||
# HOS odbojka harvester: hos-cvf.hr natjecanja + standings, hos-web.dataproject.com match stats.
|
||||
# Targets all 77 PGŽ odbojka klubova.
|
||||
|
||||
import sys, re, json, time
|
||||
import html as ihtml
|
||||
from datetime import datetime
|
||||
import requests
|
||||
from psycopg2.extras import RealDictCursor
|
||||
|
||||
sys.path.insert(0, '/opt/pgz-sport/scripts/sport_harvesters')
|
||||
from __base import SportHarvester
|
||||
|
||||
|
||||
UA = "RiNET-Civic/1.0 (https://rinet.one)"
|
||||
HDR = {"User-Agent": UA}
|
||||
|
||||
|
||||
def _http_get(url, retries=1):
|
||||
last = None
|
||||
for i in range(retries + 1):
|
||||
try:
|
||||
r = requests.get(url, headers=HDR, timeout=25)
|
||||
if r.status_code == 200 and r.text:
|
||||
return r.text
|
||||
last = f"HTTP {r.status_code}"
|
||||
except Exception as e:
|
||||
last = str(e)
|
||||
time.sleep(2)
|
||||
raise RuntimeError(f"GET {url} failed: {last}")
|
||||
|
||||
|
||||
def _strip_tags(s):
|
||||
return ihtml.unescape(re.sub(r'<[^>]+>', '', s or '')).strip()
|
||||
|
||||
|
||||
def _parse_standings(html):
|
||||
"""Return list of {poz, klub, uk, pob, por, bod} from first plausible table."""
|
||||
tables = re.findall(r'<table[^>]*>(.+?)</table>', html, re.DOTALL | re.IGNORECASE)
|
||||
for tbl in tables:
|
||||
rows = re.findall(r'<tr[^>]*>(.+?)</tr>', tbl, re.DOTALL | re.IGNORECASE)
|
||||
out = []
|
||||
for row in rows:
|
||||
cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', row, re.DOTALL | re.IGNORECASE)
|
||||
clean = [_strip_tags(c) for c in cells]
|
||||
if not clean or not clean[0]:
|
||||
continue
|
||||
if clean[0] in ('', '#', 'Pos', 'Poz', 'R'):
|
||||
continue
|
||||
try:
|
||||
m = re.match(r'(\d+)\.?', clean[0])
|
||||
if not m:
|
||||
continue
|
||||
poz = int(m.group(1))
|
||||
if len(clean) < 5:
|
||||
continue
|
||||
klub = clean[2] if (len(clean) >= 7 and not re.match(r'^\d+$', clean[2])) else clean[1]
|
||||
if not klub or re.match(r'^\d+$', klub):
|
||||
continue
|
||||
numcells = [c for c in clean if re.match(r'^-?\d+$', c)]
|
||||
if len(numcells) < 4:
|
||||
continue
|
||||
tail = numcells[1:]
|
||||
uk = int(tail[0])
|
||||
pob = int(tail[1])
|
||||
por = int(tail[2])
|
||||
bod = int(tail[-1])
|
||||
out.append({'poz': poz, 'klub': klub, 'uk': uk, 'pob': pob, 'por': por, 'bod': bod})
|
||||
except Exception:
|
||||
continue
|
||||
if out and len(out) >= 2:
|
||||
return out
|
||||
return []
|
||||
|
||||
|
||||
def _parse_title(html):
|
||||
m = re.search(r'<h1[^>]*>(.*?)</h1>', html, re.DOTALL | re.IGNORECASE)
|
||||
if m:
|
||||
t = _strip_tags(m.group(1))
|
||||
if t and len(t) > 4:
|
||||
return t
|
||||
m = re.search(r'<title[^>]*>(.*?)</title>', html, re.DOTALL | re.IGNORECASE)
|
||||
if m:
|
||||
return _strip_tags(m.group(1))
|
||||
return None
|
||||
|
||||
|
||||
def _detect_razina_spol(title):
|
||||
t = (title or '').lower()
|
||||
razina = None
|
||||
for key, lab in [
|
||||
('superliga 2', 'Superliga 2'),
|
||||
('superliga', 'Superliga'),
|
||||
('1. liga', '1.liga'), ('1.liga', '1.liga'),
|
||||
('2. liga', '2.liga'), ('2.liga', '2.liga'),
|
||||
('3. liga', '3.liga'), ('3.liga', '3.liga'),
|
||||
('kup', 'Kup'),
|
||||
('kadeti', 'Kadeti'), ('kadetkinje', 'Kadetkinje'),
|
||||
('juniori', 'Juniori'), ('juniorke', 'Juniorke'),
|
||||
('mini', 'Mini'),
|
||||
('beach', 'Beach'), ('pijesku', 'Beach'),
|
||||
]:
|
||||
if key in t:
|
||||
razina = lab
|
||||
break
|
||||
spol = None
|
||||
if re.search(r'\(\s*[mM]\s*\)|\bmu[šs]ki\b|\bmuska\b|\bjuniori\b|\bkadeti\b', t):
|
||||
spol = 'M'
|
||||
elif re.search(r'\(\s*[ŽzZ]\s*\)|\bžen|\bjuniorke\b|\bkadetkinje\b', t):
|
||||
spol = 'Ž'
|
||||
return razina, spol
|
||||
|
||||
|
||||
class HOSHarvester(SportHarvester):
|
||||
SPORT = 'odbojka'
|
||||
SOURCE = 'hos'
|
||||
|
||||
def scrape_klub(self, page, klub):
|
||||
# HOS-CVF.hr search
|
||||
self.log(f" 🏐 Klub {klub['id']} {klub['naziv']}")
|
||||
|
||||
BASE_CVF = 'https://hos-cvf.hr'
|
||||
BASE_DP = 'https://hos-web.dataproject.com'
|
||||
SEZONA = '2025/26'
|
||||
MAX_NATJ = 80
|
||||
MAX_MATCHES_PER_KLUB = 5
|
||||
MAX_MATCHES_TOTAL = 120
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._natj_by_klub = {}
|
||||
self._matches_for_klub = {}
|
||||
self._dp_match_seen = set()
|
||||
self._matches_scraped_total = 0
|
||||
self.stats.setdefault('natjecanja', 0)
|
||||
self.stats.setdefault('tablice', 0)
|
||||
self.stats.setdefault('matches', 0)
|
||||
|
||||
def get_target_klubovi(self, limit=999):
|
||||
with self.conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
cur.execute("""
|
||||
SELECT * FROM pgz_sport.v_pgz_priority_klubovi
|
||||
WHERE sport = 'odbojka'
|
||||
ORDER BY (financiran OR u_godisnjaku) DESC, id
|
||||
LIMIT %s
|
||||
""", (limit,))
|
||||
return cur.fetchall()
|
||||
|
||||
def _discover_natjecanje_ids(self):
|
||||
try:
|
||||
page.goto("https://hos-cvf.hr/", wait_until="domcontentloaded", timeout=20000)
|
||||
self.log(f" [discovery mode] HOS site loaded")
|
||||
html = _http_get(self.BASE_CVF + '/')
|
||||
except Exception as e:
|
||||
self.log(f" ❌ {e}")
|
||||
self.log(f"⚠ failed to load hos-cvf.hr: {e}")
|
||||
return []
|
||||
ids = sorted({int(m) for m in re.findall(r'natjecanje\.php\?id=(\d+)', html)})
|
||||
self.log(f" found {len(ids)} natjecanje ids on hos-cvf.hr")
|
||||
return ids[:self.MAX_NATJ]
|
||||
|
||||
def _upsert_natjecanje(self, nid, naziv, razina, spol, source_url):
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
INSERT INTO pgz_sport.natjecanja
|
||||
(sport, naziv, razina, sezona, spol, source, external_id, external_url,
|
||||
source_id, source_url, status, updated_at)
|
||||
VALUES ('odbojka', %s, %s, %s, %s, 'hos_cvf', %s, %s, %s, %s, 'aktivno', now())
|
||||
ON CONFLICT (source, external_id) DO UPDATE
|
||||
SET naziv = EXCLUDED.naziv,
|
||||
razina = COALESCE(EXCLUDED.razina, pgz_sport.natjecanja.razina),
|
||||
spol = COALESCE(EXCLUDED.spol, pgz_sport.natjecanja.spol),
|
||||
sezona = EXCLUDED.sezona,
|
||||
source_url = EXCLUDED.source_url,
|
||||
external_url = EXCLUDED.external_url,
|
||||
updated_at = now()
|
||||
RETURNING id
|
||||
""", (naziv, razina, self.SEZONA, spol, str(nid), source_url, str(nid), source_url))
|
||||
return cur.fetchone()[0]
|
||||
|
||||
def _find_klub_id(self, klub_naziv):
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
SELECT id, region FROM pgz_sport.klubovi
|
||||
WHERE sport = 'odbojka'
|
||||
AND (LOWER(naziv) = LOWER(%s) OR LOWER(naziv) LIKE LOWER(%s))
|
||||
ORDER BY CASE WHEN aktivan THEN 0 ELSE 1 END,
|
||||
CASE WHEN region='PGŽ' THEN 0 ELSE 1 END,
|
||||
id
|
||||
LIMIT 1
|
||||
""", (klub_naziv, f"%{klub_naziv}%"))
|
||||
r = cur.fetchone()
|
||||
if r:
|
||||
return r[0]
|
||||
target = self.slugify(klub_naziv)
|
||||
toks = [t for t in target.split('-') if len(t) > 3]
|
||||
if not toks:
|
||||
return None
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
SELECT id, naziv FROM pgz_sport.klubovi
|
||||
WHERE sport='odbojka' AND aktivan
|
||||
""")
|
||||
best = None
|
||||
best_score = 0
|
||||
for kid, knaz in cur.fetchall():
|
||||
kslug = self.slugify(knaz)
|
||||
score = sum(1 for t in toks if t in kslug)
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best = kid
|
||||
if best_score >= max(1, len(toks) - 1):
|
||||
return best
|
||||
return None
|
||||
|
||||
def _replace_tablice(self, natj_id, source_url, rows, spol):
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("DELETE FROM pgz_sport.natjecanja_tablice WHERE natjecanje_id=%s AND source='hos_cvf'", (natj_id,))
|
||||
for r in rows:
|
||||
klub_id = self._find_klub_id(r['klub'])
|
||||
cur.execute("""
|
||||
INSERT INTO pgz_sport.natjecanja_tablice
|
||||
(natjecanje_id, klub_id, klub_naziv, pozicija, odigrano, pobjede,
|
||||
nerijeseno, porazi, gol_z, gol_p, gol_razlika, bodovi,
|
||||
source, source_url, updated_at, extra_data)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, 0, %s, 0, 0, 0, %s,
|
||||
'hos_cvf', %s, now(), %s::jsonb)
|
||||
ON CONFLICT (natjecanje_id, klub_naziv) DO UPDATE SET
|
||||
pozicija = EXCLUDED.pozicija,
|
||||
odigrano = EXCLUDED.odigrano,
|
||||
pobjede = EXCLUDED.pobjede,
|
||||
porazi = EXCLUDED.porazi,
|
||||
bodovi = EXCLUDED.bodovi,
|
||||
klub_id = COALESCE(EXCLUDED.klub_id, pgz_sport.natjecanja_tablice.klub_id),
|
||||
source_url = EXCLUDED.source_url,
|
||||
updated_at = now()
|
||||
""", (natj_id, klub_id, r['klub'], r['poz'], r['uk'], r['pob'], r['por'],
|
||||
r['bod'], source_url, json.dumps({'spol': spol})))
|
||||
if klub_id:
|
||||
self._natj_by_klub.setdefault(klub_id, []).append({
|
||||
'natj_id': natj_id,
|
||||
'natj_naziv': None,
|
||||
'url': source_url,
|
||||
'klub_naziv': r['klub'],
|
||||
'pozicija': r['poz'],
|
||||
})
|
||||
|
||||
def _harvest_natjecanje(self, nid):
|
||||
url = f"{self.BASE_CVF}/natjecanje.php?id={nid}"
|
||||
try:
|
||||
html = _http_get(url)
|
||||
except Exception as e:
|
||||
self.log(f" ⚠ natj {nid}: {e}")
|
||||
self.stats['errors'] += 1
|
||||
return
|
||||
title = _parse_title(html) or f"HOS natjecanje #{nid}"
|
||||
razina, spol = _detect_razina_spol(title)
|
||||
natj_id = self._upsert_natjecanje(nid, title, razina, spol, url)
|
||||
rows = _parse_standings(html)
|
||||
if rows:
|
||||
self._replace_tablice(natj_id, url, rows, spol)
|
||||
self.stats['tablice'] += len(rows)
|
||||
for kid, entries in self._natj_by_klub.items():
|
||||
for e in entries:
|
||||
if e['natj_id'] == natj_id and e.get('natj_naziv') is None:
|
||||
e['natj_naziv'] = title
|
||||
mids = sorted({int(m) for m in re.findall(r'MatchStatistics\.aspx\?mID=(\d+)', html, re.IGNORECASE)})
|
||||
if mids:
|
||||
klub_ids_here = [kid for kid, entries in self._natj_by_klub.items()
|
||||
if any(e['natj_id'] == natj_id for e in entries)]
|
||||
for kid in klub_ids_here:
|
||||
bucket = self._matches_for_klub.setdefault(kid, [])
|
||||
for mid in mids:
|
||||
bucket.append({'mid': mid, 'natj_id': natj_id, 'natj_naziv': title})
|
||||
self.stats['natjecanja'] += 1
|
||||
|
||||
def _harvest_federation(self):
|
||||
self.log("📋 preflight: hos-cvf.hr natjecanja discovery")
|
||||
ids = self._discover_natjecanje_ids()
|
||||
for nid in ids:
|
||||
self._harvest_natjecanje(nid)
|
||||
self.log(f" preflight done: natjecanja={self.stats['natjecanja']}, "
|
||||
f"tablice={self.stats['tablice']}, klubova_with_match={len(self._natj_by_klub)}")
|
||||
|
||||
def _scrape_dp_match(self, page, mid, klub_id, klub_naziv, natj_naziv):
|
||||
if mid in self._dp_match_seen:
|
||||
return 0
|
||||
if self._matches_scraped_total >= self.MAX_MATCHES_TOTAL:
|
||||
return 0
|
||||
url = f"{self.BASE_DP}/MatchStatistics.aspx?mID={mid}"
|
||||
added = 0
|
||||
try:
|
||||
page.goto(url, wait_until='domcontentloaded', timeout=30000)
|
||||
try:
|
||||
page.wait_for_load_state('networkidle', timeout=10000)
|
||||
except Exception:
|
||||
pass
|
||||
self._dp_match_seen.add(mid)
|
||||
self._matches_scraped_total += 1
|
||||
self.stats['matches'] += 1
|
||||
rows = []
|
||||
for sel in ['table.statTbl tr', 'table.report tr', 'table tr']:
|
||||
try:
|
||||
txts = page.locator(sel).all_inner_texts()
|
||||
except Exception:
|
||||
txts = []
|
||||
if txts:
|
||||
rows = txts
|
||||
break
|
||||
|
||||
for txt in rows:
|
||||
line = re.sub(r'\s+', ' ', txt.replace('\t', ' ')).strip()
|
||||
if not line:
|
||||
continue
|
||||
m = re.match(r'^(\d{1,3})\s+([A-ZČĆŽŠĐ][\wČĆŽŠĐčćžšđ\.\-\']+(?:\s+[A-ZČĆŽŠĐ][\wČĆŽŠĐčćžšđ\.\-\']+)+)\b(.*)$', line)
|
||||
if not m:
|
||||
continue
|
||||
jersey = m.group(1)
|
||||
fullname = m.group(2).strip()
|
||||
tail = m.group(3).strip()
|
||||
nums = [int(x) for x in re.findall(r'-?\d+', tail)]
|
||||
if not nums:
|
||||
continue
|
||||
pts = nums[0] if len(nums) >= 1 else None
|
||||
aces = nums[5] if len(nums) > 5 else None
|
||||
blocks = nums[7] if len(nums) > 7 else None
|
||||
parts = fullname.split()
|
||||
if parts[0].isupper() and len(parts) >= 2:
|
||||
prezime = parts[0].title()
|
||||
ime = ' '.join(parts[1:])
|
||||
else:
|
||||
ime = parts[0]
|
||||
prezime = ' '.join(parts[1:]) if len(parts) > 1 else ''
|
||||
slug_key = self.slugify(fullname)
|
||||
source_id = f"dp:{mid}:{jersey}:{slug_key}"
|
||||
try:
|
||||
clan_id = self.upsert_clan(
|
||||
klub_id=klub_id, source_id=source_id,
|
||||
ime=ime, prezime=prezime,
|
||||
source_url=url, kategorija='senior', sezona=self.SEZONA,
|
||||
extra={'dp_match_id': mid, 'jersey': jersey},
|
||||
)
|
||||
self.upsert_stats(
|
||||
clan_id=clan_id, sezona=self.SEZONA,
|
||||
klub_id=klub_id, klub_naziv=klub_naziv,
|
||||
natjecanje=natj_naziv, kategorija='senior',
|
||||
stats_dict={
|
||||
'nastupi': 1,
|
||||
'bodovi': pts,
|
||||
'servis_asovi': aces,
|
||||
'blokade': blocks,
|
||||
},
|
||||
raw={'mid': mid, 'jersey': jersey, 'name': fullname, 'tail_nums': nums},
|
||||
)
|
||||
self.stats['players'] += 1
|
||||
self.stats['stats'] += 1
|
||||
added += 1
|
||||
except Exception as e:
|
||||
self.log(f" ⚠ upsert player '{fullname}': {e}")
|
||||
except Exception as e:
|
||||
self.log(f" ⚠ dp match {mid}: {e}")
|
||||
self.stats['errors'] += 1
|
||||
return added
|
||||
|
||||
def scrape_klub(self, page, klub):
|
||||
self.log(f" 🏐 Klub {klub['id']} {klub['naziv']}")
|
||||
entries = list(self._natj_by_klub.get(klub['id'], []))
|
||||
if not entries:
|
||||
kslug = self.slugify(klub['naziv'])
|
||||
ktoks = [t for t in kslug.split('-') if len(t) > 3]
|
||||
if ktoks:
|
||||
for kid, ents in list(self._natj_by_klub.items()):
|
||||
for e in ents:
|
||||
eslug = self.slugify(e['klub_naziv'])
|
||||
if sum(1 for t in ktoks if t in eslug) >= max(1, len(ktoks) - 1):
|
||||
entries.append(e)
|
||||
break
|
||||
if entries:
|
||||
break
|
||||
|
||||
if entries:
|
||||
first = entries[0]
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
UPDATE pgz_sport.klubovi
|
||||
SET source_url = COALESCE(NULLIF(source_url, ''), %s),
|
||||
source = COALESCE(source, 'hos_cvf'),
|
||||
last_scraped_at = now()
|
||||
WHERE id = %s
|
||||
""", (first['url'], klub['id']))
|
||||
naz_list = ', '.join(sorted({(e.get('natj_naziv') or '?') for e in entries}))[:120]
|
||||
self.log(f" ↳ {len(entries)} natjecanja: {naz_list}")
|
||||
else:
|
||||
self.log(f" · no HOS natjecanje hit")
|
||||
|
||||
match_bucket = self._matches_for_klub.get(klub['id'], [])
|
||||
if not match_bucket and entries:
|
||||
for kid, ents in self._natj_by_klub.items():
|
||||
if any(e['natj_id'] == entries[0]['natj_id'] for e in ents):
|
||||
match_bucket = self._matches_for_klub.get(kid, [])
|
||||
if match_bucket:
|
||||
break
|
||||
scraped_for_klub = 0
|
||||
for m in match_bucket:
|
||||
if scraped_for_klub >= self.MAX_MATCHES_PER_KLUB:
|
||||
break
|
||||
if self._matches_scraped_total >= self.MAX_MATCHES_TOTAL:
|
||||
break
|
||||
n = self._scrape_dp_match(page, m['mid'], klub['id'], klub['naziv'], m['natj_naziv'] or 'HOS')
|
||||
if n > 0:
|
||||
scraped_for_klub += 1
|
||||
if scraped_for_klub:
|
||||
self.log(f" ↳ scraped {scraped_for_klub} match(es) from dataproject")
|
||||
|
||||
def run(self, limit=999):
|
||||
self._harvest_federation()
|
||||
super().run(limit)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
HOSHarvester().run(limit=int(sys.argv[1]) if len(sys.argv) > 1 else 50)
|
||||
HOSHarvester().run(limit=int(sys.argv[1]) if len(sys.argv) > 1 else 999)
|
||||
|
||||
Reference in New Issue
Block a user