PGŽ Sport Platform — Round 1+2 baseline (sport2.html + API)

This commit is contained in:
Damir Radulić
2026-05-04 23:39:08 +02:00
commit a7ec0a86be
1820 changed files with 694455 additions and 0 deletions
+337
View File
@@ -0,0 +1,337 @@
#!/usr/bin/env python3
"""HBS Hrvatski boćarski savez scraper - prava bota za PGŽ klubove."""
import os, re, sys, time, json, html, traceback, datetime as dt
import urllib.request, urllib.error
from urllib.parse import urljoin
import psycopg2
DB = dict(host="localhost", port=5432, dbname="rinet_v3", user="rinet", password="R1net2026!SecureDB#v7")
BASE = "https://hrvatski-bocarski-savez.hr"
UA = "Mozilla/5.0 (compatible; PGZSportBot/1.0; +https://api.rinet.one/sport)"
DELAY = 1.2
LOG_FP = "/opt/pgz-sport/_logs/hbs_scraper.log"
def log(msg):
line = f"[{dt.datetime.now().isoformat()}] {msg}"
print(line, flush=True)
try:
with open(LOG_FP, "a") as f: f.write(line+"\n")
except: pass
def db():
c = psycopg2.connect(**DB); c.autocommit = True; return c
def fetch(url, retries=2):
for i in range(retries+1):
try:
req = urllib.request.Request(url, headers={"User-Agent": UA, "Accept-Language": "hr,en"})
with urllib.request.urlopen(req, timeout=20) as r:
return r.read().decode("utf-8", errors="replace")
except urllib.error.HTTPError as e:
if e.code in (404, 410): return None
if i == retries: log(f"HTTP {e.code} {url}"); return None
except Exception as e:
if i == retries: log(f"FETCH err {e} {url}"); return None
time.sleep(DELAY * 2)
# === KLUB PARSER ===
def parse_klub(h, slug):
if not h: return None
# Naziv - drugi h3 (prvi je uvijek "Fédération Croate de Boules")
h3s = re.findall(r'<h3[^>]*>\s*([^<]+?)\s*</h3>', h)
naziv = None
for cand in h3s:
cand = html.unescape(cand.strip())
if cand and 'Fédération' not in cand and 'sponzor' not in cand.lower() and len(cand) < 80:
naziv = cand; break
if not naziv: return None
# Logo
m = re.search(r'<img[^>]+src="(/cdn/content/[^"]+\.(jpg|png|jpeg))"[^>]+alt="Klub"', h)
logo = urljoin(BASE, m.group(1)) if m else None
info = {}
# Bullets: Županija, Liga, Adresa, Sportske grane, Osoba za kontakt, E-mail, Tel/mob, OIB
for m in re.finditer(r'<strong>([^<]+?):</strong>\s*([^<\n]+?)(?:<|\n)', h):
key = m.group(1).strip().lower()
val = re.sub(r'<[^>]+>', '', m.group(2).strip())
val = html.unescape(val)
if 'županija' in key: info['zupanija'] = val
elif 'liga' in key: info['liga'] = val
elif 'adresa' in key: info['adresa'] = val
elif 'sportske grane' in key or 'sportska grana' in key: info['sportske_grane'] = val
elif 'osoba za kontakt' in key: info['kontakt_osoba'] = val
elif 'tel' in key: info['telefon'] = val
elif 'oib' in key: info['oib'] = val
# Igrači - pattern: <li><a href="...igraci/SLUG/">N. E-XX-YY, <strong>Ime</strong>, YYYY.</a></li>
igraci = []
for m in re.finditer(
r'<li><a\s+href="https?://[^/]+/igraci/([\w\-]+)/?"[^>]*>\s*\d+\.\s*([A-Z][\d\-]+),\s*<strong>([^<]+)</strong>,\s*(\d{4})\.?',
h
):
igraci.append({
"slug": m.group(1),
"iskaznica": m.group(2).strip(),
"ime_prezime": html.unescape(m.group(3).strip()),
"godina_rodenja": int(m.group(4))
})
# Voditelji ekipe (treneri) - tab #popis_voditelja_ekipe
voditelji = []
vsec = re.search(r'id="popis_voditelja_ekipe"[^>]*>(.*?)(?:<div\s+(?:role|class)|</section>|<!--)', h, re.S)
if vsec:
for v in re.finditer(r'<p[^>]*>\s*([A-ZČĆĐŠŽ][\wčćđšžČĆĐŠŽ\s\-]{2,40}[A-ZČĆĐŠŽ][a-zčćđšž]+)\s*</p>', vsec.group(1)):
name = re.sub(r'\s+', ' ', v.group(1).strip())
if len(name) > 4 and len(name.split()) >= 2 and 'Trenutno' not in name and name not in voditelji:
voditelji.append(name)
# fallback bez p tagova
if not voditelji:
text = re.sub(r'<[^>]+>', '\n', vsec.group(1))
for line in text.split('\n'):
line = line.strip()
if len(line) > 4 and len(line.split()) >= 2 and 'Trenutno' not in line:
parts = line.split()
if all(p[0].isupper() for p in parts[:2] if p):
voditelji.append(line)
return {
"slug": slug, "naziv": naziv, "logo": logo,
"info": info,
"igraci": igraci,
"voditelji": voditelji[:10]
}
# === IGRAČ PARSER ===
def parse_igrac(h, slug):
if not h: return None
h3s = re.findall(r'<h3[^>]*>\s*([^<]+?)\s*</h3>', h)
full_name = None
for cand in h3s:
cand = html.unescape(cand.strip())
if cand and 'Fédération' not in cand and 'Sport' not in cand and len(cand) < 80 and len(cand.split()) >= 2:
full_name = cand; break
if not full_name:
full_name = slug.replace("-", " ").title()
parts = full_name.split()
ime = parts[0] if parts else ""
prezime = " ".join(parts[1:]) if len(parts)>1 else ""
# Slika
m = re.search(r'<img[^>]+src="(/cdn/content/[^"]+\.(jpg|png|jpeg))"[^>]+alt="Igrač"', h)
slika = urljoin(BASE, m.group(1)) if m else None
info = {}
for m in re.finditer(r'<strong>([^<]+?):</strong>\s*([^<\n]+?)(?:<|\n)', h):
key = m.group(1).strip().lower()
val = re.sub(r'<[^>]+>', '', m.group(2).strip()).rstrip('.')
val = html.unescape(val)
if 'iskaznic' in key: info['iskaznica'] = val
elif 'godina rođenja' in key:
try: info['godina_rodenja'] = int(re.search(r'(\d{4})', val).group(1))
except: pass
elif 'matični klub' in key: info['maticni_klub'] = val
# Sportski put - tabela registracije
karijera = []
table_m = re.search(r'### Sportski put.*?</table>', h, re.S)
if not table_m:
table_m = re.search(r'Sportski put.*?</table>', h, re.S)
if table_m:
rows = re.findall(r'<tr>(.*?)</tr>', table_m.group(0), re.S)
for r in rows[1:]: # skip header
cells = re.findall(r'<td[^>]*>(.*?)</td>', r, re.S)
if len(cells) >= 4:
karijera.append({
"datum_reg": re.sub(r'<[^>]+>', '', cells[0]).strip().rstrip('.'),
"klub": re.sub(r'<[^>]+>', '', cells[1]).strip(),
"sportska_grana": re.sub(r'<[^>]+>', '', cells[2]).strip(),
"sezona": re.sub(r'<[^>]+>', '', cells[3]).strip(),
"lijecnicki": re.sub(r'<[^>]+>', '', cells[4]).strip().rstrip('.') if len(cells) > 4 else None
})
return {
"slug": slug, "ime": ime, "prezime": prezime, "full_name": full_name,
"slika_url": slika,
"info": info,
"karijera": karijera
}
# PGŽ klubovi - pravi slug-ovi sa bocarski-savez-pgz
PGZ_HBS_CLUBS = [
# Senior klubovi
"kastav", "kostrena", "krenovac", "krimeja", "krk", "lovran", "opatija",
"rijeka-2", "srdoci", "sveti-jakov", "sveti-rok-klana", "vargon", "hreljin",
"draga-moscenicka-draga", "lovranska-draga", "brod-moravice",
# Ženski klubovi
"zenski-bocarski-klub-cavle", "zenski-bocarski-klub-drenova-rijeka",
"zenski-bocarski-klub-hreljin", "zenski-bocarski-klub-kastav",
"zenska-bocarska-ekipa-kastav-2",
# Kadetske / juniorske ekipe (mlade kategorije)
"cavle-skola-bocanja", "juniorska-ekipa-cavle-sb-1", "juniorska-ekipa-kastav",
"juniorska-ekipa-lovran", "juniorska-ekipa-sv-rok-klana", "juniorska-ekipa-vargon",
"kadetska-ekipa-bk-cavle-sb-2", "kadetska-ekipa-bk-kastav-2",
"kadetska-ekipa-bk-lovran", "kadetska-ekipa-bk-sveti-jakov-2",
"kadetska-ekipa-bk-vargon", "kadetska-ekipa-kastav", "kadetska-ekipa-zbk-drenova",
]
def upsert_klub(conn, k):
cur = conn.cursor()
# Check by hbs slug in napomena, then by sport+naziv
cur.execute("SELECT id FROM pgz_sport.klubovi WHERE napomena ILIKE %s LIMIT 1", (f"%hbs:{k['slug']}%",))
row = cur.fetchone()
if not row:
cur.execute("SELECT id FROM pgz_sport.klubovi WHERE sport='boćanje' AND lower(naziv)=lower(%s) LIMIT 1", (k['naziv'],))
row = cur.fetchone()
info = k.get('info', {})
naziv = k['naziv']
grad = None
if info.get('adresa'):
m = re.search(r'(\w+(?:\s+\w+)?)$', info['adresa'].strip())
if m: grad = m.group(1)
if row:
kid = row[0]
cur.execute("""UPDATE pgz_sport.klubovi SET
adresa=COALESCE(%s, adresa),
telefon=COALESCE(%s, telefon),
grad=COALESCE(%s, grad),
napomena=COALESCE(napomena,'') || ' [HBS sync ' || CURRENT_DATE || ': hbs:' || %s || ']'
WHERE id=%s""",
(info.get('adresa'), info.get('telefon'), grad, k['slug'], kid))
else:
cur.execute("""INSERT INTO pgz_sport.klubovi
(naziv, sport, region, grad, adresa, telefon, aktivan, napomena)
VALUES (%s, 'boćanje', 'PGŽ', %s, %s, %s, true, %s)
RETURNING id""",
(naziv, grad, info.get('adresa'), info.get('telefon'),
f"[HBS sync {dt.date.today()}: hbs:{k['slug']}, OIB:{info.get('oib','-')}, liga:{info.get('liga','-')}]"))
kid = cur.fetchone()[0]
return kid
def upsert_igrac(conn, p, klub_db_id, klub_naziv):
cur = conn.cursor()
iskaznica = (p.get('info', {}).get('iskaznica') or '').strip()
# Check by iskaznica (HBS unique ID)
cid = None
if iskaznica:
cur.execute("SELECT id FROM pgz_sport.clanovi WHERE source='hbs_savez' AND source_id=%s", (iskaznica,))
row = cur.fetchone()
if row: cid = row[0]
god = p.get('info', {}).get('godina_rodenja')
src_url = f"{BASE}/igraci/{p['slug']}/"
if cid:
cur.execute("""UPDATE pgz_sport.clanovi SET
ime=%s, prezime=%s, sport='boćanje', uloga='igrac',
klub_id=%s, klub_naziv_godisnjak=%s,
slika_url=COALESCE(%s, slika_url),
godina_rodenja=COALESCE(%s, godina_rodenja),
slug=%s,
source='hbs_savez', source_id=%s, source_url=%s, source_synced_at=now()
WHERE id=%s""",
(p['ime'], p['prezime'], klub_db_id, klub_naziv, p.get('slika_url'),
god, p['slug'], iskaznica, src_url, cid))
else:
cur.execute("""INSERT INTO pgz_sport.clanovi
(ime, prezime, sport, uloga, klub_id, klub_naziv_godisnjak, slika_url,
godina_rodenja, slug, source, source_id, source_url, source_synced_at)
VALUES (%s, %s, 'boćanje', 'igrac', %s, %s, %s, %s, %s, 'hbs_savez', %s, %s, now())
RETURNING id""",
(p['ime'], p['prezime'], klub_db_id, klub_naziv, p.get('slika_url'),
god, p['slug'], iskaznica, src_url))
cid = cur.fetchone()[0]
return cid
def upsert_voditelj(conn, name, klub_db_id, klub_naziv, role='trener'):
"""Voditelji ekipe = treneri"""
cur = conn.cursor()
parts = name.strip().split()
if len(parts) < 2: return None
ime, prezime = parts[0], " ".join(parts[1:])
cur.execute("""SELECT id FROM pgz_sport.clanovi
WHERE lower(ime)=lower(%s) AND lower(prezime)=lower(%s) AND sport='boćanje'""",
(ime, prezime))
row = cur.fetchone()
if row:
cur.execute("""UPDATE pgz_sport.clanovi SET
uloga=%s, klub_id=COALESCE(klub_id, %s),
klub_naziv_godisnjak=COALESCE(klub_naziv_godisnjak, %s),
source_url=COALESCE(source_url, %s)
WHERE id=%s""",
(role, klub_db_id, klub_naziv, f"{BASE}/klubovi/", row[0]))
return row[0]
cur.execute("""INSERT INTO pgz_sport.clanovi
(ime, prezime, sport, uloga, klub_id, klub_naziv_godisnjak,
source, source_url, source_synced_at)
VALUES (%s, %s, 'boćanje', %s, %s, %s, 'hbs_savez', %s, now())
RETURNING id""",
(ime, prezime, role, klub_db_id, klub_naziv, f"{BASE}/klubovi/"))
return cur.fetchone()[0]
def main():
conn = db()
log(f"=== HBS scraper START - {len(PGZ_HBS_CLUBS)} kandidata ===")
success = 0; players_total = 0
for slug in PGZ_HBS_CLUBS:
url = f"{BASE}/klubovi/{slug}/"
log(f"→ KLUB {slug}")
h = fetch(url)
if not h: log(f" ✗ klub ne postoji ili 404"); continue
parsed = parse_klub(h, slug)
if not parsed: log(f" ✗ ne mogu parse"); continue
kid = upsert_klub(conn, parsed)
log(f" ✓ {parsed['naziv']} (db_id={kid}) igrača={len(parsed['igraci'])} voditelja={len(parsed['voditelji'])}")
success += 1
# Voditelji
for v in parsed['voditelji']:
try:
upsert_voditelj(conn, v, kid, parsed['naziv'])
log(f" ✓ voditelj: {v}")
except Exception as e:
log(f" ✗ voditelj {v}: {e}")
# Igrači - dohvati profil za svakog
for ig in parsed['igraci']:
time.sleep(DELAY)
try:
purl = f"{BASE}/igraci/{ig['slug']}/"
ph = fetch(purl)
if not ph: continue
pdata = parse_igrac(ph, ig['slug'])
if not pdata: continue
# Override sa fallback iz lista ako parser pogrešno
if 'Fédération' in pdata.get('full_name','') or pdata['ime'].lower() == 'fédération':
pdata['full_name'] = ig['ime_prezime']
parts = ig['ime_prezime'].split()
pdata['ime'] = parts[0] if parts else ''
pdata['prezime'] = ' '.join(parts[1:]) if len(parts)>1 else ''
# Iskaznica from list (mora biti tu)
if not pdata.get('info', {}).get('iskaznica'):
pdata.setdefault('info', {})['iskaznica'] = ig.get('iskaznica')
if not pdata.get('info', {}).get('godina_rodenja'):
pdata.setdefault('info', {})['godina_rodenja'] = ig.get('godina_rodenja')
cid = upsert_igrac(conn, pdata, kid, parsed['naziv'])
players_total += 1
log(f" ✓ {pdata['ime']} {pdata['prezime']} (db={cid}, god={pdata.get('info',{}).get('godina_rodenja')})")
except Exception as e:
log(f" ✗ igrač {ig['slug']}: {e}")
time.sleep(DELAY)
log(f"=== DONE: {success} klubova, {players_total} igrača ===")
conn.close()
if __name__ == "__main__":
main()