338 lines
15 KiB
Python
Executable File
338 lines
15 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""HBS Hrvatski boćarski savez scraper - prava bota za PGŽ klubove."""
|
|
import os, re, sys, time, json, html, traceback, datetime as dt
|
|
import urllib.request, urllib.error
|
|
from urllib.parse import urljoin
|
|
import psycopg2
|
|
|
|
DB = dict(host="10.10.0.2", port=6432, port=5432, dbname="rinet_v3", user="rinet", password=os.environ["DB_PASSWORD"])
|
|
BASE = "https://hrvatski-bocarski-savez.hr"
|
|
UA = "Mozilla/5.0 (compatible; PGZSportBot/1.0; +https://api.rinet.one/sport)"
|
|
DELAY = 1.2
|
|
LOG_FP = "/opt/pgz-sport/_logs/hbs_scraper.log"
|
|
|
|
def log(msg):
|
|
line = f"[{dt.datetime.now().isoformat()}] {msg}"
|
|
print(line, flush=True)
|
|
try:
|
|
with open(LOG_FP, "a") as f: f.write(line+"\n")
|
|
except: pass
|
|
|
|
def db():
|
|
c = psycopg2.connect(**DB); c.autocommit = True; return c
|
|
|
|
def fetch(url, retries=2):
|
|
for i in range(retries+1):
|
|
try:
|
|
req = urllib.request.Request(url, headers={"User-Agent": UA, "Accept-Language": "hr,en"})
|
|
with urllib.request.urlopen(req, timeout=20) as r:
|
|
return r.read().decode("utf-8", errors="replace")
|
|
except urllib.error.HTTPError as e:
|
|
if e.code in (404, 410): return None
|
|
if i == retries: log(f"HTTP {e.code} {url}"); return None
|
|
except Exception as e:
|
|
if i == retries: log(f"FETCH err {e} {url}"); return None
|
|
time.sleep(DELAY * 2)
|
|
|
|
# === KLUB PARSER ===
|
|
def parse_klub(h, slug):
|
|
if not h: return None
|
|
|
|
# Naziv - drugi h3 (prvi je uvijek "Fédération Croate de Boules")
|
|
h3s = re.findall(r'<h3[^>]*>\s*([^<]+?)\s*</h3>', h)
|
|
naziv = None
|
|
for cand in h3s:
|
|
cand = html.unescape(cand.strip())
|
|
if cand and 'Fédération' not in cand and 'sponzor' not in cand.lower() and len(cand) < 80:
|
|
naziv = cand; break
|
|
if not naziv: return None
|
|
|
|
# Logo
|
|
m = re.search(r'<img[^>]+src="(/cdn/content/[^"]+\.(jpg|png|jpeg))"[^>]+alt="Klub"', h)
|
|
logo = urljoin(BASE, m.group(1)) if m else None
|
|
|
|
info = {}
|
|
# Bullets: Županija, Liga, Adresa, Sportske grane, Osoba za kontakt, E-mail, Tel/mob, OIB
|
|
for m in re.finditer(r'<strong>([^<]+?):</strong>\s*([^<\n]+?)(?:<|\n)', h):
|
|
key = m.group(1).strip().lower()
|
|
val = re.sub(r'<[^>]+>', '', m.group(2).strip())
|
|
val = html.unescape(val)
|
|
if 'županija' in key: info['zupanija'] = val
|
|
elif 'liga' in key: info['liga'] = val
|
|
elif 'adresa' in key: info['adresa'] = val
|
|
elif 'sportske grane' in key or 'sportska grana' in key: info['sportske_grane'] = val
|
|
elif 'osoba za kontakt' in key: info['kontakt_osoba'] = val
|
|
elif 'tel' in key: info['telefon'] = val
|
|
elif 'oib' in key: info['oib'] = val
|
|
|
|
# Igrači - pattern: <li><a href="...igraci/SLUG/">N. E-XX-YY, <strong>Ime</strong>, YYYY.</a></li>
|
|
igraci = []
|
|
for m in re.finditer(
|
|
r'<li><a\s+href="https?://[^/]+/igraci/([\w\-]+)/?"[^>]*>\s*\d+\.\s*([A-Z][\d\-]+),\s*<strong>([^<]+)</strong>,\s*(\d{4})\.?',
|
|
h
|
|
):
|
|
igraci.append({
|
|
"slug": m.group(1),
|
|
"iskaznica": m.group(2).strip(),
|
|
"ime_prezime": html.unescape(m.group(3).strip()),
|
|
"godina_rodenja": int(m.group(4))
|
|
})
|
|
|
|
# Voditelji ekipe (treneri) - tab #popis_voditelja_ekipe
|
|
voditelji = []
|
|
vsec = re.search(r'id="popis_voditelja_ekipe"[^>]*>(.*?)(?:<div\s+(?:role|class)|</section>|<!--)', h, re.S)
|
|
if vsec:
|
|
for v in re.finditer(r'<p[^>]*>\s*([A-ZČĆĐŠŽ][\wčćđšžČĆĐŠŽ\s\-]{2,40}[A-ZČĆĐŠŽ][a-zčćđšž]+)\s*</p>', vsec.group(1)):
|
|
name = re.sub(r'\s+', ' ', v.group(1).strip())
|
|
if len(name) > 4 and len(name.split()) >= 2 and 'Trenutno' not in name and name not in voditelji:
|
|
voditelji.append(name)
|
|
# fallback bez p tagova
|
|
if not voditelji:
|
|
text = re.sub(r'<[^>]+>', '\n', vsec.group(1))
|
|
for line in text.split('\n'):
|
|
line = line.strip()
|
|
if len(line) > 4 and len(line.split()) >= 2 and 'Trenutno' not in line:
|
|
parts = line.split()
|
|
if all(p[0].isupper() for p in parts[:2] if p):
|
|
voditelji.append(line)
|
|
|
|
return {
|
|
"slug": slug, "naziv": naziv, "logo": logo,
|
|
"info": info,
|
|
"igraci": igraci,
|
|
"voditelji": voditelji[:10]
|
|
}
|
|
|
|
# === IGRAČ PARSER ===
|
|
def parse_igrac(h, slug):
|
|
if not h: return None
|
|
|
|
h3s = re.findall(r'<h3[^>]*>\s*([^<]+?)\s*</h3>', h)
|
|
full_name = None
|
|
for cand in h3s:
|
|
cand = html.unescape(cand.strip())
|
|
if cand and 'Fédération' not in cand and 'Sport' not in cand and len(cand) < 80 and len(cand.split()) >= 2:
|
|
full_name = cand; break
|
|
if not full_name:
|
|
full_name = slug.replace("-", " ").title()
|
|
parts = full_name.split()
|
|
ime = parts[0] if parts else ""
|
|
prezime = " ".join(parts[1:]) if len(parts)>1 else ""
|
|
|
|
# Slika
|
|
m = re.search(r'<img[^>]+src="(/cdn/content/[^"]+\.(jpg|png|jpeg))"[^>]+alt="Igrač"', h)
|
|
slika = urljoin(BASE, m.group(1)) if m else None
|
|
|
|
info = {}
|
|
for m in re.finditer(r'<strong>([^<]+?):</strong>\s*([^<\n]+?)(?:<|\n)', h):
|
|
key = m.group(1).strip().lower()
|
|
val = re.sub(r'<[^>]+>', '', m.group(2).strip()).rstrip('.')
|
|
val = html.unescape(val)
|
|
if 'iskaznic' in key: info['iskaznica'] = val
|
|
elif 'godina rođenja' in key:
|
|
try: info['godina_rodenja'] = int(re.search(r'(\d{4})', val).group(1))
|
|
except: pass
|
|
elif 'matični klub' in key: info['maticni_klub'] = val
|
|
|
|
# Sportski put - tabela registracije
|
|
karijera = []
|
|
table_m = re.search(r'### Sportski put.*?</table>', h, re.S)
|
|
if not table_m:
|
|
table_m = re.search(r'Sportski put.*?</table>', h, re.S)
|
|
if table_m:
|
|
rows = re.findall(r'<tr>(.*?)</tr>', table_m.group(0), re.S)
|
|
for r in rows[1:]: # skip header
|
|
cells = re.findall(r'<td[^>]*>(.*?)</td>', r, re.S)
|
|
if len(cells) >= 4:
|
|
karijera.append({
|
|
"datum_reg": re.sub(r'<[^>]+>', '', cells[0]).strip().rstrip('.'),
|
|
"klub": re.sub(r'<[^>]+>', '', cells[1]).strip(),
|
|
"sportska_grana": re.sub(r'<[^>]+>', '', cells[2]).strip(),
|
|
"sezona": re.sub(r'<[^>]+>', '', cells[3]).strip(),
|
|
"lijecnicki": re.sub(r'<[^>]+>', '', cells[4]).strip().rstrip('.') if len(cells) > 4 else None
|
|
})
|
|
|
|
return {
|
|
"slug": slug, "ime": ime, "prezime": prezime, "full_name": full_name,
|
|
"slika_url": slika,
|
|
"info": info,
|
|
"karijera": karijera
|
|
}
|
|
|
|
# PGŽ klubovi - pravi slug-ovi sa bocarski-savez-pgz
|
|
PGZ_HBS_CLUBS = [
|
|
# Senior klubovi
|
|
"kastav", "kostrena", "krenovac", "krimeja", "krk", "lovran", "opatija",
|
|
"rijeka-2", "srdoci", "sveti-jakov", "sveti-rok-klana", "vargon", "hreljin",
|
|
"draga-moscenicka-draga", "lovranska-draga", "brod-moravice",
|
|
# Ženski klubovi
|
|
"zenski-bocarski-klub-cavle", "zenski-bocarski-klub-drenova-rijeka",
|
|
"zenski-bocarski-klub-hreljin", "zenski-bocarski-klub-kastav",
|
|
"zenska-bocarska-ekipa-kastav-2",
|
|
# Kadetske / juniorske ekipe (mlade kategorije)
|
|
"cavle-skola-bocanja", "juniorska-ekipa-cavle-sb-1", "juniorska-ekipa-kastav",
|
|
"juniorska-ekipa-lovran", "juniorska-ekipa-sv-rok-klana", "juniorska-ekipa-vargon",
|
|
"kadetska-ekipa-bk-cavle-sb-2", "kadetska-ekipa-bk-kastav-2",
|
|
"kadetska-ekipa-bk-lovran", "kadetska-ekipa-bk-sveti-jakov-2",
|
|
"kadetska-ekipa-bk-vargon", "kadetska-ekipa-kastav", "kadetska-ekipa-zbk-drenova",
|
|
]
|
|
|
|
def upsert_klub(conn, k):
|
|
cur = conn.cursor()
|
|
# Check by hbs slug in napomena, then by sport+naziv
|
|
cur.execute("SELECT id FROM pgz_sport.klubovi WHERE napomena ILIKE %s LIMIT 1", (f"%hbs:{k['slug']}%",))
|
|
row = cur.fetchone()
|
|
if not row:
|
|
cur.execute("SELECT id FROM pgz_sport.klubovi WHERE sport='boćanje' AND lower(naziv)=lower(%s) LIMIT 1", (k['naziv'],))
|
|
row = cur.fetchone()
|
|
|
|
info = k.get('info', {})
|
|
naziv = k['naziv']
|
|
grad = None
|
|
if info.get('adresa'):
|
|
m = re.search(r'(\w+(?:\s+\w+)?)$', info['adresa'].strip())
|
|
if m: grad = m.group(1)
|
|
|
|
if row:
|
|
kid = row[0]
|
|
cur.execute("""UPDATE pgz_sport.klubovi SET
|
|
adresa=COALESCE(%s, adresa),
|
|
telefon=COALESCE(%s, telefon),
|
|
grad=COALESCE(%s, grad),
|
|
napomena=COALESCE(napomena,'') || ' [HBS sync ' || CURRENT_DATE || ': hbs:' || %s || ']'
|
|
WHERE id=%s""",
|
|
(info.get('adresa'), info.get('telefon'), grad, k['slug'], kid))
|
|
else:
|
|
cur.execute("""INSERT INTO pgz_sport.klubovi
|
|
(naziv, sport, region, grad, adresa, telefon, aktivan, napomena)
|
|
VALUES (%s, 'boćanje', 'PGŽ', %s, %s, %s, true, %s)
|
|
RETURNING id""",
|
|
(naziv, grad, info.get('adresa'), info.get('telefon'),
|
|
f"[HBS sync {dt.date.today()}: hbs:{k['slug']}, OIB:{info.get('oib','-')}, liga:{info.get('liga','-')}]"))
|
|
kid = cur.fetchone()[0]
|
|
return kid
|
|
|
|
def upsert_igrac(conn, p, klub_db_id, klub_naziv):
|
|
cur = conn.cursor()
|
|
iskaznica = (p.get('info', {}).get('iskaznica') or '').strip()
|
|
|
|
# Check by iskaznica (HBS unique ID)
|
|
cid = None
|
|
if iskaznica:
|
|
cur.execute("SELECT id FROM pgz_sport.clanovi WHERE source='hbs_savez' AND source_id=%s", (iskaznica,))
|
|
row = cur.fetchone()
|
|
if row: cid = row[0]
|
|
|
|
god = p.get('info', {}).get('godina_rodenja')
|
|
src_url = f"{BASE}/igraci/{p['slug']}/"
|
|
|
|
if cid:
|
|
cur.execute("""UPDATE pgz_sport.clanovi SET
|
|
ime=%s, prezime=%s, sport='boćanje', uloga='igrac',
|
|
klub_id=%s, klub_naziv_godisnjak=%s,
|
|
slika_url=COALESCE(%s, slika_url),
|
|
godina_rodenja=COALESCE(%s, godina_rodenja),
|
|
slug=%s,
|
|
source='hbs_savez', source_id=%s, source_url=%s, source_synced_at=now()
|
|
WHERE id=%s""",
|
|
(p['ime'], p['prezime'], klub_db_id, klub_naziv, p.get('slika_url'),
|
|
god, p['slug'], iskaznica, src_url, cid))
|
|
else:
|
|
cur.execute("""INSERT INTO pgz_sport.clanovi
|
|
(ime, prezime, sport, uloga, klub_id, klub_naziv_godisnjak, slika_url,
|
|
godina_rodenja, slug, source, source_id, source_url, source_synced_at)
|
|
VALUES (%s, %s, 'boćanje', 'igrac', %s, %s, %s, %s, %s, 'hbs_savez', %s, %s, now())
|
|
RETURNING id""",
|
|
(p['ime'], p['prezime'], klub_db_id, klub_naziv, p.get('slika_url'),
|
|
god, p['slug'], iskaznica, src_url))
|
|
cid = cur.fetchone()[0]
|
|
return cid
|
|
|
|
def upsert_voditelj(conn, name, klub_db_id, klub_naziv, role='trener'):
|
|
"""Voditelji ekipe = treneri"""
|
|
cur = conn.cursor()
|
|
parts = name.strip().split()
|
|
if len(parts) < 2: return None
|
|
ime, prezime = parts[0], " ".join(parts[1:])
|
|
|
|
cur.execute("""SELECT id FROM pgz_sport.clanovi
|
|
WHERE lower(ime)=lower(%s) AND lower(prezime)=lower(%s) AND sport='boćanje'""",
|
|
(ime, prezime))
|
|
row = cur.fetchone()
|
|
if row:
|
|
cur.execute("""UPDATE pgz_sport.clanovi SET
|
|
uloga=%s, klub_id=COALESCE(klub_id, %s),
|
|
klub_naziv_godisnjak=COALESCE(klub_naziv_godisnjak, %s),
|
|
source_url=COALESCE(source_url, %s)
|
|
WHERE id=%s""",
|
|
(role, klub_db_id, klub_naziv, f"{BASE}/klubovi/", row[0]))
|
|
return row[0]
|
|
|
|
cur.execute("""INSERT INTO pgz_sport.clanovi
|
|
(ime, prezime, sport, uloga, klub_id, klub_naziv_godisnjak,
|
|
source, source_url, source_synced_at)
|
|
VALUES (%s, %s, 'boćanje', %s, %s, %s, 'hbs_savez', %s, now())
|
|
RETURNING id""",
|
|
(ime, prezime, role, klub_db_id, klub_naziv, f"{BASE}/klubovi/"))
|
|
return cur.fetchone()[0]
|
|
|
|
def main():
|
|
conn = db()
|
|
log(f"=== HBS scraper START - {len(PGZ_HBS_CLUBS)} kandidata ===")
|
|
|
|
success = 0; players_total = 0
|
|
for slug in PGZ_HBS_CLUBS:
|
|
url = f"{BASE}/klubovi/{slug}/"
|
|
log(f"→ KLUB {slug}")
|
|
h = fetch(url)
|
|
if not h: log(f" ✗ klub ne postoji ili 404"); continue
|
|
|
|
parsed = parse_klub(h, slug)
|
|
if not parsed: log(f" ✗ ne mogu parse"); continue
|
|
|
|
kid = upsert_klub(conn, parsed)
|
|
log(f" ✓ {parsed['naziv']} (db_id={kid}) igrača={len(parsed['igraci'])} voditelja={len(parsed['voditelji'])}")
|
|
success += 1
|
|
|
|
# Voditelji
|
|
for v in parsed['voditelji']:
|
|
try:
|
|
upsert_voditelj(conn, v, kid, parsed['naziv'])
|
|
log(f" ✓ voditelj: {v}")
|
|
except Exception as e:
|
|
log(f" ✗ voditelj {v}: {e}")
|
|
|
|
# Igrači - dohvati profil za svakog
|
|
for ig in parsed['igraci']:
|
|
time.sleep(DELAY)
|
|
try:
|
|
purl = f"{BASE}/igraci/{ig['slug']}/"
|
|
ph = fetch(purl)
|
|
if not ph: continue
|
|
pdata = parse_igrac(ph, ig['slug'])
|
|
if not pdata: continue
|
|
# Override sa fallback iz lista ako parser pogrešno
|
|
if 'Fédération' in pdata.get('full_name','') or pdata['ime'].lower() == 'fédération':
|
|
pdata['full_name'] = ig['ime_prezime']
|
|
parts = ig['ime_prezime'].split()
|
|
pdata['ime'] = parts[0] if parts else ''
|
|
pdata['prezime'] = ' '.join(parts[1:]) if len(parts)>1 else ''
|
|
# Iskaznica from list (mora biti tu)
|
|
if not pdata.get('info', {}).get('iskaznica'):
|
|
pdata.setdefault('info', {})['iskaznica'] = ig.get('iskaznica')
|
|
if not pdata.get('info', {}).get('godina_rodenja'):
|
|
pdata.setdefault('info', {})['godina_rodenja'] = ig.get('godina_rodenja')
|
|
cid = upsert_igrac(conn, pdata, kid, parsed['naziv'])
|
|
players_total += 1
|
|
log(f" ✓ {pdata['ime']} {pdata['prezime']} (db={cid}, god={pdata.get('info',{}).get('godina_rodenja')})")
|
|
except Exception as e:
|
|
log(f" ✗ igrač {ig['slug']}: {e}")
|
|
|
|
time.sleep(DELAY)
|
|
|
|
log(f"=== DONE: {success} klubova, {players_total} igrača ===")
|
|
conn.close()
|
|
|
|
if __name__ == "__main__":
|
|
main()
|