#!/usr/bin/env python3 """ Hrvatski boćarski savez (HBS) scraper. Strategy: - For each PGŽ-region boćarski klub, try slug from naziv → fetch /klubovi/{slug}/ - Parse "Popis igrača" section using regex: "N. E-XXXX, Ime Prezime, GGGG." - Upsert into clanovi with source='hbs_savez', source_id= Modes: python hbs_bocar.py probe — fetch single klub python hbs_bocar.py klub — scrape one klub by DB id python hbs_bocar.py all — sweep all PGŽ-region boćarski klubovi """ import os, re, sys, time, logging from datetime import datetime, date import psycopg2, psycopg2.extras import requests from bs4 import BeautifulSoup DB = dict(host='localhost', port=5432, dbname='rinet_v3', user='rinet', password=os.environ["DB_PASSWORD"]) BASE = "https://hrvatski-bocarski-savez.hr" UA = "PGZ-Sport-Bot/1.0 (+https://api.rinet.one/sport/; legitimni interes; analitika sporta PGZ)" RATE_S = 1.0 TIMEOUT = 25 log = logging.getLogger("hbs") logging.basicConfig(format='%(asctime)s [%(levelname)s] %(message)s', level=logging.INFO, handlers=[logging.FileHandler('/opt/pgz-sport/_logs/hbs_scraper.log'), logging.StreamHandler(sys.stdout)]) def conn(): return psycopg2.connect(**DB) def fetch(url): log.info(f"GET {url}") r = requests.get(url, headers={"User-Agent": UA}, timeout=TIMEOUT) r.raise_for_status() time.sleep(RATE_S) return r.text def slugify(s): s = s.lower().strip() s = re.sub(r'[čć]','c', s); s = re.sub(r'[š]','s', s) s = re.sub(r'[žđ]', lambda m: {'ž':'z','đ':'d'}[m.group()], s) s = re.sub(r'[^a-z0-9]+','-', s).strip('-') return s def naziv_to_slug_candidates(naziv): """Try multiple slug variants: 'BK Halubjan' → ['halubjan','bk-halubjan','bocarski-klub-halubjan'].""" n = naziv.lower() candidates = [] # Strip prefix words for prefix in ('boćarski klub', 'bocarski klub', 'b.k.', 'bk', 'b k', 'klub', 'društvo'): if n.startswith(prefix): candidates.append(slugify(n[len(prefix):].strip())) break candidates.append(slugify(n)) candidates.append(slugify(n.replace('boćarski','').replace('klub','').strip())) seen = set(); out = [] for c in candidates: if c and c not in seen: seen.add(c); out.append(c) return out def parse_klub_page(html, klub_url=None): """Parse boćarski klub page → players list.""" soup = BeautifulSoup(html, 'html.parser') out = {"klub_url": klub_url, "players": [], "meta": {}} # Title — naziv kluba h1 = soup.find('h1') if h1: out['meta']['naziv'] = h1.get_text(' ', strip=True) body = (soup.find(class_='entry-content') or soup.find('main') or soup.body or soup) text = body.get_text(' ', strip=True) # Extract club meta m_zup = re.search(r'Županija:\s*([^A-Z]+?)(?=Liga|Adresa|$)', text) if m_zup: out['meta']['zupanija'] = m_zup.group(1).strip() m_lig = re.search(r'Liga:\s*([^A-Z]+?)(?=Adresa|Sportske|$)', text) if m_lig: out['meta']['liga'] = m_lig.group(1).strip() m_oib = re.search(r'OIB:\s*(\d{11})', text) if m_oib: out['meta']['oib'] = m_oib.group(1) m_god = re.search(r'osnivanja:\s*(\d{4})', text) if m_god: out['meta']['osnovan'] = int(m_god.group(1)) # Players — pattern: "N. E-XXXX, Ime Prezime, GGGG." # Variants: E-2755-11, E-02010, E-1317-04, etc. PLAYER_RE = re.compile(r'(\d+)\.\s+(E-[\dA-Z\-]+),\s+([^,]+?),\s+(\d{4})\.?', re.UNICODE) for m in PLAYER_RE.finditer(text): rb = m.group(2).strip() ime_full = m.group(3).strip() god = int(m.group(4)) # Split name into ime + prezime (rsplit on space) parts = ime_full.rsplit(' ', 1) ime = parts[0] if len(parts) > 1 else ime_full prezime = parts[1] if len(parts) > 1 else '' out['players'].append({ 'reg_broj': rb, 'ime': ime, 'prezime': prezime, 'godina_rodenja': god, }) return out def cmd_klub(klub_id_db): with conn() as c: cu = c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) cu.execute("SELECT id, naziv FROM pgz_sport.klubovi WHERE id=%s", (klub_id_db,)) klub = cu.fetchone() if not klub: log.error(f"Klub #{klub_id_db} not found"); return 0 candidates = naziv_to_slug_candidates(klub['naziv']) log.info(f"Klub: {klub['naziv']} candidates={candidates[:5]}") parsed = None used_slug = None for slug in candidates[:5]: url = f"{BASE}/klubovi/{slug}/" try: html = fetch(url) p = parse_klub_page(html, url) if p.get('players'): parsed = p; used_slug = slug; break elif p['meta'].get('naziv'): # Found page but no players — keep searching pass except requests.HTTPError as e: if e.response.status_code != 404: log.warning(f" {slug}: {e}") continue except Exception as e: log.warning(f" {slug}: {e}") continue if not parsed: log.warning(f" → no match for {klub['naziv']} (tried {candidates[:5]})") return 0 # Upsert players n = 0 with conn() as c: cu = c.cursor() for pl in parsed['players']: url = f"{BASE}/klubovi/{used_slug}/" # source_id = reg_broj (HBS unique) cu.execute("""SELECT id FROM pgz_sport.clanovi WHERE source='hbs_savez' AND source_id=%s""", (pl['reg_broj'],)) row = cu.fetchone() slug = slugify(pl['ime'] + ' ' + pl['prezime']) datum_aprox = f"{pl['godina_rodenja']}-01-01" # only year known if row: cu.execute("""UPDATE pgz_sport.clanovi SET ime=%s, prezime=%s, klub_id=%s, source_url=%s, source_synced_at=now() WHERE id=%s""", (pl['ime'], pl['prezime'], klub_id_db, url, row[0])) else: cu.execute("""INSERT INTO pgz_sport.clanovi (klub_id, ime, prezime, datum_rodenja, source, source_id, source_url, source_synced_at, slug, biografija) VALUES (%s,%s,%s,%s,'hbs_savez',%s,%s,now(),%s,%s)""", (klub_id_db, pl['ime'], pl['prezime'], datum_aprox, pl['reg_broj'], url, slug, f"Reg. broj HBS: {pl['reg_broj']} · Godina rođenja: {pl['godina_rodenja']}")) n += 1 # Upsert klub OIB if found if parsed['meta'].get('oib'): cu.execute("""UPDATE pgz_sport.klubovi SET oib=COALESCE(NULLIF(oib,''),%s), web_stranica=COALESCE(NULLIF(web_stranica,''), %s), source_synced_at=now() WHERE id=%s""", (parsed['meta']['oib'], f"{BASE}/klubovi/{used_slug}/", klub_id_db)) c.commit() log.info(f" → {n} igrača za {klub['naziv']} (slug={used_slug})") return n def cmd_all(): with conn() as c: cu = c.cursor() cu.execute("""SELECT id FROM pgz_sport.klubovi WHERE sport='boćanje' AND aktivan=true ORDER BY id""") kids = [r[0] for r in cu.fetchall()] log.info(f"Sweeping {len(kids)} boćarski klubovi (PGŽ)") total = 0; found_clubs = 0 for kid in kids: try: n = cmd_klub(kid) total += n if n > 0: found_clubs += 1 except Exception as e: log.error(f"klub {kid}: {e}") log.info(f"DONE: {total} igrača iz {found_clubs}/{len(kids)} klubova") if __name__ == '__main__': if len(sys.argv) < 2: print(__doc__); sys.exit(1) cmd = sys.argv[1] if cmd == 'probe': html = fetch(f"{BASE}/klubovi/{sys.argv[2]}/") out = parse_klub_page(html) import json print(json.dumps(out, ensure_ascii=False, indent=2)) elif cmd == 'klub': cmd_klub(int(sys.argv[2])) elif cmd == 'all': cmd_all() else: print(f"unknown: {cmd}"); sys.exit(2)