213 lines
8.1 KiB
Python
Executable File
213 lines
8.1 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
from dotenv import load_dotenv
|
|
load_dotenv('/opt/rinet-gpu/.env.master')
|
|
# auto-added by patch_scrapers_with_dotenv.sh
|
|
"""
|
|
Hrvatski boćarski savez (HBS) scraper.
|
|
|
|
Strategy:
|
|
- For each PGŽ-region boćarski klub, try slug from naziv → fetch /klubovi/{slug}/
|
|
- Parse "Popis igrača" section using regex: "N. E-XXXX, Ime Prezime, GGGG."
|
|
- Upsert into clanovi with source='hbs_savez', source_id=<reg_broj>
|
|
|
|
Modes:
|
|
python hbs_bocar.py probe <slug> — fetch single klub
|
|
python hbs_bocar.py klub <db_klub_id> — scrape one klub by DB id
|
|
python hbs_bocar.py all — sweep all PGŽ-region boćarski klubovi
|
|
"""
|
|
import os, re, sys, time, logging
|
|
from datetime import datetime, date
|
|
import psycopg2, psycopg2.extras
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
DB = dict(host='localhost', port=5432, dbname='rinet_v3', user='rinet', password=os.environ["DB_PASSWORD"])
|
|
BASE = "https://hrvatski-bocarski-savez.hr"
|
|
UA = "PGZ-Sport-Bot/1.0 (+https://api.rinet.one/sport/; legitimni interes; analitika sporta PGZ)"
|
|
RATE_S = 1.0
|
|
TIMEOUT = 25
|
|
|
|
log = logging.getLogger("hbs")
|
|
logging.basicConfig(format='%(asctime)s [%(levelname)s] %(message)s', level=logging.INFO,
|
|
handlers=[logging.FileHandler('/opt/pgz-sport/_logs/hbs_scraper.log'), logging.StreamHandler(sys.stdout)])
|
|
|
|
def conn(): return psycopg2.connect(**DB)
|
|
|
|
def fetch(url):
|
|
log.info(f"GET {url}")
|
|
r = requests.get(url, headers={"User-Agent": UA}, timeout=TIMEOUT)
|
|
r.raise_for_status()
|
|
time.sleep(RATE_S)
|
|
return r.text
|
|
|
|
def slugify(s):
|
|
s = s.lower().strip()
|
|
s = re.sub(r'[čć]','c', s); s = re.sub(r'[š]','s', s)
|
|
s = re.sub(r'[žđ]', lambda m: {'ž':'z','đ':'d'}[m.group()], s)
|
|
s = re.sub(r'[^a-z0-9]+','-', s).strip('-')
|
|
return s
|
|
|
|
def naziv_to_slug_candidates(naziv):
|
|
"""Try multiple slug variants: 'BK Halubjan' → ['halubjan','bk-halubjan','bocarski-klub-halubjan']."""
|
|
n = naziv.lower()
|
|
candidates = []
|
|
# Strip prefix words
|
|
for prefix in ('boćarski klub', 'bocarski klub', 'b.k.', 'bk', 'b k', 'klub', 'društvo'):
|
|
if n.startswith(prefix):
|
|
candidates.append(slugify(n[len(prefix):].strip()))
|
|
break
|
|
candidates.append(slugify(n))
|
|
candidates.append(slugify(n.replace('boćarski','').replace('klub','').strip()))
|
|
seen = set(); out = []
|
|
for c in candidates:
|
|
if c and c not in seen:
|
|
seen.add(c); out.append(c)
|
|
return out
|
|
|
|
def parse_klub_page(html, klub_url=None):
|
|
"""Parse boćarski klub page → players list."""
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
out = {"klub_url": klub_url, "players": [], "meta": {}}
|
|
|
|
# Title — naziv kluba
|
|
h1 = soup.find('h1')
|
|
if h1: out['meta']['naziv'] = h1.get_text(' ', strip=True)
|
|
|
|
body = (soup.find(class_='entry-content') or soup.find('main') or soup.body or soup)
|
|
text = body.get_text(' ', strip=True)
|
|
|
|
# Extract club meta
|
|
m_zup = re.search(r'Županija:\s*([^A-Z]+?)(?=Liga|Adresa|$)', text)
|
|
if m_zup: out['meta']['zupanija'] = m_zup.group(1).strip()
|
|
m_lig = re.search(r'Liga:\s*([^A-Z]+?)(?=Adresa|Sportske|$)', text)
|
|
if m_lig: out['meta']['liga'] = m_lig.group(1).strip()
|
|
m_oib = re.search(r'OIB:\s*(\d{11})', text)
|
|
if m_oib: out['meta']['oib'] = m_oib.group(1)
|
|
m_god = re.search(r'osnivanja:\s*(\d{4})', text)
|
|
if m_god: out['meta']['osnovan'] = int(m_god.group(1))
|
|
|
|
# Players — pattern: "N. E-XXXX, Ime Prezime, GGGG."
|
|
# Variants: E-2755-11, E-02010, E-1317-04, etc.
|
|
PLAYER_RE = re.compile(r'(\d+)\.\s+(E-[\dA-Z\-]+),\s+([^,]+?),\s+(\d{4})\.?', re.UNICODE)
|
|
for m in PLAYER_RE.finditer(text):
|
|
rb = m.group(2).strip()
|
|
ime_full = m.group(3).strip()
|
|
god = int(m.group(4))
|
|
# Split name into ime + prezime (rsplit on space)
|
|
parts = ime_full.rsplit(' ', 1)
|
|
ime = parts[0] if len(parts) > 1 else ime_full
|
|
prezime = parts[1] if len(parts) > 1 else ''
|
|
out['players'].append({
|
|
'reg_broj': rb,
|
|
'ime': ime,
|
|
'prezime': prezime,
|
|
'godina_rodenja': god,
|
|
})
|
|
|
|
return out
|
|
|
|
def cmd_klub(klub_id_db):
|
|
with conn() as c:
|
|
cu = c.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
|
cu.execute("SELECT id, naziv FROM pgz_sport.klubovi WHERE id=%s", (klub_id_db,))
|
|
klub = cu.fetchone()
|
|
if not klub: log.error(f"Klub #{klub_id_db} not found"); return 0
|
|
|
|
candidates = naziv_to_slug_candidates(klub['naziv'])
|
|
log.info(f"Klub: {klub['naziv']} candidates={candidates[:5]}")
|
|
|
|
parsed = None
|
|
used_slug = None
|
|
for slug in candidates[:5]:
|
|
url = f"{BASE}/klubovi/{slug}/"
|
|
try:
|
|
html = fetch(url)
|
|
p = parse_klub_page(html, url)
|
|
if p.get('players'):
|
|
parsed = p; used_slug = slug; break
|
|
elif p['meta'].get('naziv'):
|
|
# Found page but no players — keep searching
|
|
pass
|
|
except requests.HTTPError as e:
|
|
if e.response.status_code != 404:
|
|
log.warning(f" {slug}: {e}")
|
|
continue
|
|
except Exception as e:
|
|
log.warning(f" {slug}: {e}")
|
|
continue
|
|
|
|
if not parsed:
|
|
log.warning(f" → no match for {klub['naziv']} (tried {candidates[:5]})")
|
|
return 0
|
|
|
|
# Upsert players
|
|
n = 0
|
|
with conn() as c:
|
|
cu = c.cursor()
|
|
for pl in parsed['players']:
|
|
url = f"{BASE}/klubovi/{used_slug}/"
|
|
# source_id = reg_broj (HBS unique)
|
|
cu.execute("""SELECT id FROM pgz_sport.clanovi
|
|
WHERE source='hbs_savez' AND source_id=%s""", (pl['reg_broj'],))
|
|
row = cu.fetchone()
|
|
slug = slugify(pl['ime'] + ' ' + pl['prezime'])
|
|
datum_aprox = f"{pl['godina_rodenja']}-01-01" # only year known
|
|
if row:
|
|
cu.execute("""UPDATE pgz_sport.clanovi
|
|
SET ime=%s, prezime=%s, klub_id=%s, source_url=%s, source_synced_at=now()
|
|
WHERE id=%s""", (pl['ime'], pl['prezime'], klub_id_db, url, row[0]))
|
|
else:
|
|
cu.execute("""INSERT INTO pgz_sport.clanovi
|
|
(klub_id, ime, prezime, datum_rodenja, source, source_id, source_url,
|
|
source_synced_at, slug, biografija)
|
|
VALUES (%s,%s,%s,%s,'hbs_savez',%s,%s,now(),%s,%s)""",
|
|
(klub_id_db, pl['ime'], pl['prezime'], datum_aprox,
|
|
pl['reg_broj'], url, slug,
|
|
f"Reg. broj HBS: {pl['reg_broj']} · Godina rođenja: {pl['godina_rodenja']}"))
|
|
n += 1
|
|
# Upsert klub OIB if found
|
|
if parsed['meta'].get('oib'):
|
|
cu.execute("""UPDATE pgz_sport.klubovi
|
|
SET oib=COALESCE(NULLIF(oib,''),%s),
|
|
web_stranica=COALESCE(NULLIF(web_stranica,''), %s),
|
|
source_synced_at=now()
|
|
WHERE id=%s""",
|
|
(parsed['meta']['oib'], f"{BASE}/klubovi/{used_slug}/", klub_id_db))
|
|
c.commit()
|
|
log.info(f" → {n} igrača za {klub['naziv']} (slug={used_slug})")
|
|
return n
|
|
|
|
def cmd_all():
|
|
with conn() as c:
|
|
cu = c.cursor()
|
|
cu.execute("""SELECT id FROM pgz_sport.klubovi
|
|
WHERE sport='boćanje' AND aktivan=true
|
|
ORDER BY id""")
|
|
kids = [r[0] for r in cu.fetchall()]
|
|
log.info(f"Sweeping {len(kids)} boćarski klubovi (PGŽ)")
|
|
|
|
total = 0; found_clubs = 0
|
|
for kid in kids:
|
|
try:
|
|
n = cmd_klub(kid)
|
|
total += n
|
|
if n > 0: found_clubs += 1
|
|
except Exception as e:
|
|
log.error(f"klub {kid}: {e}")
|
|
log.info(f"DONE: {total} igrača iz {found_clubs}/{len(kids)} klubova")
|
|
|
|
if __name__ == '__main__':
|
|
if len(sys.argv) < 2: print(__doc__); sys.exit(1)
|
|
cmd = sys.argv[1]
|
|
if cmd == 'probe':
|
|
html = fetch(f"{BASE}/klubovi/{sys.argv[2]}/")
|
|
out = parse_klub_page(html)
|
|
import json
|
|
print(json.dumps(out, ensure_ascii=False, indent=2))
|
|
elif cmd == 'klub':
|
|
cmd_klub(int(sys.argv[2]))
|
|
elif cmd == 'all':
|
|
cmd_all()
|
|
else:
|
|
print(f"unknown: {cmd}"); sys.exit(2)
|