#!/usr/bin/env python3 from dotenv import load_dotenv load_dotenv('/opt/rinet-gpu/.env.master') # auto-added by patch_scrapers_with_dotenv.sh """Sports table scrapers - HOS, HKS, HRS, HVS, HBS""" import os import time, re, requests, psycopg2 from bs4 import BeautifulSoup DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}" UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36" def get(url): r = requests.get(url, headers={"User-Agent": UA}, timeout=20) r.raise_for_status() time.sleep(1.5) return BeautifulSoup(r.text, "html.parser") def si(s): try: return int(re.sub(r'[^\d]','',str(s)) or '0') except: return 0 def save(nat_id, rows, src): if not rows: return 0 conn = psycopg2.connect(DSN) cur = conn.cursor() cur.execute("DELETE FROM pgz_sport.natjecanje_tablica WHERE natjecanje_id=%s AND source=%s", (nat_id, src)) for i,r in enumerate(rows): cur.execute("""INSERT INTO pgz_sport.natjecanje_tablica (natjecanje_id,rang,klub_naziv,utakmica,pobjede,nerijeseno,porazi,golovi_za,golovi_protiv,bodovi,source) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON CONFLICT(natjecanje_id,klub_naziv) DO UPDATE SET rang=EXCLUDED.rang,utakmica=EXCLUDED.utakmica,pobjede=EXCLUDED.pobjede, nerijeseno=EXCLUDED.nerijeseno,porazi=EXCLUDED.porazi, golovi_za=EXCLUDED.golovi_za,golovi_protiv=EXCLUDED.golovi_protiv, bodovi=EXCLUDED.bodovi,source=EXCLUDED.source,scraped_at=NOW()""", (nat_id,i+1,r['k'],r.get('u',0),r.get('p',0),r.get('n',0), r.get('i',0),r.get('gf',0),r.get('ga',0),r.get('bod',0),src)) conn.commit(); conn.close() return len(rows) # ─── HOS Odbojka ───────────────────────────── def scrape_hos(nat_id, url): print(f"HOS {nat_id}: {url}") s = get(url) rows = [] for tbl in s.find_all('table'): trs = tbl.find_all('tr') # Find table with EKIPA column hdrs = ' '.join(th.get_text(strip=True).lower() for th in trs[0].find_all(['th','td'])) if trs else '' if 'ekipa' not in hdrs and 'klub' not in hdrs: continue # Find column indices ths = [t.get_text(strip=True).upper() for t in trs[0].find_all(['th','td'])] def ci(names): for n in names: for i,h in enumerate(ths): if n in h: return i return None ki = ci(['EKIPA','KLUB','TIM']) ui = ci(['UT.','UTA','IGR']) bi = ci(['BODOV','BOD','BN']) if ki is None: continue for tr in trs[1:]: tds = tr.find_all(['td','th']) if len(tds) <= ki: continue klub = tds[ki].get_text(strip=True) # Clean rank prefix klub = re.sub(r'^\d+\.?\s*','',klub).strip() if not klub or len(klub) < 3: continue nums = [si(td.get_text(strip=True)) for td in tds if td.get_text(strip=True).lstrip('-').lstrip('+').isdigit() or (td.get_text(strip=True) and re.match(r'^[\d\-+]+$',td.get_text(strip=True)))] rows.append({'k':klub,'u':nums[0] if nums else 0,'p':nums[1] if len(nums)>1 else 0, 'n':nums[2] if len(nums)>2 else 0,'i':nums[3] if len(nums)>3 else 0, 'gf':nums[4] if len(nums)>4 else 0,'ga':nums[5] if len(nums)>5 else 0, 'bod':nums[-1] if nums else 0}) if rows: break n = save(nat_id, rows, 'hos_cvf') print(f" => {n} ekipa") return n # ─── Generic standings scraper ───────────────── def scrape_generic(nat_id, url, src): print(f"{src} {nat_id}: {url}") s = get(url) best = [] for tbl in s.find_all('table'): trs = tbl.find_all('tr') if len(trs) < 4: continue rows = [] for tr in trs[1:]: tds = tr.find_all(['td','th']) if len(tds) < 5: continue texts = [t.get_text(strip=True) for t in tds] # Find club name (non-numeric, len > 2, not rank) klub = None for j, t in enumerate(texts): if t and len(t) > 3 and not t.lstrip('+-').lstrip('.').isdigit(): if j > 0 or (j==0 and not t.isdigit()): klub = re.sub(r'^\d+\.?\s*','',t).strip() if len(klub) > 3: break if not klub: continue nums = [si(t) for t in texts if re.match(r'^[\d\-+]+$',t) and len(t) < 5] if len(nums) < 2: continue rows.append({'k':klub,'u':nums[0],'p':nums[1] if len(nums)>1 else 0, 'n':nums[2] if len(nums)>2 else 0,'i':nums[3] if len(nums)>3 else 0, 'gf':nums[4] if len(nums)>4 else 0,'ga':nums[5] if len(nums)>5 else 0, 'bod':nums[-1]}) if len(rows) > len(best): best = rows n = save(nat_id, best, src) print(f" => {n} ekipa") return n # ─── HBS Bocanje - get all leagues ───────────── def scrape_hbs_all(): print("HBS Bocanje - all leagues") s = get("https://hrvatski-bocarski-savez.hr/lige/") links = [] for a in s.find_all('a', href=True): href = a['href'] if 'liga' in href.lower() or 'lig' in href.lower(): if not href.startswith('http'): href = 'https://hrvatski-bocarski-savez.hr'+href text = a.get_text(strip=True) if text and len(text) > 3: links.append((text, href)) print(f" Found {len(links)} HBS leagues") # Find PGZ-relevant ones pgz_links = [(t,h) for t,h in links if any(k in t.upper() for k in ['PGZ','PGŽ','KVARN','RIJECKA','PRIMORJE'])] print(f" PGZ relevant: {pgz_links[:5]}") return pgz_links # ─── Main ────────────────────────────────────── conn = psycopg2.connect(DSN) cur = conn.cursor() cur.execute("SELECT id,naziv,sport,external_url FROM pgz_sport.natjecanja WHERE external_url IS NOT NULL ORDER BY sport") nats = cur.fetchall() conn.close() # Scrape specific known URLs scrape_hos(409, "https://hos-cvf.hr/natjecanje.php?id=2807") # Scrape all with external URLs for nat_id, naziv, sport, url in nats: if not url or nat_id == 409: continue src = None if 'hos-cvf' in url: src = 'hos_cvf' elif 'hks-cbf' in url: src = 'hks_cbf' elif 'hrs.hr' in url: src = 'hrs' elif 'hvs.hr' in url or 'hnvs' in url: src = 'hvs' elif 'bocarski' in url or 'hbs' in url: src = 'hbs' if src: try: scrape_generic(nat_id, url, src) except Exception as e: print(f" SKIP {nat_id}: {e}") # HBS all leagues try: scrape_hbs_all() except Exception as e: print(f"HBS all: {e}") # Final count conn = psycopg2.connect(DSN) cur = conn.cursor() cur.execute("SELECT source, count(*) FROM pgz_sport.natjecanje_tablica GROUP BY source ORDER BY count DESC") for row in cur.fetchall(): print(f" {row[0]}: {row[1]} teams") conn.close() print("ALL DONE")