161 lines
6.8 KiB
Python
161 lines
6.8 KiB
Python
#!/usr/bin/env python3
|
|
"""Sports table scrapers - HOS, HKS, HRS, HVS, HBS"""
|
|
import time, re, requests, psycopg2
|
|
from bs4 import BeautifulSoup
|
|
|
|
DSN = "host=127.0.0.1 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
|
UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36"
|
|
|
|
def get(url):
|
|
r = requests.get(url, headers={"User-Agent": UA}, timeout=20)
|
|
r.raise_for_status()
|
|
time.sleep(1.5)
|
|
return BeautifulSoup(r.text, "html.parser")
|
|
|
|
def si(s):
|
|
try: return int(re.sub(r'[^\d]','',str(s)) or '0')
|
|
except: return 0
|
|
|
|
def save(nat_id, rows, src):
|
|
if not rows: return 0
|
|
conn = psycopg2.connect(DSN)
|
|
cur = conn.cursor()
|
|
cur.execute("DELETE FROM pgz_sport.natjecanje_tablica WHERE natjecanje_id=%s AND source=%s", (nat_id, src))
|
|
for i,r in enumerate(rows):
|
|
cur.execute("""INSERT INTO pgz_sport.natjecanje_tablica
|
|
(natjecanje_id,rang,klub_naziv,utakmica,pobjede,nerijeseno,porazi,golovi_za,golovi_protiv,bodovi,source)
|
|
VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
|
|
ON CONFLICT(natjecanje_id,klub_naziv) DO UPDATE SET
|
|
rang=EXCLUDED.rang,utakmica=EXCLUDED.utakmica,pobjede=EXCLUDED.pobjede,
|
|
nerijeseno=EXCLUDED.nerijeseno,porazi=EXCLUDED.porazi,
|
|
golovi_za=EXCLUDED.golovi_za,golovi_protiv=EXCLUDED.golovi_protiv,
|
|
bodovi=EXCLUDED.bodovi,source=EXCLUDED.source,scraped_at=NOW()""",
|
|
(nat_id,i+1,r['k'],r.get('u',0),r.get('p',0),r.get('n',0),
|
|
r.get('i',0),r.get('gf',0),r.get('ga',0),r.get('bod',0),src))
|
|
conn.commit(); conn.close()
|
|
return len(rows)
|
|
|
|
# ─── HOS Odbojka ─────────────────────────────
|
|
def scrape_hos(nat_id, url):
|
|
print(f"HOS {nat_id}: {url}")
|
|
s = get(url)
|
|
rows = []
|
|
for tbl in s.find_all('table'):
|
|
trs = tbl.find_all('tr')
|
|
# Find table with EKIPA column
|
|
hdrs = ' '.join(th.get_text(strip=True).lower() for th in trs[0].find_all(['th','td'])) if trs else ''
|
|
if 'ekipa' not in hdrs and 'klub' not in hdrs: continue
|
|
# Find column indices
|
|
ths = [t.get_text(strip=True).upper() for t in trs[0].find_all(['th','td'])]
|
|
def ci(names):
|
|
for n in names:
|
|
for i,h in enumerate(ths):
|
|
if n in h: return i
|
|
return None
|
|
ki = ci(['EKIPA','KLUB','TIM'])
|
|
ui = ci(['UT.','UTA','IGR'])
|
|
bi = ci(['BODOV','BOD','BN'])
|
|
if ki is None: continue
|
|
for tr in trs[1:]:
|
|
tds = tr.find_all(['td','th'])
|
|
if len(tds) <= ki: continue
|
|
klub = tds[ki].get_text(strip=True)
|
|
# Clean rank prefix
|
|
klub = re.sub(r'^\d+\.?\s*','',klub).strip()
|
|
if not klub or len(klub) < 3: continue
|
|
nums = [si(td.get_text(strip=True)) for td in tds if td.get_text(strip=True).lstrip('-').lstrip('+').isdigit() or (td.get_text(strip=True) and re.match(r'^[\d\-+]+$',td.get_text(strip=True)))]
|
|
rows.append({'k':klub,'u':nums[0] if nums else 0,'p':nums[1] if len(nums)>1 else 0,
|
|
'n':nums[2] if len(nums)>2 else 0,'i':nums[3] if len(nums)>3 else 0,
|
|
'gf':nums[4] if len(nums)>4 else 0,'ga':nums[5] if len(nums)>5 else 0,
|
|
'bod':nums[-1] if nums else 0})
|
|
if rows: break
|
|
n = save(nat_id, rows, 'hos_cvf')
|
|
print(f" => {n} ekipa")
|
|
return n
|
|
|
|
# ─── Generic standings scraper ─────────────────
|
|
def scrape_generic(nat_id, url, src):
|
|
print(f"{src} {nat_id}: {url}")
|
|
s = get(url)
|
|
best = []
|
|
for tbl in s.find_all('table'):
|
|
trs = tbl.find_all('tr')
|
|
if len(trs) < 4: continue
|
|
rows = []
|
|
for tr in trs[1:]:
|
|
tds = tr.find_all(['td','th'])
|
|
if len(tds) < 5: continue
|
|
texts = [t.get_text(strip=True) for t in tds]
|
|
# Find club name (non-numeric, len > 2, not rank)
|
|
klub = None
|
|
for j, t in enumerate(texts):
|
|
if t and len(t) > 3 and not t.lstrip('+-').lstrip('.').isdigit():
|
|
if j > 0 or (j==0 and not t.isdigit()):
|
|
klub = re.sub(r'^\d+\.?\s*','',t).strip()
|
|
if len(klub) > 3:
|
|
break
|
|
if not klub: continue
|
|
nums = [si(t) for t in texts if re.match(r'^[\d\-+]+$',t) and len(t) < 5]
|
|
if len(nums) < 2: continue
|
|
rows.append({'k':klub,'u':nums[0],'p':nums[1] if len(nums)>1 else 0,
|
|
'n':nums[2] if len(nums)>2 else 0,'i':nums[3] if len(nums)>3 else 0,
|
|
'gf':nums[4] if len(nums)>4 else 0,'ga':nums[5] if len(nums)>5 else 0,
|
|
'bod':nums[-1]})
|
|
if len(rows) > len(best): best = rows
|
|
n = save(nat_id, best, src)
|
|
print(f" => {n} ekipa")
|
|
return n
|
|
|
|
# ─── HBS Bocanje - get all leagues ─────────────
|
|
def scrape_hbs_all():
|
|
print("HBS Bocanje - all leagues")
|
|
s = get("https://hrvatski-bocarski-savez.hr/lige/")
|
|
links = []
|
|
for a in s.find_all('a', href=True):
|
|
href = a['href']
|
|
if 'liga' in href.lower() or 'lig' in href.lower():
|
|
if not href.startswith('http'): href = 'https://hrvatski-bocarski-savez.hr'+href
|
|
text = a.get_text(strip=True)
|
|
if text and len(text) > 3:
|
|
links.append((text, href))
|
|
print(f" Found {len(links)} HBS leagues")
|
|
# Find PGZ-relevant ones
|
|
pgz_links = [(t,h) for t,h in links if any(k in t.upper() for k in ['PGZ','PGŽ','KVARN','RIJECKA','PRIMORJE'])]
|
|
print(f" PGZ relevant: {pgz_links[:5]}")
|
|
return pgz_links
|
|
|
|
# ─── Main ──────────────────────────────────────
|
|
conn = psycopg2.connect(DSN)
|
|
cur = conn.cursor()
|
|
cur.execute("SELECT id,naziv,sport,external_url FROM pgz_sport.natjecanja WHERE external_url IS NOT NULL ORDER BY sport")
|
|
nats = cur.fetchall()
|
|
conn.close()
|
|
|
|
# Scrape specific known URLs
|
|
scrape_hos(409, "https://hos-cvf.hr/natjecanje.php?id=2807")
|
|
|
|
# Scrape all with external URLs
|
|
for nat_id, naziv, sport, url in nats:
|
|
if not url or nat_id == 409: continue
|
|
src = None
|
|
if 'hos-cvf' in url: src = 'hos_cvf'
|
|
elif 'hks-cbf' in url: src = 'hks_cbf'
|
|
elif 'hrs.hr' in url: src = 'hrs'
|
|
elif 'hvs.hr' in url or 'hnvs' in url: src = 'hvs'
|
|
elif 'bocarski' in url or 'hbs' in url: src = 'hbs'
|
|
if src:
|
|
try: scrape_generic(nat_id, url, src)
|
|
except Exception as e: print(f" SKIP {nat_id}: {e}")
|
|
|
|
# HBS all leagues
|
|
try: scrape_hbs_all()
|
|
except Exception as e: print(f"HBS all: {e}")
|
|
|
|
# Final count
|
|
conn = psycopg2.connect(DSN)
|
|
cur = conn.cursor()
|
|
cur.execute("SELECT source, count(*) FROM pgz_sport.natjecanje_tablica GROUP BY source ORDER BY count DESC")
|
|
for row in cur.fetchall(): print(f" {row[0]}: {row[1]} teams")
|
|
conn.close()
|
|
print("ALL DONE")
|