#!/usr/bin/env python3 from dotenv import load_dotenv load_dotenv('/opt/rinet-gpu/.env.master') # auto-added by patch_scrapers_with_dotenv.sh """ HOS-CVF scraper - Hrvatski odbojkaški savez league standings. Path: /opt/pgz-sport/scrapers/hos_scraper.py """ import os import requests, re, json, psycopg2, html as ihtml DB = dict(host="10.10.0.2", port=6432, port=5432, dbname='rinet_v3', user='rinet', password=os.environ["DB_PASSWORD"]) UA = "RiNET-Civic/1.0 (https://rinet.one)" HDR = {"User-Agent": UA} def parse_table(html, table_idx): """Parse a single table - return rows.""" tables = re.findall(r']*>(.+?)', html, re.DOTALL) if table_idx >= len(tables): return [] rows = re.findall(r']*>(.+?)', tables[table_idx], re.DOTALL) out = [] for row in rows: cells = re.findall(r']*>(.*?)', row, re.DOTALL) clean = [ihtml.unescape(re.sub(r'<[^>]+>','',c)).strip() for c in cells] if not clean or not clean[0]: continue # Skip header if clean[0] in ('','#','Pos') or 'Utakmice' in (clean[1] if len(clean)>1 else ''): continue # Format: ['1.', '', 'HAOK MLADOST', '18', '18', '0', '36'] try: poz_match = re.match(r'(\d+)', clean[0]) if not poz_match: continue poz = int(poz_match.group(1)) if len(clean) < 6: continue klub = clean[2] if clean[2] else clean[1] if not klub: continue out.append({ "poz": poz, "klub": klub, "uk": int(clean[3]), "pob": int(clean[4]), "por": int(clean[5]), "bod": int(clean[6]) if len(clean) > 6 else 0, "ner": 0, }) except (ValueError, IndexError): continue return out def find_table_titles(html): """Find h2/h3 headers + their position to associate with following tables.""" # Use regex to find heading + nearest table out = [] # Finds positions of titles for m in re.finditer(r'<(h[1-5])[^>]*>(.*?)', html, re.DOTALL): title = ihtml.unescape(re.sub(r'<[^>]+>','', m.group(2))).strip() if title and len(title) > 5: out.append((m.start(), title)) return out def main(): conn = psycopg2.connect(**DB); conn.autocommit = True cu = conn.cursor() # Get main page r = requests.get("https://hos-cvf.hr/", headers=HDR, timeout=20) html = r.text print(f"Length: {len(html)}") tables = re.findall(r']*>(.+?)', html, re.DOTALL) print(f"Tables: {len(tables)}") # Tables 1-5 are visible at home page (Superliga M, Superliga Ž, ?, 1.M Liga, 1.Ž Liga or similar) # Best heuristic: title closest before each table title_positions = find_table_titles(html) table_positions = [m.start() for m in re.finditer(r']*>', html)] table_with_title = [] for tp in table_positions: # Find closest title before candidates = [(pos, t) for pos, t in title_positions if pos < tp] if candidates: title = candidates[-1][1] else: title = "Unknown" table_with_title.append((tp, title)) print("\n=== Table titles ===") for i, (tp, t) in enumerate(table_with_title[:8]): print(f" Table {i+1}: {t[:80]}") # Manual mapping based on Damir's request: extract all visible league tables # Looking at output: Tables 1-5 with kluba names LEAGUES_2025_26 = [ # Idx, Name, Razina, Spol (0, "Supersport Superliga (M) 2025/26", "Superliga", "M"), (1, "Supersport Superliga (Ž) 2025/26", "Superliga", "Ž"), (2, "Liga doigravanje (M) 2025/26", "Doigravanje", "M"), # Maybe (3, "Supersport Superliga 2 (M) 2025/26", "Superliga 2", "M"), (4, "Supersport Superliga 2 (Ž) 2025/26", "Superliga 2", "Ž"), ] total_inserted = 0 pgz_seen = set() for idx, natj_naziv, razina, spol in LEAGUES_2025_26: rows = parse_table(html, idx) if not rows: continue print(f"\n=== {natj_naziv} ({len(rows)} klubova) ===") for r in rows[:3]: print(f" {r['poz']:>2}. {r['klub']:<30} {r['bod']:>2}b {r['pob']}p {r['por']}por") # Get/create natjecanje cu.execute("SELECT id FROM pgz_sport.natjecanja WHERE naziv=%s LIMIT 1", (natj_naziv,)) nr = cu.fetchone() if nr: natj_id = nr[0] else: cu.execute("""INSERT INTO pgz_sport.natjecanja (naziv, sport, razina, sezona, source, source_url) VALUES (%s, 'odbojka', %s, '2025/26', 'hos_cvf', 'https://hos-cvf.hr/') RETURNING id""", (natj_naziv, razina)) natj_id = cu.fetchone()[0] print(f" Created natjecanje id={natj_id}") cu.execute("DELETE FROM pgz_sport.natjecanja_tablice WHERE natjecanje_id=%s AND source='hos_cvf'", (natj_id,)) for r in rows: klub_id = None cu.execute("""SELECT id, region FROM pgz_sport.klubovi WHERE LOWER(naziv) = LOWER(%s) OR LOWER(naziv) LIKE LOWER(%s) ORDER BY CASE WHEN aktivan THEN 0 ELSE 1 END, CASE WHEN sport='odbojka' THEN 0 ELSE 1 END, CASE WHEN region='PGŽ' THEN 0 ELSE 1 END, id ASC LIMIT 1""", (r['klub'], f"%{r['klub']}%")) kr = cu.fetchone() if kr: klub_id = kr[0] if kr[1] == 'PGŽ': pgz_seen.add(r['klub']) cu.execute("""INSERT INTO pgz_sport.natjecanja_tablice (natjecanje_id, klub_id, klub_naziv, pozicija, odigrano, pobjede, nerijeseno, porazi, gol_z, gol_p, gol_razlika, bodovi, source, source_url, updated_at, extra_data) VALUES (%s, %s, %s, %s, %s, %s, 0, %s, 0, 0, 0, %s, 'hos_cvf', 'https://hos-cvf.hr/', now(), %s::jsonb)""", (natj_id, klub_id, r['klub'], r['poz'], r['uk'], r['pob'], r['por'], r['bod'], json.dumps({"spol": spol}))) total_inserted += 1 print(f"\n=== TOTAL: {total_inserted}, PGŽ klubovi: {pgz_seen} ===") cu.execute("""SELECT n.naziv, count(t.*), count(*) FILTER (WHERE t.klub_id IS NOT NULL) FROM pgz_sport.natjecanja n LEFT JOIN pgz_sport.natjecanja_tablice t ON n.id=t.natjecanje_id AND t.source='hos_cvf' WHERE n.source='hos_cvf' GROUP BY n.id, n.naziv ORDER BY n.id""") print("\n=== HOS lige ===") for r in cu.fetchall(): print(f" {r[1]:>3} klubova ({r[2]} matched) {r[0]}") cu.execute("""SELECT n.naziv, t.pozicija, t.klub_naziv, t.bodovi, k.id, k.naziv FROM pgz_sport.natjecanja_tablice t JOIN pgz_sport.natjecanja n ON n.id=t.natjecanje_id LEFT JOIN pgz_sport.klubovi k ON k.id=t.klub_id WHERE t.source='hos_cvf' AND k.region='PGŽ' ORDER BY n.naziv, t.pozicija""") print("\n=== PGŽ klubovi u HOS ===") for r in cu.fetchall(): print(f" {r[0][:30]:<30} {r[1]:>2}. {r[2]:<25} {r[3]:>3}b -> {r[4]} '{r[5]}'") cu.execute("""INSERT INTO pgz_sport.audit_feed (table_name, action, source, source_url, details) VALUES ('natjecanja_tablice', 'hos_scrape', 'hos_cvf', NULL, %s::jsonb)""", (json.dumps({"inserted": total_inserted, "pgz_seen": list(pgz_seen)}),)) if __name__ == "__main__": main()