#!/usr/bin/env python3 from dotenv import load_dotenv load_dotenv('/opt/rinet-gpu/.env.master') # auto-added by patch_scrapers_with_dotenv.sh """ HNS Semafor ligaški scraper - parses body.innerText (SPA, no tags) Path: /opt/pgz-sport/scrapers/hns_lige_standings.py Author: Damir Radulić / Ri.NET Source: https://semafor.hns.family/natjecanja/... Output: pgz_sport.natjecanja_tablice (source=hns_semafor) Run: python3 hns_lige_standings.py """ import os from playwright.async_api import async_playwright DB = dict(host='localhost', port=5432, dbname='rinet_v3', user='rinet', password=os.environ["DB_PASSWORD"]) LIGE = [ {"natj": "Supersport HNL 2025/26", "url": "https://semafor.hns.family/natjecanja/100391485/supersport-hnl", "razina": "1.HNL"}, {"natj": "2.HNL 2025/26", "url": "https://semafor.hns.family/natjecanja/100413651/2-hnl", "razina": "2.HNL"}, {"natj": "3.HNL 2025/26", "url": "https://semafor.hns.family/natjecanja/100418001/3-hnl", "razina": "3.HNL"}, {"natj": "Magenta 1.HNL Juniori 2025/26", "url": "https://semafor.hns.family/natjecanja/100511683/magenta-1-hnl-juniori", "razina": "Juniori"}, {"natj": "1.HNKŽ 2025/26", "url": "https://semafor.hns.family/natjecanja/100914995/1-hnk%C5%BE", "razina": "Žene 1.razred"}, ] async def scrape_one(page, liga): print(f"\n=== {liga['natj']} ===", flush=True) try: await page.goto(liga['url'], wait_until="domcontentloaded", timeout=30000) await page.wait_for_timeout(2000) try: btn = await page.query_selector(".cky-btn-accept") if btn: await btn.click() await page.wait_for_timeout(1500) except: pass await page.wait_for_timeout(4000) body_text = await page.evaluate("() => document.body.innerText") idx = body_text.find('\nBod\n') if idx < 0: idx = body_text.find('\nBod ') if idx < 0: print(" No Ljestvica found"); return [] ljestvica_text = body_text[idx+5:idx+5+8000] lines = [l.strip() for l in ljestvica_text.split('\n') if l.strip()] # Filter out form indicators (P=poraz, N=ne, X=neutral, V=victory single chars) lines = [l for l in lines if l not in ('P', 'N', 'X', 'V', 'D', 'W', 'L', 'F', 'Forma')] rows = [] i = 0 while i < len(lines): try: poz = int(lines[i]) if poz > 50 or poz < 1: i += 1; continue klub = lines[i+1] if klub.isdigit() or len(klub) < 3: i += 1; continue uk = int(lines[i+2]) pob = int(lines[i+3]) ner = int(lines[i+4]) por = int(lines[i+5]) gp = int(lines[i+6]) gpr = int(lines[i+7]) gr_raw = lines[i+8].strip() gr = int(gr_raw.replace('+','')) bod = int(lines[i+9]) rows.append({ "poz": poz, "klub": klub, "uk": uk, "pob": pob, "ner": ner, "por": por, "gp": gp, "gpr": gpr, "gr": gr, "bod": bod }) i += 10 except (ValueError, IndexError): i += 1 print(f" Parsed {len(rows)} klubova") for r in rows[:3]: print(f" {r['poz']:>2}. {r['klub']:<25} {r['bod']:>3} bod, GR={r['gr']:+}") return rows except Exception as e: print(f" ERR: {e}") return [] async def run(): conn = psycopg2.connect(**DB); conn.autocommit = True cu = conn.cursor() async with async_playwright() as p: browser = await p.chromium.launch(headless=True) ctx = await browser.new_context(user_agent="Mozilla/5.0 Chrome/120 Safari/537.36") page = await ctx.new_page() all_inserted = 0 for liga in LIGE: rows = await scrape_one(page, liga) if not rows: continue cu.execute("""SELECT id FROM pgz_sport.natjecanja WHERE naziv=%s LIMIT 1""", (liga['natj'],)) nr = cu.fetchone() if nr: natj_id = nr[0] else: cu.execute("""INSERT INTO pgz_sport.natjecanja (naziv, sport, razina, sezona, source, source_url) VALUES (%s, 'nogomet', %s, '2025/26', 'hns_semafor', %s) RETURNING id""", (liga['natj'], liga['razina'], liga['url'])) natj_id = cu.fetchone()[0] print(f" Created natjecanje id={natj_id}") # Clear old rows for this natjecanje (no sezona col) cu.execute("DELETE FROM pgz_sport.natjecanja_tablice WHERE natjecanje_id=%s AND source='hns_semafor'", (natj_id,)) for r in rows: cu.execute("""SELECT id FROM pgz_sport.klubovi WHERE LOWER(naziv) = LOWER(%s) OR LOWER(naziv) LIKE LOWER(%s) ORDER BY CASE WHEN aktivan THEN 0 ELSE 1 END, CASE WHEN sport='nogomet' THEN 0 ELSE 1 END, id ASC LIMIT 1""", (r['klub'], f"%{r['klub']}%")) kr = cu.fetchone() klub_id = kr[0] if kr else None cu.execute("""INSERT INTO pgz_sport.natjecanja_tablice (natjecanje_id, klub_id, klub_naziv, pozicija, odigrano, pobjede, nerijeseno, porazi, gol_z, gol_p, gol_razlika, bodovi, source, source_url, updated_at) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, 'hns_semafor', %s, now())""", (natj_id, klub_id, r['klub'], r['poz'], r['uk'], r['pob'], r['ner'], r['por'], r['gp'], r['gpr'], r['gr'], r['bod'], liga['url'])) all_inserted += 1 await browser.close() print(f"\n=== TOTAL inserted: {all_inserted} rows ===") cu.execute("""INSERT INTO pgz_sport.audit_feed (table_name, action, source, source_url, details) VALUES ('natjecanja_tablice', 'hns_lige_scrape', 'hns_semafor', NULL, %s::jsonb)""", (f'{{"inserted":{all_inserted},"lige":{len(LIGE)}}}',)) cu.execute("""SELECT n.naziv, count(t.*) FROM pgz_sport.natjecanja n LEFT JOIN pgz_sport.natjecanja_tablice t ON n.id=t.natjecanje_id AND t.source='hns_semafor' WHERE n.source='hns_semafor' AND n.sezona='2025/26' GROUP BY n.id, n.naziv ORDER BY n.id""") print("\n=== HNS lige stats ===") for r in cu.fetchall(): print(f" {r[1]:>3} klubova - {r[0]}") # PGŽ klubovi u tablicama cu.execute("""SELECT n.naziv AS liga, t.pozicija, t.klub_naziv, t.bodovi, t.gol_razlika, k.id, k.aktivan FROM pgz_sport.natjecanja_tablice t JOIN pgz_sport.natjecanja n ON n.id=t.natjecanje_id LEFT JOIN pgz_sport.klubovi k ON k.id=t.klub_id WHERE t.source='hns_semafor' AND (lower(t.klub_naziv) LIKE '%%rijeka%%' OR lower(t.klub_naziv) LIKE '%%opatija%%' OR lower(t.klub_naziv) LIKE '%%krk%%' OR lower(t.klub_naziv) LIKE '%%delnice%%' OR lower(t.klub_naziv) LIKE '%%zamet%%' OR lower(t.klub_naziv) LIKE '%%orijent%%' OR lower(t.klub_naziv) LIKE '%%cresnik%%' OR lower(t.klub_naziv) LIKE '%%goranin%%' OR lower(t.klub_naziv) LIKE '%%kvarner%%') ORDER BY n.naziv, t.pozicija""") print("\n=== PGŽ klubovi u HNS ligama 2025/26 ===") for r in cu.fetchall(): match = f"klub_id={r[5]}" if r[5] else "❌ no match" print(f" {r[0][:30]:<30} {r[1]:>2}. {r[2]:<25} {r[3]:>3}b GR{r[4]:+} {match}") conn.close() asyncio.run(run())