#!/usr/bin/env python3
from dotenv import load_dotenv
load_dotenv('/opt/rinet-gpu/.env.master')
# auto-added by patch_scrapers_with_dotenv.sh
"""
HNS Semafor ligaški scraper - parses body.innerText (SPA, no
tags)
Path: /opt/pgz-sport/scrapers/hns_lige_standings.py
Author: Damir Radulić / Ri.NET
Source: https://semafor.hns.family/natjecanja/...
Output: pgz_sport.natjecanja_tablice (source=hns_semafor)
Run: python3 hns_lige_standings.py
"""
import os
from playwright.async_api import async_playwright
DB = dict(host='localhost', port=5432, dbname='rinet_v3',
user='rinet', password=os.environ["DB_PASSWORD"])
LIGE = [
{"natj": "Supersport HNL 2025/26", "url": "https://semafor.hns.family/natjecanja/100391485/supersport-hnl", "razina": "1.HNL"},
{"natj": "2.HNL 2025/26", "url": "https://semafor.hns.family/natjecanja/100413651/2-hnl", "razina": "2.HNL"},
{"natj": "3.HNL 2025/26", "url": "https://semafor.hns.family/natjecanja/100418001/3-hnl", "razina": "3.HNL"},
{"natj": "Magenta 1.HNL Juniori 2025/26", "url": "https://semafor.hns.family/natjecanja/100511683/magenta-1-hnl-juniori", "razina": "Juniori"},
{"natj": "1.HNKŽ 2025/26", "url": "https://semafor.hns.family/natjecanja/100914995/1-hnk%C5%BE", "razina": "Žene 1.razred"},
]
async def scrape_one(page, liga):
print(f"\n=== {liga['natj']} ===", flush=True)
try:
await page.goto(liga['url'], wait_until="domcontentloaded", timeout=30000)
await page.wait_for_timeout(2000)
try:
btn = await page.query_selector(".cky-btn-accept")
if btn:
await btn.click()
await page.wait_for_timeout(1500)
except: pass
await page.wait_for_timeout(4000)
body_text = await page.evaluate("() => document.body.innerText")
idx = body_text.find('\nBod\n')
if idx < 0:
idx = body_text.find('\nBod ')
if idx < 0:
print(" No Ljestvica found"); return []
ljestvica_text = body_text[idx+5:idx+5+8000]
lines = [l.strip() for l in ljestvica_text.split('\n') if l.strip()]
# Filter out form indicators (P=poraz, N=ne, X=neutral, V=victory single chars)
lines = [l for l in lines if l not in ('P', 'N', 'X', 'V', 'D', 'W', 'L', 'F', 'Forma')]
rows = []
i = 0
while i < len(lines):
try:
poz = int(lines[i])
if poz > 50 or poz < 1:
i += 1; continue
klub = lines[i+1]
if klub.isdigit() or len(klub) < 3:
i += 1; continue
uk = int(lines[i+2])
pob = int(lines[i+3])
ner = int(lines[i+4])
por = int(lines[i+5])
gp = int(lines[i+6])
gpr = int(lines[i+7])
gr_raw = lines[i+8].strip()
gr = int(gr_raw.replace('+',''))
bod = int(lines[i+9])
rows.append({
"poz": poz, "klub": klub, "uk": uk, "pob": pob, "ner": ner, "por": por,
"gp": gp, "gpr": gpr, "gr": gr, "bod": bod
})
i += 10
except (ValueError, IndexError):
i += 1
print(f" Parsed {len(rows)} klubova")
for r in rows[:3]:
print(f" {r['poz']:>2}. {r['klub']:<25} {r['bod']:>3} bod, GR={r['gr']:+}")
return rows
except Exception as e:
print(f" ERR: {e}")
return []
async def run():
conn = psycopg2.connect(**DB); conn.autocommit = True
cu = conn.cursor()
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
ctx = await browser.new_context(user_agent="Mozilla/5.0 Chrome/120 Safari/537.36")
page = await ctx.new_page()
all_inserted = 0
for liga in LIGE:
rows = await scrape_one(page, liga)
if not rows: continue
cu.execute("""SELECT id FROM pgz_sport.natjecanja WHERE naziv=%s LIMIT 1""", (liga['natj'],))
nr = cu.fetchone()
if nr:
natj_id = nr[0]
else:
cu.execute("""INSERT INTO pgz_sport.natjecanja
(naziv, sport, razina, sezona, source, source_url)
VALUES (%s, 'nogomet', %s, '2025/26', 'hns_semafor', %s)
RETURNING id""",
(liga['natj'], liga['razina'], liga['url']))
natj_id = cu.fetchone()[0]
print(f" Created natjecanje id={natj_id}")
# Clear old rows for this natjecanje (no sezona col)
cu.execute("DELETE FROM pgz_sport.natjecanja_tablice WHERE natjecanje_id=%s AND source='hns_semafor'",
(natj_id,))
for r in rows:
cu.execute("""SELECT id FROM pgz_sport.klubovi
WHERE LOWER(naziv) = LOWER(%s)
OR LOWER(naziv) LIKE LOWER(%s)
ORDER BY
CASE WHEN aktivan THEN 0 ELSE 1 END,
CASE WHEN sport='nogomet' THEN 0 ELSE 1 END,
id ASC LIMIT 1""",
(r['klub'], f"%{r['klub']}%"))
kr = cu.fetchone()
klub_id = kr[0] if kr else None
cu.execute("""INSERT INTO pgz_sport.natjecanja_tablice
(natjecanje_id, klub_id, klub_naziv, pozicija,
odigrano, pobjede, nerijeseno, porazi, gol_z, gol_p,
gol_razlika, bodovi, source, source_url, updated_at)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,
'hns_semafor', %s, now())""",
(natj_id, klub_id, r['klub'], r['poz'], r['uk'], r['pob'], r['ner'],
r['por'], r['gp'], r['gpr'], r['gr'], r['bod'], liga['url']))
all_inserted += 1
await browser.close()
print(f"\n=== TOTAL inserted: {all_inserted} rows ===")
cu.execute("""INSERT INTO pgz_sport.audit_feed
(table_name, action, source, source_url, details)
VALUES ('natjecanja_tablice', 'hns_lige_scrape', 'hns_semafor', NULL, %s::jsonb)""",
(f'{{"inserted":{all_inserted},"lige":{len(LIGE)}}}',))
cu.execute("""SELECT n.naziv, count(t.*) FROM pgz_sport.natjecanja n
LEFT JOIN pgz_sport.natjecanja_tablice t
ON n.id=t.natjecanje_id AND t.source='hns_semafor'
WHERE n.source='hns_semafor' AND n.sezona='2025/26'
GROUP BY n.id, n.naziv ORDER BY n.id""")
print("\n=== HNS lige stats ===")
for r in cu.fetchall():
print(f" {r[1]:>3} klubova - {r[0]}")
# PGŽ klubovi u tablicama
cu.execute("""SELECT n.naziv AS liga, t.pozicija, t.klub_naziv, t.bodovi, t.gol_razlika, k.id, k.aktivan
FROM pgz_sport.natjecanja_tablice t
JOIN pgz_sport.natjecanja n ON n.id=t.natjecanje_id
LEFT JOIN pgz_sport.klubovi k ON k.id=t.klub_id
WHERE t.source='hns_semafor'
AND (lower(t.klub_naziv) LIKE '%%rijeka%%' OR lower(t.klub_naziv) LIKE '%%opatija%%'
OR lower(t.klub_naziv) LIKE '%%krk%%' OR lower(t.klub_naziv) LIKE '%%delnice%%'
OR lower(t.klub_naziv) LIKE '%%zamet%%' OR lower(t.klub_naziv) LIKE '%%orijent%%'
OR lower(t.klub_naziv) LIKE '%%cresnik%%' OR lower(t.klub_naziv) LIKE '%%goranin%%'
OR lower(t.klub_naziv) LIKE '%%kvarner%%')
ORDER BY n.naziv, t.pozicija""")
print("\n=== PGŽ klubovi u HNS ligama 2025/26 ===")
for r in cu.fetchall():
match = f"klub_id={r[5]}" if r[5] else "❌ no match"
print(f" {r[0][:30]:<30} {r[1]:>2}. {r[2]:<25} {r[3]:>3}b GR{r[4]:+} {match}")
conn.close()
asyncio.run(run())