175 lines
8.0 KiB
Python
Executable File
175 lines
8.0 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
HNS Semafor ligaški scraper - parses body.innerText (SPA, no <table> tags)
|
|
Path: /opt/pgz-sport/scrapers/hns_lige_standings.py
|
|
Author: Damir Radulić / Ri.NET
|
|
Source: https://semafor.hns.family/natjecanja/...
|
|
Output: pgz_sport.natjecanja_tablice (source=hns_semafor)
|
|
Run: python3 hns_lige_standings.py
|
|
"""
|
|
import os
|
|
from playwright.async_api import async_playwright
|
|
|
|
DB = dict(host='localhost', port=5432, dbname='rinet_v3',
|
|
user='rinet', password=os.environ["DB_PASSWORD"])
|
|
|
|
LIGE = [
|
|
{"natj": "Supersport HNL 2025/26", "url": "https://semafor.hns.family/natjecanja/100391485/supersport-hnl", "razina": "1.HNL"},
|
|
{"natj": "2.HNL 2025/26", "url": "https://semafor.hns.family/natjecanja/100413651/2-hnl", "razina": "2.HNL"},
|
|
{"natj": "3.HNL 2025/26", "url": "https://semafor.hns.family/natjecanja/100418001/3-hnl", "razina": "3.HNL"},
|
|
{"natj": "Magenta 1.HNL Juniori 2025/26", "url": "https://semafor.hns.family/natjecanja/100511683/magenta-1-hnl-juniori", "razina": "Juniori"},
|
|
{"natj": "1.HNKŽ 2025/26", "url": "https://semafor.hns.family/natjecanja/100914995/1-hnk%C5%BE", "razina": "Žene 1.razred"},
|
|
]
|
|
|
|
async def scrape_one(page, liga):
|
|
print(f"\n=== {liga['natj']} ===", flush=True)
|
|
try:
|
|
await page.goto(liga['url'], wait_until="domcontentloaded", timeout=30000)
|
|
await page.wait_for_timeout(2000)
|
|
try:
|
|
btn = await page.query_selector(".cky-btn-accept")
|
|
if btn:
|
|
await btn.click()
|
|
await page.wait_for_timeout(1500)
|
|
except: pass
|
|
await page.wait_for_timeout(4000)
|
|
|
|
body_text = await page.evaluate("() => document.body.innerText")
|
|
idx = body_text.find('\nBod\n')
|
|
if idx < 0:
|
|
idx = body_text.find('\nBod ')
|
|
if idx < 0:
|
|
print(" No Ljestvica found"); return []
|
|
|
|
ljestvica_text = body_text[idx+5:idx+5+8000]
|
|
lines = [l.strip() for l in ljestvica_text.split('\n') if l.strip()]
|
|
# Filter out form indicators (P=poraz, N=ne, X=neutral, V=victory single chars)
|
|
lines = [l for l in lines if l not in ('P', 'N', 'X', 'V', 'D', 'W', 'L', 'F', 'Forma')]
|
|
|
|
rows = []
|
|
i = 0
|
|
while i < len(lines):
|
|
try:
|
|
poz = int(lines[i])
|
|
if poz > 50 or poz < 1:
|
|
i += 1; continue
|
|
klub = lines[i+1]
|
|
if klub.isdigit() or len(klub) < 3:
|
|
i += 1; continue
|
|
uk = int(lines[i+2])
|
|
pob = int(lines[i+3])
|
|
ner = int(lines[i+4])
|
|
por = int(lines[i+5])
|
|
gp = int(lines[i+6])
|
|
gpr = int(lines[i+7])
|
|
gr_raw = lines[i+8].strip()
|
|
gr = int(gr_raw.replace('+',''))
|
|
bod = int(lines[i+9])
|
|
rows.append({
|
|
"poz": poz, "klub": klub, "uk": uk, "pob": pob, "ner": ner, "por": por,
|
|
"gp": gp, "gpr": gpr, "gr": gr, "bod": bod
|
|
})
|
|
i += 10
|
|
except (ValueError, IndexError):
|
|
i += 1
|
|
|
|
print(f" Parsed {len(rows)} klubova")
|
|
for r in rows[:3]:
|
|
print(f" {r['poz']:>2}. {r['klub']:<25} {r['bod']:>3} bod, GR={r['gr']:+}")
|
|
return rows
|
|
except Exception as e:
|
|
print(f" ERR: {e}")
|
|
return []
|
|
|
|
async def run():
|
|
conn = psycopg2.connect(**DB); conn.autocommit = True
|
|
cu = conn.cursor()
|
|
|
|
async with async_playwright() as p:
|
|
browser = await p.chromium.launch(headless=True)
|
|
ctx = await browser.new_context(user_agent="Mozilla/5.0 Chrome/120 Safari/537.36")
|
|
page = await ctx.new_page()
|
|
|
|
all_inserted = 0
|
|
for liga in LIGE:
|
|
rows = await scrape_one(page, liga)
|
|
if not rows: continue
|
|
|
|
cu.execute("""SELECT id FROM pgz_sport.natjecanja WHERE naziv=%s LIMIT 1""", (liga['natj'],))
|
|
nr = cu.fetchone()
|
|
if nr:
|
|
natj_id = nr[0]
|
|
else:
|
|
cu.execute("""INSERT INTO pgz_sport.natjecanja
|
|
(naziv, sport, razina, sezona, source, source_url)
|
|
VALUES (%s, 'nogomet', %s, '2025/26', 'hns_semafor', %s)
|
|
RETURNING id""",
|
|
(liga['natj'], liga['razina'], liga['url']))
|
|
natj_id = cu.fetchone()[0]
|
|
print(f" Created natjecanje id={natj_id}")
|
|
|
|
# Clear old rows for this natjecanje (no sezona col)
|
|
cu.execute("DELETE FROM pgz_sport.natjecanja_tablice WHERE natjecanje_id=%s AND source='hns_semafor'",
|
|
(natj_id,))
|
|
|
|
for r in rows:
|
|
cu.execute("""SELECT id FROM pgz_sport.klubovi
|
|
WHERE LOWER(naziv) = LOWER(%s)
|
|
OR LOWER(naziv) LIKE LOWER(%s)
|
|
ORDER BY
|
|
CASE WHEN aktivan THEN 0 ELSE 1 END,
|
|
CASE WHEN sport='nogomet' THEN 0 ELSE 1 END,
|
|
id ASC LIMIT 1""",
|
|
(r['klub'], f"%{r['klub']}%"))
|
|
kr = cu.fetchone()
|
|
klub_id = kr[0] if kr else None
|
|
|
|
cu.execute("""INSERT INTO pgz_sport.natjecanja_tablice
|
|
(natjecanje_id, klub_id, klub_naziv, pozicija,
|
|
odigrano, pobjede, nerijeseno, porazi, gol_z, gol_p,
|
|
gol_razlika, bodovi, source, source_url, updated_at)
|
|
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,
|
|
'hns_semafor', %s, now())""",
|
|
(natj_id, klub_id, r['klub'], r['poz'], r['uk'], r['pob'], r['ner'],
|
|
r['por'], r['gp'], r['gpr'], r['gr'], r['bod'], liga['url']))
|
|
all_inserted += 1
|
|
|
|
await browser.close()
|
|
|
|
print(f"\n=== TOTAL inserted: {all_inserted} rows ===")
|
|
|
|
cu.execute("""INSERT INTO pgz_sport.audit_feed
|
|
(table_name, action, source, source_url, details)
|
|
VALUES ('natjecanja_tablice', 'hns_lige_scrape', 'hns_semafor', NULL, %s::jsonb)""",
|
|
(f'{{"inserted":{all_inserted},"lige":{len(LIGE)}}}',))
|
|
|
|
cu.execute("""SELECT n.naziv, count(t.*) FROM pgz_sport.natjecanja n
|
|
LEFT JOIN pgz_sport.natjecanja_tablice t
|
|
ON n.id=t.natjecanje_id AND t.source='hns_semafor'
|
|
WHERE n.source='hns_semafor' AND n.sezona='2025/26'
|
|
GROUP BY n.id, n.naziv ORDER BY n.id""")
|
|
print("\n=== HNS lige stats ===")
|
|
for r in cu.fetchall():
|
|
print(f" {r[1]:>3} klubova - {r[0]}")
|
|
|
|
# PGŽ klubovi u tablicama
|
|
cu.execute("""SELECT n.naziv AS liga, t.pozicija, t.klub_naziv, t.bodovi, t.gol_razlika, k.id, k.aktivan
|
|
FROM pgz_sport.natjecanja_tablice t
|
|
JOIN pgz_sport.natjecanja n ON n.id=t.natjecanje_id
|
|
LEFT JOIN pgz_sport.klubovi k ON k.id=t.klub_id
|
|
WHERE t.source='hns_semafor'
|
|
AND (lower(t.klub_naziv) LIKE '%%rijeka%%' OR lower(t.klub_naziv) LIKE '%%opatija%%'
|
|
OR lower(t.klub_naziv) LIKE '%%krk%%' OR lower(t.klub_naziv) LIKE '%%delnice%%'
|
|
OR lower(t.klub_naziv) LIKE '%%zamet%%' OR lower(t.klub_naziv) LIKE '%%orijent%%'
|
|
OR lower(t.klub_naziv) LIKE '%%cresnik%%' OR lower(t.klub_naziv) LIKE '%%goranin%%'
|
|
OR lower(t.klub_naziv) LIKE '%%kvarner%%')
|
|
ORDER BY n.naziv, t.pozicija""")
|
|
print("\n=== PGŽ klubovi u HNS ligama 2025/26 ===")
|
|
for r in cu.fetchall():
|
|
match = f"klub_id={r[5]}" if r[5] else "❌ no match"
|
|
print(f" {r[0][:30]:<30} {r[1]:>2}. {r[2]:<25} {r[3]:>3}b GR{r[4]:+} {match}")
|
|
|
|
conn.close()
|
|
|
|
asyncio.run(run())
|