feat: /api/v2/analiza/* endpoints - sport analytics backend
This commit is contained in:
+174
@@ -0,0 +1,174 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
HNS Semafor ligaški scraper - parses body.innerText (SPA, no <table> tags)
|
||||
Path: /opt/pgz-sport/scrapers/hns_lige_standings.py
|
||||
Author: Damir Radulić / Ri.NET
|
||||
Source: https://semafor.hns.family/natjecanja/...
|
||||
Output: pgz_sport.natjecanja_tablice (source=hns_semafor)
|
||||
Run: python3 hns_lige_standings.py
|
||||
"""
|
||||
import os
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
DB = dict(host='localhost', port=5432, dbname='rinet_v3',
|
||||
user='rinet', password=os.environ["DB_PASSWORD"])
|
||||
|
||||
LIGE = [
|
||||
{"natj": "Supersport HNL 2025/26", "url": "https://semafor.hns.family/natjecanja/100391485/supersport-hnl", "razina": "1.HNL"},
|
||||
{"natj": "2.HNL 2025/26", "url": "https://semafor.hns.family/natjecanja/100413651/2-hnl", "razina": "2.HNL"},
|
||||
{"natj": "3.HNL 2025/26", "url": "https://semafor.hns.family/natjecanja/100418001/3-hnl", "razina": "3.HNL"},
|
||||
{"natj": "Magenta 1.HNL Juniori 2025/26", "url": "https://semafor.hns.family/natjecanja/100511683/magenta-1-hnl-juniori", "razina": "Juniori"},
|
||||
{"natj": "1.HNKŽ 2025/26", "url": "https://semafor.hns.family/natjecanja/100914995/1-hnk%C5%BE", "razina": "Žene 1.razred"},
|
||||
]
|
||||
|
||||
async def scrape_one(page, liga):
|
||||
print(f"\n=== {liga['natj']} ===", flush=True)
|
||||
try:
|
||||
await page.goto(liga['url'], wait_until="domcontentloaded", timeout=30000)
|
||||
await page.wait_for_timeout(2000)
|
||||
try:
|
||||
btn = await page.query_selector(".cky-btn-accept")
|
||||
if btn:
|
||||
await btn.click()
|
||||
await page.wait_for_timeout(1500)
|
||||
except: pass
|
||||
await page.wait_for_timeout(4000)
|
||||
|
||||
body_text = await page.evaluate("() => document.body.innerText")
|
||||
idx = body_text.find('\nBod\n')
|
||||
if idx < 0:
|
||||
idx = body_text.find('\nBod ')
|
||||
if idx < 0:
|
||||
print(" No Ljestvica found"); return []
|
||||
|
||||
ljestvica_text = body_text[idx+5:idx+5+8000]
|
||||
lines = [l.strip() for l in ljestvica_text.split('\n') if l.strip()]
|
||||
# Filter out form indicators (P=poraz, N=ne, X=neutral, V=victory single chars)
|
||||
lines = [l for l in lines if l not in ('P', 'N', 'X', 'V', 'D', 'W', 'L', 'F', 'Forma')]
|
||||
|
||||
rows = []
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
try:
|
||||
poz = int(lines[i])
|
||||
if poz > 50 or poz < 1:
|
||||
i += 1; continue
|
||||
klub = lines[i+1]
|
||||
if klub.isdigit() or len(klub) < 3:
|
||||
i += 1; continue
|
||||
uk = int(lines[i+2])
|
||||
pob = int(lines[i+3])
|
||||
ner = int(lines[i+4])
|
||||
por = int(lines[i+5])
|
||||
gp = int(lines[i+6])
|
||||
gpr = int(lines[i+7])
|
||||
gr_raw = lines[i+8].strip()
|
||||
gr = int(gr_raw.replace('+',''))
|
||||
bod = int(lines[i+9])
|
||||
rows.append({
|
||||
"poz": poz, "klub": klub, "uk": uk, "pob": pob, "ner": ner, "por": por,
|
||||
"gp": gp, "gpr": gpr, "gr": gr, "bod": bod
|
||||
})
|
||||
i += 10
|
||||
except (ValueError, IndexError):
|
||||
i += 1
|
||||
|
||||
print(f" Parsed {len(rows)} klubova")
|
||||
for r in rows[:3]:
|
||||
print(f" {r['poz']:>2}. {r['klub']:<25} {r['bod']:>3} bod, GR={r['gr']:+}")
|
||||
return rows
|
||||
except Exception as e:
|
||||
print(f" ERR: {e}")
|
||||
return []
|
||||
|
||||
async def run():
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cu = conn.cursor()
|
||||
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
ctx = await browser.new_context(user_agent="Mozilla/5.0 Chrome/120 Safari/537.36")
|
||||
page = await ctx.new_page()
|
||||
|
||||
all_inserted = 0
|
||||
for liga in LIGE:
|
||||
rows = await scrape_one(page, liga)
|
||||
if not rows: continue
|
||||
|
||||
cu.execute("""SELECT id FROM pgz_sport.natjecanja WHERE naziv=%s LIMIT 1""", (liga['natj'],))
|
||||
nr = cu.fetchone()
|
||||
if nr:
|
||||
natj_id = nr[0]
|
||||
else:
|
||||
cu.execute("""INSERT INTO pgz_sport.natjecanja
|
||||
(naziv, sport, razina, sezona, source, source_url)
|
||||
VALUES (%s, 'nogomet', %s, '2025/26', 'hns_semafor', %s)
|
||||
RETURNING id""",
|
||||
(liga['natj'], liga['razina'], liga['url']))
|
||||
natj_id = cu.fetchone()[0]
|
||||
print(f" Created natjecanje id={natj_id}")
|
||||
|
||||
# Clear old rows for this natjecanje (no sezona col)
|
||||
cu.execute("DELETE FROM pgz_sport.natjecanja_tablice WHERE natjecanje_id=%s AND source='hns_semafor'",
|
||||
(natj_id,))
|
||||
|
||||
for r in rows:
|
||||
cu.execute("""SELECT id FROM pgz_sport.klubovi
|
||||
WHERE LOWER(naziv) = LOWER(%s)
|
||||
OR LOWER(naziv) LIKE LOWER(%s)
|
||||
ORDER BY
|
||||
CASE WHEN aktivan THEN 0 ELSE 1 END,
|
||||
CASE WHEN sport='nogomet' THEN 0 ELSE 1 END,
|
||||
id ASC LIMIT 1""",
|
||||
(r['klub'], f"%{r['klub']}%"))
|
||||
kr = cu.fetchone()
|
||||
klub_id = kr[0] if kr else None
|
||||
|
||||
cu.execute("""INSERT INTO pgz_sport.natjecanja_tablice
|
||||
(natjecanje_id, klub_id, klub_naziv, pozicija,
|
||||
odigrano, pobjede, nerijeseno, porazi, gol_z, gol_p,
|
||||
gol_razlika, bodovi, source, source_url, updated_at)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,
|
||||
'hns_semafor', %s, now())""",
|
||||
(natj_id, klub_id, r['klub'], r['poz'], r['uk'], r['pob'], r['ner'],
|
||||
r['por'], r['gp'], r['gpr'], r['gr'], r['bod'], liga['url']))
|
||||
all_inserted += 1
|
||||
|
||||
await browser.close()
|
||||
|
||||
print(f"\n=== TOTAL inserted: {all_inserted} rows ===")
|
||||
|
||||
cu.execute("""INSERT INTO pgz_sport.audit_feed
|
||||
(table_name, action, source, source_url, details)
|
||||
VALUES ('natjecanja_tablice', 'hns_lige_scrape', 'hns_semafor', NULL, %s::jsonb)""",
|
||||
(f'{{"inserted":{all_inserted},"lige":{len(LIGE)}}}',))
|
||||
|
||||
cu.execute("""SELECT n.naziv, count(t.*) FROM pgz_sport.natjecanja n
|
||||
LEFT JOIN pgz_sport.natjecanja_tablice t
|
||||
ON n.id=t.natjecanje_id AND t.source='hns_semafor'
|
||||
WHERE n.source='hns_semafor' AND n.sezona='2025/26'
|
||||
GROUP BY n.id, n.naziv ORDER BY n.id""")
|
||||
print("\n=== HNS lige stats ===")
|
||||
for r in cu.fetchall():
|
||||
print(f" {r[1]:>3} klubova - {r[0]}")
|
||||
|
||||
# PGŽ klubovi u tablicama
|
||||
cu.execute("""SELECT n.naziv AS liga, t.pozicija, t.klub_naziv, t.bodovi, t.gol_razlika, k.id, k.aktivan
|
||||
FROM pgz_sport.natjecanja_tablice t
|
||||
JOIN pgz_sport.natjecanja n ON n.id=t.natjecanje_id
|
||||
LEFT JOIN pgz_sport.klubovi k ON k.id=t.klub_id
|
||||
WHERE t.source='hns_semafor'
|
||||
AND (lower(t.klub_naziv) LIKE '%%rijeka%%' OR lower(t.klub_naziv) LIKE '%%opatija%%'
|
||||
OR lower(t.klub_naziv) LIKE '%%krk%%' OR lower(t.klub_naziv) LIKE '%%delnice%%'
|
||||
OR lower(t.klub_naziv) LIKE '%%zamet%%' OR lower(t.klub_naziv) LIKE '%%orijent%%'
|
||||
OR lower(t.klub_naziv) LIKE '%%cresnik%%' OR lower(t.klub_naziv) LIKE '%%goranin%%'
|
||||
OR lower(t.klub_naziv) LIKE '%%kvarner%%')
|
||||
ORDER BY n.naziv, t.pozicija""")
|
||||
print("\n=== PGŽ klubovi u HNS ligama 2025/26 ===")
|
||||
for r in cu.fetchall():
|
||||
match = f"klub_id={r[5]}" if r[5] else "❌ no match"
|
||||
print(f" {r[0][:30]:<30} {r[1]:>2}. {r[2]:<25} {r[3]:>3}b GR{r[4]:+} {match}")
|
||||
|
||||
conn.close()
|
||||
|
||||
asyncio.run(run())
|
||||
Reference in New Issue
Block a user