175 lines
8.0 KiB
Python
Executable File
175 lines
8.0 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
HKS-CBF scraper - parses Genius Sports embed standings for HR košarka leagues.
|
|
Path: /opt/pgz-sport/scrapers/hks_scraper.py
|
|
Source: https://hosted.dcd.shared.geniussports.com/embednf/HKS/en/competition/{ID}/standings
|
|
Output: pgz_sport.natjecanja_tablice + pgz_sport.klubovi (matching)
|
|
"""
|
|
import os
|
|
import requests, re, json, psycopg2, html as ihtml
|
|
|
|
DB = dict(host="10.10.0.2", port=6432, port=5432, dbname='rinet_v3',
|
|
user='rinet', password=os.environ["DB_PASSWORD"])
|
|
UA = "RiNET-Civic/1.0 (https://rinet.one)"
|
|
HDR = {"User-Agent": UA, "Accept": "application/json", "Referer": "https://www.hks-cbf.hr/"}
|
|
|
|
# Competition ID lookup - 2025/26 sezona
|
|
COMPS = [
|
|
{"natj": "Supersport Premijer Liga (M) 2025/26", "razina": "Premijer M", "comp_id": 42186, "spol": "M",
|
|
"url": "https://www.hks-cbf.hr/supersport-premijer/"},
|
|
{"natj": "Supersport Premijer Liga (Ž) 2025/26", "razina": "Premijer Ž", "comp_id": 42187, "spol": "Ž",
|
|
"url": "https://www.hks-cbf.hr/premijer-zenska-liga/"},
|
|
{"natj": "1.Muška liga 2025/26", "razina": "1.M liga", "comp_id": 42259, "spol": "M",
|
|
"url": "https://www.hks-cbf.hr/prva-muska-liga/"},
|
|
]
|
|
|
|
def parse_standings(html):
|
|
"""Parse Genius Sports standings HTML table → list of rows."""
|
|
tables = re.findall(r'<table[^>]*>(.+?)</table>', html, re.DOTALL)
|
|
if not tables: return []
|
|
rows_html = re.findall(r'<tr[^>]*>(.+?)</tr>', tables[0], re.DOTALL)
|
|
if len(rows_html) < 2: return []
|
|
|
|
standings = []
|
|
for row in rows_html[1:]: # skip header
|
|
cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', row, re.DOTALL)
|
|
if len(cells) < 8: continue
|
|
clean = [ihtml.unescape(re.sub(r'<[^>]+>', '', c)).strip() for c in cells]
|
|
# ['1', '', 'KK ZadarZAD', '31', '58', '27', '4', '2669', '2209', '460']
|
|
try:
|
|
poz = int(clean[0])
|
|
except (ValueError, IndexError): continue
|
|
# Team has cell with name + abbreviation glued (e.g. "KK ZadarZAD")
|
|
team_raw = clean[2]
|
|
# Extract klub naziv - assume last 2-4 uppercase letters are abbrev
|
|
m = re.match(r'^(.+?)([A-ZČĆŠŽĐ]{2,4})$', team_raw)
|
|
if m:
|
|
klub = m.group(1).strip()
|
|
abbrev = m.group(2)
|
|
else:
|
|
klub = team_raw
|
|
abbrev = None
|
|
|
|
try:
|
|
standings.append({
|
|
"poz": poz,
|
|
"klub": klub,
|
|
"abbrev": abbrev,
|
|
"gp": int(clean[3]),
|
|
"bod": int(clean[4]),
|
|
"pob": int(clean[5]),
|
|
"por": int(clean[6]),
|
|
"for_pts": int(clean[7].replace(',','')),
|
|
"ag_pts": int(clean[8].replace(',','')),
|
|
"gd": int(clean[9].replace(',','').replace('+','')),
|
|
"ner": 0, # košarka nema neriješeno
|
|
})
|
|
except (ValueError, IndexError) as e:
|
|
continue
|
|
return standings
|
|
|
|
def main():
|
|
conn = psycopg2.connect(**DB); conn.autocommit = True
|
|
cu = conn.cursor()
|
|
|
|
total_inserted = 0
|
|
pgz_klubovi_seen = set()
|
|
|
|
for comp in COMPS:
|
|
print(f"\n=== {comp['natj']} (comp_id={comp['comp_id']}) ===")
|
|
api_url = f"https://hosted.dcd.shared.geniussports.com/embednf/HKS/en/competition/{comp['comp_id']}/standings?&iurl=https%3A%2F%2Fwww.hks-cbf.hr"
|
|
try:
|
|
r = requests.get(api_url, headers=HDR, timeout=20)
|
|
d = r.json()
|
|
html = d.get('html', '')
|
|
except Exception as e:
|
|
print(f" ERR fetch: {e}"); continue
|
|
|
|
rows = parse_standings(html)
|
|
print(f" Parsed {len(rows)} klubova")
|
|
|
|
# Get/create natjecanje
|
|
cu.execute("SELECT id FROM pgz_sport.natjecanja WHERE naziv=%s LIMIT 1", (comp['natj'],))
|
|
nr = cu.fetchone()
|
|
if nr:
|
|
natj_id = nr[0]
|
|
else:
|
|
cu.execute("""INSERT INTO pgz_sport.natjecanja
|
|
(naziv, sport, razina, sezona, source, source_url)
|
|
VALUES (%s, 'košarka', %s, '2025/26', 'hks_genius', %s)
|
|
RETURNING id""",
|
|
(comp['natj'], comp['razina'], comp['url']))
|
|
natj_id = cu.fetchone()[0]
|
|
print(f" Created natjecanje id={natj_id}")
|
|
|
|
# Clear old rows
|
|
cu.execute("DELETE FROM pgz_sport.natjecanja_tablice WHERE natjecanje_id=%s AND source='hks_genius'", (natj_id,))
|
|
|
|
for r in rows:
|
|
# Match klub
|
|
klub_id = None
|
|
cu.execute("""SELECT id FROM pgz_sport.klubovi
|
|
WHERE LOWER(naziv) = LOWER(%s)
|
|
OR LOWER(naziv) LIKE LOWER(%s)
|
|
ORDER BY
|
|
CASE WHEN aktivan THEN 0 ELSE 1 END,
|
|
CASE WHEN sport='košarka' THEN 0 ELSE 1 END,
|
|
CASE WHEN region='PGŽ' THEN 0 ELSE 1 END,
|
|
id ASC LIMIT 1""",
|
|
(r['klub'], f"%{r['klub']}%"))
|
|
kr = cu.fetchone()
|
|
if kr:
|
|
klub_id = kr[0]
|
|
# Check if PGŽ
|
|
cu.execute("SELECT region, grad FROM pgz_sport.klubovi WHERE id=%s", (klub_id,))
|
|
kdata = cu.fetchone()
|
|
if kdata and (kdata[0] == 'PGŽ' or (kdata[1] and any(g in (kdata[1] or '').lower() for g in ['rijeka','crikv','opatija','delnice','krk','cres','rab','lošinj','losinj','vrbnik','novi vinodolski','čavle','cavle','kraljevica','kostrena','kastav']))):
|
|
pgz_klubovi_seen.add(r['klub'])
|
|
|
|
cu.execute("""INSERT INTO pgz_sport.natjecanja_tablice
|
|
(natjecanje_id, klub_id, klub_naziv, pozicija,
|
|
odigrano, pobjede, nerijeseno, porazi, gol_z, gol_p,
|
|
gol_razlika, bodovi, source, source_url, updated_at, extra_data)
|
|
VALUES (%s, %s, %s, %s, %s, %s, 0, %s, %s, %s, %s, %s,
|
|
'hks_genius', %s, now(), %s::jsonb)""",
|
|
(natj_id, klub_id, r['klub'], r['poz'], r['gp'], r['pob'],
|
|
r['por'], r['for_pts'], r['ag_pts'], r['gd'], r['bod'],
|
|
comp['url'],
|
|
json.dumps({"abbrev": r['abbrev'], "spol": comp['spol']})))
|
|
total_inserted += 1
|
|
|
|
print(f"\n=== TOTAL: {total_inserted} rows inserted ===")
|
|
print(f"PGŽ klubovi seen: {len(pgz_klubovi_seen)} - {pgz_klubovi_seen}")
|
|
|
|
# Stats
|
|
cu.execute("""SELECT n.naziv, count(t.*),
|
|
count(*) FILTER (WHERE t.klub_id IS NOT NULL) AS matched
|
|
FROM pgz_sport.natjecanja n
|
|
LEFT JOIN pgz_sport.natjecanja_tablice t ON n.id=t.natjecanje_id AND t.source='hks_genius'
|
|
WHERE n.source='hks_genius'
|
|
GROUP BY n.id, n.naziv ORDER BY n.id""")
|
|
print("\n=== HKS lige stats ===")
|
|
for r in cu.fetchall():
|
|
print(f" {r[1]:>3} klubova ({r[2]} matched) {r[0]}")
|
|
|
|
# PGŽ klubovi
|
|
cu.execute("""SELECT n.naziv AS liga, t.pozicija, t.klub_naziv, t.bodovi, t.gol_razlika, k.id, k.naziv AS db_naziv, k.aktivan
|
|
FROM pgz_sport.natjecanja_tablice t
|
|
JOIN pgz_sport.natjecanja n ON n.id=t.natjecanje_id
|
|
LEFT JOIN pgz_sport.klubovi k ON k.id=t.klub_id
|
|
WHERE t.source='hks_genius' AND k.region='PGŽ'
|
|
ORDER BY n.naziv, t.pozicija""")
|
|
print("\n=== PGŽ klubovi u HKS ===")
|
|
for r in cu.fetchall():
|
|
print(f" {r[0][:30]:<30} {r[1]:>2}. {r[2]:<25} {r[3]:>3}b GR{r[4]:+} -> {r[5]} '{r[6]}'")
|
|
|
|
cu.execute("""INSERT INTO pgz_sport.audit_feed
|
|
(table_name, action, source, source_url, details)
|
|
VALUES ('natjecanja_tablice', 'hks_genius_scrape', 'hks_genius', NULL, %s::jsonb)""",
|
|
(json.dumps({"inserted": total_inserted, "comps": len(COMPS), "pgz_seen": list(pgz_klubovi_seen)}),))
|
|
|
|
conn.close()
|
|
|
|
if __name__ == "__main__":
|
|
main()
|