Files
pgz-sport/scrapers/hks_scraper.py_prije_env_deepseek
T

175 lines
8.0 KiB
Python
Executable File

#!/usr/bin/env python3
"""
HKS-CBF scraper - parses Genius Sports embed standings for HR košarka leagues.
Path: /opt/pgz-sport/scrapers/hks_scraper.py
Source: https://hosted.dcd.shared.geniussports.com/embednf/HKS/en/competition/{ID}/standings
Output: pgz_sport.natjecanja_tablice + pgz_sport.klubovi (matching)
"""
import os
import requests, re, json, psycopg2, html as ihtml
DB = dict(host="10.10.0.2", port=6432, port=5432, dbname='rinet_v3',
user='rinet', password=os.environ["DB_PASSWORD"])
UA = "RiNET-Civic/1.0 (https://rinet.one)"
HDR = {"User-Agent": UA, "Accept": "application/json", "Referer": "https://www.hks-cbf.hr/"}
# Competition ID lookup - 2025/26 sezona
COMPS = [
{"natj": "Supersport Premijer Liga (M) 2025/26", "razina": "Premijer M", "comp_id": 42186, "spol": "M",
"url": "https://www.hks-cbf.hr/supersport-premijer/"},
{"natj": "Supersport Premijer Liga (Ž) 2025/26", "razina": "Premijer Ž", "comp_id": 42187, "spol": "Ž",
"url": "https://www.hks-cbf.hr/premijer-zenska-liga/"},
{"natj": "1.Muška liga 2025/26", "razina": "1.M liga", "comp_id": 42259, "spol": "M",
"url": "https://www.hks-cbf.hr/prva-muska-liga/"},
]
def parse_standings(html):
"""Parse Genius Sports standings HTML table → list of rows."""
tables = re.findall(r'<table[^>]*>(.+?)</table>', html, re.DOTALL)
if not tables: return []
rows_html = re.findall(r'<tr[^>]*>(.+?)</tr>', tables[0], re.DOTALL)
if len(rows_html) < 2: return []
standings = []
for row in rows_html[1:]: # skip header
cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', row, re.DOTALL)
if len(cells) < 8: continue
clean = [ihtml.unescape(re.sub(r'<[^>]+>', '', c)).strip() for c in cells]
# ['1', '', 'KK ZadarZAD', '31', '58', '27', '4', '2669', '2209', '460']
try:
poz = int(clean[0])
except (ValueError, IndexError): continue
# Team has cell with name + abbreviation glued (e.g. "KK ZadarZAD")
team_raw = clean[2]
# Extract klub naziv - assume last 2-4 uppercase letters are abbrev
m = re.match(r'^(.+?)([A-ZČĆŠŽĐ]{2,4})$', team_raw)
if m:
klub = m.group(1).strip()
abbrev = m.group(2)
else:
klub = team_raw
abbrev = None
try:
standings.append({
"poz": poz,
"klub": klub,
"abbrev": abbrev,
"gp": int(clean[3]),
"bod": int(clean[4]),
"pob": int(clean[5]),
"por": int(clean[6]),
"for_pts": int(clean[7].replace(',','')),
"ag_pts": int(clean[8].replace(',','')),
"gd": int(clean[9].replace(',','').replace('+','')),
"ner": 0, # košarka nema neriješeno
})
except (ValueError, IndexError) as e:
continue
return standings
def main():
conn = psycopg2.connect(**DB); conn.autocommit = True
cu = conn.cursor()
total_inserted = 0
pgz_klubovi_seen = set()
for comp in COMPS:
print(f"\n=== {comp['natj']} (comp_id={comp['comp_id']}) ===")
api_url = f"https://hosted.dcd.shared.geniussports.com/embednf/HKS/en/competition/{comp['comp_id']}/standings?&iurl=https%3A%2F%2Fwww.hks-cbf.hr"
try:
r = requests.get(api_url, headers=HDR, timeout=20)
d = r.json()
html = d.get('html', '')
except Exception as e:
print(f" ERR fetch: {e}"); continue
rows = parse_standings(html)
print(f" Parsed {len(rows)} klubova")
# Get/create natjecanje
cu.execute("SELECT id FROM pgz_sport.natjecanja WHERE naziv=%s LIMIT 1", (comp['natj'],))
nr = cu.fetchone()
if nr:
natj_id = nr[0]
else:
cu.execute("""INSERT INTO pgz_sport.natjecanja
(naziv, sport, razina, sezona, source, source_url)
VALUES (%s, 'košarka', %s, '2025/26', 'hks_genius', %s)
RETURNING id""",
(comp['natj'], comp['razina'], comp['url']))
natj_id = cu.fetchone()[0]
print(f" Created natjecanje id={natj_id}")
# Clear old rows
cu.execute("DELETE FROM pgz_sport.natjecanja_tablice WHERE natjecanje_id=%s AND source='hks_genius'", (natj_id,))
for r in rows:
# Match klub
klub_id = None
cu.execute("""SELECT id FROM pgz_sport.klubovi
WHERE LOWER(naziv) = LOWER(%s)
OR LOWER(naziv) LIKE LOWER(%s)
ORDER BY
CASE WHEN aktivan THEN 0 ELSE 1 END,
CASE WHEN sport='košarka' THEN 0 ELSE 1 END,
CASE WHEN region='PGŽ' THEN 0 ELSE 1 END,
id ASC LIMIT 1""",
(r['klub'], f"%{r['klub']}%"))
kr = cu.fetchone()
if kr:
klub_id = kr[0]
# Check if PGŽ
cu.execute("SELECT region, grad FROM pgz_sport.klubovi WHERE id=%s", (klub_id,))
kdata = cu.fetchone()
if kdata and (kdata[0] == 'PGŽ' or (kdata[1] and any(g in (kdata[1] or '').lower() for g in ['rijeka','crikv','opatija','delnice','krk','cres','rab','lošinj','losinj','vrbnik','novi vinodolski','čavle','cavle','kraljevica','kostrena','kastav']))):
pgz_klubovi_seen.add(r['klub'])
cu.execute("""INSERT INTO pgz_sport.natjecanja_tablice
(natjecanje_id, klub_id, klub_naziv, pozicija,
odigrano, pobjede, nerijeseno, porazi, gol_z, gol_p,
gol_razlika, bodovi, source, source_url, updated_at, extra_data)
VALUES (%s, %s, %s, %s, %s, %s, 0, %s, %s, %s, %s, %s,
'hks_genius', %s, now(), %s::jsonb)""",
(natj_id, klub_id, r['klub'], r['poz'], r['gp'], r['pob'],
r['por'], r['for_pts'], r['ag_pts'], r['gd'], r['bod'],
comp['url'],
json.dumps({"abbrev": r['abbrev'], "spol": comp['spol']})))
total_inserted += 1
print(f"\n=== TOTAL: {total_inserted} rows inserted ===")
print(f"PGŽ klubovi seen: {len(pgz_klubovi_seen)} - {pgz_klubovi_seen}")
# Stats
cu.execute("""SELECT n.naziv, count(t.*),
count(*) FILTER (WHERE t.klub_id IS NOT NULL) AS matched
FROM pgz_sport.natjecanja n
LEFT JOIN pgz_sport.natjecanja_tablice t ON n.id=t.natjecanje_id AND t.source='hks_genius'
WHERE n.source='hks_genius'
GROUP BY n.id, n.naziv ORDER BY n.id""")
print("\n=== HKS lige stats ===")
for r in cu.fetchall():
print(f" {r[1]:>3} klubova ({r[2]} matched) {r[0]}")
# PGŽ klubovi
cu.execute("""SELECT n.naziv AS liga, t.pozicija, t.klub_naziv, t.bodovi, t.gol_razlika, k.id, k.naziv AS db_naziv, k.aktivan
FROM pgz_sport.natjecanja_tablice t
JOIN pgz_sport.natjecanja n ON n.id=t.natjecanje_id
LEFT JOIN pgz_sport.klubovi k ON k.id=t.klub_id
WHERE t.source='hks_genius' AND k.region='PGŽ'
ORDER BY n.naziv, t.pozicija""")
print("\n=== PGŽ klubovi u HKS ===")
for r in cu.fetchall():
print(f" {r[0][:30]:<30} {r[1]:>2}. {r[2]:<25} {r[3]:>3}b GR{r[4]:+} -> {r[5]} '{r[6]}'")
cu.execute("""INSERT INTO pgz_sport.audit_feed
(table_name, action, source, source_url, details)
VALUES ('natjecanja_tablice', 'hks_genius_scrape', 'hks_genius', NULL, %s::jsonb)""",
(json.dumps({"inserted": total_inserted, "comps": len(COMPS), "pgz_seen": list(pgz_klubovi_seen)}),))
conn.close()
if __name__ == "__main__":
main()