PGŽ Sport Platform — Round 1+2 baseline (sport2.html + API)
This commit is contained in:
Executable
+102
@@ -0,0 +1,102 @@
|
||||
#!/usr/bin/env python3
|
||||
"""D.6 — Scrape HNL + 2.HNL + 3.HNL into natjecanja_tablice."""
|
||||
import requests, re, html as h_unesc, psycopg2
|
||||
|
||||
DB = dict(host="10.10.0.2", port=6432, dbname="rinet_v3",
|
||||
user="rinet", password="R1net2026!SecureDB#v7")
|
||||
UA = "RiNET-Civic/1.0"
|
||||
s = requests.Session()
|
||||
s.headers.update({"User-Agent": UA})
|
||||
|
||||
def fetch(url):
|
||||
try:
|
||||
r = s.get(url, timeout=15)
|
||||
return r.text if r.status_code == 200 else None
|
||||
except: return None
|
||||
|
||||
def find_table_with_header(html_text, header_marker="Klub"):
|
||||
"""Find table that contains 'Klub' in header."""
|
||||
tables = re.findall(r'<table[^>]*>.*?</table>', html_text, re.DOTALL)
|
||||
for t in tables:
|
||||
rows = re.findall(r'<tr[^>]*>(.*?)</tr>', t, re.DOTALL)
|
||||
if not rows: continue
|
||||
h_cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', rows[0], re.DOTALL)
|
||||
h_clean = [re.sub(r'<[^>]+>', '', c).strip() for c in h_cells]
|
||||
if any(header_marker in h for h in h_clean):
|
||||
return rows
|
||||
return None
|
||||
|
||||
def parse_standings_rows(rows):
|
||||
"""Parse rows; expect first is header."""
|
||||
out = []
|
||||
for row in rows[1:]:
|
||||
cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', row, re.DOTALL)
|
||||
cleaned = [re.sub(r'<[^>]+>', '', h_unesc.unescape(c)).strip() for c in cells]
|
||||
if len(cleaned) < 8: continue
|
||||
try:
|
||||
poz = int(cleaned[0].rstrip('.'))
|
||||
klub = cleaned[1]
|
||||
odigrano = int(cleaned[2])
|
||||
pobjede = int(cleaned[3])
|
||||
nerij = int(cleaned[4])
|
||||
porazi = int(cleaned[5])
|
||||
gz = int(cleaned[6])
|
||||
gp = int(cleaned[7])
|
||||
razl = int(cleaned[8].replace('+','').replace('−','-'))
|
||||
bod = int(cleaned[9])
|
||||
out.append({"poz":poz,"klub":klub,"odigrano":odigrano,"pobjede":pobjede,
|
||||
"nerij":nerij,"porazi":porazi,"gz":gz,"gp":gp,"razl":razl,"bod":bod})
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
return out
|
||||
|
||||
# === Sources ===
|
||||
LIGE = [
|
||||
("https://hnl.hr/", "SuperSport HNL", "1.HNL", "supersport_hnl_2025_2026", "2025/2026"),
|
||||
]
|
||||
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cr = conn.cursor()
|
||||
|
||||
def find_klub(naziv):
|
||||
# HNK / NK prefixes
|
||||
for q in [f"HNK {naziv}", f"NK {naziv}", naziv, f"%{naziv}%"]:
|
||||
cr.execute("SELECT id FROM pgz_sport.klubovi WHERE LOWER(naziv) ILIKE LOWER(%s) AND id != 4426 AND LOWER(sport) LIKE '%%nogomet%%' LIMIT 1", (q,))
|
||||
r = cr.fetchone()
|
||||
if r: return r[0]
|
||||
cr.execute("SELECT id FROM pgz_sport.klubovi WHERE LOWER(naziv) ILIKE LOWER(%s) AND id != 4426 LIMIT 1", (f"%{naziv}%",))
|
||||
r = cr.fetchone()
|
||||
return r[0] if r else None
|
||||
|
||||
for url, naziv, razina, ext_id, sezona in LIGE:
|
||||
print(f"=== {naziv} ===")
|
||||
body = fetch(url)
|
||||
if not body: print(" fetch failed"); continue
|
||||
rows = find_table_with_header(body, "Klub")
|
||||
if not rows: print(" no table found"); continue
|
||||
parsed = parse_standings_rows(rows)
|
||||
print(f" {len(parsed)} rows parsed")
|
||||
|
||||
cr.execute("SELECT id FROM pgz_sport.natjecanja WHERE source='hns_manual' AND external_id=%s", (ext_id,))
|
||||
nid = cr.fetchone()[0]
|
||||
cr.execute("DELETE FROM pgz_sport.natjecanja_tablice WHERE natjecanje_id=%s", (nid,))
|
||||
|
||||
matched = 0
|
||||
for r in parsed:
|
||||
kid = find_klub(r["klub"])
|
||||
if kid: matched += 1
|
||||
cr.execute("""INSERT INTO pgz_sport.natjecanja_tablice
|
||||
(natjecanje_id, klub_id, klub_naziv, pozicija, odigrano, pobjede, nerijeseno,
|
||||
porazi, gol_z, gol_p, gol_razlika, bodovi)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""",
|
||||
(nid, kid, r["klub"][:200], r["poz"], r["odigrano"], r["pobjede"],
|
||||
r["nerij"], r["porazi"], r["gz"], r["gp"], r["razl"], r["bod"]))
|
||||
print(f" matched klub_id: {matched}/{len(parsed)}")
|
||||
|
||||
# Verify Rijeka
|
||||
cr.execute("""SELECT pozicija, klub_naziv, bodovi FROM pgz_sport.natjecanja_tablice
|
||||
WHERE klub_naziv ILIKE '%Rijeka%' AND natjecanje_id IN (SELECT id FROM pgz_sport.natjecanja WHERE source='hns_manual')""")
|
||||
for r in cr.fetchall():
|
||||
print(f" {r}")
|
||||
|
||||
conn.close()
|
||||
Reference in New Issue
Block a user