107 lines
4.2 KiB
Python
Executable File
107 lines
4.2 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
from dotenv import load_dotenv
|
||
load_dotenv('/opt/rinet-gpu/.env.master')
|
||
# auto-added by patch_scrapers_with_dotenv.sh
|
||
"""D.6 — Scrape HNL + 2.HNL + 3.HNL into natjecanja_tablice."""
|
||
import os
|
||
import requests, re, html as h_unesc, psycopg2
|
||
|
||
DB = dict(host="10.10.0.2", port=6432, dbname="rinet_v3",
|
||
user="rinet", password=os.environ["DB_PASSWORD"])
|
||
UA = "RiNET-Civic/1.0"
|
||
s = requests.Session()
|
||
s.headers.update({"User-Agent": UA})
|
||
|
||
def fetch(url):
|
||
try:
|
||
r = s.get(url, timeout=15)
|
||
return r.text if r.status_code == 200 else None
|
||
except: return None
|
||
|
||
def find_table_with_header(html_text, header_marker="Klub"):
|
||
"""Find table that contains 'Klub' in header."""
|
||
tables = re.findall(r'<table[^>]*>.*?</table>', html_text, re.DOTALL)
|
||
for t in tables:
|
||
rows = re.findall(r'<tr[^>]*>(.*?)</tr>', t, re.DOTALL)
|
||
if not rows: continue
|
||
h_cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', rows[0], re.DOTALL)
|
||
h_clean = [re.sub(r'<[^>]+>', '', c).strip() for c in h_cells]
|
||
if any(header_marker in h for h in h_clean):
|
||
return rows
|
||
return None
|
||
|
||
def parse_standings_rows(rows):
|
||
"""Parse rows; expect first is header."""
|
||
out = []
|
||
for row in rows[1:]:
|
||
cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', row, re.DOTALL)
|
||
cleaned = [re.sub(r'<[^>]+>', '', h_unesc.unescape(c)).strip() for c in cells]
|
||
if len(cleaned) < 8: continue
|
||
try:
|
||
poz = int(cleaned[0].rstrip('.'))
|
||
klub = cleaned[1]
|
||
odigrano = int(cleaned[2])
|
||
pobjede = int(cleaned[3])
|
||
nerij = int(cleaned[4])
|
||
porazi = int(cleaned[5])
|
||
gz = int(cleaned[6])
|
||
gp = int(cleaned[7])
|
||
razl = int(cleaned[8].replace('+','').replace('−','-'))
|
||
bod = int(cleaned[9])
|
||
out.append({"poz":poz,"klub":klub,"odigrano":odigrano,"pobjede":pobjede,
|
||
"nerij":nerij,"porazi":porazi,"gz":gz,"gp":gp,"razl":razl,"bod":bod})
|
||
except (ValueError, IndexError):
|
||
continue
|
||
return out
|
||
|
||
# === Sources ===
|
||
LIGE = [
|
||
("https://hnl.hr/", "SuperSport HNL", "1.HNL", "supersport_hnl_2025_2026", "2025/2026"),
|
||
]
|
||
|
||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||
cr = conn.cursor()
|
||
|
||
def find_klub(naziv):
|
||
# HNK / NK prefixes
|
||
for q in [f"HNK {naziv}", f"NK {naziv}", naziv, f"%{naziv}%"]:
|
||
cr.execute("SELECT id FROM pgz_sport.klubovi WHERE LOWER(naziv) ILIKE LOWER(%s) AND id != 4426 AND LOWER(sport) LIKE '%%nogomet%%' LIMIT 1", (q,))
|
||
r = cr.fetchone()
|
||
if r: return r[0]
|
||
cr.execute("SELECT id FROM pgz_sport.klubovi WHERE LOWER(naziv) ILIKE LOWER(%s) AND id != 4426 LIMIT 1", (f"%{naziv}%",))
|
||
r = cr.fetchone()
|
||
return r[0] if r else None
|
||
|
||
for url, naziv, razina, ext_id, sezona in LIGE:
|
||
print(f"=== {naziv} ===")
|
||
body = fetch(url)
|
||
if not body: print(" fetch failed"); continue
|
||
rows = find_table_with_header(body, "Klub")
|
||
if not rows: print(" no table found"); continue
|
||
parsed = parse_standings_rows(rows)
|
||
print(f" {len(parsed)} rows parsed")
|
||
|
||
cr.execute("SELECT id FROM pgz_sport.natjecanja WHERE source='hns_manual' AND external_id=%s", (ext_id,))
|
||
nid = cr.fetchone()[0]
|
||
cr.execute("DELETE FROM pgz_sport.natjecanja_tablice WHERE natjecanje_id=%s", (nid,))
|
||
|
||
matched = 0
|
||
for r in parsed:
|
||
kid = find_klub(r["klub"])
|
||
if kid: matched += 1
|
||
cr.execute("""INSERT INTO pgz_sport.natjecanja_tablice
|
||
(natjecanje_id, klub_id, klub_naziv, pozicija, odigrano, pobjede, nerijeseno,
|
||
porazi, gol_z, gol_p, gol_razlika, bodovi)
|
||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""",
|
||
(nid, kid, r["klub"][:200], r["poz"], r["odigrano"], r["pobjede"],
|
||
r["nerij"], r["porazi"], r["gz"], r["gp"], r["razl"], r["bod"]))
|
||
print(f" matched klub_id: {matched}/{len(parsed)}")
|
||
|
||
# Verify Rijeka
|
||
cr.execute("""SELECT pozicija, klub_naziv, bodovi FROM pgz_sport.natjecanja_tablice
|
||
WHERE klub_naziv ILIKE '%Rijeka%' AND natjecanje_id IN (SELECT id FROM pgz_sport.natjecanja WHERE source='hns_manual')""")
|
||
for r in cr.fetchall():
|
||
print(f" {r}")
|
||
|
||
conn.close()
|