Files
pgz-sport/scrapers/hnl_scraper.py_prije_env_deepseek
T

104 lines
4.1 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""D.6 — Scrape HNL + 2.HNL + 3.HNL into natjecanja_tablice."""
import os
import requests, re, html as h_unesc, psycopg2
DB = dict(host="10.10.0.2", port=6432, dbname="rinet_v3",
user="rinet", password=os.environ["DB_PASSWORD"])
UA = "RiNET-Civic/1.0"
s = requests.Session()
s.headers.update({"User-Agent": UA})
def fetch(url):
try:
r = s.get(url, timeout=15)
return r.text if r.status_code == 200 else None
except: return None
def find_table_with_header(html_text, header_marker="Klub"):
"""Find table that contains 'Klub' in header."""
tables = re.findall(r'<table[^>]*>.*?</table>', html_text, re.DOTALL)
for t in tables:
rows = re.findall(r'<tr[^>]*>(.*?)</tr>', t, re.DOTALL)
if not rows: continue
h_cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', rows[0], re.DOTALL)
h_clean = [re.sub(r'<[^>]+>', '', c).strip() for c in h_cells]
if any(header_marker in h for h in h_clean):
return rows
return None
def parse_standings_rows(rows):
"""Parse rows; expect first is header."""
out = []
for row in rows[1:]:
cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', row, re.DOTALL)
cleaned = [re.sub(r'<[^>]+>', '', h_unesc.unescape(c)).strip() for c in cells]
if len(cleaned) < 8: continue
try:
poz = int(cleaned[0].rstrip('.'))
klub = cleaned[1]
odigrano = int(cleaned[2])
pobjede = int(cleaned[3])
nerij = int(cleaned[4])
porazi = int(cleaned[5])
gz = int(cleaned[6])
gp = int(cleaned[7])
razl = int(cleaned[8].replace('+','').replace('','-'))
bod = int(cleaned[9])
out.append({"poz":poz,"klub":klub,"odigrano":odigrano,"pobjede":pobjede,
"nerij":nerij,"porazi":porazi,"gz":gz,"gp":gp,"razl":razl,"bod":bod})
except (ValueError, IndexError):
continue
return out
# === Sources ===
LIGE = [
("https://hnl.hr/", "SuperSport HNL", "1.HNL", "supersport_hnl_2025_2026", "2025/2026"),
]
conn = psycopg2.connect(**DB); conn.autocommit = True
cr = conn.cursor()
def find_klub(naziv):
# HNK / NK prefixes
for q in [f"HNK {naziv}", f"NK {naziv}", naziv, f"%{naziv}%"]:
cr.execute("SELECT id FROM pgz_sport.klubovi WHERE LOWER(naziv) ILIKE LOWER(%s) AND id != 4426 AND LOWER(sport) LIKE '%%nogomet%%' LIMIT 1", (q,))
r = cr.fetchone()
if r: return r[0]
cr.execute("SELECT id FROM pgz_sport.klubovi WHERE LOWER(naziv) ILIKE LOWER(%s) AND id != 4426 LIMIT 1", (f"%{naziv}%",))
r = cr.fetchone()
return r[0] if r else None
for url, naziv, razina, ext_id, sezona in LIGE:
print(f"=== {naziv} ===")
body = fetch(url)
if not body: print(" fetch failed"); continue
rows = find_table_with_header(body, "Klub")
if not rows: print(" no table found"); continue
parsed = parse_standings_rows(rows)
print(f" {len(parsed)} rows parsed")
cr.execute("SELECT id FROM pgz_sport.natjecanja WHERE source='hns_manual' AND external_id=%s", (ext_id,))
nid = cr.fetchone()[0]
cr.execute("DELETE FROM pgz_sport.natjecanja_tablice WHERE natjecanje_id=%s", (nid,))
matched = 0
for r in parsed:
kid = find_klub(r["klub"])
if kid: matched += 1
cr.execute("""INSERT INTO pgz_sport.natjecanja_tablice
(natjecanje_id, klub_id, klub_naziv, pozicija, odigrano, pobjede, nerijeseno,
porazi, gol_z, gol_p, gol_razlika, bodovi)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""",
(nid, kid, r["klub"][:200], r["poz"], r["odigrano"], r["pobjede"],
r["nerij"], r["porazi"], r["gz"], r["gp"], r["razl"], r["bod"]))
print(f" matched klub_id: {matched}/{len(parsed)}")
# Verify Rijeka
cr.execute("""SELECT pozicija, klub_naziv, bodovi FROM pgz_sport.natjecanja_tablice
WHERE klub_naziv ILIKE '%Rijeka%' AND natjecanje_id IN (SELECT id FROM pgz_sport.natjecanja WHERE source='hns_manual')""")
for r in cr.fetchall():
print(f" {r}")
conn.close()