300 lines
12 KiB
Python
Executable File
300 lines
12 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
HNS PGŽ FULL SCRAPER – svi klubovi, igrači, sezone, utakmice
|
||
─────────────────────────────────────────────────────────────
|
||
Sprema u:
|
||
pgz_sport.clanovi
|
||
pgz_sport.hns_player_seasons
|
||
pgz_sport.hns_player_matches
|
||
Autor: Damir Radulić / dradulic@outlook.com
|
||
Datum: 2026-05-15 (robustna verzija)
|
||
"""
|
||
|
||
import os, re, sys, time, logging, json
|
||
from datetime import datetime, timedelta
|
||
import requests
|
||
import psycopg2
|
||
from psycopg2.extras import execute_values
|
||
|
||
# ─── LOG ───────────────────────────────────────────
|
||
LOG_DIR = "/var/log/pgz-sport-debug"
|
||
os.makedirs(LOG_DIR, exist_ok=True)
|
||
logging.basicConfig(
|
||
level=logging.INFO,
|
||
format="%(asctime)s [FULL] %(message)s",
|
||
handlers=[
|
||
logging.FileHandler(os.path.join(LOG_DIR, "hns_full.log"), encoding="utf-8"),
|
||
logging.StreamHandler(sys.stdout)
|
||
]
|
||
)
|
||
log = logging.getLogger("hns_full")
|
||
|
||
# ─── DB CONN ────────────────────────────────────────
|
||
from dotenv import load_dotenv
|
||
load_dotenv('/opt/rinet-gpu/.env.master')
|
||
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
|
||
|
||
def get_conn():
|
||
c = psycopg2.connect(DSN)
|
||
c.autocommit = True
|
||
return c
|
||
|
||
# ─── HTTP FETCH ─────────────────────────────────────
|
||
UA = "Mozilla/5.0 (Ri.NET PGŽ Sport Bot; contact dradulic@outlook.com)"
|
||
def fetch(url, retries=3):
|
||
for i in range(retries):
|
||
try:
|
||
r = requests.get(url, headers={"User-Agent": UA}, timeout=15)
|
||
if r.status_code == 404:
|
||
return None
|
||
r.raise_for_status()
|
||
return r.text
|
||
except Exception as e:
|
||
time.sleep(1.5 * (i+1))
|
||
return None
|
||
|
||
# ─── PARSIRANJE ─────────────────────────────────────
|
||
def parse_roster(html, klub_hns_id):
|
||
"""Vraća listu (hns_igrac_id, ime, prezime, url)"""
|
||
igraci = []
|
||
# pronađi linkove na igrače
|
||
for m in re.finditer(r'<a\s+[^>]*href="(/igraci/(\d+)/[^"]*)"[^>]*>(.*?)</a>', html, re.DOTALL):
|
||
url = "https://semafor.hns.family" + m.group(1)
|
||
hns_id = int(m.group(2))
|
||
raw_name = re.sub(r'<[^>]+>', ' ', m.group(3)).strip()
|
||
raw_name = re.sub(r'\s+', ' ', raw_name)
|
||
if not raw_name:
|
||
continue
|
||
parts = raw_name.split(' ', 1)
|
||
ime = parts[0].strip()
|
||
prezime = parts[1].strip() if len(parts) > 1 else ''
|
||
# preskoči administrativne linkove
|
||
if not prezime or len(prezime) < 2:
|
||
continue
|
||
igraci.append((hns_id, ime, prezime, url))
|
||
return igraci
|
||
|
||
def parse_player_seasons(html, hns_igrac_id):
|
||
"""Vraća listu dictova za svaku sezonu s poljima:
|
||
sezona, natjecanje, klub_hns_id, klub_naziv, nastupi, golovi, asistencije, zuti, crveni, minute"""
|
||
seasons = []
|
||
# Pronađi blok sa sezonama – obično unutar <div class="playerStats"> ili tablice
|
||
# Pojednostavljeno: tražimo sve redove tablice koje sadrže godinu i link na klub
|
||
table_match = re.search(r'<table[^>]*class="[^"]*playerSeason[^"]*"[^>]*>(.*?)</table>', html, re.DOTALL)
|
||
if not table_match:
|
||
table_match = re.search(r'<table[^>]*class="[^"]*career[^"]*"[^>]*>(.*?)</table>', html, re.DOTALL)
|
||
if not table_match:
|
||
# fallback – tražimo bilo koju tablicu
|
||
table_match = re.search(r'<table[^>]*>(.*?)</table>', html, re.DOTALL)
|
||
if table_match:
|
||
table_html = table_match.group(1)
|
||
# parsiraj redove
|
||
for row in re.finditer(r'<tr[^>]*>(.*?)</tr>', table_html, re.DOTALL):
|
||
cells = re.findall(r'<td[^>]*>(.*?)</td>', row.group(1), re.DOTALL)
|
||
if len(cells) < 3:
|
||
continue
|
||
# očekivani format: sezona (npr. "2025/26"), klub (link), natjecanje, nastupi, golovi, asistencije, žuti, crveni, minute
|
||
# ali može varirati – tražimo barem sezonu i link na klub
|
||
sezona = None
|
||
klub_hns_id = None
|
||
klub_naziv = ""
|
||
natjecanje = ""
|
||
nastupi = golovi = asistencije = zuti = crveni = minute = 0
|
||
|
||
# prva ćelija često sezona
|
||
season_text = re.sub(r'<[^>]+>', '', cells[0]).strip()
|
||
if re.match(r'\d{4}/\d{2,4}', season_text):
|
||
sezona = season_text
|
||
|
||
# tražimo link na klub
|
||
club_link = re.search(r'<a[^>]*href="(/klubovi/(\d+)/[^"]*)"[^>]*>(.*?)</a>', row.group(1), re.DOTALL)
|
||
if club_link:
|
||
klub_hns_id = int(club_link.group(2))
|
||
klub_naziv = re.sub(r'<[^>]+>', '', club_link.group(3)).strip()
|
||
|
||
# natjecanje (obično treća ćelija ili druga ako nema kluba)
|
||
if len(cells) >= 2 and not klub_hns_id:
|
||
natjecanje = re.sub(r'<[^>]+>', '', cells[1]).strip()
|
||
elif len(cells) >= 3:
|
||
natjecanje = re.sub(r'<[^>]+>', '', cells[2]).strip()
|
||
|
||
# statistika
|
||
for i, cell in enumerate(cells):
|
||
text = re.sub(r'<[^>]+>', '', cell).strip()
|
||
if text.isdigit():
|
||
val = int(text)
|
||
if i == 3: nastupi = val
|
||
elif i == 4: golovi = val
|
||
elif i == 5: asistencije = val
|
||
elif i == 6: zuti = val
|
||
elif i == 7: crveni = val
|
||
elif i == 8: minute = val
|
||
|
||
if sezona and klub_hns_id:
|
||
seasons.append({
|
||
"hns_igrac_id": hns_igrac_id,
|
||
"sezona": sezona,
|
||
"klub_hns_id": str(klub_hns_id),
|
||
"klub_naziv": klub_naziv,
|
||
"natjecanje": natjecanje,
|
||
"nastupi": nastupi,
|
||
"golovi": golovi,
|
||
"asistencije": asistencije,
|
||
"zuti": zuti,
|
||
"crveni": crveni,
|
||
"minute": minute
|
||
})
|
||
return seasons
|
||
|
||
def parse_player_matches(html, hns_igrac_id):
|
||
"""Vraća listu dictova za zadnje utakmice"""
|
||
matches = []
|
||
# slično parsiranje, tablica utakmica
|
||
table_match = re.search(r'<table[^>]*class="[^"]*match[^"]*"[^>]*>(.*?)</table>', html, re.DOTALL)
|
||
if not table_match:
|
||
table_match = re.search(r'<table[^>]*>(.*?)</table>', html, re.DOTALL)
|
||
if table_match:
|
||
table_html = table_match.group(1)
|
||
for row in re.finditer(r'<tr[^>]*>(.*?)</tr>', table_html, re.DOTALL):
|
||
cells = re.findall(r'<td[^>]*>(.*?)</td>', row.group(1), re.DOTALL)
|
||
if len(cells) < 5:
|
||
continue
|
||
# format: datum, domaćin, gost, rezultat, (možda minutaža, golovi...)
|
||
datum = re.sub(r'<[^>]+>', '', cells[0]).strip()
|
||
domacin = re.sub(r'<[^>]+>', '', cells[1]).strip()
|
||
gost = re.sub(r'<[^>]+>', '', cells[2]).strip()
|
||
rezultat = re.sub(r'<[^>]+>', '', cells[3]).strip()
|
||
if datum and domacin:
|
||
matches.append({
|
||
"hns_igrac_id": hns_igrac_id,
|
||
"datum": datum,
|
||
"domacin": domacin,
|
||
"gost": gost,
|
||
"rezultat": rezultat
|
||
})
|
||
return matches
|
||
|
||
# ─── UPSERT U BAZU ──────────────────────────────────
|
||
def upsert_players(conn, players):
|
||
sql = """
|
||
INSERT INTO pgz_sport.clanovi (hns_igrac_id, ime, prezime, source_url, klub_hns_id)
|
||
VALUES %s
|
||
ON CONFLICT (hns_igrac_id) DO UPDATE SET
|
||
ime = EXCLUDED.ime,
|
||
prezime = EXCLUDED.prezime,
|
||
source_url = EXCLUDED.source_url,
|
||
klub_hns_id = EXCLUDED.klub_hns_id
|
||
"""
|
||
with conn.cursor() as cur:
|
||
execute_values(cur, sql, players)
|
||
|
||
def upsert_seasons(conn, seasons):
|
||
if not seasons:
|
||
return
|
||
sql = """
|
||
INSERT INTO pgz_sport.hns_player_seasons
|
||
(hns_igrac_id, sezona, natjecanje, klub_hns_id, klub_naziv,
|
||
nastupi, golovi, asistencije, zuti, crveni, minute, source_url)
|
||
VALUES %s
|
||
ON CONFLICT (hns_igrac_id, sezona, klub_hns_id, natjecanje) DO UPDATE SET
|
||
nastupi = EXCLUDED.nastupi,
|
||
golovi = EXCLUDED.golovi,
|
||
asistencije = EXCLUDED.asistencije,
|
||
zuti = EXCLUDED.zuti,
|
||
crveni = EXCLUDED.crveni,
|
||
minute = EXCLUDED.minute,
|
||
klub_naziv = EXCLUDED.klub_naziv
|
||
"""
|
||
vals = []
|
||
for s in seasons:
|
||
vals.append((
|
||
s["hns_igrac_id"], s["sezona"], s["natjecanje"], s["klub_hns_id"],
|
||
s["klub_naziv"], s["nastupi"], s["golovi"], s["asistencije"],
|
||
s["zuti"], s["crveni"], s["minute"], ""
|
||
))
|
||
with conn.cursor() as cur:
|
||
execute_values(cur, sql, vals, page_size=100)
|
||
|
||
def upsert_matches(conn, matches):
|
||
if not matches:
|
||
return
|
||
sql = """
|
||
INSERT INTO pgz_sport.hns_player_matches
|
||
(hns_igrac_id, datum, domacin, gost, rezultat)
|
||
VALUES %s
|
||
ON CONFLICT (hns_igrac_id, datum, domacin, gost) DO NOTHING
|
||
"""
|
||
vals = [(m["hns_igrac_id"], m["datum"], m["domacin"], m["gost"], m["rezultat"]) for m in matches]
|
||
with conn.cursor() as cur:
|
||
execute_values(cur, sql, vals, page_size=100)
|
||
|
||
# ─── MAIN ───────────────────────────────────────────
|
||
def main():
|
||
log.info("=== START FULL PGŽ HNS SCRAPE ===")
|
||
conn = get_conn()
|
||
cur = conn.cursor()
|
||
|
||
# 1. Dohvati sve PGŽ klubove s hns_klub_id
|
||
cur.execute("""
|
||
SELECT id, naziv, hns_klub_id
|
||
FROM pgz_sport.klubovi
|
||
WHERE savez_id = 10 AND hns_klub_id IS NOT NULL
|
||
""")
|
||
klubovi = cur.fetchall()
|
||
log.info(f"Klubova za obradu: {len(klubovi)}")
|
||
|
||
total_players = 0
|
||
total_seasons = 0
|
||
total_matches = 0
|
||
|
||
for klub_id, klub_naziv, hns_klub_id in klubovi:
|
||
log.info(f"🏟️ {klub_naziv} (HNS {hns_klub_id})")
|
||
# 2. Roster
|
||
roster_url = f"https://semafor.hns.family/klubovi/{hns_klub_id}/igraci/"
|
||
html = fetch(roster_url)
|
||
if not html:
|
||
log.warning(f" ⚠️ Nema rostera za {klub_naziv}")
|
||
continue
|
||
players = parse_roster(html, hns_klub_id)
|
||
if players:
|
||
# dodaj klub_hns_id u igrače (za update)
|
||
players_with_klub = [(p[0], p[1], p[2], p[3], str(hns_klub_id)) for p in players]
|
||
upsert_players(conn, players_with_klub)
|
||
log.info(f" 👥 {len(players)} igrača")
|
||
else:
|
||
log.warning(f" ⚠️ Nema igrača")
|
||
continue
|
||
|
||
# 3. Za svakog igrača skini sezone i utakmice ako nije skoro rađen
|
||
for hns_id, ime, prezime, url in players:
|
||
# provjeri kada je zadnji put scrape-an
|
||
cur.execute("""
|
||
SELECT MAX(scraped_at) FROM pgz_sport.hns_player_seasons
|
||
WHERE hns_igrac_id = %s
|
||
""", (hns_id,))
|
||
last = cur.fetchone()[0]
|
||
if last and (datetime.now() - last) < timedelta(days=7):
|
||
continue # preskoči svježe igrače
|
||
|
||
html = fetch(url)
|
||
if not html:
|
||
continue
|
||
seasons = parse_player_seasons(html, hns_id)
|
||
if seasons:
|
||
upsert_seasons(conn, seasons)
|
||
total_seasons += len(seasons)
|
||
matches = parse_player_matches(html, hns_id)
|
||
if matches:
|
||
upsert_matches(conn, matches)
|
||
total_matches += len(matches)
|
||
time.sleep(0.3) # pristojnost prema serveru
|
||
|
||
total_players += len(players)
|
||
time.sleep(1) # kratka pauza između klubova
|
||
|
||
conn.close()
|
||
log.info(f"=== GOTOVO: {total_players} igrača, {total_seasons} sezona, {total_matches} utakmica ===")
|
||
|
||
if __name__ == "__main__":
|
||
main() |