Files
pgz-sport/scripts/hns_pgz_full.py
T

300 lines
12 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
HNS PGŽ FULL SCRAPER svi klubovi, igrači, sezone, utakmice
─────────────────────────────────────────────────────────────
Sprema u:
pgz_sport.clanovi
pgz_sport.hns_player_seasons
pgz_sport.hns_player_matches
Autor: Damir Radulić / dradulic@outlook.com
Datum: 2026-05-15 (robustna verzija)
"""
import os, re, sys, time, logging, json
from datetime import datetime, timedelta
import requests
import psycopg2
from psycopg2.extras import execute_values
# ─── LOG ───────────────────────────────────────────
LOG_DIR = "/var/log/pgz-sport-debug"
os.makedirs(LOG_DIR, exist_ok=True)
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [FULL] %(message)s",
handlers=[
logging.FileHandler(os.path.join(LOG_DIR, "hns_full.log"), encoding="utf-8"),
logging.StreamHandler(sys.stdout)
]
)
log = logging.getLogger("hns_full")
# ─── DB CONN ────────────────────────────────────────
from dotenv import load_dotenv
load_dotenv('/opt/rinet-gpu/.env.master')
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
def get_conn():
c = psycopg2.connect(DSN)
c.autocommit = True
return c
# ─── HTTP FETCH ─────────────────────────────────────
UA = "Mozilla/5.0 (Ri.NET PGŽ Sport Bot; contact dradulic@outlook.com)"
def fetch(url, retries=3):
for i in range(retries):
try:
r = requests.get(url, headers={"User-Agent": UA}, timeout=15)
if r.status_code == 404:
return None
r.raise_for_status()
return r.text
except Exception as e:
time.sleep(1.5 * (i+1))
return None
# ─── PARSIRANJE ─────────────────────────────────────
def parse_roster(html, klub_hns_id):
"""Vraća listu (hns_igrac_id, ime, prezime, url)"""
igraci = []
# pronađi linkove na igrače
for m in re.finditer(r'<a\s+[^>]*href="(/igraci/(\d+)/[^"]*)"[^>]*>(.*?)</a>', html, re.DOTALL):
url = "https://semafor.hns.family" + m.group(1)
hns_id = int(m.group(2))
raw_name = re.sub(r'<[^>]+>', ' ', m.group(3)).strip()
raw_name = re.sub(r'\s+', ' ', raw_name)
if not raw_name:
continue
parts = raw_name.split(' ', 1)
ime = parts[0].strip()
prezime = parts[1].strip() if len(parts) > 1 else ''
# preskoči administrativne linkove
if not prezime or len(prezime) < 2:
continue
igraci.append((hns_id, ime, prezime, url))
return igraci
def parse_player_seasons(html, hns_igrac_id):
"""Vraća listu dictova za svaku sezonu s poljima:
sezona, natjecanje, klub_hns_id, klub_naziv, nastupi, golovi, asistencije, zuti, crveni, minute"""
seasons = []
# Pronađi blok sa sezonama obično unutar <div class="playerStats"> ili tablice
# Pojednostavljeno: tražimo sve redove tablice koje sadrže godinu i link na klub
table_match = re.search(r'<table[^>]*class="[^"]*playerSeason[^"]*"[^>]*>(.*?)</table>', html, re.DOTALL)
if not table_match:
table_match = re.search(r'<table[^>]*class="[^"]*career[^"]*"[^>]*>(.*?)</table>', html, re.DOTALL)
if not table_match:
# fallback tražimo bilo koju tablicu
table_match = re.search(r'<table[^>]*>(.*?)</table>', html, re.DOTALL)
if table_match:
table_html = table_match.group(1)
# parsiraj redove
for row in re.finditer(r'<tr[^>]*>(.*?)</tr>', table_html, re.DOTALL):
cells = re.findall(r'<td[^>]*>(.*?)</td>', row.group(1), re.DOTALL)
if len(cells) < 3:
continue
# očekivani format: sezona (npr. "2025/26"), klub (link), natjecanje, nastupi, golovi, asistencije, žuti, crveni, minute
# ali može varirati tražimo barem sezonu i link na klub
sezona = None
klub_hns_id = None
klub_naziv = ""
natjecanje = ""
nastupi = golovi = asistencije = zuti = crveni = minute = 0
# prva ćelija često sezona
season_text = re.sub(r'<[^>]+>', '', cells[0]).strip()
if re.match(r'\d{4}/\d{2,4}', season_text):
sezona = season_text
# tražimo link na klub
club_link = re.search(r'<a[^>]*href="(/klubovi/(\d+)/[^"]*)"[^>]*>(.*?)</a>', row.group(1), re.DOTALL)
if club_link:
klub_hns_id = int(club_link.group(2))
klub_naziv = re.sub(r'<[^>]+>', '', club_link.group(3)).strip()
# natjecanje (obično treća ćelija ili druga ako nema kluba)
if len(cells) >= 2 and not klub_hns_id:
natjecanje = re.sub(r'<[^>]+>', '', cells[1]).strip()
elif len(cells) >= 3:
natjecanje = re.sub(r'<[^>]+>', '', cells[2]).strip()
# statistika
for i, cell in enumerate(cells):
text = re.sub(r'<[^>]+>', '', cell).strip()
if text.isdigit():
val = int(text)
if i == 3: nastupi = val
elif i == 4: golovi = val
elif i == 5: asistencije = val
elif i == 6: zuti = val
elif i == 7: crveni = val
elif i == 8: minute = val
if sezona and klub_hns_id:
seasons.append({
"hns_igrac_id": hns_igrac_id,
"sezona": sezona,
"klub_hns_id": str(klub_hns_id),
"klub_naziv": klub_naziv,
"natjecanje": natjecanje,
"nastupi": nastupi,
"golovi": golovi,
"asistencije": asistencije,
"zuti": zuti,
"crveni": crveni,
"minute": minute
})
return seasons
def parse_player_matches(html, hns_igrac_id):
"""Vraća listu dictova za zadnje utakmice"""
matches = []
# slično parsiranje, tablica utakmica
table_match = re.search(r'<table[^>]*class="[^"]*match[^"]*"[^>]*>(.*?)</table>', html, re.DOTALL)
if not table_match:
table_match = re.search(r'<table[^>]*>(.*?)</table>', html, re.DOTALL)
if table_match:
table_html = table_match.group(1)
for row in re.finditer(r'<tr[^>]*>(.*?)</tr>', table_html, re.DOTALL):
cells = re.findall(r'<td[^>]*>(.*?)</td>', row.group(1), re.DOTALL)
if len(cells) < 5:
continue
# format: datum, domaćin, gost, rezultat, (možda minutaža, golovi...)
datum = re.sub(r'<[^>]+>', '', cells[0]).strip()
domacin = re.sub(r'<[^>]+>', '', cells[1]).strip()
gost = re.sub(r'<[^>]+>', '', cells[2]).strip()
rezultat = re.sub(r'<[^>]+>', '', cells[3]).strip()
if datum and domacin:
matches.append({
"hns_igrac_id": hns_igrac_id,
"datum": datum,
"domacin": domacin,
"gost": gost,
"rezultat": rezultat
})
return matches
# ─── UPSERT U BAZU ──────────────────────────────────
def upsert_players(conn, players):
sql = """
INSERT INTO pgz_sport.clanovi (hns_igrac_id, ime, prezime, source_url, klub_hns_id)
VALUES %s
ON CONFLICT (hns_igrac_id) DO UPDATE SET
ime = EXCLUDED.ime,
prezime = EXCLUDED.prezime,
source_url = EXCLUDED.source_url,
klub_hns_id = EXCLUDED.klub_hns_id
"""
with conn.cursor() as cur:
execute_values(cur, sql, players)
def upsert_seasons(conn, seasons):
if not seasons:
return
sql = """
INSERT INTO pgz_sport.hns_player_seasons
(hns_igrac_id, sezona, natjecanje, klub_hns_id, klub_naziv,
nastupi, golovi, asistencije, zuti, crveni, minute, source_url)
VALUES %s
ON CONFLICT (hns_igrac_id, sezona, klub_hns_id, natjecanje) DO UPDATE SET
nastupi = EXCLUDED.nastupi,
golovi = EXCLUDED.golovi,
asistencije = EXCLUDED.asistencije,
zuti = EXCLUDED.zuti,
crveni = EXCLUDED.crveni,
minute = EXCLUDED.minute,
klub_naziv = EXCLUDED.klub_naziv
"""
vals = []
for s in seasons:
vals.append((
s["hns_igrac_id"], s["sezona"], s["natjecanje"], s["klub_hns_id"],
s["klub_naziv"], s["nastupi"], s["golovi"], s["asistencije"],
s["zuti"], s["crveni"], s["minute"], ""
))
with conn.cursor() as cur:
execute_values(cur, sql, vals, page_size=100)
def upsert_matches(conn, matches):
if not matches:
return
sql = """
INSERT INTO pgz_sport.hns_player_matches
(hns_igrac_id, datum, domacin, gost, rezultat)
VALUES %s
ON CONFLICT (hns_igrac_id, datum, domacin, gost) DO NOTHING
"""
vals = [(m["hns_igrac_id"], m["datum"], m["domacin"], m["gost"], m["rezultat"]) for m in matches]
with conn.cursor() as cur:
execute_values(cur, sql, vals, page_size=100)
# ─── MAIN ───────────────────────────────────────────
def main():
log.info("=== START FULL PGŽ HNS SCRAPE ===")
conn = get_conn()
cur = conn.cursor()
# 1. Dohvati sve PGŽ klubove s hns_klub_id
cur.execute("""
SELECT id, naziv, hns_klub_id
FROM pgz_sport.klubovi
WHERE savez_id = 10 AND hns_klub_id IS NOT NULL
""")
klubovi = cur.fetchall()
log.info(f"Klubova za obradu: {len(klubovi)}")
total_players = 0
total_seasons = 0
total_matches = 0
for klub_id, klub_naziv, hns_klub_id in klubovi:
log.info(f"🏟️ {klub_naziv} (HNS {hns_klub_id})")
# 2. Roster
roster_url = f"https://semafor.hns.family/klubovi/{hns_klub_id}/igraci/"
html = fetch(roster_url)
if not html:
log.warning(f" ⚠️ Nema rostera za {klub_naziv}")
continue
players = parse_roster(html, hns_klub_id)
if players:
# dodaj klub_hns_id u igrače (za update)
players_with_klub = [(p[0], p[1], p[2], p[3], str(hns_klub_id)) for p in players]
upsert_players(conn, players_with_klub)
log.info(f" 👥 {len(players)} igrača")
else:
log.warning(f" ⚠️ Nema igrača")
continue
# 3. Za svakog igrača skini sezone i utakmice ako nije skoro rađen
for hns_id, ime, prezime, url in players:
# provjeri kada je zadnji put scrape-an
cur.execute("""
SELECT MAX(scraped_at) FROM pgz_sport.hns_player_seasons
WHERE hns_igrac_id = %s
""", (hns_id,))
last = cur.fetchone()[0]
if last and (datetime.now() - last) < timedelta(days=7):
continue # preskoči svježe igrače
html = fetch(url)
if not html:
continue
seasons = parse_player_seasons(html, hns_id)
if seasons:
upsert_seasons(conn, seasons)
total_seasons += len(seasons)
matches = parse_player_matches(html, hns_id)
if matches:
upsert_matches(conn, matches)
total_matches += len(matches)
time.sleep(0.3) # pristojnost prema serveru
total_players += len(players)
time.sleep(1) # kratka pauza između klubova
conn.close()
log.info(f"=== GOTOVO: {total_players} igrača, {total_seasons} sezona, {total_matches} utakmica ===")
if __name__ == "__main__":
main()