#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
HNS PGŽ FULL SCRAPER – svi klubovi, igrači, sezone, utakmice
─────────────────────────────────────────────────────────────
Sprema u:
pgz_sport.clanovi
pgz_sport.hns_player_seasons
pgz_sport.hns_player_matches
Autor: Damir Radulić / dradulic@outlook.com
Datum: 2026-05-15 (robustna verzija)
"""
import os, re, sys, time, logging, json
from datetime import datetime, timedelta
import requests
import psycopg2
from psycopg2.extras import execute_values
# ─── LOG ───────────────────────────────────────────
LOG_DIR = "/var/log/pgz-sport-debug"
os.makedirs(LOG_DIR, exist_ok=True)
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [FULL] %(message)s",
handlers=[
logging.FileHandler(os.path.join(LOG_DIR, "hns_full.log"), encoding="utf-8"),
logging.StreamHandler(sys.stdout)
]
)
log = logging.getLogger("hns_full")
# ─── DB CONN ────────────────────────────────────────
from dotenv import load_dotenv
load_dotenv('/opt/rinet-gpu/.env.master')
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
def get_conn():
c = psycopg2.connect(DSN)
c.autocommit = True
return c
# ─── HTTP FETCH ─────────────────────────────────────
UA = "Mozilla/5.0 (Ri.NET PGŽ Sport Bot; contact dradulic@outlook.com)"
def fetch(url, retries=3):
for i in range(retries):
try:
r = requests.get(url, headers={"User-Agent": UA}, timeout=15)
if r.status_code == 404:
return None
r.raise_for_status()
return r.text
except Exception as e:
time.sleep(1.5 * (i+1))
return None
# ─── PARSIRANJE ─────────────────────────────────────
def parse_roster(html, klub_hns_id):
"""Vraća listu (hns_igrac_id, ime, prezime, url)"""
igraci = []
# pronađi linkove na igrače
for m in re.finditer(r']*href="(/igraci/(\d+)/[^"]*)"[^>]*>(.*?)', html, re.DOTALL):
url = "https://semafor.hns.family" + m.group(1)
hns_id = int(m.group(2))
raw_name = re.sub(r'<[^>]+>', ' ', m.group(3)).strip()
raw_name = re.sub(r'\s+', ' ', raw_name)
if not raw_name:
continue
parts = raw_name.split(' ', 1)
ime = parts[0].strip()
prezime = parts[1].strip() if len(parts) > 1 else ''
# preskoči administrativne linkove
if not prezime or len(prezime) < 2:
continue
igraci.append((hns_id, ime, prezime, url))
return igraci
def parse_player_seasons(html, hns_igrac_id):
"""Vraća listu dictova za svaku sezonu s poljima:
sezona, natjecanje, klub_hns_id, klub_naziv, nastupi, golovi, asistencije, zuti, crveni, minute"""
seasons = []
# Pronađi blok sa sezonama – obično unutar
ili tablice
# Pojednostavljeno: tražimo sve redove tablice koje sadrže godinu i link na klub
table_match = re.search(r'
]*class="[^"]*playerSeason[^"]*"[^>]*>(.*?)
', html, re.DOTALL)
if not table_match:
table_match = re.search(r'
]*class="[^"]*career[^"]*"[^>]*>(.*?)
', html, re.DOTALL)
if not table_match:
# fallback – tražimo bilo koju tablicu
table_match = re.search(r'
', html, re.DOTALL)
if table_match:
table_html = table_match.group(1)
# parsiraj redove
for row in re.finditer(r'
]*>(.*?)
', table_html, re.DOTALL):
cells = re.findall(r'
]*>(.*?) | ', row.group(1), re.DOTALL)
if len(cells) < 3:
continue
# očekivani format: sezona (npr. "2025/26"), klub (link), natjecanje, nastupi, golovi, asistencije, žuti, crveni, minute
# ali može varirati – tražimo barem sezonu i link na klub
sezona = None
klub_hns_id = None
klub_naziv = ""
natjecanje = ""
nastupi = golovi = asistencije = zuti = crveni = minute = 0
# prva ćelija često sezona
season_text = re.sub(r'<[^>]+>', '', cells[0]).strip()
if re.match(r'\d{4}/\d{2,4}', season_text):
sezona = season_text
# tražimo link na klub
club_link = re.search(r'
]*href="(/klubovi/(\d+)/[^"]*)"[^>]*>(.*?)', row.group(1), re.DOTALL)
if club_link:
klub_hns_id = int(club_link.group(2))
klub_naziv = re.sub(r'<[^>]+>', '', club_link.group(3)).strip()
# natjecanje (obično treća ćelija ili druga ako nema kluba)
if len(cells) >= 2 and not klub_hns_id:
natjecanje = re.sub(r'<[^>]+>', '', cells[1]).strip()
elif len(cells) >= 3:
natjecanje = re.sub(r'<[^>]+>', '', cells[2]).strip()
# statistika
for i, cell in enumerate(cells):
text = re.sub(r'<[^>]+>', '', cell).strip()
if text.isdigit():
val = int(text)
if i == 3: nastupi = val
elif i == 4: golovi = val
elif i == 5: asistencije = val
elif i == 6: zuti = val
elif i == 7: crveni = val
elif i == 8: minute = val
if sezona and klub_hns_id:
seasons.append({
"hns_igrac_id": hns_igrac_id,
"sezona": sezona,
"klub_hns_id": str(klub_hns_id),
"klub_naziv": klub_naziv,
"natjecanje": natjecanje,
"nastupi": nastupi,
"golovi": golovi,
"asistencije": asistencije,
"zuti": zuti,
"crveni": crveni,
"minute": minute
})
return seasons
def parse_player_matches(html, hns_igrac_id):
"""Vraća listu dictova za zadnje utakmice"""
matches = []
# slično parsiranje, tablica utakmica
table_match = re.search(r'
]*class="[^"]*match[^"]*"[^>]*>(.*?)
', html, re.DOTALL)
if not table_match:
table_match = re.search(r'
', html, re.DOTALL)
if table_match:
table_html = table_match.group(1)
for row in re.finditer(r'
]*>(.*?)
', table_html, re.DOTALL):
cells = re.findall(r'
]*>(.*?) | ', row.group(1), re.DOTALL)
if len(cells) < 5:
continue
# format: datum, domaćin, gost, rezultat, (možda minutaža, golovi...)
datum = re.sub(r'<[^>]+>', '', cells[0]).strip()
domacin = re.sub(r'<[^>]+>', '', cells[1]).strip()
gost = re.sub(r'<[^>]+>', '', cells[2]).strip()
rezultat = re.sub(r'<[^>]+>', '', cells[3]).strip()
if datum and domacin:
matches.append({
"hns_igrac_id": hns_igrac_id,
"datum": datum,
"domacin": domacin,
"gost": gost,
"rezultat": rezultat
})
return matches
# ─── UPSERT U BAZU ──────────────────────────────────
def upsert_players(conn, players):
sql = """
INSERT INTO pgz_sport.clanovi (hns_igrac_id, ime, prezime, source_url, klub_hns_id)
VALUES %s
ON CONFLICT (hns_igrac_id) DO UPDATE SET
ime = EXCLUDED.ime,
prezime = EXCLUDED.prezime,
source_url = EXCLUDED.source_url,
klub_hns_id = EXCLUDED.klub_hns_id
"""
with conn.cursor() as cur:
execute_values(cur, sql, players)
def upsert_seasons(conn, seasons):
if not seasons:
return
sql = """
INSERT INTO pgz_sport.hns_player_seasons
(hns_igrac_id, sezona, natjecanje, klub_hns_id, klub_naziv,
nastupi, golovi, asistencije, zuti, crveni, minute, source_url)
VALUES %s
ON CONFLICT (hns_igrac_id, sezona, klub_hns_id, natjecanje) DO UPDATE SET
nastupi = EXCLUDED.nastupi,
golovi = EXCLUDED.golovi,
asistencije = EXCLUDED.asistencije,
zuti = EXCLUDED.zuti,
crveni = EXCLUDED.crveni,
minute = EXCLUDED.minute,
klub_naziv = EXCLUDED.klub_naziv
"""
vals = []
for s in seasons:
vals.append((
s["hns_igrac_id"], s["sezona"], s["natjecanje"], s["klub_hns_id"],
s["klub_naziv"], s["nastupi"], s["golovi"], s["asistencije"],
s["zuti"], s["crveni"], s["minute"], ""
))
with conn.cursor() as cur:
execute_values(cur, sql, vals, page_size=100)
def upsert_matches(conn, matches):
if not matches:
return
sql = """
INSERT INTO pgz_sport.hns_player_matches
(hns_igrac_id, datum, domacin, gost, rezultat)
VALUES %s
ON CONFLICT (hns_igrac_id, datum, domacin, gost) DO NOTHING
"""
vals = [(m["hns_igrac_id"], m["datum"], m["domacin"], m["gost"], m["rezultat"]) for m in matches]
with conn.cursor() as cur:
execute_values(cur, sql, vals, page_size=100)
# ─── MAIN ───────────────────────────────────────────
def main():
log.info("=== START FULL PGŽ HNS SCRAPE ===")
conn = get_conn()
cur = conn.cursor()
# 1. Dohvati sve PGŽ klubove s hns_klub_id
cur.execute("""
SELECT id, naziv, hns_klub_id
FROM pgz_sport.klubovi
WHERE savez_id = 10 AND hns_klub_id IS NOT NULL
""")
klubovi = cur.fetchall()
log.info(f"Klubova za obradu: {len(klubovi)}")
total_players = 0
total_seasons = 0
total_matches = 0
for klub_id, klub_naziv, hns_klub_id in klubovi:
log.info(f"🏟️ {klub_naziv} (HNS {hns_klub_id})")
# 2. Roster
roster_url = f"https://semafor.hns.family/klubovi/{hns_klub_id}/igraci/"
html = fetch(roster_url)
if not html:
log.warning(f" ⚠️ Nema rostera za {klub_naziv}")
continue
players = parse_roster(html, hns_klub_id)
if players:
# dodaj klub_hns_id u igrače (za update)
players_with_klub = [(p[0], p[1], p[2], p[3], str(hns_klub_id)) for p in players]
upsert_players(conn, players_with_klub)
log.info(f" 👥 {len(players)} igrača")
else:
log.warning(f" ⚠️ Nema igrača")
continue
# 3. Za svakog igrača skini sezone i utakmice ako nije skoro rađen
for hns_id, ime, prezime, url in players:
# provjeri kada je zadnji put scrape-an
cur.execute("""
SELECT MAX(scraped_at) FROM pgz_sport.hns_player_seasons
WHERE hns_igrac_id = %s
""", (hns_id,))
last = cur.fetchone()[0]
if last and (datetime.now() - last) < timedelta(days=7):
continue # preskoči svježe igrače
html = fetch(url)
if not html:
continue
seasons = parse_player_seasons(html, hns_id)
if seasons:
upsert_seasons(conn, seasons)
total_seasons += len(seasons)
matches = parse_player_matches(html, hns_id)
if matches:
upsert_matches(conn, matches)
total_matches += len(matches)
time.sleep(0.3) # pristojnost prema serveru
total_players += len(players)
time.sleep(1) # kratka pauza između klubova
conn.close()
log.info(f"=== GOTOVO: {total_players} igrača, {total_seasons} sezona, {total_matches} utakmica ===")
if __name__ == "__main__":
main()