#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ HNS PGŽ FULL SCRAPER – svi klubovi, igrači, sezone, utakmice ───────────────────────────────────────────────────────────── Sprema u: pgz_sport.clanovi pgz_sport.hns_player_seasons pgz_sport.hns_player_matches Autor: Damir Radulić / dradulic@outlook.com Datum: 2026-05-15 (robustna verzija) """ import os, re, sys, time, logging, json from datetime import datetime, timedelta import requests import psycopg2 from psycopg2.extras import execute_values # ─── LOG ─────────────────────────────────────────── LOG_DIR = "/var/log/pgz-sport-debug" os.makedirs(LOG_DIR, exist_ok=True) logging.basicConfig( level=logging.INFO, format="%(asctime)s [FULL] %(message)s", handlers=[ logging.FileHandler(os.path.join(LOG_DIR, "hns_full.log"), encoding="utf-8"), logging.StreamHandler(sys.stdout) ] ) log = logging.getLogger("hns_full") # ─── DB CONN ──────────────────────────────────────── from dotenv import load_dotenv load_dotenv('/opt/rinet-gpu/.env.master') DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}" def get_conn(): c = psycopg2.connect(DSN) c.autocommit = True return c # ─── HTTP FETCH ───────────────────────────────────── UA = "Mozilla/5.0 (Ri.NET PGŽ Sport Bot; contact dradulic@outlook.com)" def fetch(url, retries=3): for i in range(retries): try: r = requests.get(url, headers={"User-Agent": UA}, timeout=15) if r.status_code == 404: return None r.raise_for_status() return r.text except Exception as e: time.sleep(1.5 * (i+1)) return None # ─── PARSIRANJE ───────────────────────────────────── def parse_roster(html, klub_hns_id): """Vraća listu (hns_igrac_id, ime, prezime, url)""" igraci = [] # pronađi linkove na igrače for m in re.finditer(r']*href="(/igraci/(\d+)/[^"]*)"[^>]*>(.*?)', html, re.DOTALL): url = "https://semafor.hns.family" + m.group(1) hns_id = int(m.group(2)) raw_name = re.sub(r'<[^>]+>', ' ', m.group(3)).strip() raw_name = re.sub(r'\s+', ' ', raw_name) if not raw_name: continue parts = raw_name.split(' ', 1) ime = parts[0].strip() prezime = parts[1].strip() if len(parts) > 1 else '' # preskoči administrativne linkove if not prezime or len(prezime) < 2: continue igraci.append((hns_id, ime, prezime, url)) return igraci def parse_player_seasons(html, hns_igrac_id): """Vraća listu dictova za svaku sezonu s poljima: sezona, natjecanje, klub_hns_id, klub_naziv, nastupi, golovi, asistencije, zuti, crveni, minute""" seasons = [] # Pronađi blok sa sezonama – obično unutar
ili tablice # Pojednostavljeno: tražimo sve redove tablice koje sadrže godinu i link na klub table_match = re.search(r']*class="[^"]*playerSeason[^"]*"[^>]*>(.*?)', html, re.DOTALL) if not table_match: table_match = re.search(r']*class="[^"]*career[^"]*"[^>]*>(.*?)', html, re.DOTALL) if not table_match: # fallback – tražimo bilo koju tablicu table_match = re.search(r']*>(.*?)', html, re.DOTALL) if table_match: table_html = table_match.group(1) # parsiraj redove for row in re.finditer(r']*>(.*?)', table_html, re.DOTALL): cells = re.findall(r']*>(.*?)', row.group(1), re.DOTALL) if len(cells) < 3: continue # očekivani format: sezona (npr. "2025/26"), klub (link), natjecanje, nastupi, golovi, asistencije, žuti, crveni, minute # ali može varirati – tražimo barem sezonu i link na klub sezona = None klub_hns_id = None klub_naziv = "" natjecanje = "" nastupi = golovi = asistencije = zuti = crveni = minute = 0 # prva ćelija često sezona season_text = re.sub(r'<[^>]+>', '', cells[0]).strip() if re.match(r'\d{4}/\d{2,4}', season_text): sezona = season_text # tražimo link na klub club_link = re.search(r']*href="(/klubovi/(\d+)/[^"]*)"[^>]*>(.*?)', row.group(1), re.DOTALL) if club_link: klub_hns_id = int(club_link.group(2)) klub_naziv = re.sub(r'<[^>]+>', '', club_link.group(3)).strip() # natjecanje (obično treća ćelija ili druga ako nema kluba) if len(cells) >= 2 and not klub_hns_id: natjecanje = re.sub(r'<[^>]+>', '', cells[1]).strip() elif len(cells) >= 3: natjecanje = re.sub(r'<[^>]+>', '', cells[2]).strip() # statistika for i, cell in enumerate(cells): text = re.sub(r'<[^>]+>', '', cell).strip() if text.isdigit(): val = int(text) if i == 3: nastupi = val elif i == 4: golovi = val elif i == 5: asistencije = val elif i == 6: zuti = val elif i == 7: crveni = val elif i == 8: minute = val if sezona and klub_hns_id: seasons.append({ "hns_igrac_id": hns_igrac_id, "sezona": sezona, "klub_hns_id": str(klub_hns_id), "klub_naziv": klub_naziv, "natjecanje": natjecanje, "nastupi": nastupi, "golovi": golovi, "asistencije": asistencije, "zuti": zuti, "crveni": crveni, "minute": minute }) return seasons def parse_player_matches(html, hns_igrac_id): """Vraća listu dictova za zadnje utakmice""" matches = [] # slično parsiranje, tablica utakmica table_match = re.search(r']*class="[^"]*match[^"]*"[^>]*>(.*?)', html, re.DOTALL) if not table_match: table_match = re.search(r']*>(.*?)', html, re.DOTALL) if table_match: table_html = table_match.group(1) for row in re.finditer(r']*>(.*?)', table_html, re.DOTALL): cells = re.findall(r']*>(.*?)', row.group(1), re.DOTALL) if len(cells) < 5: continue # format: datum, domaćin, gost, rezultat, (možda minutaža, golovi...) datum = re.sub(r'<[^>]+>', '', cells[0]).strip() domacin = re.sub(r'<[^>]+>', '', cells[1]).strip() gost = re.sub(r'<[^>]+>', '', cells[2]).strip() rezultat = re.sub(r'<[^>]+>', '', cells[3]).strip() if datum and domacin: matches.append({ "hns_igrac_id": hns_igrac_id, "datum": datum, "domacin": domacin, "gost": gost, "rezultat": rezultat }) return matches # ─── UPSERT U BAZU ────────────────────────────────── def upsert_players(conn, players): sql = """ INSERT INTO pgz_sport.clanovi (hns_igrac_id, ime, prezime, source_url, klub_hns_id) VALUES %s ON CONFLICT (hns_igrac_id) DO UPDATE SET ime = EXCLUDED.ime, prezime = EXCLUDED.prezime, source_url = EXCLUDED.source_url, klub_hns_id = EXCLUDED.klub_hns_id """ with conn.cursor() as cur: execute_values(cur, sql, players) def upsert_seasons(conn, seasons): if not seasons: return sql = """ INSERT INTO pgz_sport.hns_player_seasons (hns_igrac_id, sezona, natjecanje, klub_hns_id, klub_naziv, nastupi, golovi, asistencije, zuti, crveni, minute, source_url) VALUES %s ON CONFLICT (hns_igrac_id, sezona, klub_hns_id, natjecanje) DO UPDATE SET nastupi = EXCLUDED.nastupi, golovi = EXCLUDED.golovi, asistencije = EXCLUDED.asistencije, zuti = EXCLUDED.zuti, crveni = EXCLUDED.crveni, minute = EXCLUDED.minute, klub_naziv = EXCLUDED.klub_naziv """ vals = [] for s in seasons: vals.append(( s["hns_igrac_id"], s["sezona"], s["natjecanje"], s["klub_hns_id"], s["klub_naziv"], s["nastupi"], s["golovi"], s["asistencije"], s["zuti"], s["crveni"], s["minute"], "" )) with conn.cursor() as cur: execute_values(cur, sql, vals, page_size=100) def upsert_matches(conn, matches): if not matches: return sql = """ INSERT INTO pgz_sport.hns_player_matches (hns_igrac_id, datum, domacin, gost, rezultat) VALUES %s ON CONFLICT (hns_igrac_id, datum, domacin, gost) DO NOTHING """ vals = [(m["hns_igrac_id"], m["datum"], m["domacin"], m["gost"], m["rezultat"]) for m in matches] with conn.cursor() as cur: execute_values(cur, sql, vals, page_size=100) # ─── MAIN ─────────────────────────────────────────── def main(): log.info("=== START FULL PGŽ HNS SCRAPE ===") conn = get_conn() cur = conn.cursor() # 1. Dohvati sve PGŽ klubove s hns_klub_id cur.execute(""" SELECT id, naziv, hns_klub_id FROM pgz_sport.klubovi WHERE savez_id = 10 AND hns_klub_id IS NOT NULL """) klubovi = cur.fetchall() log.info(f"Klubova za obradu: {len(klubovi)}") total_players = 0 total_seasons = 0 total_matches = 0 for klub_id, klub_naziv, hns_klub_id in klubovi: log.info(f"🏟️ {klub_naziv} (HNS {hns_klub_id})") # 2. Roster roster_url = f"https://semafor.hns.family/klubovi/{hns_klub_id}/igraci/" html = fetch(roster_url) if not html: log.warning(f" ⚠️ Nema rostera za {klub_naziv}") continue players = parse_roster(html, hns_klub_id) if players: # dodaj klub_hns_id u igrače (za update) players_with_klub = [(p[0], p[1], p[2], p[3], str(hns_klub_id)) for p in players] upsert_players(conn, players_with_klub) log.info(f" 👥 {len(players)} igrača") else: log.warning(f" ⚠️ Nema igrača") continue # 3. Za svakog igrača skini sezone i utakmice ako nije skoro rađen for hns_id, ime, prezime, url in players: # provjeri kada je zadnji put scrape-an cur.execute(""" SELECT MAX(scraped_at) FROM pgz_sport.hns_player_seasons WHERE hns_igrac_id = %s """, (hns_id,)) last = cur.fetchone()[0] if last and (datetime.now() - last) < timedelta(days=7): continue # preskoči svježe igrače html = fetch(url) if not html: continue seasons = parse_player_seasons(html, hns_id) if seasons: upsert_seasons(conn, seasons) total_seasons += len(seasons) matches = parse_player_matches(html, hns_id) if matches: upsert_matches(conn, matches) total_matches += len(matches) time.sleep(0.3) # pristojnost prema serveru total_players += len(players) time.sleep(1) # kratka pauza između klubova conn.close() log.info(f"=== GOTOVO: {total_players} igrača, {total_seasons} sezona, {total_matches} utakmica ===") if __name__ == "__main__": main()