#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ HNS PGŽ FULL SCRAPER v2 – ispravljen URL roster-a Koristi sub1_hns_catalog.json za točne URL-ove klubova """ import os, re, sys, time, logging, json from datetime import datetime, timedelta import requests import psycopg2 from psycopg2.extras import execute_values # ─── LOG ─────────────────────────────────────────── LOG_DIR = "/var/log/pgz-sport-debug" os.makedirs(LOG_DIR, exist_ok=True) logging.basicConfig( level=logging.INFO, format="%(asctime)s [FULL] %(message)s", handlers=[ logging.FileHandler(os.path.join(LOG_DIR, "hns_full.log"), encoding="utf-8"), logging.StreamHandler(sys.stdout) ] ) log = logging.getLogger("hns_full") # ─── DB ──────────────────────────────────────────── from dotenv import load_dotenv load_dotenv('/opt/rinet-gpu/.env.master') DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}" def get_conn(): return psycopg2.connect(DSN) UA = "Mozilla/5.0 (Ri.NET PGŽ Sport Bot)" def fetch(url, retries=3): for i in range(retries): try: r = requests.get(url, headers={"User-Agent": UA}, timeout=15) if r.status_code == 404: return None r.raise_for_status() return r.text except: time.sleep(1.5 * (i+1)) return None # ─── PARSIRANJE ───────────────────────────────────── def parse_roster(html): """Vraća listu (hns_igrac_id, ime, prezime, profil_url)""" igraci = [] for m in re.finditer(r']*href="(/igraci/(\d+)/[^"]*)"[^>]*>(.*?)', html, re.DOTALL): url = "https://semafor.hns.family" + m.group(1) hns_id = int(m.group(2)) raw_name = re.sub(r'<[^>]+>', ' ', m.group(3)).strip() raw_name = re.sub(r'\s+', ' ', raw_name) if not raw_name: continue parts = raw_name.split(' ', 1) ime = parts[0].strip() prezime = parts[1].strip() if len(parts) > 1 else '' if not prezime or len(prezime) < 2: continue igraci.append((hns_id, ime, prezime, url)) return igraci def parse_seasons(html, hns_igrac_id): """Vraća listu dictova sezona""" # tražimo JSON-LD ili tablicu json_match = re.search(r']*type="application/ld\+json"[^>]*>(.*?)', html, re.DOTALL) if json_match: try: data = json.loads(json_match.group(1)) seasons_data = data.get('playerSeason', []) if not seasons_data: return [] seasons = [] for s in seasons_data: seasons.append({ "hns_igrac_id": hns_igrac_id, "sezona": s.get("season", ""), "klub_hns_id": str(s.get("clubId", "")), "klub_naziv": s.get("clubName", ""), "natjecanje": s.get("competition", ""), "nastupi": int(s.get("apps", 0)), "golovi": int(s.get("goals", 0)), "asistencije": int(s.get("assists", 0)), "zuti": int(s.get("yellow", 0)), "crveni": int(s.get("red", 0)), "minute": int(s.get("minutes", 0)) }) return seasons except: pass return [] def parse_matches(html, hns_igrac_id): """Vraća listu dictova utakmica iz JSON-LD""" json_match = re.search(r']*type="application/ld\+json"[^>]*>(.*?)', html, re.DOTALL) if json_match: try: data = json.loads(json_match.group(1)) matches_data = data.get('playerMatch', []) matches = [] for m in matches_data: matches.append({ "hns_igrac_id": hns_igrac_id, "datum": m.get("date", ""), "domacin": m.get("homeTeam", ""), "gost": m.get("awayTeam", ""), "rezultat": m.get("result", "") }) return matches except: pass return [] # ─── UPSERT ───────────────────────────────────────── def upsert_players(conn, players): sql = """INSERT INTO pgz_sport.clanovi (hns_igrac_id, ime, prezime, source_url) VALUES %s ON CONFLICT (hns_igrac_id) DO UPDATE SET ime = EXCLUDED.ime, prezime = EXCLUDED.prezime, source_url = EXCLUDED.source_url""" with conn.cursor() as cur: execute_values(cur, sql, players) def upsert_seasons(conn, seasons): if not seasons: return sql = """INSERT INTO pgz_sport.hns_player_seasons (hns_igrac_id, sezona, natjecanje, klub_hns_id, klub_naziv, nastupi, golovi, asistencije, zuti, crveni, minute) VALUES %s ON CONFLICT (hns_igrac_id, sezona, klub_hns_id, natjecanje) DO UPDATE SET nastupi=EXCLUDED.nastupi, golovi=EXCLUDED.golovi, asistencije=EXCLUDED.asistencije, zuti=EXCLUDED.zuti, crveni=EXCLUDED.crveni, minute=EXCLUDED.minute, klub_naziv=EXCLUDED.klub_naziv""" vals = [(s['hns_igrac_id'], s['sezona'], s['natjecanje'], s['klub_hns_id'], s['klub_naziv'], s['nastupi'], s['golovi'], s['asistencije'], s['zuti'], s['crveni'], s['minute']) for s in seasons] with conn.cursor() as cur: execute_values(cur, sql, vals, page_size=50) def upsert_matches(conn, matches): if not matches: return sql = """INSERT INTO pgz_sport.hns_player_matches (hns_igrac_id, datum, domacin, gost, rezultat) VALUES %s ON CONFLICT (hns_igrac_id, datum, domacin, gost) DO NOTHING""" vals = [(m['hns_igrac_id'], m['datum'], m['domacin'], m['gost'], m['rezultat']) for m in matches] with conn.cursor() as cur: execute_values(cur, sql, vals, page_size=50) # ─── MAIN ─────────────────────────────────────────── def main(): log.info("=== START FULL PGŽ HNS SCRAPE v2 ===") conn = get_conn() conn.autocommit = True # 1. Učitaj katalog za URL-ove klubova with open('/opt/pgz-sport/cc_tasks/sub1_hns_catalog.json', 'r') as f: catalog = json.load(f) klub_url_map = {} for item in catalog: klub_url_map[item['id']] = f"https://semafor.hns.family/klubovi/{item['id']}/{item['slug']}/" log.info(f"Učitano {len(klub_url_map)} klubova iz kataloga.") # 2. Dohvati klubove iz baze koji imaju hns_klub_id i savez_id=10 cur = conn.cursor() cur.execute("SELECT id, naziv, hns_klub_id FROM pgz_sport.klubovi WHERE savez_id = 10 AND hns_klub_id IS NOT NULL") klubovi = cur.fetchall() log.info(f"Klubova za obradu: {len(klubovi)}") total_players = total_seasons = total_matches = 0 for klub_id, naziv, hns_id in klubovi: klub_url = klub_url_map.get(hns_id) if not klub_url: log.warning(f" ⚠️ {naziv} (HNS {hns_id}) nema URL u katalogu, preskačem.") continue log.info(f"🏟️ {naziv} → {klub_url}") html = fetch(klub_url) if not html: log.warning(f" ❌ Ne mogu dohvatiti stranicu kluba.") continue players = parse_roster(html) if not players: log.warning(f" ⚠️ Nema igrača.") continue # upsert igrača player_tuples = [(p[0], p[1], p[2], p[3]) for p in players] upsert_players(conn, player_tuples) log.info(f" 👥 {len(players)} igrača") # za svakog igrača skini detalje for hns_igrac_id, ime, prezime, profile_url in players: # provjeri da li smo nedavno scrapeali sezone cur.execute("SELECT MAX(scraped_at) FROM pgz_sport.hns_player_seasons WHERE hns_igrac_id = %s", (hns_igrac_id,)) last = cur.fetchone()[0] if last and (datetime.now() - last) < timedelta(days=7): continue html = fetch(profile_url) if not html: continue seasons = parse_seasons(html, hns_igrac_id) if seasons: upsert_seasons(conn, seasons) total_seasons += len(seasons) matches = parse_matches(html, hns_igrac_id) if matches: upsert_matches(conn, matches) total_matches += len(matches) time.sleep(0.3) total_players += len(players) conn.close() log.info(f"=== GOTOVO: {total_players} igrača, {total_seasons} sezona, {total_matches} utakmica ===") if __name__ == "__main__": main()