#!/usr/bin/env python3 from dotenv import load_dotenv load_dotenv('/opt/rinet-gpu/.env.master') # auto-added by patch_scrapers_with_dotenv.sh """HNS sezone v3 — koristi __NEXT_DATA__ JSON parser primarily.""" import os, time, re, json, sys import psycopg2 from psycopg2.extras import RealDictCursor from playwright.sync_api import sync_playwright DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}" def find_seasons(obj, found=None, depth=0): if depth > 25: return found or [] if found is None: found = [] if isinstance(obj, dict): keys = set(obj.keys()) # Detect season-like dict if ('season' in keys and isinstance(obj.get('season'), (str, dict))) or 'sezona' in keys: found.append(obj) # Detect career object with seasons array for k, v in obj.items(): if k.lower() in ('careers','career','seasons','sezone','statistics','stats') and isinstance(v, list): for item in v: if isinstance(item, dict) and any(kk in item for kk in ('season','sezona','year','godina')): found.append(item) find_seasons(v, found, depth+1) elif isinstance(obj, list): for item in obj: find_seasons(item, found, depth+1) return found def normalize_season(s): """Convert season dict to flat row.""" sezona = s.get('season') or s.get('sezona') or s.get('year') or s.get('godina') or '' if isinstance(sezona, dict): sezona = sezona.get('name') or sezona.get('label') or str(sezona.get('year','')) sezona = str(sezona) klub = s.get('club') or s.get('klub') or s.get('team') or '' if isinstance(klub, dict): klub = klub.get('name') or klub.get('naziv') or '' natj = s.get('competition') or s.get('natjecanje') or s.get('league') or '' if isinstance(natj, dict): natj = natj.get('name') or natj.get('naziv') or '' def num(*keys): for k in keys: for kk in s.keys(): if k.lower() in kk.lower(): v = s[kk] try: return int(v) except: try: return int(re.sub(r'\D','', str(v)) or 0) except: return 0 return 0 return { 'sezona': sezona, 'klub': str(klub)[:200], 'natjecanje': str(natj)[:100], 'nastupi': num('matches','nastup','appearance'), 'startna': num('start'), 'zamjena': num('sub','zamjen'), 'golovi': num('goal','gol'), 'asistencije': num('assist','asist'), 'zuti': num('yellow','žut','zut'), 'crveni': num('red','crv'), 'minute': num('minute','minut','min'), } def main(): conn = psycopg2.connect(DSN); conn.autocommit = True with conn.cursor(cursor_factory=RealDictCursor) as cur: cur.execute(""" SELECT c.id AS clan_id, c.hns_igrac_id, c.ime, c.prezime, c.source_url FROM pgz_sport.clanovi c WHERE c.hns_igrac_id IS NOT NULL AND NOT EXISTS (SELECT 1 FROM pgz_sport.hns_player_seasons s WHERE s.hns_igrac_id = c.hns_igrac_id) ORDER BY c.id LIMIT 200 """) targets = cur.fetchall() print(f"Targets: {len(targets)}", flush=True) seasons_added = 0 with sync_playwright() as pw: browser = pw.chromium.launch(headless=True, args=["--no-sandbox","--ignore-certificate-errors"]) page = browser.new_context(ignore_https_errors=True, user_agent="Mozilla/5.0 (X11; Linux x86_64) Chrome/120.0.0.0").new_page() for i, t in enumerate(targets): url = t['source_url'] if not url or 'semafor.hns.family/igraci/' not in url: continue try: page.goto(url, wait_until="networkidle", timeout=20000) time.sleep(0.8) html = page.content() rows = [] # Extract __NEXT_DATA__ m = re.search(r'__NEXT_DATA__"\s*type="application/json">([^<]+)', html) if m: try: data = json.loads(m.group(1)) seasons_raw = find_seasons(data) for s in seasons_raw: n = normalize_season(s) if n['sezona']: rows.append(n) except Exception as e: pass # Insert if rows: with conn.cursor() as cur: for r in rows: try: cur.execute(""" INSERT INTO pgz_sport.hns_player_seasons (hns_igrac_id, clan_id, sezona, klub_naziv, natjecanje, nastupi, startna, zamjena, golovi, asistencije, zuti, crveni, minute) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON CONFLICT DO NOTHING """, (t['hns_igrac_id'], t['clan_id'], r['sezona'], r['klub'], r['natjecanje'], r['nastupi'], r['startna'], r['zamjena'], r['golovi'], r['asistencije'], r['zuti'], r['crveni'], r['minute'])) seasons_added += 1 except: pass print(f" ✓ [{i}/{len(targets)}] {t['ime']} {t['prezime']}: +{len(rows)} sezone (total: {seasons_added})", flush=True) if i % 30 == 0 and i > 0: print(f" [{i}/{len(targets)}] processed, total: {seasons_added}", flush=True) except Exception as e: pass browser.close() print(f"\n✅ Done. Total: {seasons_added}", flush=True) if __name__ == '__main__': main()