#!/usr/bin/env python3 """HNS sezone retry — pojednostavljen extract.""" import os, time, re, json, sys from datetime import datetime import psycopg2 from psycopg2.extras import RealDictCursor from playwright.sync_api import sync_playwright DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7" def find_seasons_in_obj(obj, found=None): if found is None: found = [] if isinstance(obj, dict): if 'season' in obj or 'sezona' in obj: found.append(obj) for v in obj.values(): find_seasons_in_obj(v, found) elif isinstance(obj, list): for item in obj: find_seasons_in_obj(item, found) return found def main(): conn = psycopg2.connect(DSN); conn.autocommit = True with conn.cursor(cursor_factory=RealDictCursor) as cur: cur.execute(""" SELECT c.id AS clan_id, c.hns_igrac_id, c.ime, c.prezime, c.source_url FROM pgz_sport.clanovi c WHERE c.hns_igrac_id IS NOT NULL AND NOT EXISTS (SELECT 1 FROM pgz_sport.hns_player_seasons s WHERE s.hns_igrac_id = c.hns_igrac_id) ORDER BY c.id LIMIT 200 """) targets = cur.fetchall() print(f"Targets: {len(targets)}", flush=True) seasons_added = 0 with sync_playwright() as pw: browser = pw.chromium.launch(headless=True, args=["--no-sandbox","--ignore-certificate-errors"]) ctx = browser.new_context(ignore_https_errors=True, user_agent="Mozilla/5.0 (X11; Linux x86_64) Chrome/120.0.0.0") page = ctx.new_page() for i, t in enumerate(targets): url = t['source_url'] if not url or 'semafor.hns.family/igraci/' not in url: continue try: page.goto(url, wait_until="networkidle", timeout=20000) try: page.wait_for_selector('table, .karijera, [class*="season"]', timeout=6000) except: pass time.sleep(0.5) rows = [] # Extract from __NEXT_DATA__ if exists html = page.content() m = re.search(r'__NEXT_DATA__"\s*type="application/json">([^<]+)', html) if m: try: data = json.loads(m.group(1)) sezone = find_seasons_in_obj(data) for s in sezone: sezona = s.get('season') or s.get('sezona') if sezona: rows.append({'sezona': str(sezona), 'klub': '', 'natjecanje': '', 'nastupi': 0, 'golovi': 0}) except Exception: pass # Fallback regex on body if not rows: body = page.locator('body').inner_text() for line in body.split('\n'): match = re.match(r'^(20\d{2}/\d{2})\s+(.+?)\s+(\d+(?:\s+\d+)*)\s*$', line.strip()) if match: sezona = match.group(1) rest = match.group(2) nums = [int(x) for x in match.group(3).split()] rows.append({ 'sezona': sezona, 'klub': rest[:200], 'natjecanje': '', 'nastupi': nums[0] if nums else 0, 'golovi': nums[1] if len(nums) > 1 else 0, }) if rows: with conn.cursor() as cur: for r in rows: try: cur.execute(""" INSERT INTO pgz_sport.hns_player_seasons (hns_igrac_id, clan_id, sezona, klub_naziv, natjecanje, nastupi, golovi) VALUES (%s, %s, %s, %s, %s, %s, %s) ON CONFLICT DO NOTHING """, (t['hns_igrac_id'], t['clan_id'], r['sezona'], r['klub'], r['natjecanje'], r['nastupi'], r['golovi'])) seasons_added += 1 except Exception: pass print(f" ✓ [{i}/{len(targets)}] {t['ime']} {t['prezime']}: {len(rows)} sezone (total added: {seasons_added})", flush=True) if i % 20 == 0: print(f" [{i}/{len(targets)}] processed, total added: {seasons_added}", flush=True) except Exception as e: print(f" ❌ {t['ime']}: {e}", flush=True) browser.close() print(f"\nDone. Total sezone added: {seasons_added}") if __name__ == '__main__': main()