#!/usr/bin/env python3 from dotenv import load_dotenv load_dotenv('/opt/rinet-gpu/.env.master') # auto-added by patch_scrapers_with_dotenv.sh # -*- coding: utf-8 -*- """ hns_player_deep.py — SUB3 deep HNS player scraper ───────────────────────────────────────────────── Author: dradulic@outlook.com / damir@rinet.one Date: 2026-05-05 Version: 1.0 Scrapes semafor.hns.family/igraci/{id}/{slug}/ for every clanovi.hns_igrac_id row, extracting: • profil meta (datum_rodenja, mjesto_rodenja, broj_dresa, current klub) • per-season stats per natjecanje (UPSERT pgz_sport.hns_player_seasons) • last 30+ matches (UPSERT pgz_sport.hns_player_matches) Server-rendered HTML — no Playwright needed → uses requests for 5–10× speedup. Fallback to Playwright if --use-playwright is passed. Resume-able: skips clanovi where last_scraped_at > now() - interval N days. Usage: python3 hns_player_deep.py [--limit 200] [--days 7] [--player HNS_ID] [--use-playwright] """ import os, sys, re, time, json, argparse, traceback from datetime import datetime, date from urllib.parse import urljoin import requests import psycopg2 from psycopg2.extras import RealDictCursor, execute_values DSN = os.getenv("RINET_DSN", f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}") TG_TOKEN = os.getenv("TG_BOT_TOKEN", "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y") TG_CHAT = os.getenv("TG_CHAT", "7969491558") SLEEP = float(os.getenv("SLEEP", "0.8")) UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" LOG_DIR = "/var/log/pgz-sport-debug" os.makedirs(LOG_DIR, exist_ok=True) LOG_FILE = f"{LOG_DIR}/sub3_{datetime.now().strftime('%Y%m%d_%H%M')}.log" LOG_FH = open(LOG_FILE, "a", encoding="utf-8") def log(msg: str, telegram: bool = False) -> None: line = f"[{datetime.now().isoformat(timespec='seconds')}] {msg}" print(line, flush=True) LOG_FH.write(line + "\n"); LOG_FH.flush() if telegram and TG_TOKEN and TG_CHAT: try: requests.post( f"https://api.telegram.org/bot{TG_TOKEN}/sendMessage", data={"chat_id": TG_CHAT, "text": msg[:4000]}, timeout=8, ) except Exception: pass # ── HTTP session ────────────────────────────────────────────────────────── SESSION = requests.Session() SESSION.headers.update({"User-Agent": UA, "Accept-Language": "hr,en;q=0.7"}) def fetch_html(url: str, timeout: int = 20) -> str | None: try: r = SESSION.get(url, timeout=timeout) if r.status_code != 200: log(f" HTTP {r.status_code} {url}") return None return r.text except Exception as e: log(f" fetch fail {url}: {e}") return None # ── Parsers ─────────────────────────────────────────────────────────────── def _strip_html(s: str) -> str: s = re.sub(r"<[^>]+>", " ", s) return re.sub(r"\s+", " ", s).strip() def parse_profile(html: str) -> dict: """Extract player profile meta (HNS exposes only birth date / city / jersey / current club).""" out = { "broj_dresa": None, "datum_rodenja": None, "mjesto_rodenja": None, "klub_hns_id": None, "klub_naziv": None, } # playerHeader block (everything from header to first