#!/usr/bin/env python3 from dotenv import load_dotenv load_dotenv('/opt/rinet-gpu/.env.master') # auto-added by patch_scrapers_with_dotenv.sh """ HNS Master Harvester — Playwright-based scrape semafor.hns.family ───────────────────────────────────────────────────────────────── 1. List PGŽ financirani nogometni klubovi 2. For each klub: scrape klub roster 3. For each player: scrape full profile (sezone, utakmice) 4. UPSERT u pgz_sport: hns_klub_roster, hns_player_seasons, hns_player_matches, clanovi 5. Audit log Usage: python3 hns_master_harvester.py [--limit N] [--klub-id X] [--players-only] """ import os, sys, time, json, re, argparse, traceback from datetime import datetime from urllib.parse import urlparse import psycopg2 from psycopg2.extras import RealDictCursor, execute_values from playwright.sync_api import sync_playwright DSN = os.getenv("RINET_DSN", f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}") TG = os.getenv("TG_BOT_TOKEN", "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y") TG_CHAT = os.getenv("TG_CHAT", "7969491558") LOG = open(f"/var/log/pgz-sport-debug/hns_harvester_{datetime.now().strftime('%Y%m%d_%H%M')}.log", "a") def log(msg, telegram=False): line = f"[{datetime.now().isoformat(timespec='seconds')}] {msg}" print(line, flush=True) LOG.write(line + "\n"); LOG.flush() if telegram: try: import subprocess subprocess.run(["curl","-s","-X","POST", f"https://api.telegram.org/bot{TG}/sendMessage", "-d", f"chat_id={TG_CHAT}", "--data-urlencode", f"text={msg[:2000]}"], timeout=8, capture_output=True) except: pass def db_conn(): c = psycopg2.connect(DSN); c.autocommit = True; return c # ── Slug HNS = "Franko Andrijašević" → "franko-andrijasevic" ── def slugify_hns(text): if not text: return "" t = text.lower().strip() t = (t.replace('č','c').replace('ć','c').replace('ž','z').replace('š','s').replace('đ','d') .replace('Č','c').replace('Ć','c').replace('Ž','z').replace('Š','s').replace('Đ','d')) t = re.sub(r'[^a-z0-9\s-]', '', t) t = re.sub(r'\s+', '-', t).strip('-') return t def scrape_player(page, hns_id, slug): """Scrape player profile + sezone + utakmice.""" url = f"https://semafor.hns.family/igraci/{hns_id}/{slug}/" try: page.goto(url, wait_until="networkidle", timeout=30000) except Exception as e: log(f" ❌ Goto fail {url}: {e}") return None h1 = page.locator('h1').first.inner_text() if page.locator('h1').count() else '' # Body text body_text = page.locator('body').inner_text() # Trenutni klub link (first /klubovi/ link) current_klub = None klub_links = page.locator('a[href*="/klubovi/"]').all() if klub_links: href = klub_links[0].get_attribute('href') or '' m = re.search(r'/klubovi/(\d+)/([\w-]+)/', href) if m: current_klub = {'hns_id': m.group(1), 'slug': m.group(2), 'naziv': klub_links[0].inner_text().strip()} # Karijera: regex za sezone (npr "2024/25", "2023/24") sezone = [] # Potraži pattern "Sezona | Klub | ..." u tekstu season_lines = re.findall(r'(20\d{2}/\d{2}).{0,200}', body_text) # Tables (možda dynamiclli rendered) seasons_data = [] matches_data = [] # Wait for dynamic content try: page.wait_for_selector('table, .karijera, .sezona, [class*="season"]', timeout=8000) except: pass time.sleep(1) # Re-grab full body after wait body_text = page.locator('body').inner_text() # Parse karijera section: "Sezona | Klub | Natjecanje | Nastupi | Golovi" # Pattern: 2024/25 ... HNK Orijent ... 3.HNL ... 14 ... 2 season_blocks = re.findall(r'(20\d{2}/\d{2})\s+([\w\s\u017c-\u017e\u0107\u010d\u0161\u017d\u0110\.\-]+?)\s+([\d\.\s]+)(?=20\d{2}/\d{2}|$)', body_text) for sb in season_blocks: sezona, klub_text, stats_text = sb nums = re.findall(r'\d+', stats_text) if len(nums) >= 1: seasons_data.append({ 'sezona': sezona, 'klub': klub_text.strip()[:200], 'nastupi': int(nums[0]) if len(nums) > 0 else 0, 'golovi': int(nums[1]) if len(nums) > 1 else 0, }) tables = page.locator('table').all() for t in tables: rows = t.locator('tr').all() if len(rows) < 2: continue # Header header = [c.inner_text().strip() for c in rows[0].locator('th, td').all()] for r in rows[1:]: cells = [c.inner_text().strip() for c in r.locator('th, td').all()] if not cells: continue row_dict = dict(zip(header, cells)) # Detect: has season column? sezona = next((v for k,v in row_dict.items() if re.match(r'\d{4}/\d{2}', v)), None) if sezona: seasons_data.append({**row_dict, 'sezona': sezona}) return { 'hns_id': hns_id, 'slug': slug, 'naziv': h1, 'url': url, 'current_klub': current_klub, 'sezone_count': len(seasons_data), 'seasons': seasons_data, 'matches': matches_data, 'body_text_len': len(body_text), } def scrape_klub_roster(page, klub_hns_id, klub_slug): """Scrape klub roster — sve igrače trenutno u klubu.""" url = f"https://semafor.hns.family/klubovi/{klub_hns_id}/{klub_slug}/" try: page.goto(url, wait_until="networkidle", timeout=30000) except Exception as e: log(f" ❌ Goto fail {url}: {e}") return [] # Sve linkove na igrače players = [] player_links = page.locator('a[href*="/igraci/"]').all() seen_ids = set() for a in player_links: href = a.get_attribute('href') or '' m = re.search(r'/igraci/(\d+)/([\w-]+)', href) if m: hns_id = m.group(1) if hns_id in seen_ids: continue seen_ids.add(hns_id) players.append({ 'hns_id': hns_id, 'slug': m.group(2), 'naziv': a.inner_text().strip(), 'url': f"https://semafor.hns.family{href}" if href.startswith('/') else href }) return players def upsert_clan(conn, klub_id, player_data): """Upsert člana iz HNS profil podataka.""" # Naziv split: "FrankoAndrijašević" → ime/prezime naziv = re.sub(r'\s+', ' ', player_data.get('naziv', '')).strip() # Better: ako h1 join-an, podijeli camelcase parts = re.findall(r'[A-ZČĆŠŽĐ][a-zčćšžđ\']+', naziv) if len(parts) >= 2: ime = parts[0] prezime = ' '.join(parts[1:]) else: ime = naziv prezime = '' hns_id = player_data['hns_id'] url = player_data['url'] with conn.cursor() as cur: # Try find existing cur.execute(""" SELECT id FROM pgz_sport.clanovi WHERE hns_igrac_id = %s ORDER BY id LIMIT 1 """, (hns_id,)) row = cur.fetchone() if row: clan_id = row[0] cur.execute(""" UPDATE pgz_sport.clanovi SET ime = COALESCE(NULLIF(ime,''), %s), prezime = COALESCE(NULLIF(prezime,''), %s), klub_id = COALESCE(klub_id, %s), hns_igrac_id = %s, source = 'hns_semafor', source_url = %s, last_updated = now(), last_scraped_at = now(), sport = COALESCE(sport, 'nogomet') WHERE id = %s """, (ime, prezime, klub_id, hns_id, url, clan_id)) else: cur.execute(""" INSERT INTO pgz_sport.clanovi (klub_id, ime, prezime, sport, source, source_url, hns_igrac_id, last_scraped_at, aktivan) VALUES (%s, %s, %s, 'nogomet', 'hns_semafor', %s, %s, now(), true) RETURNING id """, (klub_id, ime, prezime, url, hns_id)) clan_id = cur.fetchone()[0] return clan_id def upsert_seasons(conn, hns_id, clan_id, seasons): if not seasons: return 0 rows = [] skipped = 0 # Reject rows where klub_naziv is obviously a misparsed HTML stat-block # (the parser at scrape_player_full() can produce these when a