#!/usr/bin/env python3 """ HNS Master Harvester — Playwright-based scrape semafor.hns.family ───────────────────────────────────────────────────────────────── 1. List PGŽ financirani nogometni klubovi 2. For each klub: scrape klub roster 3. For each player: scrape full profile (sezone, utakmice) 4. UPSERT u pgz_sport: hns_klub_roster, hns_player_seasons, hns_player_matches, clanovi 5. Audit log Usage: python3 hns_master_harvester.py [--limit N] [--klub-id X] [--players-only] """ import os, sys, time, json, re, argparse, traceback from datetime import datetime from urllib.parse import urlparse import psycopg2 from psycopg2.extras import RealDictCursor, execute_values from playwright.sync_api import sync_playwright DSN = os.getenv("RINET_DSN", f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}") TG = os.getenv("TG_BOT_TOKEN", "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y") TG_CHAT = os.getenv("TG_CHAT", "7969491558") LOG = open(f"/var/log/pgz-sport-debug/hns_harvester_{datetime.now().strftime('%Y%m%d_%H%M')}.log", "a") def log(msg, telegram=False): line = f"[{datetime.now().isoformat(timespec='seconds')}] {msg}" print(line, flush=True) LOG.write(line + "\n"); LOG.flush() if telegram: try: import subprocess subprocess.run(["curl","-s","-X","POST", f"https://api.telegram.org/bot{TG}/sendMessage", "-d", f"chat_id={TG_CHAT}", "--data-urlencode", f"text={msg[:2000]}"], timeout=8, capture_output=True) except: pass def db_conn(): c = psycopg2.connect(DSN); c.autocommit = True; return c # ── Slug HNS = "Franko Andrijašević" → "franko-andrijasevic" ── def slugify_hns(text): if not text: return "" t = text.lower().strip() t = (t.replace('č','c').replace('ć','c').replace('ž','z').replace('š','s').replace('đ','d') .replace('Č','c').replace('Ć','c').replace('Ž','z').replace('Š','s').replace('Đ','d')) t = re.sub(r'[^a-z0-9\s-]', '', t) t = re.sub(r'\s+', '-', t).strip('-') return t def scrape_player(page, hns_id, slug): """Scrape player profile + sezone + utakmice.""" url = f"https://semafor.hns.family/igraci/{hns_id}/{slug}/" try: page.goto(url, wait_until="networkidle", timeout=30000) except Exception as e: log(f" ❌ Goto fail {url}: {e}") return None h1 = page.locator('h1').first.inner_text() if page.locator('h1').count() else '' # Body text body_text = page.locator('body').inner_text() # Trenutni klub link (first /klubovi/ link) current_klub = None klub_links = page.locator('a[href*="/klubovi/"]').all() if klub_links: href = klub_links[0].get_attribute('href') or '' m = re.search(r'/klubovi/(\d+)/([\w-]+)/', href) if m: current_klub = {'hns_id': m.group(1), 'slug': m.group(2), 'naziv': klub_links[0].inner_text().strip()} # Karijera: regex za sezone (npr "2024/25", "2023/24") sezone = [] # Potraži pattern "Sezona | Klub | ..." u tekstu season_lines = re.findall(r'(20\d{2}/\d{2}).{0,200}', body_text) # Tables (možda dynamiclli rendered) seasons_data = [] matches_data = [] # Wait for dynamic content try: page.wait_for_selector('table, .karijera, .sezona, [class*="season"]', timeout=8000) except: pass time.sleep(1) # Re-grab full body after wait body_text = page.locator('body').inner_text() # Parse karijera section: "Sezona | Klub | Natjecanje | Nastupi | Golovi" # Pattern: 2024/25 ... HNK Orijent ... 3.HNL ... 14 ... 2 season_blocks = re.findall(r'(20\d{2}/\d{2})\s+([\w\s\u017c-\u017e\u0107\u010d\u0161\u017d\u0110\.\-]+?)\s+([\d\.\s]+)(?=20\d{2}/\d{2}|$)', body_text) for sb in season_blocks: sezona, klub_text, stats_text = sb nums = re.findall(r'\d+', stats_text) if len(nums) >= 1: seasons_data.append({ 'sezona': sezona, 'klub': klub_text.strip()[:200], 'nastupi': int(nums[0]) if len(nums) > 0 else 0, 'golovi': int(nums[1]) if len(nums) > 1 else 0, }) tables = page.locator('table').all() for t in tables: rows = t.locator('tr').all() if len(rows) < 2: continue # Header header = [c.inner_text().strip() for c in rows[0].locator('th, td').all()] for r in rows[1:]: cells = [c.inner_text().strip() for c in r.locator('th, td').all()] if not cells: continue row_dict = dict(zip(header, cells)) # Detect: has season column? sezona = next((v for k,v in row_dict.items() if re.match(r'\d{4}/\d{2}', v)), None) if sezona: seasons_data.append({**row_dict, 'sezona': sezona}) return { 'hns_id': hns_id, 'slug': slug, 'naziv': h1, 'url': url, 'current_klub': current_klub, 'sezone_count': len(seasons_data), 'seasons': seasons_data, 'matches': matches_data, 'body_text_len': len(body_text), } def scrape_klub_roster(page, klub_hns_id, klub_slug): """Scrape klub roster — sve igrače trenutno u klubu.""" url = f"https://semafor.hns.family/klubovi/{klub_hns_id}/{klub_slug}/" try: page.goto(url, wait_until="networkidle", timeout=30000) except Exception as e: log(f" ❌ Goto fail {url}: {e}") return [] # Sve linkove na igrače players = [] player_links = page.locator('a[href*="/igraci/"]').all() seen_ids = set() for a in player_links: href = a.get_attribute('href') or '' m = re.search(r'/igraci/(\d+)/([\w-]+)', href) if m: hns_id = m.group(1) if hns_id in seen_ids: continue seen_ids.add(hns_id) players.append({ 'hns_id': hns_id, 'slug': m.group(2), 'naziv': a.inner_text().strip(), 'url': f"https://semafor.hns.family{href}" if href.startswith('/') else href }) return players def upsert_clan(conn, klub_id, player_data): """Upsert člana iz HNS profil podataka.""" # Naziv split: "FrankoAndrijašević" → ime/prezime naziv = re.sub(r'\s+', ' ', player_data.get('naziv', '')).strip() # Better: ako h1 join-an, podijeli camelcase parts = re.findall(r'[A-ZČĆŠŽĐ][a-zčćšžđ\']+', naziv) if len(parts) >= 2: ime = parts[0] prezime = ' '.join(parts[1:]) else: ime = naziv prezime = '' hns_id = player_data['hns_id'] url = player_data['url'] with conn.cursor() as cur: # Try find existing cur.execute(""" SELECT id FROM pgz_sport.clanovi WHERE hns_igrac_id = %s ORDER BY id LIMIT 1 """, (hns_id,)) row = cur.fetchone() if row: clan_id = row[0] cur.execute(""" UPDATE pgz_sport.clanovi SET ime = COALESCE(NULLIF(ime,''), %s), prezime = COALESCE(NULLIF(prezime,''), %s), klub_id = COALESCE(klub_id, %s), hns_igrac_id = %s, source = 'hns_semafor', source_url = %s, last_updated = now(), last_scraped_at = now(), sport = COALESCE(sport, 'nogomet') WHERE id = %s """, (ime, prezime, klub_id, hns_id, url, clan_id)) else: cur.execute(""" INSERT INTO pgz_sport.clanovi (klub_id, ime, prezime, sport, source, source_url, hns_igrac_id, last_scraped_at, aktivan) VALUES (%s, %s, %s, 'nogomet', 'hns_semafor', %s, %s, now(), true) RETURNING id """, (klub_id, ime, prezime, url, hns_id)) clan_id = cur.fetchone()[0] return clan_id def upsert_seasons(conn, hns_id, clan_id, seasons): if not seasons: return 0 rows = [] skipped = 0 # Reject rows where klub_naziv is obviously a misparsed HTML stat-block # (the parser at scrape_player_full() can produce these when a row # has fewer cells than the header — dict(zip(...)) silently drops, leaving # whole-block dumps or bare numbers in the value). _BAD_PREFIXES = ('STATISTIKA', 'NASTUPI', 'ZAPOČEO', 'ZAMJENA', 'POGOTCI', 'ŽUTI', 'CRVENI', 'UKUPNO', 'SUPERSPORT') def _looks_like_garbage(klub_text): if not klub_text: return True t = klub_text.strip() if not t: return True # whitespace only if re.match(r'^\d+$', t): return True # bare number (year, jersey #) if t.count('\n') >= 2: return True # multi-line label dump u = t.upper() return any(u.startswith(p) for p in _BAD_PREFIXES) for s in seasons: sezona = s.get('sezona', '') if not sezona: continue # Try extract klub iz row klub = next((v for k,v in s.items() if 'lub' in k.lower()), '') natjecanje = next((v for k,v in s.items() if 'atjec' in k.lower() or 'liga' in k.lower()), '') if _looks_like_garbage(klub): skipped += 1 continue def num(key): for k in s.keys(): if key in k.lower(): try: return int(re.sub(r'\D','', s[k]) or 0) except: return 0 return 0 rows.append(( hns_id, clan_id, sezona, None, klub, natjecanje, num('nastup'), num('start'), num('zamj'), num('gol'), num('asist'), num('žut'), num('crv'), num('minut') )) if skipped: print(f'[hns_master_harvester] upsert_seasons: skipped {skipped} garbage rows for hns_id={hns_id}') with conn.cursor() as cur: execute_values(cur, """ INSERT INTO pgz_sport.hns_player_seasons (hns_igrac_id, clan_id, sezona, klub_hns_id, klub_naziv, natjecanje, nastupi, startna, zamjena, golovi, asistencije, zuti, crveni, minute) VALUES %s ON CONFLICT (hns_igrac_id, sezona, klub_hns_id, natjecanje) DO UPDATE SET nastupi = EXCLUDED.nastupi, startna = EXCLUDED.startna, zamjena = EXCLUDED.zamjena, golovi = EXCLUDED.golovi, asistencije = EXCLUDED.asistencije, zuti = EXCLUDED.zuti, crveni = EXCLUDED.crveni, minute = EXCLUDED.minute, scraped_at = now() """, rows) return len(rows) def upsert_klub_roster(conn, klub_id, klub_hns_id, players): if not players: return 0 rows = [(klub_id, klub_hns_id, p['hns_id'], p.get('naziv','').split()[0] if p.get('naziv') else '', ' '.join(p.get('naziv','').split()[1:]) if p.get('naziv') else '', p.get('pozicija',''), p.get('url','')) for p in players] with conn.cursor() as cur: execute_values(cur, """ INSERT INTO pgz_sport.hns_klub_roster (klub_id, klub_hns_id, hns_igrac_id, ime, prezime, pozicija, source_url) VALUES %s ON CONFLICT (klub_hns_id, hns_igrac_id) DO UPDATE SET klub_id = EXCLUDED.klub_id, scraped_at = now() """, rows) return len(rows) def main(): ap = argparse.ArgumentParser() ap.add_argument('--limit', type=int, default=999) ap.add_argument('--klub-id', type=int, default=None) ap.add_argument('--single-player', help='HNS ID of single player to scrape') args = ap.parse_args() conn = db_conn() # Get target klubs: PGŽ financirani nogometni if args.single_player: klubovi = [] else: with conn.cursor(cursor_factory=RealDictCursor) as cur: if args.klub_id: cur.execute("SELECT * FROM pgz_sport.klubovi WHERE id = %s", (args.klub_id,)) else: cur.execute(""" SELECT * FROM pgz_sport.v_pgz_financirani_klubovi WHERE sport = 'nogomet' AND source_url LIKE %s ORDER BY id LIMIT %s """, ('%semafor.hns.family/klubovi%', args.limit)) klubovi = cur.fetchall() log(f"🚀 HNS Harvester starting. Target klubova: {len(klubovi)}", telegram=True) stats = {'klubova': 0, 'players_scraped': 0, 'seasons_upserted': 0, 'errors': 0} with sync_playwright() as pw: browser = pw.chromium.launch(headless=True, args=["--no-sandbox","--ignore-certificate-errors"]) ctx = browser.new_context( ignore_https_errors=True, user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" ) page = ctx.new_page() if args.single_player: # Test mode log(f"🔬 Single player mode: {args.single_player}") data = scrape_player(page, args.single_player, 'unknown') log(f" Data: {json.dumps(data, default=str, ensure_ascii=False)[:500]}") browser.close() return for klub in klubovi: try: src = klub.get('source_url', '') or '' m = re.search(r'/klubovi/(\d+)/([^/]*)', src) if not m: log(f" ⏭ Klub {klub['id']} {klub['naziv']} — no HNS URL") continue khns, kslug = m.group(1), m.group(2) or 'klub' log(f"\n🏟 Klub {klub['id']} {klub['naziv']} → HNS {khns}/{kslug}") roster = scrape_klub_roster(page, khns, kslug) log(f" Roster: {len(roster)} igrača") if roster: upsert_klub_roster(conn, klub['id'], khns, roster) # Each player for p in roster[:30]: # safety: max 30 per klub for now try: time.sleep(0.5) pdata = scrape_player(page, p['hns_id'], p['slug']) if pdata: clan_id = upsert_clan(conn, klub['id'], pdata) n_seas = upsert_seasons(conn, pdata['hns_id'], clan_id, pdata.get('seasons', [])) stats['players_scraped'] += 1 stats['seasons_upserted'] += n_seas log(f" ✓ {pdata['naziv']} (clan_id={clan_id}, seasons={n_seas})") except Exception as e: stats['errors'] += 1 log(f" ❌ Player {p['hns_id']}: {e}") stats['klubova'] += 1 except Exception as e: stats['errors'] += 1 log(f" ❌ Klub {klub['id']}: {e}\n{traceback.format_exc()[:500]}") browser.close() summary = f"✅ HNS Harvester done. Klubova: {stats['klubova']}, Players: {stats['players_scraped']}, Seasons: {stats['seasons_upserted']}, Errors: {stats['errors']}" log(summary, telegram=True) if __name__ == '__main__': main()