#!/usr/bin/env python3 from dotenv import load_dotenv load_dotenv('/opt/rinet-gpu/.env.master') # auto-added by patch_scrapers_with_dotenv.sh # ═══════════════════════════════════════════════════════════════════ # Fajl: hks_playwright.py | v1.0.0 | 05.05.2026 # Lokacija: /opt/pgz-sport/scrapers/hks_playwright.py # Svrha: Cloudflare bypass za hks.hr (košarka) preko Playwright headless # ═══════════════════════════════════════════════════════════════════ """HKS scraper — Playwright with stealth tricks.""" import os, re, time, json, hashlib, sys from urllib.parse import urljoin, urlparse import psycopg2 from psycopg2.extras import execute_batch from playwright.sync_api import sync_playwright DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}" PGZ_TOKENS = ["Rijeka", "PGŽ", "Primorsko-goransk", "Kvarner", "Opatija", "Crikvenica", "Krk", "Cres", "Lošinj", "Rab", "Bakar", "Kostrena", "Kantrida", "Trsat", "Mlaka", "Viškovo", "Kastav"] def is_pgz_relevant(text): return any(t in text for t in PGZ_TOKENS) or "Primorsko-goransk" in text def chunk(text, max_len=800): if len(text) <= max_len: return [text] if text else [] out = []; start = 0 while start < len(text): end = min(start + max_len, len(text)) if end < len(text): for sep in [". ", "! ", "? ", "\n"]: p = text.rfind(sep, start, end) if p > start + max_len // 2: end = p + len(sep); break out.append(text[start:end].strip()) start = end return [c for c in out if len(c) > 80] def upsert(conn, facts, source): if not facts: return 0 cur = conn.cursor() rows = [] for f in facts: h = hashlib.md5(f["fact"].encode()).hexdigest() rows.append((f["fact"], source, "pgz_sport_savez_pw", f.get("confidence", 0.85), h, json.dumps({"url": f.get("url", ""), "scraped_via": "playwright"}))) sql = """INSERT INTO dabi.knowledge (fact, source, category, confidence, data_hash, source_refs) VALUES (%s, %s, %s, %s, %s, %s::jsonb) ON CONFLICT (data_hash) DO NOTHING""" try: execute_batch(cur, sql, rows, page_size=50) n = cur.rowcount; cur.close() return n except Exception as e: return 0 SAVEZI = { "savezi_hks_kosarka_pw": ["https://www.hks.hr/", "https://www.hks.hr/klubovi/", "https://www.hks.hr/natjecanja/"], "savezi_hos_odbojka_pw": ["https://hos.hr/", "https://hos.hr/klubovi/"], "savezi_hbs_bocanje_pw": ["https://hbs.hr/", "https://hbs.hr/klubovi/"], } def crawl(): conn = psycopg2.connect(DSN); conn.autocommit = True total = {} with sync_playwright() as p: browser = p.chromium.launch( headless=True, args=["--no-sandbox", "--disable-blink-features=AutomationControlled", "--disable-dev-shm-usage", "--disable-gpu"] ) ctx = browser.new_context( user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36", viewport={"width": 1280, "height": 800}, locale="hr-HR", extra_http_headers={"Accept-Language": "hr-HR,hr;q=0.9,en;q=0.8"}, ) # Stealth: hide webdriver ctx.add_init_script(""" Object.defineProperty(navigator, 'webdriver', {get: () => undefined}); Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]}); Object.defineProperty(navigator, 'languages', {get: () => ['hr-HR', 'hr', 'en']}); """) for source, urls in SAVEZI.items(): print(f"\n=== {source} ===") visited = set() queue = list(urls) facts_total = 0 page = ctx.new_page() while queue and len(visited) < 40: url = queue.pop(0) if url in visited: continue visited.add(url) try: response = page.goto(url, timeout=30000, wait_until="domcontentloaded") if not response: continue # Wait for Cloudflare challenge if any try: page.wait_for_load_state("networkidle", timeout=8000) except Exception: pass # Check for Cloudflare challenge title = page.title() or "" if "Just a moment" in title or "Attention Required" in title: print(f" CF challenge: {url[:60]}") # Wait extra page.wait_for_timeout(8000) text = page.evaluate("() => document.body.innerText || ''") or "" if not text or len(text) < 200: print(f" empty: {url[:60]}") continue # PGŽ relevance filter if not is_pgz_relevant(text): # Still grab links try: hrefs = page.evaluate(""" () => Array.from(document.querySelectorAll('a')) .map(a => a.href).filter(h => h) """) for h in hrefs[:50]: base_host = urlparse(url).hostname href_host = urlparse(h).hostname if href_host == base_host and h not in visited and len(queue) < 100: queue.append(h.split("#")[0]) except Exception: pass continue # Insert facts facts = [{"fact": c, "url": url, "confidence": 0.85} for c in chunk(text, 800) if len(c) > 100] n = upsert(conn, facts, source) facts_total += n print(f" {url[:50]:50} → {n} facts (visit {len(visited)})") # Discover links try: hrefs = page.evaluate(""" () => Array.from(document.querySelectorAll('a')) .map(a => a.href).filter(h => h) """) for h in hrefs[:80]: base_host = urlparse(url).hostname href_host = urlparse(h).hostname if href_host == base_host and h not in visited and len(queue) < 100: queue.append(h.split("#")[0]) except Exception: pass page.wait_for_timeout(800) # rate limit except Exception as e: print(f" err {url[:50]}: {str(e)[:100]}") page.close() total[source] = {"visited": len(visited), "facts": facts_total} ctx.close() browser.close() print(f"\n=== TOTAL ===") print(json.dumps(total, default=str)) conn.close() return total if __name__ == "__main__": print(json.dumps(crawl(), default=str))