pgz-sport/scrapers/hks_hos_hbs_pw.py

#!/usr/bin/env python3
from dotenv import load_dotenv
load_dotenv('/opt/rinet-gpu/.env.master')
# auto-added by patch_scrapers_with_dotenv.sh
# ═══════════════════════════════════════════════════════════════════
# Fajl: hks_playwright.py | v1.0.0 | 05.05.2026
# Lokacija: /opt/pgz-sport/scrapers/hks_playwright.py
# Svrha: Cloudflare bypass za hks.hr (košarka) preko Playwright headless
# ═══════════════════════════════════════════════════════════════════
"""HKS scraper — Playwright with stealth tricks."""
import os, re, time, json, hashlib, sys
from urllib.parse import urljoin, urlparse
import psycopg2
from psycopg2.extras import execute_batch
from playwright.sync_api import sync_playwright

DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"

PGZ_TOKENS = ["Rijeka", "PGŽ", "Primorsko-goransk", "Kvarner", "Opatija",
              "Crikvenica", "Krk", "Cres", "Lošinj", "Rab", "Bakar", "Kostrena",
              "Kantrida", "Trsat", "Mlaka", "Viškovo", "Kastav"]


def is_pgz_relevant(text):
    return any(t in text for t in PGZ_TOKENS) or "Primorsko-goransk" in text


def chunk(text, max_len=800):
    if len(text) <= max_len: return [text] if text else []
    out = []; start = 0
    while start < len(text):
        end = min(start + max_len, len(text))
        if end < len(text):
            for sep in [". ", "! ", "? ", "\n"]:
                p = text.rfind(sep, start, end)
                if p > start + max_len // 2:
                    end = p + len(sep); break
        out.append(text[start:end].strip())
        start = end
    return [c for c in out if len(c) > 80]


def upsert(conn, facts, source):
    if not facts: return 0
    cur = conn.cursor()
    rows = []
    for f in facts:
        h = hashlib.md5(f["fact"].encode()).hexdigest()
        rows.append((f["fact"], source, "pgz_sport_savez_pw",
                     f.get("confidence", 0.85), h,
                     json.dumps({"url": f.get("url", ""), "scraped_via": "playwright"})))
    sql = """INSERT INTO dabi.knowledge (fact, source, category, confidence, data_hash, source_refs)
             VALUES (%s, %s, %s, %s, %s, %s::jsonb) ON CONFLICT (data_hash) DO NOTHING"""
    try:
        execute_batch(cur, sql, rows, page_size=50)
        n = cur.rowcount; cur.close()
        return n
    except Exception as e:
        return 0


SAVEZI = {
    "savezi_hks_kosarka_pw": ["https://www.hks.hr/", "https://www.hks.hr/klubovi/",
                               "https://www.hks.hr/natjecanja/"],
    "savezi_hos_odbojka_pw": ["https://hos.hr/", "https://hos.hr/klubovi/"],
    "savezi_hbs_bocanje_pw": ["https://hbs.hr/", "https://hbs.hr/klubovi/"],
}


def crawl():
    conn = psycopg2.connect(DSN); conn.autocommit = True

    total = {}

    with sync_playwright() as p:
        browser = p.chromium.launch(
            headless=True,
            args=["--no-sandbox", "--disable-blink-features=AutomationControlled",
                  "--disable-dev-shm-usage", "--disable-gpu"]
        )
        ctx = browser.new_context(
            user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
            viewport={"width": 1280, "height": 800},
            locale="hr-HR",
            extra_http_headers={"Accept-Language": "hr-HR,hr;q=0.9,en;q=0.8"},
        )
        # Stealth: hide webdriver
        ctx.add_init_script("""
            Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
            Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]});
            Object.defineProperty(navigator, 'languages', {get: () => ['hr-HR', 'hr', 'en']});
        """)

        for source, urls in SAVEZI.items():
            print(f"\n=== {source} ===")
            visited = set()
            queue = list(urls)
            facts_total = 0

            page = ctx.new_page()

            while queue and len(visited) < 40:
                url = queue.pop(0)
                if url in visited: continue
                visited.add(url)

                try:
                    response = page.goto(url, timeout=30000, wait_until="domcontentloaded")
                    if not response:
                        continue

                    # Wait for Cloudflare challenge if any
                    try:
                        page.wait_for_load_state("networkidle", timeout=8000)
                    except Exception:
                        pass

                    # Check for Cloudflare challenge
                    title = page.title() or ""
                    if "Just a moment" in title or "Attention Required" in title:
                        print(f"  CF challenge: {url[:60]}")
                        # Wait extra
                        page.wait_for_timeout(8000)

                    text = page.evaluate("() => document.body.innerText || ''") or ""
                    if not text or len(text) < 200:
                        print(f"  empty: {url[:60]}")
                        continue

                    # PGŽ relevance filter
                    if not is_pgz_relevant(text):
                        # Still grab links
                        try:
                            hrefs = page.evaluate("""
                                () => Array.from(document.querySelectorAll('a'))
                                    .map(a => a.href).filter(h => h)
                            """)
                            for h in hrefs[:50]:
                                base_host = urlparse(url).hostname
                                href_host = urlparse(h).hostname
                                if href_host == base_host and h not in visited and len(queue) < 100:
                                    queue.append(h.split("#")[0])
                        except Exception:
                            pass
                        continue

                    # Insert facts
                    facts = [{"fact": c, "url": url, "confidence": 0.85}
                             for c in chunk(text, 800) if len(c) > 100]
                    n = upsert(conn, facts, source)
                    facts_total += n
                    print(f"  {url[:50]:50} → {n} facts (visit {len(visited)})")

                    # Discover links
                    try:
                        hrefs = page.evaluate("""
                            () => Array.from(document.querySelectorAll('a'))
                                .map(a => a.href).filter(h => h)
                        """)
                        for h in hrefs[:80]:
                            base_host = urlparse(url).hostname
                            href_host = urlparse(h).hostname
                            if href_host == base_host and h not in visited and len(queue) < 100:
                                queue.append(h.split("#")[0])
                    except Exception:
                        pass

                    page.wait_for_timeout(800)  # rate limit

                except Exception as e:
                    print(f"  err {url[:50]}: {str(e)[:100]}")

            page.close()
            total[source] = {"visited": len(visited), "facts": facts_total}

        ctx.close()
        browser.close()

    print(f"\n=== TOTAL ===")
    print(json.dumps(total, default=str))
    conn.close()
    return total


if __name__ == "__main__":
    print(json.dumps(crawl(), default=str))