pgz-sport/enrichment/playwright_scraper.py

#!/usr/bin/env python3
"""
playwright_scraper.py — JS-aware fallback scraper for enrichment v3
Author: Damir Radulić (damir@rinet.one) / dradulic@outlook.com
Date:   2026-05-04
Version: 1.0.0

Most Croatian sport-federation websites render fine with plain HTTP, but a
handful (single-page apps, lazy-loaded content) need a real browser. This
module wraps Playwright so the enrichment pipeline can fall back to a JS
render when the cheap urllib path returns a thin/empty page.

Public surface
--------------
fetch_rendered(url, *, timeout_ms=12000, wait_until="networkidle") -> dict
    Returns {url, status, title, text, html, fetched_at}. Returns None on
    fatal browser/launch errors (caller must treat as missing source).

scrape_sport_pgz_klub(naziv) -> dict | None
    Convenience wrapper for sport-pgz.hr — runs the search query, follows
    the first article hit, and returns the rendered text + URL.

scrape_federation(homepage, naziv) -> dict | None
    Generic federation site scraper: opens the homepage, performs a naive
    in-page text search for the entity name, returns the rendered first
    page (or follows the first link whose text contains the name).

The module is import-safe even when playwright is missing — every public
function returns None instead of crashing.
"""
from __future__ import annotations
import re
import time
import urllib.parse
from typing import Optional

try:
    from playwright.sync_api import sync_playwright
    HAS_PLAYWRIGHT = True
except Exception:
    HAS_PLAYWRIGHT = False


UA = "Mozilla/5.0 (X11; Linux x86_64) pgz-sport-enrich/3.0 Playwright"


def _strip_html(s: str) -> str:
    s = re.sub(r"<script[^>]*>.*?</script>", " ", s or "", flags=re.S | re.I)
    s = re.sub(r"<style[^>]*>.*?</style>",   " ", s, flags=re.S | re.I)
    s = re.sub(r"<[^>]+>", " ", s)
    return re.sub(r"\s+", " ", s).strip()


def fetch_rendered(url: str, *, timeout_ms: int = 12000,
                   wait_until: str = "networkidle") -> Optional[dict]:
    """Render `url` with Chromium, return text + html.

    Caller should treat None as 'JS render unavailable, fall back to plain HTTP'.
    """
    if not HAS_PLAYWRIGHT or not url or not url.startswith("http"):
        return None
    try:
        with sync_playwright() as pw:
            browser = pw.chromium.launch(headless=True, args=["--no-sandbox"])
            ctx = browser.new_context(user_agent=UA, locale="hr-HR")
            page = ctx.new_page()
            page.set_default_timeout(timeout_ms)
            try:
                page.goto(url, wait_until=wait_until, timeout=timeout_ms)
            except Exception:
                pass  # use whatever was rendered before timeout
            title = page.title() or ""
            html = page.content()
            text = _strip_html(html)[:12000]
            ctx.close(); browser.close()
        return {
            "url": url,
            "title": title[:300],
            "text": text,
            "html_len": len(html or ""),
            "fetched_at": int(time.time()),
        }
    except Exception:
        return None


def scrape_sport_pgz_klub(naziv: str) -> Optional[dict]:
    """Search sport-pgz.hr for `naziv` and return the rendered first hit."""
    if not naziv: return None
    if not HAS_PLAYWRIGHT: return None
    q = urllib.parse.quote(naziv)
    search_url = f"https://sport-pgz.hr/?s={q}"
    try:
        with sync_playwright() as pw:
            browser = pw.chromium.launch(headless=True, args=["--no-sandbox"])
            ctx = browser.new_context(user_agent=UA, locale="hr-HR")
            page = ctx.new_page()
            page.set_default_timeout(12000)
            page.goto(search_url, wait_until="networkidle")
            link = page.locator("article a[rel='bookmark']").first
            try:
                href = link.get_attribute("href", timeout=4000)
            except Exception:
                href = None
            if not href:
                ctx.close(); browser.close()
                return None
            page.goto(href, wait_until="networkidle")
            title = page.title() or ""
            html = page.content()
            text = _strip_html(html)[:8000]
            ctx.close(); browser.close()
        return {
            "source": "sport-pgz.hr",
            "url": href,
            "title": title[:300],
            "extract": text[:600],
            "raw_text": text,
            "fetched_at": int(time.time()),
        }
    except Exception:
        return None


def scrape_federation(homepage: str, naziv: str) -> Optional[dict]:
    """Open `homepage`, follow the first link whose text contains `naziv`."""
    if not (homepage and naziv): return None
    if not HAS_PLAYWRIGHT: return None
    try:
        with sync_playwright() as pw:
            browser = pw.chromium.launch(headless=True, args=["--no-sandbox"])
            ctx = browser.new_context(user_agent=UA, locale="hr-HR")
            page = ctx.new_page()
            page.set_default_timeout(12000)
            page.goto(homepage, wait_until="networkidle")
            anchors = page.locator(
                f"a:has-text(\"{naziv[:30]}\")"
            )
            href = None
            try:
                if anchors.count() > 0:
                    href = anchors.first.get_attribute("href", timeout=2000)
            except Exception:
                href = None
            if href and not href.startswith("http"):
                href = urllib.parse.urljoin(homepage, href)
            target = href or homepage
            page.goto(target, wait_until="networkidle")
            title = page.title() or ""
            html = page.content()
            text = _strip_html(html)[:8000]
            ctx.close(); browser.close()
        return {
            "source": urllib.parse.urlparse(target).hostname or target,
            "url": target,
            "title": title[:300],
            "extract": text[:600],
            "raw_text": text,
            "fetched_at": int(time.time()),
        }
    except Exception:
        return None


# ─── self-test ───────────────────────────────────────────────────────────
if __name__ == "__main__":
    import json, sys
    target = sys.argv[1] if len(sys.argv) > 1 else "Košarkaški klub Kvarner 2010"
    print("playwright available:", HAS_PLAYWRIGHT)
    print(json.dumps(scrape_sport_pgz_klub(target) or {"none": True}, ensure_ascii=False, indent=2))