#!/usr/bin/env python3 """ playwright_scraper.py — JS-aware fallback scraper for enrichment v3 Author: Damir Radulić (damir@rinet.one) / dradulic@outlook.com Date: 2026-05-04 Version: 1.0.0 Most Croatian sport-federation websites render fine with plain HTTP, but a handful (single-page apps, lazy-loaded content) need a real browser. This module wraps Playwright so the enrichment pipeline can fall back to a JS render when the cheap urllib path returns a thin/empty page. Public surface -------------- fetch_rendered(url, *, timeout_ms=12000, wait_until="networkidle") -> dict Returns {url, status, title, text, html, fetched_at}. Returns None on fatal browser/launch errors (caller must treat as missing source). scrape_sport_pgz_klub(naziv) -> dict | None Convenience wrapper for sport-pgz.hr — runs the search query, follows the first article hit, and returns the rendered text + URL. scrape_federation(homepage, naziv) -> dict | None Generic federation site scraper: opens the homepage, performs a naive in-page text search for the entity name, returns the rendered first page (or follows the first link whose text contains the name). The module is import-safe even when playwright is missing — every public function returns None instead of crashing. """ from __future__ import annotations import re import time import urllib.parse from typing import Optional try: from playwright.sync_api import sync_playwright HAS_PLAYWRIGHT = True except Exception: HAS_PLAYWRIGHT = False UA = "Mozilla/5.0 (X11; Linux x86_64) pgz-sport-enrich/3.0 Playwright" def _strip_html(s: str) -> str: s = re.sub(r"]*>.*?", " ", s or "", flags=re.S | re.I) s = re.sub(r"]*>.*?", " ", s, flags=re.S | re.I) s = re.sub(r"<[^>]+>", " ", s) return re.sub(r"\s+", " ", s).strip() def fetch_rendered(url: str, *, timeout_ms: int = 12000, wait_until: str = "networkidle") -> Optional[dict]: """Render `url` with Chromium, return text + html. Caller should treat None as 'JS render unavailable, fall back to plain HTTP'. """ if not HAS_PLAYWRIGHT or not url or not url.startswith("http"): return None try: with sync_playwright() as pw: browser = pw.chromium.launch(headless=True, args=["--no-sandbox"]) ctx = browser.new_context(user_agent=UA, locale="hr-HR") page = ctx.new_page() page.set_default_timeout(timeout_ms) try: page.goto(url, wait_until=wait_until, timeout=timeout_ms) except Exception: pass # use whatever was rendered before timeout title = page.title() or "" html = page.content() text = _strip_html(html)[:12000] ctx.close(); browser.close() return { "url": url, "title": title[:300], "text": text, "html_len": len(html or ""), "fetched_at": int(time.time()), } except Exception: return None def scrape_sport_pgz_klub(naziv: str) -> Optional[dict]: """Search sport-pgz.hr for `naziv` and return the rendered first hit.""" if not naziv: return None if not HAS_PLAYWRIGHT: return None q = urllib.parse.quote(naziv) search_url = f"https://sport-pgz.hr/?s={q}" try: with sync_playwright() as pw: browser = pw.chromium.launch(headless=True, args=["--no-sandbox"]) ctx = browser.new_context(user_agent=UA, locale="hr-HR") page = ctx.new_page() page.set_default_timeout(12000) page.goto(search_url, wait_until="networkidle") link = page.locator("article a[rel='bookmark']").first try: href = link.get_attribute("href", timeout=4000) except Exception: href = None if not href: ctx.close(); browser.close() return None page.goto(href, wait_until="networkidle") title = page.title() or "" html = page.content() text = _strip_html(html)[:8000] ctx.close(); browser.close() return { "source": "sport-pgz.hr", "url": href, "title": title[:300], "extract": text[:600], "raw_text": text, "fetched_at": int(time.time()), } except Exception: return None def scrape_federation(homepage: str, naziv: str) -> Optional[dict]: """Open `homepage`, follow the first link whose text contains `naziv`.""" if not (homepage and naziv): return None if not HAS_PLAYWRIGHT: return None try: with sync_playwright() as pw: browser = pw.chromium.launch(headless=True, args=["--no-sandbox"]) ctx = browser.new_context(user_agent=UA, locale="hr-HR") page = ctx.new_page() page.set_default_timeout(12000) page.goto(homepage, wait_until="networkidle") anchors = page.locator( f"a:has-text(\"{naziv[:30]}\")" ) href = None try: if anchors.count() > 0: href = anchors.first.get_attribute("href", timeout=2000) except Exception: href = None if href and not href.startswith("http"): href = urllib.parse.urljoin(homepage, href) target = href or homepage page.goto(target, wait_until="networkidle") title = page.title() or "" html = page.content() text = _strip_html(html)[:8000] ctx.close(); browser.close() return { "source": urllib.parse.urlparse(target).hostname or target, "url": target, "title": title[:300], "extract": text[:600], "raw_text": text, "fetched_at": int(time.time()), } except Exception: return None # ─── self-test ─────────────────────────────────────────────────────────── if __name__ == "__main__": import json, sys target = sys.argv[1] if len(sys.argv) > 1 else "Košarkaški klub Kvarner 2010" print("playwright available:", HAS_PLAYWRIGHT) print(json.dumps(scrape_sport_pgz_klub(target) or {"none": True}, ensure_ascii=False, indent=2))