#!/usr/bin/env python3
"""
playwright_scraper.py — JS-aware fallback scraper for enrichment v3
Author: Damir Radulić (damir@rinet.one) / dradulic@outlook.com
Date: 2026-05-04
Version: 1.0.0
Most Croatian sport-federation websites render fine with plain HTTP, but a
handful (single-page apps, lazy-loaded content) need a real browser. This
module wraps Playwright so the enrichment pipeline can fall back to a JS
render when the cheap urllib path returns a thin/empty page.
Public surface
--------------
fetch_rendered(url, *, timeout_ms=12000, wait_until="networkidle") -> dict
Returns {url, status, title, text, html, fetched_at}. Returns None on
fatal browser/launch errors (caller must treat as missing source).
scrape_sport_pgz_klub(naziv) -> dict | None
Convenience wrapper for sport-pgz.hr — runs the search query, follows
the first article hit, and returns the rendered text + URL.
scrape_federation(homepage, naziv) -> dict | None
Generic federation site scraper: opens the homepage, performs a naive
in-page text search for the entity name, returns the rendered first
page (or follows the first link whose text contains the name).
The module is import-safe even when playwright is missing — every public
function returns None instead of crashing.
"""
from __future__ import annotations
import re
import time
import urllib.parse
from typing import Optional
try:
from playwright.sync_api import sync_playwright
HAS_PLAYWRIGHT = True
except Exception:
HAS_PLAYWRIGHT = False
UA = "Mozilla/5.0 (X11; Linux x86_64) pgz-sport-enrich/3.0 Playwright"
def _strip_html(s: str) -> str:
s = re.sub(r"", " ", s or "", flags=re.S | re.I)
s = re.sub(r"", " ", s, flags=re.S | re.I)
s = re.sub(r"<[^>]+>", " ", s)
return re.sub(r"\s+", " ", s).strip()
def fetch_rendered(url: str, *, timeout_ms: int = 12000,
wait_until: str = "networkidle") -> Optional[dict]:
"""Render `url` with Chromium, return text + html.
Caller should treat None as 'JS render unavailable, fall back to plain HTTP'.
"""
if not HAS_PLAYWRIGHT or not url or not url.startswith("http"):
return None
try:
with sync_playwright() as pw:
browser = pw.chromium.launch(headless=True, args=["--no-sandbox"])
ctx = browser.new_context(user_agent=UA, locale="hr-HR")
page = ctx.new_page()
page.set_default_timeout(timeout_ms)
try:
page.goto(url, wait_until=wait_until, timeout=timeout_ms)
except Exception:
pass # use whatever was rendered before timeout
title = page.title() or ""
html = page.content()
text = _strip_html(html)[:12000]
ctx.close(); browser.close()
return {
"url": url,
"title": title[:300],
"text": text,
"html_len": len(html or ""),
"fetched_at": int(time.time()),
}
except Exception:
return None
def scrape_sport_pgz_klub(naziv: str) -> Optional[dict]:
"""Search sport-pgz.hr for `naziv` and return the rendered first hit."""
if not naziv: return None
if not HAS_PLAYWRIGHT: return None
q = urllib.parse.quote(naziv)
search_url = f"https://sport-pgz.hr/?s={q}"
try:
with sync_playwright() as pw:
browser = pw.chromium.launch(headless=True, args=["--no-sandbox"])
ctx = browser.new_context(user_agent=UA, locale="hr-HR")
page = ctx.new_page()
page.set_default_timeout(12000)
page.goto(search_url, wait_until="networkidle")
link = page.locator("article a[rel='bookmark']").first
try:
href = link.get_attribute("href", timeout=4000)
except Exception:
href = None
if not href:
ctx.close(); browser.close()
return None
page.goto(href, wait_until="networkidle")
title = page.title() or ""
html = page.content()
text = _strip_html(html)[:8000]
ctx.close(); browser.close()
return {
"source": "sport-pgz.hr",
"url": href,
"title": title[:300],
"extract": text[:600],
"raw_text": text,
"fetched_at": int(time.time()),
}
except Exception:
return None
def scrape_federation(homepage: str, naziv: str) -> Optional[dict]:
"""Open `homepage`, follow the first link whose text contains `naziv`."""
if not (homepage and naziv): return None
if not HAS_PLAYWRIGHT: return None
try:
with sync_playwright() as pw:
browser = pw.chromium.launch(headless=True, args=["--no-sandbox"])
ctx = browser.new_context(user_agent=UA, locale="hr-HR")
page = ctx.new_page()
page.set_default_timeout(12000)
page.goto(homepage, wait_until="networkidle")
anchors = page.locator(
f"a:has-text(\"{naziv[:30]}\")"
)
href = None
try:
if anchors.count() > 0:
href = anchors.first.get_attribute("href", timeout=2000)
except Exception:
href = None
if href and not href.startswith("http"):
href = urllib.parse.urljoin(homepage, href)
target = href or homepage
page.goto(target, wait_until="networkidle")
title = page.title() or ""
html = page.content()
text = _strip_html(html)[:8000]
ctx.close(); browser.close()
return {
"source": urllib.parse.urlparse(target).hostname or target,
"url": target,
"title": title[:300],
"extract": text[:600],
"raw_text": text,
"fetched_at": int(time.time()),
}
except Exception:
return None
# ─── self-test ───────────────────────────────────────────────────────────
if __name__ == "__main__":
import json, sys
target = sys.argv[1] if len(sys.argv) > 1 else "Košarkaški klub Kvarner 2010"
print("playwright available:", HAS_PLAYWRIGHT)
print(json.dumps(scrape_sport_pgz_klub(target) or {"none": True}, ensure_ascii=False, indent=2))