84f1c41008
- enrichment/playwright_scraper.py: fetch_rendered(), scrape_sport_pgz_klub(), scrape_federation(). Headless Chromium, 12s timeout, returns rendered text. Import-safe when playwright is missing. - enrich_router._sport_pgz_search() now falls back to the JS path when the cheap urllib fetch returns empty or unparseable HTML. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
171 lines
6.4 KiB
Python
171 lines
6.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
playwright_scraper.py — JS-aware fallback scraper for enrichment v3
|
|
Author: Damir Radulić (damir@rinet.one) / dradulic@outlook.com
|
|
Date: 2026-05-04
|
|
Version: 1.0.0
|
|
|
|
Most Croatian sport-federation websites render fine with plain HTTP, but a
|
|
handful (single-page apps, lazy-loaded content) need a real browser. This
|
|
module wraps Playwright so the enrichment pipeline can fall back to a JS
|
|
render when the cheap urllib path returns a thin/empty page.
|
|
|
|
Public surface
|
|
--------------
|
|
fetch_rendered(url, *, timeout_ms=12000, wait_until="networkidle") -> dict
|
|
Returns {url, status, title, text, html, fetched_at}. Returns None on
|
|
fatal browser/launch errors (caller must treat as missing source).
|
|
|
|
scrape_sport_pgz_klub(naziv) -> dict | None
|
|
Convenience wrapper for sport-pgz.hr — runs the search query, follows
|
|
the first article hit, and returns the rendered text + URL.
|
|
|
|
scrape_federation(homepage, naziv) -> dict | None
|
|
Generic federation site scraper: opens the homepage, performs a naive
|
|
in-page text search for the entity name, returns the rendered first
|
|
page (or follows the first link whose text contains the name).
|
|
|
|
The module is import-safe even when playwright is missing — every public
|
|
function returns None instead of crashing.
|
|
"""
|
|
from __future__ import annotations
|
|
import re
|
|
import time
|
|
import urllib.parse
|
|
from typing import Optional
|
|
|
|
try:
|
|
from playwright.sync_api import sync_playwright
|
|
HAS_PLAYWRIGHT = True
|
|
except Exception:
|
|
HAS_PLAYWRIGHT = False
|
|
|
|
|
|
UA = "Mozilla/5.0 (X11; Linux x86_64) pgz-sport-enrich/3.0 Playwright"
|
|
|
|
|
|
def _strip_html(s: str) -> str:
|
|
s = re.sub(r"<script[^>]*>.*?</script>", " ", s or "", flags=re.S | re.I)
|
|
s = re.sub(r"<style[^>]*>.*?</style>", " ", s, flags=re.S | re.I)
|
|
s = re.sub(r"<[^>]+>", " ", s)
|
|
return re.sub(r"\s+", " ", s).strip()
|
|
|
|
|
|
def fetch_rendered(url: str, *, timeout_ms: int = 12000,
|
|
wait_until: str = "networkidle") -> Optional[dict]:
|
|
"""Render `url` with Chromium, return text + html.
|
|
|
|
Caller should treat None as 'JS render unavailable, fall back to plain HTTP'.
|
|
"""
|
|
if not HAS_PLAYWRIGHT or not url or not url.startswith("http"):
|
|
return None
|
|
try:
|
|
with sync_playwright() as pw:
|
|
browser = pw.chromium.launch(headless=True, args=["--no-sandbox"])
|
|
ctx = browser.new_context(user_agent=UA, locale="hr-HR")
|
|
page = ctx.new_page()
|
|
page.set_default_timeout(timeout_ms)
|
|
try:
|
|
page.goto(url, wait_until=wait_until, timeout=timeout_ms)
|
|
except Exception:
|
|
pass # use whatever was rendered before timeout
|
|
title = page.title() or ""
|
|
html = page.content()
|
|
text = _strip_html(html)[:12000]
|
|
ctx.close(); browser.close()
|
|
return {
|
|
"url": url,
|
|
"title": title[:300],
|
|
"text": text,
|
|
"html_len": len(html or ""),
|
|
"fetched_at": int(time.time()),
|
|
}
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
def scrape_sport_pgz_klub(naziv: str) -> Optional[dict]:
|
|
"""Search sport-pgz.hr for `naziv` and return the rendered first hit."""
|
|
if not naziv: return None
|
|
if not HAS_PLAYWRIGHT: return None
|
|
q = urllib.parse.quote(naziv)
|
|
search_url = f"https://sport-pgz.hr/?s={q}"
|
|
try:
|
|
with sync_playwright() as pw:
|
|
browser = pw.chromium.launch(headless=True, args=["--no-sandbox"])
|
|
ctx = browser.new_context(user_agent=UA, locale="hr-HR")
|
|
page = ctx.new_page()
|
|
page.set_default_timeout(12000)
|
|
page.goto(search_url, wait_until="networkidle")
|
|
link = page.locator("article a[rel='bookmark']").first
|
|
try:
|
|
href = link.get_attribute("href", timeout=4000)
|
|
except Exception:
|
|
href = None
|
|
if not href:
|
|
ctx.close(); browser.close()
|
|
return None
|
|
page.goto(href, wait_until="networkidle")
|
|
title = page.title() or ""
|
|
html = page.content()
|
|
text = _strip_html(html)[:8000]
|
|
ctx.close(); browser.close()
|
|
return {
|
|
"source": "sport-pgz.hr",
|
|
"url": href,
|
|
"title": title[:300],
|
|
"extract": text[:600],
|
|
"raw_text": text,
|
|
"fetched_at": int(time.time()),
|
|
}
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
def scrape_federation(homepage: str, naziv: str) -> Optional[dict]:
|
|
"""Open `homepage`, follow the first link whose text contains `naziv`."""
|
|
if not (homepage and naziv): return None
|
|
if not HAS_PLAYWRIGHT: return None
|
|
try:
|
|
with sync_playwright() as pw:
|
|
browser = pw.chromium.launch(headless=True, args=["--no-sandbox"])
|
|
ctx = browser.new_context(user_agent=UA, locale="hr-HR")
|
|
page = ctx.new_page()
|
|
page.set_default_timeout(12000)
|
|
page.goto(homepage, wait_until="networkidle")
|
|
anchors = page.locator(
|
|
f"a:has-text(\"{naziv[:30]}\")"
|
|
)
|
|
href = None
|
|
try:
|
|
if anchors.count() > 0:
|
|
href = anchors.first.get_attribute("href", timeout=2000)
|
|
except Exception:
|
|
href = None
|
|
if href and not href.startswith("http"):
|
|
href = urllib.parse.urljoin(homepage, href)
|
|
target = href or homepage
|
|
page.goto(target, wait_until="networkidle")
|
|
title = page.title() or ""
|
|
html = page.content()
|
|
text = _strip_html(html)[:8000]
|
|
ctx.close(); browser.close()
|
|
return {
|
|
"source": urllib.parse.urlparse(target).hostname or target,
|
|
"url": target,
|
|
"title": title[:300],
|
|
"extract": text[:600],
|
|
"raw_text": text,
|
|
"fetched_at": int(time.time()),
|
|
}
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
# ─── self-test ───────────────────────────────────────────────────────────
|
|
if __name__ == "__main__":
|
|
import json, sys
|
|
target = sys.argv[1] if len(sys.argv) > 1 else "Košarkaški klub Kvarner 2010"
|
|
print("playwright available:", HAS_PLAYWRIGHT)
|
|
print(json.dumps(scrape_sport_pgz_klub(target) or {"none": True}, ensure_ascii=False, indent=2))
|