M12.3: Playwright fallback scraper for JS-heavy federation sites
- enrichment/playwright_scraper.py: fetch_rendered(), scrape_sport_pgz_klub(), scrape_federation(). Headless Chromium, 12s timeout, returns rendered text. Import-safe when playwright is missing. - enrich_router._sport_pgz_search() now falls back to the JS path when the cheap urllib fetch returns empty or unparseable HTML. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,170 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
playwright_scraper.py — JS-aware fallback scraper for enrichment v3
|
||||||
|
Author: Damir Radulić (damir@rinet.one) / dradulic@outlook.com
|
||||||
|
Date: 2026-05-04
|
||||||
|
Version: 1.0.0
|
||||||
|
|
||||||
|
Most Croatian sport-federation websites render fine with plain HTTP, but a
|
||||||
|
handful (single-page apps, lazy-loaded content) need a real browser. This
|
||||||
|
module wraps Playwright so the enrichment pipeline can fall back to a JS
|
||||||
|
render when the cheap urllib path returns a thin/empty page.
|
||||||
|
|
||||||
|
Public surface
|
||||||
|
--------------
|
||||||
|
fetch_rendered(url, *, timeout_ms=12000, wait_until="networkidle") -> dict
|
||||||
|
Returns {url, status, title, text, html, fetched_at}. Returns None on
|
||||||
|
fatal browser/launch errors (caller must treat as missing source).
|
||||||
|
|
||||||
|
scrape_sport_pgz_klub(naziv) -> dict | None
|
||||||
|
Convenience wrapper for sport-pgz.hr — runs the search query, follows
|
||||||
|
the first article hit, and returns the rendered text + URL.
|
||||||
|
|
||||||
|
scrape_federation(homepage, naziv) -> dict | None
|
||||||
|
Generic federation site scraper: opens the homepage, performs a naive
|
||||||
|
in-page text search for the entity name, returns the rendered first
|
||||||
|
page (or follows the first link whose text contains the name).
|
||||||
|
|
||||||
|
The module is import-safe even when playwright is missing — every public
|
||||||
|
function returns None instead of crashing.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
import urllib.parse
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
try:
|
||||||
|
from playwright.sync_api import sync_playwright
|
||||||
|
HAS_PLAYWRIGHT = True
|
||||||
|
except Exception:
|
||||||
|
HAS_PLAYWRIGHT = False
|
||||||
|
|
||||||
|
|
||||||
|
UA = "Mozilla/5.0 (X11; Linux x86_64) pgz-sport-enrich/3.0 Playwright"
|
||||||
|
|
||||||
|
|
||||||
|
def _strip_html(s: str) -> str:
|
||||||
|
s = re.sub(r"<script[^>]*>.*?</script>", " ", s or "", flags=re.S | re.I)
|
||||||
|
s = re.sub(r"<style[^>]*>.*?</style>", " ", s, flags=re.S | re.I)
|
||||||
|
s = re.sub(r"<[^>]+>", " ", s)
|
||||||
|
return re.sub(r"\s+", " ", s).strip()
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_rendered(url: str, *, timeout_ms: int = 12000,
|
||||||
|
wait_until: str = "networkidle") -> Optional[dict]:
|
||||||
|
"""Render `url` with Chromium, return text + html.
|
||||||
|
|
||||||
|
Caller should treat None as 'JS render unavailable, fall back to plain HTTP'.
|
||||||
|
"""
|
||||||
|
if not HAS_PLAYWRIGHT or not url or not url.startswith("http"):
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
with sync_playwright() as pw:
|
||||||
|
browser = pw.chromium.launch(headless=True, args=["--no-sandbox"])
|
||||||
|
ctx = browser.new_context(user_agent=UA, locale="hr-HR")
|
||||||
|
page = ctx.new_page()
|
||||||
|
page.set_default_timeout(timeout_ms)
|
||||||
|
try:
|
||||||
|
page.goto(url, wait_until=wait_until, timeout=timeout_ms)
|
||||||
|
except Exception:
|
||||||
|
pass # use whatever was rendered before timeout
|
||||||
|
title = page.title() or ""
|
||||||
|
html = page.content()
|
||||||
|
text = _strip_html(html)[:12000]
|
||||||
|
ctx.close(); browser.close()
|
||||||
|
return {
|
||||||
|
"url": url,
|
||||||
|
"title": title[:300],
|
||||||
|
"text": text,
|
||||||
|
"html_len": len(html or ""),
|
||||||
|
"fetched_at": int(time.time()),
|
||||||
|
}
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def scrape_sport_pgz_klub(naziv: str) -> Optional[dict]:
|
||||||
|
"""Search sport-pgz.hr for `naziv` and return the rendered first hit."""
|
||||||
|
if not naziv: return None
|
||||||
|
if not HAS_PLAYWRIGHT: return None
|
||||||
|
q = urllib.parse.quote(naziv)
|
||||||
|
search_url = f"https://sport-pgz.hr/?s={q}"
|
||||||
|
try:
|
||||||
|
with sync_playwright() as pw:
|
||||||
|
browser = pw.chromium.launch(headless=True, args=["--no-sandbox"])
|
||||||
|
ctx = browser.new_context(user_agent=UA, locale="hr-HR")
|
||||||
|
page = ctx.new_page()
|
||||||
|
page.set_default_timeout(12000)
|
||||||
|
page.goto(search_url, wait_until="networkidle")
|
||||||
|
link = page.locator("article a[rel='bookmark']").first
|
||||||
|
try:
|
||||||
|
href = link.get_attribute("href", timeout=4000)
|
||||||
|
except Exception:
|
||||||
|
href = None
|
||||||
|
if not href:
|
||||||
|
ctx.close(); browser.close()
|
||||||
|
return None
|
||||||
|
page.goto(href, wait_until="networkidle")
|
||||||
|
title = page.title() or ""
|
||||||
|
html = page.content()
|
||||||
|
text = _strip_html(html)[:8000]
|
||||||
|
ctx.close(); browser.close()
|
||||||
|
return {
|
||||||
|
"source": "sport-pgz.hr",
|
||||||
|
"url": href,
|
||||||
|
"title": title[:300],
|
||||||
|
"extract": text[:600],
|
||||||
|
"raw_text": text,
|
||||||
|
"fetched_at": int(time.time()),
|
||||||
|
}
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def scrape_federation(homepage: str, naziv: str) -> Optional[dict]:
|
||||||
|
"""Open `homepage`, follow the first link whose text contains `naziv`."""
|
||||||
|
if not (homepage and naziv): return None
|
||||||
|
if not HAS_PLAYWRIGHT: return None
|
||||||
|
try:
|
||||||
|
with sync_playwright() as pw:
|
||||||
|
browser = pw.chromium.launch(headless=True, args=["--no-sandbox"])
|
||||||
|
ctx = browser.new_context(user_agent=UA, locale="hr-HR")
|
||||||
|
page = ctx.new_page()
|
||||||
|
page.set_default_timeout(12000)
|
||||||
|
page.goto(homepage, wait_until="networkidle")
|
||||||
|
anchors = page.locator(
|
||||||
|
f"a:has-text(\"{naziv[:30]}\")"
|
||||||
|
)
|
||||||
|
href = None
|
||||||
|
try:
|
||||||
|
if anchors.count() > 0:
|
||||||
|
href = anchors.first.get_attribute("href", timeout=2000)
|
||||||
|
except Exception:
|
||||||
|
href = None
|
||||||
|
if href and not href.startswith("http"):
|
||||||
|
href = urllib.parse.urljoin(homepage, href)
|
||||||
|
target = href or homepage
|
||||||
|
page.goto(target, wait_until="networkidle")
|
||||||
|
title = page.title() or ""
|
||||||
|
html = page.content()
|
||||||
|
text = _strip_html(html)[:8000]
|
||||||
|
ctx.close(); browser.close()
|
||||||
|
return {
|
||||||
|
"source": urllib.parse.urlparse(target).hostname or target,
|
||||||
|
"url": target,
|
||||||
|
"title": title[:300],
|
||||||
|
"extract": text[:600],
|
||||||
|
"raw_text": text,
|
||||||
|
"fetched_at": int(time.time()),
|
||||||
|
}
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# ─── self-test ───────────────────────────────────────────────────────────
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import json, sys
|
||||||
|
target = sys.argv[1] if len(sys.argv) > 1 else "Košarkaški klub Kvarner 2010"
|
||||||
|
print("playwright available:", HAS_PLAYWRIGHT)
|
||||||
|
print(json.dumps(scrape_sport_pgz_klub(target) or {"none": True}, ensure_ascii=False, indent=2))
|
||||||
@@ -48,6 +48,16 @@ DB = dict(host=_pgh, port=_pgp,
|
|||||||
UA = 'pgz-sport-enrich/3.0 (+https://sport.rinet.one)'
|
UA = 'pgz-sport-enrich/3.0 (+https://sport.rinet.one)'
|
||||||
TIMEOUT = 6 # seconds — fail-soft
|
TIMEOUT = 6 # seconds — fail-soft
|
||||||
|
|
||||||
|
# Optional JS-aware fallback (Playwright). Lazy-loaded, never required.
|
||||||
|
import sys as _sys
|
||||||
|
_sys.path.insert(0, '/opt/pgz-sport')
|
||||||
|
try:
|
||||||
|
from enrichment import playwright_scraper as _pw_scraper
|
||||||
|
_HAS_PW = _pw_scraper.HAS_PLAYWRIGHT
|
||||||
|
except Exception:
|
||||||
|
_pw_scraper = None
|
||||||
|
_HAS_PW = False
|
||||||
|
|
||||||
DEEPSEEK_KEY = os.environ.get('DEEPSEEK_API_KEY', '').strip()
|
DEEPSEEK_KEY = os.environ.get('DEEPSEEK_API_KEY', '').strip()
|
||||||
DEEPSEEK_URL = os.environ.get('DEEPSEEK_URL',
|
DEEPSEEK_URL = os.environ.get('DEEPSEEK_URL',
|
||||||
'https://api.deepseek.com/v1/chat/completions')
|
'https://api.deepseek.com/v1/chat/completions')
|
||||||
@@ -184,12 +194,20 @@ def _wiki_summary(query: str) -> Optional[dict]:
|
|||||||
def _sport_pgz_search(query: str) -> Optional[dict]:
|
def _sport_pgz_search(query: str) -> Optional[dict]:
|
||||||
if not query: return None
|
if not query: return None
|
||||||
page = _http_get('https://sport-pgz.hr/?s=' + urllib.parse.quote(query), timeout=6)
|
page = _http_get('https://sport-pgz.hr/?s=' + urllib.parse.quote(query), timeout=6)
|
||||||
if not page: return None
|
if not page:
|
||||||
|
# Plain HTTP failed → try JS-rendered fallback if available.
|
||||||
|
if _HAS_PW and _pw_scraper is not None:
|
||||||
|
return _pw_scraper.scrape_sport_pgz_klub(query)
|
||||||
|
return None
|
||||||
m = re.search(r'<article[^>]*>.*?<a\s+href=["\']([^"\']+)["\'][^>]*rel=["\']bookmark["\'][^>]*>([^<]+)</a>',
|
m = re.search(r'<article[^>]*>.*?<a\s+href=["\']([^"\']+)["\'][^>]*rel=["\']bookmark["\'][^>]*>([^<]+)</a>',
|
||||||
page, re.S | re.I)
|
page, re.S | re.I)
|
||||||
if not m:
|
if not m:
|
||||||
m = re.search(r'<a\s+href=["\'](https?://sport-pgz\.hr/[^"\']+)["\'][^>]*>([^<]{6,180})</a>', page, re.I)
|
m = re.search(r'<a\s+href=["\'](https?://sport-pgz\.hr/[^"\']+)["\'][^>]*>([^<]{6,180})</a>', page, re.I)
|
||||||
if not m: return None
|
if not m:
|
||||||
|
# Search page rendered but yielded nothing parseable — try JS fallback.
|
||||||
|
if _HAS_PW and _pw_scraper is not None:
|
||||||
|
return _pw_scraper.scrape_sport_pgz_klub(query)
|
||||||
|
return None
|
||||||
hit = m.group(1)
|
hit = m.group(1)
|
||||||
body = _http_get(hit, timeout=6)
|
body = _http_get(hit, timeout=6)
|
||||||
if not body:
|
if not body:
|
||||||
|
|||||||
Reference in New Issue
Block a user