From 84f1c41008e62ef198b4e97a493910811dbd9c88 Mon Sep 17 00:00:00 2001 From: CC6 Worker Date: Tue, 5 May 2026 00:23:00 +0200 Subject: [PATCH] M12.3: Playwright fallback scraper for JS-heavy federation sites - enrichment/playwright_scraper.py: fetch_rendered(), scrape_sport_pgz_klub(), scrape_federation(). Headless Chromium, 12s timeout, returns rendered text. Import-safe when playwright is missing. - enrich_router._sport_pgz_search() now falls back to the JS path when the cheap urllib fetch returns empty or unparseable HTML. Co-Authored-By: Claude Opus 4.7 (1M context) --- enrichment/__init__.py | 0 enrichment/playwright_scraper.py | 170 +++++++++++++++++++++++++++++++ routers/enrich_router.py | 22 +++- 3 files changed, 190 insertions(+), 2 deletions(-) create mode 100644 enrichment/__init__.py create mode 100644 enrichment/playwright_scraper.py diff --git a/enrichment/__init__.py b/enrichment/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/enrichment/playwright_scraper.py b/enrichment/playwright_scraper.py new file mode 100644 index 0000000..f47d6b7 --- /dev/null +++ b/enrichment/playwright_scraper.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 +""" +playwright_scraper.py — JS-aware fallback scraper for enrichment v3 +Author: Damir Radulić (damir@rinet.one) / dradulic@outlook.com +Date: 2026-05-04 +Version: 1.0.0 + +Most Croatian sport-federation websites render fine with plain HTTP, but a +handful (single-page apps, lazy-loaded content) need a real browser. This +module wraps Playwright so the enrichment pipeline can fall back to a JS +render when the cheap urllib path returns a thin/empty page. + +Public surface +-------------- +fetch_rendered(url, *, timeout_ms=12000, wait_until="networkidle") -> dict + Returns {url, status, title, text, html, fetched_at}. Returns None on + fatal browser/launch errors (caller must treat as missing source). + +scrape_sport_pgz_klub(naziv) -> dict | None + Convenience wrapper for sport-pgz.hr — runs the search query, follows + the first article hit, and returns the rendered text + URL. + +scrape_federation(homepage, naziv) -> dict | None + Generic federation site scraper: opens the homepage, performs a naive + in-page text search for the entity name, returns the rendered first + page (or follows the first link whose text contains the name). + +The module is import-safe even when playwright is missing — every public +function returns None instead of crashing. +""" +from __future__ import annotations +import re +import time +import urllib.parse +from typing import Optional + +try: + from playwright.sync_api import sync_playwright + HAS_PLAYWRIGHT = True +except Exception: + HAS_PLAYWRIGHT = False + + +UA = "Mozilla/5.0 (X11; Linux x86_64) pgz-sport-enrich/3.0 Playwright" + + +def _strip_html(s: str) -> str: + s = re.sub(r"]*>.*?", " ", s or "", flags=re.S | re.I) + s = re.sub(r"]*>.*?", " ", s, flags=re.S | re.I) + s = re.sub(r"<[^>]+>", " ", s) + return re.sub(r"\s+", " ", s).strip() + + +def fetch_rendered(url: str, *, timeout_ms: int = 12000, + wait_until: str = "networkidle") -> Optional[dict]: + """Render `url` with Chromium, return text + html. + + Caller should treat None as 'JS render unavailable, fall back to plain HTTP'. + """ + if not HAS_PLAYWRIGHT or not url or not url.startswith("http"): + return None + try: + with sync_playwright() as pw: + browser = pw.chromium.launch(headless=True, args=["--no-sandbox"]) + ctx = browser.new_context(user_agent=UA, locale="hr-HR") + page = ctx.new_page() + page.set_default_timeout(timeout_ms) + try: + page.goto(url, wait_until=wait_until, timeout=timeout_ms) + except Exception: + pass # use whatever was rendered before timeout + title = page.title() or "" + html = page.content() + text = _strip_html(html)[:12000] + ctx.close(); browser.close() + return { + "url": url, + "title": title[:300], + "text": text, + "html_len": len(html or ""), + "fetched_at": int(time.time()), + } + except Exception: + return None + + +def scrape_sport_pgz_klub(naziv: str) -> Optional[dict]: + """Search sport-pgz.hr for `naziv` and return the rendered first hit.""" + if not naziv: return None + if not HAS_PLAYWRIGHT: return None + q = urllib.parse.quote(naziv) + search_url = f"https://sport-pgz.hr/?s={q}" + try: + with sync_playwright() as pw: + browser = pw.chromium.launch(headless=True, args=["--no-sandbox"]) + ctx = browser.new_context(user_agent=UA, locale="hr-HR") + page = ctx.new_page() + page.set_default_timeout(12000) + page.goto(search_url, wait_until="networkidle") + link = page.locator("article a[rel='bookmark']").first + try: + href = link.get_attribute("href", timeout=4000) + except Exception: + href = None + if not href: + ctx.close(); browser.close() + return None + page.goto(href, wait_until="networkidle") + title = page.title() or "" + html = page.content() + text = _strip_html(html)[:8000] + ctx.close(); browser.close() + return { + "source": "sport-pgz.hr", + "url": href, + "title": title[:300], + "extract": text[:600], + "raw_text": text, + "fetched_at": int(time.time()), + } + except Exception: + return None + + +def scrape_federation(homepage: str, naziv: str) -> Optional[dict]: + """Open `homepage`, follow the first link whose text contains `naziv`.""" + if not (homepage and naziv): return None + if not HAS_PLAYWRIGHT: return None + try: + with sync_playwright() as pw: + browser = pw.chromium.launch(headless=True, args=["--no-sandbox"]) + ctx = browser.new_context(user_agent=UA, locale="hr-HR") + page = ctx.new_page() + page.set_default_timeout(12000) + page.goto(homepage, wait_until="networkidle") + anchors = page.locator( + f"a:has-text(\"{naziv[:30]}\")" + ) + href = None + try: + if anchors.count() > 0: + href = anchors.first.get_attribute("href", timeout=2000) + except Exception: + href = None + if href and not href.startswith("http"): + href = urllib.parse.urljoin(homepage, href) + target = href or homepage + page.goto(target, wait_until="networkidle") + title = page.title() or "" + html = page.content() + text = _strip_html(html)[:8000] + ctx.close(); browser.close() + return { + "source": urllib.parse.urlparse(target).hostname or target, + "url": target, + "title": title[:300], + "extract": text[:600], + "raw_text": text, + "fetched_at": int(time.time()), + } + except Exception: + return None + + +# ─── self-test ─────────────────────────────────────────────────────────── +if __name__ == "__main__": + import json, sys + target = sys.argv[1] if len(sys.argv) > 1 else "Košarkaški klub Kvarner 2010" + print("playwright available:", HAS_PLAYWRIGHT) + print(json.dumps(scrape_sport_pgz_klub(target) or {"none": True}, ensure_ascii=False, indent=2)) diff --git a/routers/enrich_router.py b/routers/enrich_router.py index 6e69f7b..9df44f4 100644 --- a/routers/enrich_router.py +++ b/routers/enrich_router.py @@ -48,6 +48,16 @@ DB = dict(host=_pgh, port=_pgp, UA = 'pgz-sport-enrich/3.0 (+https://sport.rinet.one)' TIMEOUT = 6 # seconds — fail-soft +# Optional JS-aware fallback (Playwright). Lazy-loaded, never required. +import sys as _sys +_sys.path.insert(0, '/opt/pgz-sport') +try: + from enrichment import playwright_scraper as _pw_scraper + _HAS_PW = _pw_scraper.HAS_PLAYWRIGHT +except Exception: + _pw_scraper = None + _HAS_PW = False + DEEPSEEK_KEY = os.environ.get('DEEPSEEK_API_KEY', '').strip() DEEPSEEK_URL = os.environ.get('DEEPSEEK_URL', 'https://api.deepseek.com/v1/chat/completions') @@ -184,12 +194,20 @@ def _wiki_summary(query: str) -> Optional[dict]: def _sport_pgz_search(query: str) -> Optional[dict]: if not query: return None page = _http_get('https://sport-pgz.hr/?s=' + urllib.parse.quote(query), timeout=6) - if not page: return None + if not page: + # Plain HTTP failed → try JS-rendered fallback if available. + if _HAS_PW and _pw_scraper is not None: + return _pw_scraper.scrape_sport_pgz_klub(query) + return None m = re.search(r']*>.*?]*rel=["\']bookmark["\'][^>]*>([^<]+)', page, re.S | re.I) if not m: m = re.search(r']*>([^<]{6,180})', page, re.I) - if not m: return None + if not m: + # Search page rendered but yielded nothing parseable — try JS fallback. + if _HAS_PW and _pw_scraper is not None: + return _pw_scraper.scrape_sport_pgz_klub(query) + return None hit = m.group(1) body = _http_get(hit, timeout=6) if not body: