From 84f1c41008e62ef198b4e97a493910811dbd9c88 Mon Sep 17 00:00:00 2001
From: CC6 Worker <cc6@rinet.one>
Date: Tue, 5 May 2026 00:23:00 +0200
Subject: [PATCH] M12.3: Playwright fallback scraper for JS-heavy federation
 sites

- enrichment/playwright_scraper.py: fetch_rendered(), scrape_sport_pgz_klub(),
  scrape_federation(). Headless Chromium, 12s timeout, returns rendered text.
  Import-safe when playwright is missing.
- enrich_router._sport_pgz_search() now falls back to the JS path when the
  cheap urllib fetch returns empty or unparseable HTML.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 enrichment/__init__.py           |   0
 enrichment/playwright_scraper.py | 170 +++++++++++++++++++++++++++++++
 routers/enrich_router.py         |  22 +++-
 3 files changed, 190 insertions(+), 2 deletions(-)
 create mode 100644 enrichment/__init__.py
 create mode 100644 enrichment/playwright_scraper.py
diff --git a/enrichment/__init__.py b/enrichment/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/enrichment/playwright_scraper.py b/enrichment/playwright_scraper.py
new file mode 100644
index 0000000..f47d6b7
--- /dev/null
+++ b/enrichment/playwright_scraper.py
@@ -0,0 +1,170 @@
+#!/usr/bin/env python3
+"""
+playwright_scraper.py — JS-aware fallback scraper for enrichment v3
+Author: Damir Radulić (damir@rinet.one) / dradulic@outlook.com
+Date:   2026-05-04
+Version: 1.0.0
+
+Most Croatian sport-federation websites render fine with plain HTTP, but a
+handful (single-page apps, lazy-loaded content) need a real browser. This
+module wraps Playwright so the enrichment pipeline can fall back to a JS
+render when the cheap urllib path returns a thin/empty page.
+
+Public surface
+--------------
+fetch_rendered(url, *, timeout_ms=12000, wait_until="networkidle") -> dict
+    Returns {url, status, title, text, html, fetched_at}. Returns None on
+    fatal browser/launch errors (caller must treat as missing source).
+
+scrape_sport_pgz_klub(naziv) -> dict | None
+    Convenience wrapper for sport-pgz.hr — runs the search query, follows
+    the first article hit, and returns the rendered text + URL.
+
+scrape_federation(homepage, naziv) -> dict | None
+    Generic federation site scraper: opens the homepage, performs a naive
+    in-page text search for the entity name, returns the rendered first
+    page (or follows the first link whose text contains the name).
+
+The module is import-safe even when playwright is missing — every public
+function returns None instead of crashing.
+"""
+from __future__ import annotations
+import re
+import time
+import urllib.parse
+from typing import Optional
+
+try:
+    from playwright.sync_api import sync_playwright
+    HAS_PLAYWRIGHT = True
+except Exception:
+    HAS_PLAYWRIGHT = False
+
+
+UA = "Mozilla/5.0 (X11; Linux x86_64) pgz-sport-enrich/3.0 Playwright"
+
+
+def _strip_html(s: str) -> str:
+    s = re.sub(r"<script[^>]*>.*?</script>", " ", s or "", flags=re.S | re.I)
+    s = re.sub(r"<style[^>]*>.*?</style>",   " ", s, flags=re.S | re.I)
+    s = re.sub(r"<[^>]+>", " ", s)
+    return re.sub(r"\s+", " ", s).strip()
+
+
+def fetch_rendered(url: str, *, timeout_ms: int = 12000,
+                   wait_until: str = "networkidle") -> Optional[dict]:
+    """Render `url` with Chromium, return text + html.
+
+    Caller should treat None as 'JS render unavailable, fall back to plain HTTP'.
+    """
+    if not HAS_PLAYWRIGHT or not url or not url.startswith("http"):
+        return None
+    try:
+        with sync_playwright() as pw:
+            browser = pw.chromium.launch(headless=True, args=["--no-sandbox"])
+            ctx = browser.new_context(user_agent=UA, locale="hr-HR")
+            page = ctx.new_page()
+            page.set_default_timeout(timeout_ms)
+            try:
+                page.goto(url, wait_until=wait_until, timeout=timeout_ms)
+            except Exception:
+                pass  # use whatever was rendered before timeout
+            title = page.title() or ""
+            html = page.content()
+            text = _strip_html(html)[:12000]
+            ctx.close(); browser.close()
+        return {
+            "url": url,
+            "title": title[:300],
+            "text": text,
+            "html_len": len(html or ""),
+            "fetched_at": int(time.time()),
+        }
+    except Exception:
+        return None
+
+
+def scrape_sport_pgz_klub(naziv: str) -> Optional[dict]:
+    """Search sport-pgz.hr for `naziv` and return the rendered first hit."""
+    if not naziv: return None
+    if not HAS_PLAYWRIGHT: return None
+    q = urllib.parse.quote(naziv)
+    search_url = f"https://sport-pgz.hr/?s={q}"
+    try:
+        with sync_playwright() as pw:
+            browser = pw.chromium.launch(headless=True, args=["--no-sandbox"])
+            ctx = browser.new_context(user_agent=UA, locale="hr-HR")
+            page = ctx.new_page()
+            page.set_default_timeout(12000)
+            page.goto(search_url, wait_until="networkidle")
+            link = page.locator("article a[rel='bookmark']").first
+            try:
+                href = link.get_attribute("href", timeout=4000)
+            except Exception:
+                href = None
+            if not href:
+                ctx.close(); browser.close()
+                return None
+            page.goto(href, wait_until="networkidle")
+            title = page.title() or ""
+            html = page.content()
+            text = _strip_html(html)[:8000]
+            ctx.close(); browser.close()
+        return {
+            "source": "sport-pgz.hr",
+            "url": href,
+            "title": title[:300],
+            "extract": text[:600],
+            "raw_text": text,
+            "fetched_at": int(time.time()),
+        }
+    except Exception:
+        return None
+
+
+def scrape_federation(homepage: str, naziv: str) -> Optional[dict]:
+    """Open `homepage`, follow the first link whose text contains `naziv`."""
+    if not (homepage and naziv): return None
+    if not HAS_PLAYWRIGHT: return None
+    try:
+        with sync_playwright() as pw:
+            browser = pw.chromium.launch(headless=True, args=["--no-sandbox"])
+            ctx = browser.new_context(user_agent=UA, locale="hr-HR")
+            page = ctx.new_page()
+            page.set_default_timeout(12000)
+            page.goto(homepage, wait_until="networkidle")
+            anchors = page.locator(
+                f"a:has-text(\"{naziv[:30]}\")"
+            )
+            href = None
+            try:
+                if anchors.count() > 0:
+                    href = anchors.first.get_attribute("href", timeout=2000)
+            except Exception:
+                href = None
+            if href and not href.startswith("http"):
+                href = urllib.parse.urljoin(homepage, href)
+            target = href or homepage
+            page.goto(target, wait_until="networkidle")
+            title = page.title() or ""
+            html = page.content()
+            text = _strip_html(html)[:8000]
+            ctx.close(); browser.close()
+        return {
+            "source": urllib.parse.urlparse(target).hostname or target,
+            "url": target,
+            "title": title[:300],
+            "extract": text[:600],
+            "raw_text": text,
+            "fetched_at": int(time.time()),
+        }
+    except Exception:
+        return None
+
+
+# ─── self-test ───────────────────────────────────────────────────────────
+if __name__ == "__main__":
+    import json, sys
+    target = sys.argv[1] if len(sys.argv) > 1 else "Košarkaški klub Kvarner 2010"
+    print("playwright available:", HAS_PLAYWRIGHT)
+    print(json.dumps(scrape_sport_pgz_klub(target) or {"none": True}, ensure_ascii=False, indent=2))
diff --git a/routers/enrich_router.py b/routers/enrich_router.py
index 6e69f7b..9df44f4 100644
--- a/routers/enrich_router.py
+++ b/routers/enrich_router.py
@@ -48,6 +48,16 @@ DB = dict(host=_pgh, port=_pgp,
 UA = 'pgz-sport-enrich/3.0 (+https://sport.rinet.one)'
 TIMEOUT = 6  # seconds — fail-soft
 
+# Optional JS-aware fallback (Playwright). Lazy-loaded, never required.
+import sys as _sys
+_sys.path.insert(0, '/opt/pgz-sport')
+try:
+    from enrichment import playwright_scraper as _pw_scraper
+    _HAS_PW = _pw_scraper.HAS_PLAYWRIGHT
+except Exception:
+    _pw_scraper = None
+    _HAS_PW = False
+
 DEEPSEEK_KEY = os.environ.get('DEEPSEEK_API_KEY', '').strip()
 DEEPSEEK_URL = os.environ.get('DEEPSEEK_URL',
                               'https://api.deepseek.com/v1/chat/completions')
@@ -184,12 +194,20 @@ def _wiki_summary(query: str) -> Optional[dict]:
 def _sport_pgz_search(query: str) -> Optional[dict]:
     if not query: return None
     page = _http_get('https://sport-pgz.hr/?s=' + urllib.parse.quote(query), timeout=6)
-    if not page: return None
+    if not page:
+        # Plain HTTP failed → try JS-rendered fallback if available.
+        if _HAS_PW and _pw_scraper is not None:
+            return _pw_scraper.scrape_sport_pgz_klub(query)
+        return None
     m = re.search(r'<article[^>]*>.*?<a\s+href=["\']([^"\']+)["\'][^>]*rel=["\']bookmark["\'][^>]*>([^<]+)</a>',
                   page, re.S | re.I)
     if not m:
         m = re.search(r'<a\s+href=["\'](https?://sport-pgz\.hr/[^"\']+)["\'][^>]*>([^<]{6,180})</a>', page, re.I)
-    if not m: return None
+    if not m:
+        # Search page rendered but yielded nothing parseable — try JS fallback.
+        if _HAS_PW and _pw_scraper is not None:
+            return _pw_scraper.scrape_sport_pgz_klub(query)
+        return None
     hit = m.group(1)
     body = _http_get(hit, timeout=6)
     if not body: