diff --git a/enrichment/__init__.py b/enrichment/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/enrichment/playwright_scraper.py b/enrichment/playwright_scraper.py
new file mode 100644
index 0000000..f47d6b7
--- /dev/null
+++ b/enrichment/playwright_scraper.py
@@ -0,0 +1,170 @@
+#!/usr/bin/env python3
+"""
+playwright_scraper.py — JS-aware fallback scraper for enrichment v3
+Author: Damir Radulić (damir@rinet.one) / dradulic@outlook.com
+Date: 2026-05-04
+Version: 1.0.0
+
+Most Croatian sport-federation websites render fine with plain HTTP, but a
+handful (single-page apps, lazy-loaded content) need a real browser. This
+module wraps Playwright so the enrichment pipeline can fall back to a JS
+render when the cheap urllib path returns a thin/empty page.
+
+Public surface
+--------------
+fetch_rendered(url, *, timeout_ms=12000, wait_until="networkidle") -> dict
+ Returns {url, status, title, text, html, fetched_at}. Returns None on
+ fatal browser/launch errors (caller must treat as missing source).
+
+scrape_sport_pgz_klub(naziv) -> dict | None
+ Convenience wrapper for sport-pgz.hr — runs the search query, follows
+ the first article hit, and returns the rendered text + URL.
+
+scrape_federation(homepage, naziv) -> dict | None
+ Generic federation site scraper: opens the homepage, performs a naive
+ in-page text search for the entity name, returns the rendered first
+ page (or follows the first link whose text contains the name).
+
+The module is import-safe even when playwright is missing — every public
+function returns None instead of crashing.
+"""
+from __future__ import annotations
+import re
+import time
+import urllib.parse
+from typing import Optional
+
+try:
+ from playwright.sync_api import sync_playwright
+ HAS_PLAYWRIGHT = True
+except Exception:
+ HAS_PLAYWRIGHT = False
+
+
+UA = "Mozilla/5.0 (X11; Linux x86_64) pgz-sport-enrich/3.0 Playwright"
+
+
+def _strip_html(s: str) -> str:
+ s = re.sub(r"", " ", s or "", flags=re.S | re.I)
+ s = re.sub(r"", " ", s, flags=re.S | re.I)
+ s = re.sub(r"<[^>]+>", " ", s)
+ return re.sub(r"\s+", " ", s).strip()
+
+
+def fetch_rendered(url: str, *, timeout_ms: int = 12000,
+ wait_until: str = "networkidle") -> Optional[dict]:
+ """Render `url` with Chromium, return text + html.
+
+ Caller should treat None as 'JS render unavailable, fall back to plain HTTP'.
+ """
+ if not HAS_PLAYWRIGHT or not url or not url.startswith("http"):
+ return None
+ try:
+ with sync_playwright() as pw:
+ browser = pw.chromium.launch(headless=True, args=["--no-sandbox"])
+ ctx = browser.new_context(user_agent=UA, locale="hr-HR")
+ page = ctx.new_page()
+ page.set_default_timeout(timeout_ms)
+ try:
+ page.goto(url, wait_until=wait_until, timeout=timeout_ms)
+ except Exception:
+ pass # use whatever was rendered before timeout
+ title = page.title() or ""
+ html = page.content()
+ text = _strip_html(html)[:12000]
+ ctx.close(); browser.close()
+ return {
+ "url": url,
+ "title": title[:300],
+ "text": text,
+ "html_len": len(html or ""),
+ "fetched_at": int(time.time()),
+ }
+ except Exception:
+ return None
+
+
+def scrape_sport_pgz_klub(naziv: str) -> Optional[dict]:
+ """Search sport-pgz.hr for `naziv` and return the rendered first hit."""
+ if not naziv: return None
+ if not HAS_PLAYWRIGHT: return None
+ q = urllib.parse.quote(naziv)
+ search_url = f"https://sport-pgz.hr/?s={q}"
+ try:
+ with sync_playwright() as pw:
+ browser = pw.chromium.launch(headless=True, args=["--no-sandbox"])
+ ctx = browser.new_context(user_agent=UA, locale="hr-HR")
+ page = ctx.new_page()
+ page.set_default_timeout(12000)
+ page.goto(search_url, wait_until="networkidle")
+ link = page.locator("article a[rel='bookmark']").first
+ try:
+ href = link.get_attribute("href", timeout=4000)
+ except Exception:
+ href = None
+ if not href:
+ ctx.close(); browser.close()
+ return None
+ page.goto(href, wait_until="networkidle")
+ title = page.title() or ""
+ html = page.content()
+ text = _strip_html(html)[:8000]
+ ctx.close(); browser.close()
+ return {
+ "source": "sport-pgz.hr",
+ "url": href,
+ "title": title[:300],
+ "extract": text[:600],
+ "raw_text": text,
+ "fetched_at": int(time.time()),
+ }
+ except Exception:
+ return None
+
+
+def scrape_federation(homepage: str, naziv: str) -> Optional[dict]:
+ """Open `homepage`, follow the first link whose text contains `naziv`."""
+ if not (homepage and naziv): return None
+ if not HAS_PLAYWRIGHT: return None
+ try:
+ with sync_playwright() as pw:
+ browser = pw.chromium.launch(headless=True, args=["--no-sandbox"])
+ ctx = browser.new_context(user_agent=UA, locale="hr-HR")
+ page = ctx.new_page()
+ page.set_default_timeout(12000)
+ page.goto(homepage, wait_until="networkidle")
+ anchors = page.locator(
+ f"a:has-text(\"{naziv[:30]}\")"
+ )
+ href = None
+ try:
+ if anchors.count() > 0:
+ href = anchors.first.get_attribute("href", timeout=2000)
+ except Exception:
+ href = None
+ if href and not href.startswith("http"):
+ href = urllib.parse.urljoin(homepage, href)
+ target = href or homepage
+ page.goto(target, wait_until="networkidle")
+ title = page.title() or ""
+ html = page.content()
+ text = _strip_html(html)[:8000]
+ ctx.close(); browser.close()
+ return {
+ "source": urllib.parse.urlparse(target).hostname or target,
+ "url": target,
+ "title": title[:300],
+ "extract": text[:600],
+ "raw_text": text,
+ "fetched_at": int(time.time()),
+ }
+ except Exception:
+ return None
+
+
+# ─── self-test ───────────────────────────────────────────────────────────
+if __name__ == "__main__":
+ import json, sys
+ target = sys.argv[1] if len(sys.argv) > 1 else "Košarkaški klub Kvarner 2010"
+ print("playwright available:", HAS_PLAYWRIGHT)
+ print(json.dumps(scrape_sport_pgz_klub(target) or {"none": True}, ensure_ascii=False, indent=2))
diff --git a/routers/enrich_router.py b/routers/enrich_router.py
index 6e69f7b..9df44f4 100644
--- a/routers/enrich_router.py
+++ b/routers/enrich_router.py
@@ -48,6 +48,16 @@ DB = dict(host=_pgh, port=_pgp,
UA = 'pgz-sport-enrich/3.0 (+https://sport.rinet.one)'
TIMEOUT = 6 # seconds — fail-soft
+# Optional JS-aware fallback (Playwright). Lazy-loaded, never required.
+import sys as _sys
+_sys.path.insert(0, '/opt/pgz-sport')
+try:
+ from enrichment import playwright_scraper as _pw_scraper
+ _HAS_PW = _pw_scraper.HAS_PLAYWRIGHT
+except Exception:
+ _pw_scraper = None
+ _HAS_PW = False
+
DEEPSEEK_KEY = os.environ.get('DEEPSEEK_API_KEY', '').strip()
DEEPSEEK_URL = os.environ.get('DEEPSEEK_URL',
'https://api.deepseek.com/v1/chat/completions')
@@ -184,12 +194,20 @@ def _wiki_summary(query: str) -> Optional[dict]:
def _sport_pgz_search(query: str) -> Optional[dict]:
if not query: return None
page = _http_get('https://sport-pgz.hr/?s=' + urllib.parse.quote(query), timeout=6)
- if not page: return None
+ if not page:
+ # Plain HTTP failed → try JS-rendered fallback if available.
+ if _HAS_PW and _pw_scraper is not None:
+ return _pw_scraper.scrape_sport_pgz_klub(query)
+ return None
m = re.search(r']*>.*?]*rel=["\']bookmark["\'][^>]*>([^<]+)',
page, re.S | re.I)
if not m:
m = re.search(r']*>([^<]{6,180})', page, re.I)
- if not m: return None
+ if not m:
+ # Search page rendered but yielded nothing parseable — try JS fallback.
+ if _HAS_PW and _pw_scraper is not None:
+ return _pw_scraper.scrape_sport_pgz_klub(query)
+ return None
hit = m.group(1)
body = _http_get(hit, timeout=6)
if not body: