M12.3: Playwright fallback scraper for JS-heavy federation sites

- enrichment/playwright_scraper.py: fetch_rendered(), scrape_sport_pgz_klub(),
  scrape_federation(). Headless Chromium, 12s timeout, returns rendered text.
  Import-safe when playwright is missing.
- enrich_router._sport_pgz_search() now falls back to the JS path when the
  cheap urllib fetch returns empty or unparseable HTML.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
CC6 Worker
2026-05-05 00:23:00 +02:00
parent c8be132e0f
commit 84f1c41008
3 changed files with 190 additions and 2 deletions
+20 -2
View File
@@ -48,6 +48,16 @@ DB = dict(host=_pgh, port=_pgp,
UA = 'pgz-sport-enrich/3.0 (+https://sport.rinet.one)'
TIMEOUT = 6 # seconds — fail-soft
# Optional JS-aware fallback (Playwright). Lazy-loaded, never required.
import sys as _sys
_sys.path.insert(0, '/opt/pgz-sport')
try:
from enrichment import playwright_scraper as _pw_scraper
_HAS_PW = _pw_scraper.HAS_PLAYWRIGHT
except Exception:
_pw_scraper = None
_HAS_PW = False
DEEPSEEK_KEY = os.environ.get('DEEPSEEK_API_KEY', '').strip()
DEEPSEEK_URL = os.environ.get('DEEPSEEK_URL',
'https://api.deepseek.com/v1/chat/completions')
@@ -184,12 +194,20 @@ def _wiki_summary(query: str) -> Optional[dict]:
def _sport_pgz_search(query: str) -> Optional[dict]:
if not query: return None
page = _http_get('https://sport-pgz.hr/?s=' + urllib.parse.quote(query), timeout=6)
if not page: return None
if not page:
# Plain HTTP failed → try JS-rendered fallback if available.
if _HAS_PW and _pw_scraper is not None:
return _pw_scraper.scrape_sport_pgz_klub(query)
return None
m = re.search(r'<article[^>]*>.*?<a\s+href=["\']([^"\']+)["\'][^>]*rel=["\']bookmark["\'][^>]*>([^<]+)</a>',
page, re.S | re.I)
if not m:
m = re.search(r'<a\s+href=["\'](https?://sport-pgz\.hr/[^"\']+)["\'][^>]*>([^<]{6,180})</a>', page, re.I)
if not m: return None
if not m:
# Search page rendered but yielded nothing parseable — try JS fallback.
if _HAS_PW and _pw_scraper is not None:
return _pw_scraper.scrape_sport_pgz_klub(query)
return None
hit = m.group(1)
body = _http_get(hit, timeout=6)
if not body: