M12.3: Playwright fallback scraper for JS-heavy federation sites
- enrichment/playwright_scraper.py: fetch_rendered(), scrape_sport_pgz_klub(), scrape_federation(). Headless Chromium, 12s timeout, returns rendered text. Import-safe when playwright is missing. - enrich_router._sport_pgz_search() now falls back to the JS path when the cheap urllib fetch returns empty or unparseable HTML. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -48,6 +48,16 @@ DB = dict(host=_pgh, port=_pgp,
|
||||
UA = 'pgz-sport-enrich/3.0 (+https://sport.rinet.one)'
|
||||
TIMEOUT = 6 # seconds — fail-soft
|
||||
|
||||
# Optional JS-aware fallback (Playwright). Lazy-loaded, never required.
|
||||
import sys as _sys
|
||||
_sys.path.insert(0, '/opt/pgz-sport')
|
||||
try:
|
||||
from enrichment import playwright_scraper as _pw_scraper
|
||||
_HAS_PW = _pw_scraper.HAS_PLAYWRIGHT
|
||||
except Exception:
|
||||
_pw_scraper = None
|
||||
_HAS_PW = False
|
||||
|
||||
DEEPSEEK_KEY = os.environ.get('DEEPSEEK_API_KEY', '').strip()
|
||||
DEEPSEEK_URL = os.environ.get('DEEPSEEK_URL',
|
||||
'https://api.deepseek.com/v1/chat/completions')
|
||||
@@ -184,12 +194,20 @@ def _wiki_summary(query: str) -> Optional[dict]:
|
||||
def _sport_pgz_search(query: str) -> Optional[dict]:
|
||||
if not query: return None
|
||||
page = _http_get('https://sport-pgz.hr/?s=' + urllib.parse.quote(query), timeout=6)
|
||||
if not page: return None
|
||||
if not page:
|
||||
# Plain HTTP failed → try JS-rendered fallback if available.
|
||||
if _HAS_PW and _pw_scraper is not None:
|
||||
return _pw_scraper.scrape_sport_pgz_klub(query)
|
||||
return None
|
||||
m = re.search(r'<article[^>]*>.*?<a\s+href=["\']([^"\']+)["\'][^>]*rel=["\']bookmark["\'][^>]*>([^<]+)</a>',
|
||||
page, re.S | re.I)
|
||||
if not m:
|
||||
m = re.search(r'<a\s+href=["\'](https?://sport-pgz\.hr/[^"\']+)["\'][^>]*>([^<]{6,180})</a>', page, re.I)
|
||||
if not m: return None
|
||||
if not m:
|
||||
# Search page rendered but yielded nothing parseable — try JS fallback.
|
||||
if _HAS_PW and _pw_scraper is not None:
|
||||
return _pw_scraper.scrape_sport_pgz_klub(query)
|
||||
return None
|
||||
hit = m.group(1)
|
||||
body = _http_get(hit, timeout=6)
|
||||
if not body:
|
||||
|
||||
Reference in New Issue
Block a user