Files
pgz-sport/scripts/hns_season_retry.py

116 lines
5.0 KiB
Python
Executable File

#!/usr/bin/env python3
from dotenv import load_dotenv
load_dotenv('/opt/rinet-gpu/.env.master')
# auto-added by patch_scrapers_with_dotenv.sh
"""HNS sezone retry — pojednostavljen extract."""
import os, time, re, json, sys
from datetime import datetime
import psycopg2
from psycopg2.extras import RealDictCursor
from playwright.sync_api import sync_playwright
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
def find_seasons_in_obj(obj, found=None):
if found is None: found = []
if isinstance(obj, dict):
if 'season' in obj or 'sezona' in obj:
found.append(obj)
for v in obj.values():
find_seasons_in_obj(v, found)
elif isinstance(obj, list):
for item in obj:
find_seasons_in_obj(item, found)
return found
def main():
conn = psycopg2.connect(DSN); conn.autocommit = True
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT c.id AS clan_id, c.hns_igrac_id, c.ime, c.prezime, c.source_url
FROM pgz_sport.clanovi c
WHERE c.hns_igrac_id IS NOT NULL
AND NOT EXISTS (SELECT 1 FROM pgz_sport.hns_player_seasons s WHERE s.hns_igrac_id = c.hns_igrac_id)
ORDER BY c.id LIMIT 200
""")
targets = cur.fetchall()
print(f"Targets: {len(targets)}", flush=True)
seasons_added = 0
with sync_playwright() as pw:
browser = pw.chromium.launch(headless=True, args=["--no-sandbox","--ignore-certificate-errors"])
ctx = browser.new_context(ignore_https_errors=True,
user_agent="Mozilla/5.0 (X11; Linux x86_64) Chrome/120.0.0.0")
page = ctx.new_page()
for i, t in enumerate(targets):
url = t['source_url']
if not url or 'semafor.hns.family/igraci/' not in url:
continue
try:
page.goto(url, wait_until="networkidle", timeout=20000)
try: page.wait_for_selector('table, .karijera, [class*="season"]', timeout=6000)
except: pass
time.sleep(0.5)
rows = []
# Extract from __NEXT_DATA__ if exists
html = page.content()
m = re.search(r'__NEXT_DATA__"\s*type="application/json">([^<]+)</script>', html)
if m:
try:
data = json.loads(m.group(1))
sezone = find_seasons_in_obj(data)
for s in sezone:
sezona = s.get('season') or s.get('sezona')
if sezona:
rows.append({'sezona': str(sezona), 'klub': '', 'natjecanje': '', 'nastupi': 0, 'golovi': 0})
except Exception:
pass
# Fallback regex on body
if not rows:
body = page.locator('body').inner_text()
for line in body.split('\n'):
match = re.match(r'^(20\d{2}/\d{2})\s+(.+?)\s+(\d+(?:\s+\d+)*)\s*$', line.strip())
if match:
sezona = match.group(1)
rest = match.group(2)
nums = [int(x) for x in match.group(3).split()]
rows.append({
'sezona': sezona, 'klub': rest[:200], 'natjecanje': '',
'nastupi': nums[0] if nums else 0,
'golovi': nums[1] if len(nums) > 1 else 0,
})
if rows:
with conn.cursor() as cur:
for r in rows:
try:
cur.execute("""
INSERT INTO pgz_sport.hns_player_seasons
(hns_igrac_id, clan_id, sezona, klub_naziv, natjecanje, nastupi, golovi)
VALUES (%s, %s, %s, %s, %s, %s, %s)
ON CONFLICT DO NOTHING
""", (t['hns_igrac_id'], t['clan_id'], r['sezona'], r['klub'],
r['natjecanje'], r['nastupi'], r['golovi']))
seasons_added += 1
except Exception:
pass
print(f" ✓ [{i}/{len(targets)}] {t['ime']} {t['prezime']}: {len(rows)} sezone (total added: {seasons_added})", flush=True)
if i % 20 == 0:
print(f" [{i}/{len(targets)}] processed, total added: {seasons_added}", flush=True)
except Exception as e:
print(f"{t['ime']}: {e}", flush=True)
browser.close()
print(f"\nDone. Total sezone added: {seasons_added}")
if __name__ == '__main__':
main()