Files
pgz-sport/scripts/hns_season_v3.py_prije_env_deepseek

142 lines
5.9 KiB
Python
Executable File

#!/usr/bin/env python3
"""HNS sezone v3 — koristi __NEXT_DATA__ JSON parser primarily."""
import os, time, re, json, sys
import psycopg2
from psycopg2.extras import RealDictCursor
from playwright.sync_api import sync_playwright
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
def find_seasons(obj, found=None, depth=0):
if depth > 25: return found or []
if found is None: found = []
if isinstance(obj, dict):
keys = set(obj.keys())
# Detect season-like dict
if ('season' in keys and isinstance(obj.get('season'), (str, dict))) or 'sezona' in keys:
found.append(obj)
# Detect career object with seasons array
for k, v in obj.items():
if k.lower() in ('careers','career','seasons','sezone','statistics','stats') and isinstance(v, list):
for item in v:
if isinstance(item, dict) and any(kk in item for kk in ('season','sezona','year','godina')):
found.append(item)
find_seasons(v, found, depth+1)
elif isinstance(obj, list):
for item in obj:
find_seasons(item, found, depth+1)
return found
def normalize_season(s):
"""Convert season dict to flat row."""
sezona = s.get('season') or s.get('sezona') or s.get('year') or s.get('godina') or ''
if isinstance(sezona, dict):
sezona = sezona.get('name') or sezona.get('label') or str(sezona.get('year',''))
sezona = str(sezona)
klub = s.get('club') or s.get('klub') or s.get('team') or ''
if isinstance(klub, dict):
klub = klub.get('name') or klub.get('naziv') or ''
natj = s.get('competition') or s.get('natjecanje') or s.get('league') or ''
if isinstance(natj, dict):
natj = natj.get('name') or natj.get('naziv') or ''
def num(*keys):
for k in keys:
for kk in s.keys():
if k.lower() in kk.lower():
v = s[kk]
try: return int(v)
except:
try: return int(re.sub(r'\D','', str(v)) or 0)
except: return 0
return 0
return {
'sezona': sezona, 'klub': str(klub)[:200], 'natjecanje': str(natj)[:100],
'nastupi': num('matches','nastup','appearance'),
'startna': num('start'),
'zamjena': num('sub','zamjen'),
'golovi': num('goal','gol'),
'asistencije': num('assist','asist'),
'zuti': num('yellow','žut','zut'),
'crveni': num('red','crv'),
'minute': num('minute','minut','min'),
}
def main():
conn = psycopg2.connect(DSN); conn.autocommit = True
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT c.id AS clan_id, c.hns_igrac_id, c.ime, c.prezime, c.source_url
FROM pgz_sport.clanovi c
WHERE c.hns_igrac_id IS NOT NULL
AND NOT EXISTS (SELECT 1 FROM pgz_sport.hns_player_seasons s WHERE s.hns_igrac_id = c.hns_igrac_id)
ORDER BY c.id LIMIT 200
""")
targets = cur.fetchall()
print(f"Targets: {len(targets)}", flush=True)
seasons_added = 0
with sync_playwright() as pw:
browser = pw.chromium.launch(headless=True, args=["--no-sandbox","--ignore-certificate-errors"])
page = browser.new_context(ignore_https_errors=True,
user_agent="Mozilla/5.0 (X11; Linux x86_64) Chrome/120.0.0.0").new_page()
for i, t in enumerate(targets):
url = t['source_url']
if not url or 'semafor.hns.family/igraci/' not in url:
continue
try:
page.goto(url, wait_until="networkidle", timeout=20000)
time.sleep(0.8)
html = page.content()
rows = []
# Extract __NEXT_DATA__
m = re.search(r'__NEXT_DATA__"\s*type="application/json">([^<]+)</script>', html)
if m:
try:
data = json.loads(m.group(1))
seasons_raw = find_seasons(data)
for s in seasons_raw:
n = normalize_season(s)
if n['sezona']:
rows.append(n)
except Exception as e:
pass
# Insert
if rows:
with conn.cursor() as cur:
for r in rows:
try:
cur.execute("""
INSERT INTO pgz_sport.hns_player_seasons
(hns_igrac_id, clan_id, sezona, klub_naziv, natjecanje,
nastupi, startna, zamjena, golovi, asistencije, zuti, crveni, minute)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT DO NOTHING
""", (t['hns_igrac_id'], t['clan_id'], r['sezona'], r['klub'], r['natjecanje'],
r['nastupi'], r['startna'], r['zamjena'], r['golovi'],
r['asistencije'], r['zuti'], r['crveni'], r['minute']))
seasons_added += 1
except: pass
print(f" ✓ [{i}/{len(targets)}] {t['ime']} {t['prezime']}: +{len(rows)} sezone (total: {seasons_added})", flush=True)
if i % 30 == 0 and i > 0:
print(f" [{i}/{len(targets)}] processed, total: {seasons_added}", flush=True)
except Exception as e:
pass
browser.close()
print(f"\n✅ Done. Total: {seasons_added}", flush=True)
if __name__ == '__main__':
main()