145 lines
6.0 KiB
Python
Executable File
145 lines
6.0 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
from dotenv import load_dotenv
|
|
load_dotenv('/opt/rinet-gpu/.env.master')
|
|
# auto-added by patch_scrapers_with_dotenv.sh
|
|
"""HNS sezone v3 — koristi __NEXT_DATA__ JSON parser primarily."""
|
|
import os, time, re, json, sys
|
|
import psycopg2
|
|
from psycopg2.extras import RealDictCursor
|
|
from playwright.sync_api import sync_playwright
|
|
|
|
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
|
|
|
|
def find_seasons(obj, found=None, depth=0):
|
|
if depth > 25: return found or []
|
|
if found is None: found = []
|
|
if isinstance(obj, dict):
|
|
keys = set(obj.keys())
|
|
# Detect season-like dict
|
|
if ('season' in keys and isinstance(obj.get('season'), (str, dict))) or 'sezona' in keys:
|
|
found.append(obj)
|
|
# Detect career object with seasons array
|
|
for k, v in obj.items():
|
|
if k.lower() in ('careers','career','seasons','sezone','statistics','stats') and isinstance(v, list):
|
|
for item in v:
|
|
if isinstance(item, dict) and any(kk in item for kk in ('season','sezona','year','godina')):
|
|
found.append(item)
|
|
find_seasons(v, found, depth+1)
|
|
elif isinstance(obj, list):
|
|
for item in obj:
|
|
find_seasons(item, found, depth+1)
|
|
return found
|
|
|
|
def normalize_season(s):
|
|
"""Convert season dict to flat row."""
|
|
sezona = s.get('season') or s.get('sezona') or s.get('year') or s.get('godina') or ''
|
|
if isinstance(sezona, dict):
|
|
sezona = sezona.get('name') or sezona.get('label') or str(sezona.get('year',''))
|
|
sezona = str(sezona)
|
|
|
|
klub = s.get('club') or s.get('klub') or s.get('team') or ''
|
|
if isinstance(klub, dict):
|
|
klub = klub.get('name') or klub.get('naziv') or ''
|
|
|
|
natj = s.get('competition') or s.get('natjecanje') or s.get('league') or ''
|
|
if isinstance(natj, dict):
|
|
natj = natj.get('name') or natj.get('naziv') or ''
|
|
|
|
def num(*keys):
|
|
for k in keys:
|
|
for kk in s.keys():
|
|
if k.lower() in kk.lower():
|
|
v = s[kk]
|
|
try: return int(v)
|
|
except:
|
|
try: return int(re.sub(r'\D','', str(v)) or 0)
|
|
except: return 0
|
|
return 0
|
|
|
|
return {
|
|
'sezona': sezona, 'klub': str(klub)[:200], 'natjecanje': str(natj)[:100],
|
|
'nastupi': num('matches','nastup','appearance'),
|
|
'startna': num('start'),
|
|
'zamjena': num('sub','zamjen'),
|
|
'golovi': num('goal','gol'),
|
|
'asistencije': num('assist','asist'),
|
|
'zuti': num('yellow','žut','zut'),
|
|
'crveni': num('red','crv'),
|
|
'minute': num('minute','minut','min'),
|
|
}
|
|
|
|
def main():
|
|
conn = psycopg2.connect(DSN); conn.autocommit = True
|
|
|
|
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
|
cur.execute("""
|
|
SELECT c.id AS clan_id, c.hns_igrac_id, c.ime, c.prezime, c.source_url
|
|
FROM pgz_sport.clanovi c
|
|
WHERE c.hns_igrac_id IS NOT NULL
|
|
AND NOT EXISTS (SELECT 1 FROM pgz_sport.hns_player_seasons s WHERE s.hns_igrac_id = c.hns_igrac_id)
|
|
ORDER BY c.id LIMIT 200
|
|
""")
|
|
targets = cur.fetchall()
|
|
|
|
print(f"Targets: {len(targets)}", flush=True)
|
|
|
|
seasons_added = 0
|
|
with sync_playwright() as pw:
|
|
browser = pw.chromium.launch(headless=True, args=["--no-sandbox","--ignore-certificate-errors"])
|
|
page = browser.new_context(ignore_https_errors=True,
|
|
user_agent="Mozilla/5.0 (X11; Linux x86_64) Chrome/120.0.0.0").new_page()
|
|
|
|
for i, t in enumerate(targets):
|
|
url = t['source_url']
|
|
if not url or 'semafor.hns.family/igraci/' not in url:
|
|
continue
|
|
try:
|
|
page.goto(url, wait_until="networkidle", timeout=20000)
|
|
time.sleep(0.8)
|
|
|
|
html = page.content()
|
|
rows = []
|
|
|
|
# Extract __NEXT_DATA__
|
|
m = re.search(r'__NEXT_DATA__"\s*type="application/json">([^<]+)</script>', html)
|
|
if m:
|
|
try:
|
|
data = json.loads(m.group(1))
|
|
seasons_raw = find_seasons(data)
|
|
for s in seasons_raw:
|
|
n = normalize_season(s)
|
|
if n['sezona']:
|
|
rows.append(n)
|
|
except Exception as e:
|
|
pass
|
|
|
|
# Insert
|
|
if rows:
|
|
with conn.cursor() as cur:
|
|
for r in rows:
|
|
try:
|
|
cur.execute("""
|
|
INSERT INTO pgz_sport.hns_player_seasons
|
|
(hns_igrac_id, clan_id, sezona, klub_naziv, natjecanje,
|
|
nastupi, startna, zamjena, golovi, asistencije, zuti, crveni, minute)
|
|
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
|
ON CONFLICT DO NOTHING
|
|
""", (t['hns_igrac_id'], t['clan_id'], r['sezona'], r['klub'], r['natjecanje'],
|
|
r['nastupi'], r['startna'], r['zamjena'], r['golovi'],
|
|
r['asistencije'], r['zuti'], r['crveni'], r['minute']))
|
|
seasons_added += 1
|
|
except: pass
|
|
print(f" ✓ [{i}/{len(targets)}] {t['ime']} {t['prezime']}: +{len(rows)} sezone (total: {seasons_added})", flush=True)
|
|
|
|
if i % 30 == 0 and i > 0:
|
|
print(f" [{i}/{len(targets)}] processed, total: {seasons_added}", flush=True)
|
|
except Exception as e:
|
|
pass
|
|
|
|
browser.close()
|
|
|
|
print(f"\n✅ Done. Total: {seasons_added}", flush=True)
|
|
|
|
if __name__ == '__main__':
|
|
main()
|