116 lines
5.0 KiB
Python
Executable File
116 lines
5.0 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
from dotenv import load_dotenv
|
|
load_dotenv('/opt/rinet-gpu/.env.master')
|
|
# auto-added by patch_scrapers_with_dotenv.sh
|
|
"""HNS sezone retry — pojednostavljen extract."""
|
|
import os, time, re, json, sys
|
|
from datetime import datetime
|
|
import psycopg2
|
|
from psycopg2.extras import RealDictCursor
|
|
from playwright.sync_api import sync_playwright
|
|
|
|
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
|
|
|
|
def find_seasons_in_obj(obj, found=None):
|
|
if found is None: found = []
|
|
if isinstance(obj, dict):
|
|
if 'season' in obj or 'sezona' in obj:
|
|
found.append(obj)
|
|
for v in obj.values():
|
|
find_seasons_in_obj(v, found)
|
|
elif isinstance(obj, list):
|
|
for item in obj:
|
|
find_seasons_in_obj(item, found)
|
|
return found
|
|
|
|
def main():
|
|
conn = psycopg2.connect(DSN); conn.autocommit = True
|
|
|
|
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
|
cur.execute("""
|
|
SELECT c.id AS clan_id, c.hns_igrac_id, c.ime, c.prezime, c.source_url
|
|
FROM pgz_sport.clanovi c
|
|
WHERE c.hns_igrac_id IS NOT NULL
|
|
AND NOT EXISTS (SELECT 1 FROM pgz_sport.hns_player_seasons s WHERE s.hns_igrac_id = c.hns_igrac_id)
|
|
ORDER BY c.id LIMIT 200
|
|
""")
|
|
targets = cur.fetchall()
|
|
|
|
print(f"Targets: {len(targets)}", flush=True)
|
|
|
|
seasons_added = 0
|
|
with sync_playwright() as pw:
|
|
browser = pw.chromium.launch(headless=True, args=["--no-sandbox","--ignore-certificate-errors"])
|
|
ctx = browser.new_context(ignore_https_errors=True,
|
|
user_agent="Mozilla/5.0 (X11; Linux x86_64) Chrome/120.0.0.0")
|
|
page = ctx.new_page()
|
|
|
|
for i, t in enumerate(targets):
|
|
url = t['source_url']
|
|
if not url or 'semafor.hns.family/igraci/' not in url:
|
|
continue
|
|
try:
|
|
page.goto(url, wait_until="networkidle", timeout=20000)
|
|
try: page.wait_for_selector('table, .karijera, [class*="season"]', timeout=6000)
|
|
except: pass
|
|
time.sleep(0.5)
|
|
|
|
rows = []
|
|
|
|
# Extract from __NEXT_DATA__ if exists
|
|
html = page.content()
|
|
m = re.search(r'__NEXT_DATA__"\s*type="application/json">([^<]+)</script>', html)
|
|
if m:
|
|
try:
|
|
data = json.loads(m.group(1))
|
|
sezone = find_seasons_in_obj(data)
|
|
for s in sezone:
|
|
sezona = s.get('season') or s.get('sezona')
|
|
if sezona:
|
|
rows.append({'sezona': str(sezona), 'klub': '', 'natjecanje': '', 'nastupi': 0, 'golovi': 0})
|
|
except Exception:
|
|
pass
|
|
|
|
# Fallback regex on body
|
|
if not rows:
|
|
body = page.locator('body').inner_text()
|
|
for line in body.split('\n'):
|
|
match = re.match(r'^(20\d{2}/\d{2})\s+(.+?)\s+(\d+(?:\s+\d+)*)\s*$', line.strip())
|
|
if match:
|
|
sezona = match.group(1)
|
|
rest = match.group(2)
|
|
nums = [int(x) for x in match.group(3).split()]
|
|
rows.append({
|
|
'sezona': sezona, 'klub': rest[:200], 'natjecanje': '',
|
|
'nastupi': nums[0] if nums else 0,
|
|
'golovi': nums[1] if len(nums) > 1 else 0,
|
|
})
|
|
|
|
if rows:
|
|
with conn.cursor() as cur:
|
|
for r in rows:
|
|
try:
|
|
cur.execute("""
|
|
INSERT INTO pgz_sport.hns_player_seasons
|
|
(hns_igrac_id, clan_id, sezona, klub_naziv, natjecanje, nastupi, golovi)
|
|
VALUES (%s, %s, %s, %s, %s, %s, %s)
|
|
ON CONFLICT DO NOTHING
|
|
""", (t['hns_igrac_id'], t['clan_id'], r['sezona'], r['klub'],
|
|
r['natjecanje'], r['nastupi'], r['golovi']))
|
|
seasons_added += 1
|
|
except Exception:
|
|
pass
|
|
print(f" ✓ [{i}/{len(targets)}] {t['ime']} {t['prezime']}: {len(rows)} sezone (total added: {seasons_added})", flush=True)
|
|
|
|
if i % 20 == 0:
|
|
print(f" [{i}/{len(targets)}] processed, total added: {seasons_added}", flush=True)
|
|
except Exception as e:
|
|
print(f" ❌ {t['ime']}: {e}", flush=True)
|
|
|
|
browser.close()
|
|
|
|
print(f"\nDone. Total sezone added: {seasons_added}")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|