Files

153 lines
7.3 KiB
Python
Executable File

from dotenv import load_dotenv
load_dotenv('/opt/rinet-gpu/.env.master')
# auto-added by patch_scrapers_with_dotenv.sh
"""
Multi-sport scrape base class.
Usage: subclass + implement scrape_klub(), scrape_player()
"""
import os, time, json, re, sys
from datetime import datetime
from playwright.sync_api import sync_playwright
import psycopg2
from psycopg2.extras import RealDictCursor, execute_values
DSN = os.getenv("RINET_DSN",
f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}")
class SportHarvester:
SPORT = None # override
SOURCE = None # override
def __init__(self):
self.conn = psycopg2.connect(DSN)
self.conn.autocommit = True
self.stats = {'klubova': 0, 'players': 0, 'stats': 0, 'errors': 0}
self.log_file = open(f"/var/log/pgz-sport-debug/harvest_{self.SPORT}_{datetime.now().strftime('%Y%m%d_%H%M')}.log", "a")
def log(self, msg):
line = f"[{datetime.now().isoformat(timespec='seconds')}] [{self.SPORT}] {msg}"
print(line, flush=True)
self.log_file.write(line + "\n"); self.log_file.flush()
def slugify(self, s):
if not s: return ""
t = s.lower().strip()
for old, new in [('č','c'),('ć','c'),('ž','z'),('š','s'),('đ','d')]:
t = t.replace(old, new)
t = re.sub(r'[^a-z0-9\s-]', '', t)
return re.sub(r'\s+', '-', t).strip('-')
def get_target_klubovi(self, limit=999):
"""Get PGŽ priority klubovi for this sport."""
with self.conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT * FROM pgz_sport.v_pgz_priority_klubovi
WHERE sport = %s AND (financiran OR u_godisnjaku)
ORDER BY financiran DESC, u_godisnjaku DESC, id
LIMIT %s
""", (self.SPORT, limit))
return cur.fetchall()
def upsert_clan(self, klub_id, source_id, ime, prezime, source_url, kategorija=None, sezona=None, extra=None):
"""Upsert player + return clan_id."""
ime = re.sub(r'\s+', ' ', (ime or '')).strip()
prezime = re.sub(r'\s+', ' ', (prezime or '')).strip()
with self.conn.cursor() as cur:
# Try find existing by source+source_id
cur.execute("""
SELECT id FROM pgz_sport.clanovi
WHERE source = %s AND source_id = %s
ORDER BY id LIMIT 1
""", (self.SOURCE, str(source_id)))
row = cur.fetchone()
if row:
clan_id = row[0]
cur.execute("""
UPDATE pgz_sport.clanovi
SET ime = COALESCE(NULLIF(ime,''), %s),
prezime = COALESCE(NULLIF(prezime,''), %s),
klub_id = COALESCE(klub_id, %s),
source_url = %s, last_updated = now(), last_scraped_at = now(),
sport = COALESCE(sport, %s),
metadata = COALESCE(metadata, '{}'::jsonb) || %s::jsonb
WHERE id = %s
""", (ime, prezime, klub_id, source_url, self.SPORT, json.dumps(extra or {}), clan_id))
else:
cur.execute("""
INSERT INTO pgz_sport.clanovi
(klub_id, ime, prezime, sport, source, source_id, source_url, last_scraped_at, aktivan, metadata)
VALUES (%s, %s, %s, %s, %s, %s, %s, now(), true, %s::jsonb)
RETURNING id
""", (klub_id, ime, prezime, self.SPORT, self.SOURCE, str(source_id), source_url, json.dumps(extra or {})))
clan_id = cur.fetchone()[0]
# Add kategorija if specified (many-to-many)
if kategorija:
cur.execute("""
INSERT INTO pgz_sport.clan_kategorije
(clan_id, kategorija, sezona, klub_id, source, source_url)
VALUES (%s, %s, %s, %s, %s, %s)
ON CONFLICT (clan_id, kategorija, sezona, klub_id) DO NOTHING
""", (clan_id, kategorija, sezona, klub_id, self.SOURCE, source_url))
return clan_id
def upsert_stats(self, clan_id, sezona, klub_id, klub_naziv, natjecanje, kategorija, stats_dict, raw=None):
"""Upsert player_stats row."""
with self.conn.cursor() as cur:
cur.execute("""
INSERT INTO pgz_sport.player_stats
(clan_id, sport, source, sezona, klub_id, klub_naziv, natjecanje, kategorija,
nastupi, golovi, asistencije, bodovi, trice, skokovi, blokade, servis_asovi,
zuti, crveni, minute, metadata)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s::jsonb)
ON CONFLICT (clan_id, sport, sezona, klub_id, natjecanje)
DO UPDATE SET
nastupi = EXCLUDED.nastupi, golovi = EXCLUDED.golovi,
asistencije = EXCLUDED.asistencije, bodovi = EXCLUDED.bodovi,
trice = EXCLUDED.trice, skokovi = EXCLUDED.skokovi,
blokade = EXCLUDED.blokade, servis_asovi = EXCLUDED.servis_asovi,
zuti = EXCLUDED.zuti, crveni = EXCLUDED.crveni, minute = EXCLUDED.minute,
metadata = EXCLUDED.metadata, scraped_at = now()
""", (clan_id, self.SPORT, self.SOURCE, sezona, klub_id, klub_naziv, natjecanje, kategorija,
stats_dict.get('nastupi'), stats_dict.get('golovi'), stats_dict.get('asistencije'),
stats_dict.get('bodovi'), stats_dict.get('trice'), stats_dict.get('skokovi'),
stats_dict.get('blokade'), stats_dict.get('servis_asovi'),
stats_dict.get('zuti'), stats_dict.get('crveni'), stats_dict.get('minute'),
json.dumps(raw or {})))
def run(self, limit=999):
klubovi = self.get_target_klubovi(limit)
self.log(f"🚀 Starting {self.SPORT} harvest. Target: {len(klubovi)} klubova")
with sync_playwright() as pw:
browser = pw.chromium.launch(headless=True, args=["--no-sandbox","--ignore-certificate-errors"])
ctx = browser.new_context(
ignore_https_errors=True,
user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36"
)
page = ctx.new_page()
for klub in klubovi:
try:
self.scrape_klub(page, klub)
self.stats['klubova'] += 1
except Exception as e:
self.stats['errors'] += 1
self.log(f" ❌ Klub {klub['id']} {klub['naziv']}: {e}")
browser.close()
self.log(f"✅ Done. Stats: {self.stats}")
# Telegram
import subprocess
try:
subprocess.run(["curl","-s","-X","POST",
f"https://api.telegram.org/bot8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y/sendMessage",
"-d","chat_id=7969491558",
"--data-urlencode", f"text={self.SPORT.upper()} harvest done: {self.stats}"],
timeout=8, capture_output=True)
except: pass
def scrape_klub(self, page, klub):
raise NotImplementedError("subclass must implement")