from dotenv import load_dotenv load_dotenv('/opt/rinet-gpu/.env.master') # auto-added by patch_scrapers_with_dotenv.sh """ Multi-sport scrape base class. Usage: subclass + implement scrape_klub(), scrape_player() """ import os, time, json, re, sys from datetime import datetime from playwright.sync_api import sync_playwright import psycopg2 from psycopg2.extras import RealDictCursor, execute_values DSN = os.getenv("RINET_DSN", f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}") class SportHarvester: SPORT = None # override SOURCE = None # override def __init__(self): self.conn = psycopg2.connect(DSN) self.conn.autocommit = True self.stats = {'klubova': 0, 'players': 0, 'stats': 0, 'errors': 0} self.log_file = open(f"/var/log/pgz-sport-debug/harvest_{self.SPORT}_{datetime.now().strftime('%Y%m%d_%H%M')}.log", "a") def log(self, msg): line = f"[{datetime.now().isoformat(timespec='seconds')}] [{self.SPORT}] {msg}" print(line, flush=True) self.log_file.write(line + "\n"); self.log_file.flush() def slugify(self, s): if not s: return "" t = s.lower().strip() for old, new in [('č','c'),('ć','c'),('ž','z'),('š','s'),('đ','d')]: t = t.replace(old, new) t = re.sub(r'[^a-z0-9\s-]', '', t) return re.sub(r'\s+', '-', t).strip('-') def get_target_klubovi(self, limit=999): """Get PGŽ priority klubovi for this sport.""" with self.conn.cursor(cursor_factory=RealDictCursor) as cur: cur.execute(""" SELECT * FROM pgz_sport.v_pgz_priority_klubovi WHERE sport = %s AND (financiran OR u_godisnjaku) ORDER BY financiran DESC, u_godisnjaku DESC, id LIMIT %s """, (self.SPORT, limit)) return cur.fetchall() def upsert_clan(self, klub_id, source_id, ime, prezime, source_url, kategorija=None, sezona=None, extra=None): """Upsert player + return clan_id.""" ime = re.sub(r'\s+', ' ', (ime or '')).strip() prezime = re.sub(r'\s+', ' ', (prezime or '')).strip() with self.conn.cursor() as cur: # Try find existing by source+source_id cur.execute(""" SELECT id FROM pgz_sport.clanovi WHERE source = %s AND source_id = %s ORDER BY id LIMIT 1 """, (self.SOURCE, str(source_id))) row = cur.fetchone() if row: clan_id = row[0] cur.execute(""" UPDATE pgz_sport.clanovi SET ime = COALESCE(NULLIF(ime,''), %s), prezime = COALESCE(NULLIF(prezime,''), %s), klub_id = COALESCE(klub_id, %s), source_url = %s, last_updated = now(), last_scraped_at = now(), sport = COALESCE(sport, %s), metadata = COALESCE(metadata, '{}'::jsonb) || %s::jsonb WHERE id = %s """, (ime, prezime, klub_id, source_url, self.SPORT, json.dumps(extra or {}), clan_id)) else: cur.execute(""" INSERT INTO pgz_sport.clanovi (klub_id, ime, prezime, sport, source, source_id, source_url, last_scraped_at, aktivan, metadata) VALUES (%s, %s, %s, %s, %s, %s, %s, now(), true, %s::jsonb) RETURNING id """, (klub_id, ime, prezime, self.SPORT, self.SOURCE, str(source_id), source_url, json.dumps(extra or {}))) clan_id = cur.fetchone()[0] # Add kategorija if specified (many-to-many) if kategorija: cur.execute(""" INSERT INTO pgz_sport.clan_kategorije (clan_id, kategorija, sezona, klub_id, source, source_url) VALUES (%s, %s, %s, %s, %s, %s) ON CONFLICT (clan_id, kategorija, sezona, klub_id) DO NOTHING """, (clan_id, kategorija, sezona, klub_id, self.SOURCE, source_url)) return clan_id def upsert_stats(self, clan_id, sezona, klub_id, klub_naziv, natjecanje, kategorija, stats_dict, raw=None): """Upsert player_stats row.""" with self.conn.cursor() as cur: cur.execute(""" INSERT INTO pgz_sport.player_stats (clan_id, sport, source, sezona, klub_id, klub_naziv, natjecanje, kategorija, nastupi, golovi, asistencije, bodovi, trice, skokovi, blokade, servis_asovi, zuti, crveni, minute, metadata) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s::jsonb) ON CONFLICT (clan_id, sport, sezona, klub_id, natjecanje) DO UPDATE SET nastupi = EXCLUDED.nastupi, golovi = EXCLUDED.golovi, asistencije = EXCLUDED.asistencije, bodovi = EXCLUDED.bodovi, trice = EXCLUDED.trice, skokovi = EXCLUDED.skokovi, blokade = EXCLUDED.blokade, servis_asovi = EXCLUDED.servis_asovi, zuti = EXCLUDED.zuti, crveni = EXCLUDED.crveni, minute = EXCLUDED.minute, metadata = EXCLUDED.metadata, scraped_at = now() """, (clan_id, self.SPORT, self.SOURCE, sezona, klub_id, klub_naziv, natjecanje, kategorija, stats_dict.get('nastupi'), stats_dict.get('golovi'), stats_dict.get('asistencije'), stats_dict.get('bodovi'), stats_dict.get('trice'), stats_dict.get('skokovi'), stats_dict.get('blokade'), stats_dict.get('servis_asovi'), stats_dict.get('zuti'), stats_dict.get('crveni'), stats_dict.get('minute'), json.dumps(raw or {}))) def run(self, limit=999): klubovi = self.get_target_klubovi(limit) self.log(f"🚀 Starting {self.SPORT} harvest. Target: {len(klubovi)} klubova") with sync_playwright() as pw: browser = pw.chromium.launch(headless=True, args=["--no-sandbox","--ignore-certificate-errors"]) ctx = browser.new_context( ignore_https_errors=True, user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36" ) page = ctx.new_page() for klub in klubovi: try: self.scrape_klub(page, klub) self.stats['klubova'] += 1 except Exception as e: self.stats['errors'] += 1 self.log(f" ❌ Klub {klub['id']} {klub['naziv']}: {e}") browser.close() self.log(f"✅ Done. Stats: {self.stats}") # Telegram import subprocess try: subprocess.run(["curl","-s","-X","POST", f"https://api.telegram.org/bot8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y/sendMessage", "-d","chat_id=7969491558", "--data-urlencode", f"text={self.SPORT.upper()} harvest done: {self.stats}"], timeout=8, capture_output=True) except: pass def scrape_klub(self, page, klub): raise NotImplementedError("subclass must implement")