Files
pgz-sport/scripts/hns_pgz_full_v2.py
T

219 lines
8.9 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
HNS PGŽ FULL SCRAPER v2 ispravljen URL roster-a
Koristi sub1_hns_catalog.json za točne URL-ove klubova
"""
import os, re, sys, time, logging, json
from datetime import datetime, timedelta
import requests
import psycopg2
from psycopg2.extras import execute_values
# ─── LOG ───────────────────────────────────────────
LOG_DIR = "/var/log/pgz-sport-debug"
os.makedirs(LOG_DIR, exist_ok=True)
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [FULL] %(message)s",
handlers=[
logging.FileHandler(os.path.join(LOG_DIR, "hns_full.log"), encoding="utf-8"),
logging.StreamHandler(sys.stdout)
]
)
log = logging.getLogger("hns_full")
# ─── DB ────────────────────────────────────────────
from dotenv import load_dotenv
load_dotenv('/opt/rinet-gpu/.env.master')
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
def get_conn():
return psycopg2.connect(DSN)
UA = "Mozilla/5.0 (Ri.NET PGŽ Sport Bot)"
def fetch(url, retries=3):
for i in range(retries):
try:
r = requests.get(url, headers={"User-Agent": UA}, timeout=15)
if r.status_code == 404: return None
r.raise_for_status()
return r.text
except: time.sleep(1.5 * (i+1))
return None
# ─── PARSIRANJE ─────────────────────────────────────
def parse_roster(html):
"""Vraća listu (hns_igrac_id, ime, prezime, profil_url)"""
igraci = []
for m in re.finditer(r'<a\s+[^>]*href="(/igraci/(\d+)/[^"]*)"[^>]*>(.*?)</a>', html, re.DOTALL):
url = "https://semafor.hns.family" + m.group(1)
hns_id = int(m.group(2))
raw_name = re.sub(r'<[^>]+>', ' ', m.group(3)).strip()
raw_name = re.sub(r'\s+', ' ', raw_name)
if not raw_name: continue
parts = raw_name.split(' ', 1)
ime = parts[0].strip()
prezime = parts[1].strip() if len(parts) > 1 else ''
if not prezime or len(prezime) < 2: continue
igraci.append((hns_id, ime, prezime, url))
return igraci
def parse_seasons(html, hns_igrac_id):
"""Vraća listu dictova sezona"""
# tražimo JSON-LD ili tablicu
json_match = re.search(r'<script[^>]*type="application/ld\+json"[^>]*>(.*?)</script>', html, re.DOTALL)
if json_match:
try:
data = json.loads(json_match.group(1))
seasons_data = data.get('playerSeason', [])
if not seasons_data:
return []
seasons = []
for s in seasons_data:
seasons.append({
"hns_igrac_id": hns_igrac_id,
"sezona": s.get("season", ""),
"klub_hns_id": str(s.get("clubId", "")),
"klub_naziv": s.get("clubName", ""),
"natjecanje": s.get("competition", ""),
"nastupi": int(s.get("apps", 0)),
"golovi": int(s.get("goals", 0)),
"asistencije": int(s.get("assists", 0)),
"zuti": int(s.get("yellow", 0)),
"crveni": int(s.get("red", 0)),
"minute": int(s.get("minutes", 0))
})
return seasons
except:
pass
return []
def parse_matches(html, hns_igrac_id):
"""Vraća listu dictova utakmica iz JSON-LD"""
json_match = re.search(r'<script[^>]*type="application/ld\+json"[^>]*>(.*?)</script>', html, re.DOTALL)
if json_match:
try:
data = json.loads(json_match.group(1))
matches_data = data.get('playerMatch', [])
matches = []
for m in matches_data:
matches.append({
"hns_igrac_id": hns_igrac_id,
"datum": m.get("date", ""),
"domacin": m.get("homeTeam", ""),
"gost": m.get("awayTeam", ""),
"rezultat": m.get("result", "")
})
return matches
except:
pass
return []
# ─── UPSERT ─────────────────────────────────────────
def upsert_players(conn, players):
sql = """INSERT INTO pgz_sport.clanovi (hns_igrac_id, ime, prezime, source_url)
VALUES %s
ON CONFLICT (hns_igrac_id) DO UPDATE SET
ime = EXCLUDED.ime, prezime = EXCLUDED.prezime, source_url = EXCLUDED.source_url"""
with conn.cursor() as cur:
execute_values(cur, sql, players)
def upsert_seasons(conn, seasons):
if not seasons: return
sql = """INSERT INTO pgz_sport.hns_player_seasons
(hns_igrac_id, sezona, natjecanje, klub_hns_id, klub_naziv,
nastupi, golovi, asistencije, zuti, crveni, minute)
VALUES %s
ON CONFLICT (hns_igrac_id, sezona, klub_hns_id, natjecanje) DO UPDATE SET
nastupi=EXCLUDED.nastupi, golovi=EXCLUDED.golovi,
asistencije=EXCLUDED.asistencije, zuti=EXCLUDED.zuti,
crveni=EXCLUDED.crveni, minute=EXCLUDED.minute,
klub_naziv=EXCLUDED.klub_naziv"""
vals = [(s['hns_igrac_id'], s['sezona'], s['natjecanje'], s['klub_hns_id'],
s['klub_naziv'], s['nastupi'], s['golovi'], s['asistencije'],
s['zuti'], s['crveni'], s['minute']) for s in seasons]
with conn.cursor() as cur:
execute_values(cur, sql, vals, page_size=50)
def upsert_matches(conn, matches):
if not matches: return
sql = """INSERT INTO pgz_sport.hns_player_matches
(hns_igrac_id, datum, domacin, gost, rezultat)
VALUES %s
ON CONFLICT (hns_igrac_id, datum, domacin, gost) DO NOTHING"""
vals = [(m['hns_igrac_id'], m['datum'], m['domacin'], m['gost'], m['rezultat']) for m in matches]
with conn.cursor() as cur:
execute_values(cur, sql, vals, page_size=50)
# ─── MAIN ───────────────────────────────────────────
def main():
log.info("=== START FULL PGŽ HNS SCRAPE v2 ===")
conn = get_conn()
conn.autocommit = True
# 1. Učitaj katalog za URL-ove klubova
with open('/opt/pgz-sport/cc_tasks/sub1_hns_catalog.json', 'r') as f:
catalog = json.load(f)
klub_url_map = {}
for item in catalog:
klub_url_map[item['id']] = f"https://semafor.hns.family/klubovi/{item['id']}/{item['slug']}/"
log.info(f"Učitano {len(klub_url_map)} klubova iz kataloga.")
# 2. Dohvati klubove iz baze koji imaju hns_klub_id i savez_id=10
cur = conn.cursor()
cur.execute("SELECT id, naziv, hns_klub_id FROM pgz_sport.klubovi WHERE savez_id = 10 AND hns_klub_id IS NOT NULL")
klubovi = cur.fetchall()
log.info(f"Klubova za obradu: {len(klubovi)}")
total_players = total_seasons = total_matches = 0
for klub_id, naziv, hns_id in klubovi:
klub_url = klub_url_map.get(hns_id)
if not klub_url:
log.warning(f" ⚠️ {naziv} (HNS {hns_id}) nema URL u katalogu, preskačem.")
continue
log.info(f"🏟️ {naziv}{klub_url}")
html = fetch(klub_url)
if not html:
log.warning(f" ❌ Ne mogu dohvatiti stranicu kluba.")
continue
players = parse_roster(html)
if not players:
log.warning(f" ⚠️ Nema igrača.")
continue
# upsert igrača
player_tuples = [(p[0], p[1], p[2], p[3]) for p in players]
upsert_players(conn, player_tuples)
log.info(f" 👥 {len(players)} igrača")
# za svakog igrača skini detalje
for hns_igrac_id, ime, prezime, profile_url in players:
# provjeri da li smo nedavno scrapeali sezone
cur.execute("SELECT MAX(scraped_at) FROM pgz_sport.hns_player_seasons WHERE hns_igrac_id = %s", (hns_igrac_id,))
last = cur.fetchone()[0]
if last and (datetime.now() - last) < timedelta(days=7):
continue
html = fetch(profile_url)
if not html:
continue
seasons = parse_seasons(html, hns_igrac_id)
if seasons:
upsert_seasons(conn, seasons)
total_seasons += len(seasons)
matches = parse_matches(html, hns_igrac_id)
if matches:
upsert_matches(conn, matches)
total_matches += len(matches)
time.sleep(0.3)
total_players += len(players)
conn.close()
log.info(f"=== GOTOVO: {total_players} igrača, {total_seasons} sezona, {total_matches} utakmica ===")
if __name__ == "__main__":
main()