feat: /api/v2/analiza/* endpoints - sport analytics backend
This commit is contained in:
Executable
+219
@@ -0,0 +1,219 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
HNS PGŽ FULL SCRAPER v2 – ispravljen URL roster-a
|
||||
Koristi sub1_hns_catalog.json za točne URL-ove klubova
|
||||
"""
|
||||
|
||||
import os, re, sys, time, logging, json
|
||||
from datetime import datetime, timedelta
|
||||
import requests
|
||||
import psycopg2
|
||||
from psycopg2.extras import execute_values
|
||||
|
||||
# ─── LOG ───────────────────────────────────────────
|
||||
LOG_DIR = "/var/log/pgz-sport-debug"
|
||||
os.makedirs(LOG_DIR, exist_ok=True)
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [FULL] %(message)s",
|
||||
handlers=[
|
||||
logging.FileHandler(os.path.join(LOG_DIR, "hns_full.log"), encoding="utf-8"),
|
||||
logging.StreamHandler(sys.stdout)
|
||||
]
|
||||
)
|
||||
log = logging.getLogger("hns_full")
|
||||
|
||||
# ─── DB ────────────────────────────────────────────
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv('/opt/rinet-gpu/.env.master')
|
||||
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
|
||||
|
||||
def get_conn():
|
||||
return psycopg2.connect(DSN)
|
||||
|
||||
UA = "Mozilla/5.0 (Ri.NET PGŽ Sport Bot)"
|
||||
|
||||
def fetch(url, retries=3):
|
||||
for i in range(retries):
|
||||
try:
|
||||
r = requests.get(url, headers={"User-Agent": UA}, timeout=15)
|
||||
if r.status_code == 404: return None
|
||||
r.raise_for_status()
|
||||
return r.text
|
||||
except: time.sleep(1.5 * (i+1))
|
||||
return None
|
||||
|
||||
# ─── PARSIRANJE ─────────────────────────────────────
|
||||
def parse_roster(html):
|
||||
"""Vraća listu (hns_igrac_id, ime, prezime, profil_url)"""
|
||||
igraci = []
|
||||
for m in re.finditer(r'<a\s+[^>]*href="(/igraci/(\d+)/[^"]*)"[^>]*>(.*?)</a>', html, re.DOTALL):
|
||||
url = "https://semafor.hns.family" + m.group(1)
|
||||
hns_id = int(m.group(2))
|
||||
raw_name = re.sub(r'<[^>]+>', ' ', m.group(3)).strip()
|
||||
raw_name = re.sub(r'\s+', ' ', raw_name)
|
||||
if not raw_name: continue
|
||||
parts = raw_name.split(' ', 1)
|
||||
ime = parts[0].strip()
|
||||
prezime = parts[1].strip() if len(parts) > 1 else ''
|
||||
if not prezime or len(prezime) < 2: continue
|
||||
igraci.append((hns_id, ime, prezime, url))
|
||||
return igraci
|
||||
|
||||
def parse_seasons(html, hns_igrac_id):
|
||||
"""Vraća listu dictova sezona"""
|
||||
# tražimo JSON-LD ili tablicu
|
||||
json_match = re.search(r'<script[^>]*type="application/ld\+json"[^>]*>(.*?)</script>', html, re.DOTALL)
|
||||
if json_match:
|
||||
try:
|
||||
data = json.loads(json_match.group(1))
|
||||
seasons_data = data.get('playerSeason', [])
|
||||
if not seasons_data:
|
||||
return []
|
||||
seasons = []
|
||||
for s in seasons_data:
|
||||
seasons.append({
|
||||
"hns_igrac_id": hns_igrac_id,
|
||||
"sezona": s.get("season", ""),
|
||||
"klub_hns_id": str(s.get("clubId", "")),
|
||||
"klub_naziv": s.get("clubName", ""),
|
||||
"natjecanje": s.get("competition", ""),
|
||||
"nastupi": int(s.get("apps", 0)),
|
||||
"golovi": int(s.get("goals", 0)),
|
||||
"asistencije": int(s.get("assists", 0)),
|
||||
"zuti": int(s.get("yellow", 0)),
|
||||
"crveni": int(s.get("red", 0)),
|
||||
"minute": int(s.get("minutes", 0))
|
||||
})
|
||||
return seasons
|
||||
except:
|
||||
pass
|
||||
return []
|
||||
|
||||
def parse_matches(html, hns_igrac_id):
|
||||
"""Vraća listu dictova utakmica iz JSON-LD"""
|
||||
json_match = re.search(r'<script[^>]*type="application/ld\+json"[^>]*>(.*?)</script>', html, re.DOTALL)
|
||||
if json_match:
|
||||
try:
|
||||
data = json.loads(json_match.group(1))
|
||||
matches_data = data.get('playerMatch', [])
|
||||
matches = []
|
||||
for m in matches_data:
|
||||
matches.append({
|
||||
"hns_igrac_id": hns_igrac_id,
|
||||
"datum": m.get("date", ""),
|
||||
"domacin": m.get("homeTeam", ""),
|
||||
"gost": m.get("awayTeam", ""),
|
||||
"rezultat": m.get("result", "")
|
||||
})
|
||||
return matches
|
||||
except:
|
||||
pass
|
||||
return []
|
||||
|
||||
# ─── UPSERT ─────────────────────────────────────────
|
||||
def upsert_players(conn, players):
|
||||
sql = """INSERT INTO pgz_sport.clanovi (hns_igrac_id, ime, prezime, source_url)
|
||||
VALUES %s
|
||||
ON CONFLICT (hns_igrac_id) DO UPDATE SET
|
||||
ime = EXCLUDED.ime, prezime = EXCLUDED.prezime, source_url = EXCLUDED.source_url"""
|
||||
with conn.cursor() as cur:
|
||||
execute_values(cur, sql, players)
|
||||
|
||||
def upsert_seasons(conn, seasons):
|
||||
if not seasons: return
|
||||
sql = """INSERT INTO pgz_sport.hns_player_seasons
|
||||
(hns_igrac_id, sezona, natjecanje, klub_hns_id, klub_naziv,
|
||||
nastupi, golovi, asistencije, zuti, crveni, minute)
|
||||
VALUES %s
|
||||
ON CONFLICT (hns_igrac_id, sezona, klub_hns_id, natjecanje) DO UPDATE SET
|
||||
nastupi=EXCLUDED.nastupi, golovi=EXCLUDED.golovi,
|
||||
asistencije=EXCLUDED.asistencije, zuti=EXCLUDED.zuti,
|
||||
crveni=EXCLUDED.crveni, minute=EXCLUDED.minute,
|
||||
klub_naziv=EXCLUDED.klub_naziv"""
|
||||
vals = [(s['hns_igrac_id'], s['sezona'], s['natjecanje'], s['klub_hns_id'],
|
||||
s['klub_naziv'], s['nastupi'], s['golovi'], s['asistencije'],
|
||||
s['zuti'], s['crveni'], s['minute']) for s in seasons]
|
||||
with conn.cursor() as cur:
|
||||
execute_values(cur, sql, vals, page_size=50)
|
||||
|
||||
def upsert_matches(conn, matches):
|
||||
if not matches: return
|
||||
sql = """INSERT INTO pgz_sport.hns_player_matches
|
||||
(hns_igrac_id, datum, domacin, gost, rezultat)
|
||||
VALUES %s
|
||||
ON CONFLICT (hns_igrac_id, datum, domacin, gost) DO NOTHING"""
|
||||
vals = [(m['hns_igrac_id'], m['datum'], m['domacin'], m['gost'], m['rezultat']) for m in matches]
|
||||
with conn.cursor() as cur:
|
||||
execute_values(cur, sql, vals, page_size=50)
|
||||
|
||||
# ─── MAIN ───────────────────────────────────────────
|
||||
def main():
|
||||
log.info("=== START FULL PGŽ HNS SCRAPE v2 ===")
|
||||
conn = get_conn()
|
||||
conn.autocommit = True
|
||||
|
||||
# 1. Učitaj katalog za URL-ove klubova
|
||||
with open('/opt/pgz-sport/cc_tasks/sub1_hns_catalog.json', 'r') as f:
|
||||
catalog = json.load(f)
|
||||
klub_url_map = {}
|
||||
for item in catalog:
|
||||
klub_url_map[item['id']] = f"https://semafor.hns.family/klubovi/{item['id']}/{item['slug']}/"
|
||||
log.info(f"Učitano {len(klub_url_map)} klubova iz kataloga.")
|
||||
|
||||
# 2. Dohvati klubove iz baze koji imaju hns_klub_id i savez_id=10
|
||||
cur = conn.cursor()
|
||||
cur.execute("SELECT id, naziv, hns_klub_id FROM pgz_sport.klubovi WHERE savez_id = 10 AND hns_klub_id IS NOT NULL")
|
||||
klubovi = cur.fetchall()
|
||||
log.info(f"Klubova za obradu: {len(klubovi)}")
|
||||
|
||||
total_players = total_seasons = total_matches = 0
|
||||
|
||||
for klub_id, naziv, hns_id in klubovi:
|
||||
klub_url = klub_url_map.get(hns_id)
|
||||
if not klub_url:
|
||||
log.warning(f" ⚠️ {naziv} (HNS {hns_id}) nema URL u katalogu, preskačem.")
|
||||
continue
|
||||
log.info(f"🏟️ {naziv} → {klub_url}")
|
||||
html = fetch(klub_url)
|
||||
if not html:
|
||||
log.warning(f" ❌ Ne mogu dohvatiti stranicu kluba.")
|
||||
continue
|
||||
|
||||
players = parse_roster(html)
|
||||
if not players:
|
||||
log.warning(f" ⚠️ Nema igrača.")
|
||||
continue
|
||||
|
||||
# upsert igrača
|
||||
player_tuples = [(p[0], p[1], p[2], p[3]) for p in players]
|
||||
upsert_players(conn, player_tuples)
|
||||
log.info(f" 👥 {len(players)} igrača")
|
||||
|
||||
# za svakog igrača skini detalje
|
||||
for hns_igrac_id, ime, prezime, profile_url in players:
|
||||
# provjeri da li smo nedavno scrapeali sezone
|
||||
cur.execute("SELECT MAX(scraped_at) FROM pgz_sport.hns_player_seasons WHERE hns_igrac_id = %s", (hns_igrac_id,))
|
||||
last = cur.fetchone()[0]
|
||||
if last and (datetime.now() - last) < timedelta(days=7):
|
||||
continue
|
||||
html = fetch(profile_url)
|
||||
if not html:
|
||||
continue
|
||||
seasons = parse_seasons(html, hns_igrac_id)
|
||||
if seasons:
|
||||
upsert_seasons(conn, seasons)
|
||||
total_seasons += len(seasons)
|
||||
matches = parse_matches(html, hns_igrac_id)
|
||||
if matches:
|
||||
upsert_matches(conn, matches)
|
||||
total_matches += len(matches)
|
||||
time.sleep(0.3)
|
||||
total_players += len(players)
|
||||
|
||||
conn.close()
|
||||
log.info(f"=== GOTOVO: {total_players} igrača, {total_seasons} sezona, {total_matches} utakmica ===")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user