feat: /api/v2/analiza/* endpoints - sport analytics backend

This commit is contained in:
Damir Radulic
2026-05-16 00:28:12 +02:00
parent 7ca5d7d94e
commit aca5051418
1355 changed files with 321891 additions and 4128 deletions
+219
View File
@@ -0,0 +1,219 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
HNS PGŽ FULL SCRAPER v2 ispravljen URL roster-a
Koristi sub1_hns_catalog.json za točne URL-ove klubova
"""
import os, re, sys, time, logging, json
from datetime import datetime, timedelta
import requests
import psycopg2
from psycopg2.extras import execute_values
# ─── LOG ───────────────────────────────────────────
LOG_DIR = "/var/log/pgz-sport-debug"
os.makedirs(LOG_DIR, exist_ok=True)
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [FULL] %(message)s",
handlers=[
logging.FileHandler(os.path.join(LOG_DIR, "hns_full.log"), encoding="utf-8"),
logging.StreamHandler(sys.stdout)
]
)
log = logging.getLogger("hns_full")
# ─── DB ────────────────────────────────────────────
from dotenv import load_dotenv
load_dotenv('/opt/rinet-gpu/.env.master')
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
def get_conn():
return psycopg2.connect(DSN)
UA = "Mozilla/5.0 (Ri.NET PGŽ Sport Bot)"
def fetch(url, retries=3):
for i in range(retries):
try:
r = requests.get(url, headers={"User-Agent": UA}, timeout=15)
if r.status_code == 404: return None
r.raise_for_status()
return r.text
except: time.sleep(1.5 * (i+1))
return None
# ─── PARSIRANJE ─────────────────────────────────────
def parse_roster(html):
"""Vraća listu (hns_igrac_id, ime, prezime, profil_url)"""
igraci = []
for m in re.finditer(r'<a\s+[^>]*href="(/igraci/(\d+)/[^"]*)"[^>]*>(.*?)</a>', html, re.DOTALL):
url = "https://semafor.hns.family" + m.group(1)
hns_id = int(m.group(2))
raw_name = re.sub(r'<[^>]+>', ' ', m.group(3)).strip()
raw_name = re.sub(r'\s+', ' ', raw_name)
if not raw_name: continue
parts = raw_name.split(' ', 1)
ime = parts[0].strip()
prezime = parts[1].strip() if len(parts) > 1 else ''
if not prezime or len(prezime) < 2: continue
igraci.append((hns_id, ime, prezime, url))
return igraci
def parse_seasons(html, hns_igrac_id):
"""Vraća listu dictova sezona"""
# tražimo JSON-LD ili tablicu
json_match = re.search(r'<script[^>]*type="application/ld\+json"[^>]*>(.*?)</script>', html, re.DOTALL)
if json_match:
try:
data = json.loads(json_match.group(1))
seasons_data = data.get('playerSeason', [])
if not seasons_data:
return []
seasons = []
for s in seasons_data:
seasons.append({
"hns_igrac_id": hns_igrac_id,
"sezona": s.get("season", ""),
"klub_hns_id": str(s.get("clubId", "")),
"klub_naziv": s.get("clubName", ""),
"natjecanje": s.get("competition", ""),
"nastupi": int(s.get("apps", 0)),
"golovi": int(s.get("goals", 0)),
"asistencije": int(s.get("assists", 0)),
"zuti": int(s.get("yellow", 0)),
"crveni": int(s.get("red", 0)),
"minute": int(s.get("minutes", 0))
})
return seasons
except:
pass
return []
def parse_matches(html, hns_igrac_id):
"""Vraća listu dictova utakmica iz JSON-LD"""
json_match = re.search(r'<script[^>]*type="application/ld\+json"[^>]*>(.*?)</script>', html, re.DOTALL)
if json_match:
try:
data = json.loads(json_match.group(1))
matches_data = data.get('playerMatch', [])
matches = []
for m in matches_data:
matches.append({
"hns_igrac_id": hns_igrac_id,
"datum": m.get("date", ""),
"domacin": m.get("homeTeam", ""),
"gost": m.get("awayTeam", ""),
"rezultat": m.get("result", "")
})
return matches
except:
pass
return []
# ─── UPSERT ─────────────────────────────────────────
def upsert_players(conn, players):
sql = """INSERT INTO pgz_sport.clanovi (hns_igrac_id, ime, prezime, source_url)
VALUES %s
ON CONFLICT (hns_igrac_id) DO UPDATE SET
ime = EXCLUDED.ime, prezime = EXCLUDED.prezime, source_url = EXCLUDED.source_url"""
with conn.cursor() as cur:
execute_values(cur, sql, players)
def upsert_seasons(conn, seasons):
if not seasons: return
sql = """INSERT INTO pgz_sport.hns_player_seasons
(hns_igrac_id, sezona, natjecanje, klub_hns_id, klub_naziv,
nastupi, golovi, asistencije, zuti, crveni, minute)
VALUES %s
ON CONFLICT (hns_igrac_id, sezona, klub_hns_id, natjecanje) DO UPDATE SET
nastupi=EXCLUDED.nastupi, golovi=EXCLUDED.golovi,
asistencije=EXCLUDED.asistencije, zuti=EXCLUDED.zuti,
crveni=EXCLUDED.crveni, minute=EXCLUDED.minute,
klub_naziv=EXCLUDED.klub_naziv"""
vals = [(s['hns_igrac_id'], s['sezona'], s['natjecanje'], s['klub_hns_id'],
s['klub_naziv'], s['nastupi'], s['golovi'], s['asistencije'],
s['zuti'], s['crveni'], s['minute']) for s in seasons]
with conn.cursor() as cur:
execute_values(cur, sql, vals, page_size=50)
def upsert_matches(conn, matches):
if not matches: return
sql = """INSERT INTO pgz_sport.hns_player_matches
(hns_igrac_id, datum, domacin, gost, rezultat)
VALUES %s
ON CONFLICT (hns_igrac_id, datum, domacin, gost) DO NOTHING"""
vals = [(m['hns_igrac_id'], m['datum'], m['domacin'], m['gost'], m['rezultat']) for m in matches]
with conn.cursor() as cur:
execute_values(cur, sql, vals, page_size=50)
# ─── MAIN ───────────────────────────────────────────
def main():
log.info("=== START FULL PGŽ HNS SCRAPE v2 ===")
conn = get_conn()
conn.autocommit = True
# 1. Učitaj katalog za URL-ove klubova
with open('/opt/pgz-sport/cc_tasks/sub1_hns_catalog.json', 'r') as f:
catalog = json.load(f)
klub_url_map = {}
for item in catalog:
klub_url_map[item['id']] = f"https://semafor.hns.family/klubovi/{item['id']}/{item['slug']}/"
log.info(f"Učitano {len(klub_url_map)} klubova iz kataloga.")
# 2. Dohvati klubove iz baze koji imaju hns_klub_id i savez_id=10
cur = conn.cursor()
cur.execute("SELECT id, naziv, hns_klub_id FROM pgz_sport.klubovi WHERE savez_id = 10 AND hns_klub_id IS NOT NULL")
klubovi = cur.fetchall()
log.info(f"Klubova za obradu: {len(klubovi)}")
total_players = total_seasons = total_matches = 0
for klub_id, naziv, hns_id in klubovi:
klub_url = klub_url_map.get(hns_id)
if not klub_url:
log.warning(f" ⚠️ {naziv} (HNS {hns_id}) nema URL u katalogu, preskačem.")
continue
log.info(f"🏟️ {naziv}{klub_url}")
html = fetch(klub_url)
if not html:
log.warning(f" ❌ Ne mogu dohvatiti stranicu kluba.")
continue
players = parse_roster(html)
if not players:
log.warning(f" ⚠️ Nema igrača.")
continue
# upsert igrača
player_tuples = [(p[0], p[1], p[2], p[3]) for p in players]
upsert_players(conn, player_tuples)
log.info(f" 👥 {len(players)} igrača")
# za svakog igrača skini detalje
for hns_igrac_id, ime, prezime, profile_url in players:
# provjeri da li smo nedavno scrapeali sezone
cur.execute("SELECT MAX(scraped_at) FROM pgz_sport.hns_player_seasons WHERE hns_igrac_id = %s", (hns_igrac_id,))
last = cur.fetchone()[0]
if last and (datetime.now() - last) < timedelta(days=7):
continue
html = fetch(profile_url)
if not html:
continue
seasons = parse_seasons(html, hns_igrac_id)
if seasons:
upsert_seasons(conn, seasons)
total_seasons += len(seasons)
matches = parse_matches(html, hns_igrac_id)
if matches:
upsert_matches(conn, matches)
total_matches += len(matches)
time.sleep(0.3)
total_players += len(players)
conn.close()
log.info(f"=== GOTOVO: {total_players} igrača, {total_seasons} sezona, {total_matches} utakmica ===")
if __name__ == "__main__":
main()