219 lines
8.9 KiB
Python
Executable File
219 lines
8.9 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
HNS PGŽ FULL SCRAPER v2 – ispravljen URL roster-a
|
||
Koristi sub1_hns_catalog.json za točne URL-ove klubova
|
||
"""
|
||
|
||
import os, re, sys, time, logging, json
|
||
from datetime import datetime, timedelta
|
||
import requests
|
||
import psycopg2
|
||
from psycopg2.extras import execute_values
|
||
|
||
# ─── LOG ───────────────────────────────────────────
|
||
LOG_DIR = "/var/log/pgz-sport-debug"
|
||
os.makedirs(LOG_DIR, exist_ok=True)
|
||
logging.basicConfig(
|
||
level=logging.INFO,
|
||
format="%(asctime)s [FULL] %(message)s",
|
||
handlers=[
|
||
logging.FileHandler(os.path.join(LOG_DIR, "hns_full.log"), encoding="utf-8"),
|
||
logging.StreamHandler(sys.stdout)
|
||
]
|
||
)
|
||
log = logging.getLogger("hns_full")
|
||
|
||
# ─── DB ────────────────────────────────────────────
|
||
from dotenv import load_dotenv
|
||
load_dotenv('/opt/rinet-gpu/.env.master')
|
||
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
|
||
|
||
def get_conn():
|
||
return psycopg2.connect(DSN)
|
||
|
||
UA = "Mozilla/5.0 (Ri.NET PGŽ Sport Bot)"
|
||
|
||
def fetch(url, retries=3):
|
||
for i in range(retries):
|
||
try:
|
||
r = requests.get(url, headers={"User-Agent": UA}, timeout=15)
|
||
if r.status_code == 404: return None
|
||
r.raise_for_status()
|
||
return r.text
|
||
except: time.sleep(1.5 * (i+1))
|
||
return None
|
||
|
||
# ─── PARSIRANJE ─────────────────────────────────────
|
||
def parse_roster(html):
|
||
"""Vraća listu (hns_igrac_id, ime, prezime, profil_url)"""
|
||
igraci = []
|
||
for m in re.finditer(r'<a\s+[^>]*href="(/igraci/(\d+)/[^"]*)"[^>]*>(.*?)</a>', html, re.DOTALL):
|
||
url = "https://semafor.hns.family" + m.group(1)
|
||
hns_id = int(m.group(2))
|
||
raw_name = re.sub(r'<[^>]+>', ' ', m.group(3)).strip()
|
||
raw_name = re.sub(r'\s+', ' ', raw_name)
|
||
if not raw_name: continue
|
||
parts = raw_name.split(' ', 1)
|
||
ime = parts[0].strip()
|
||
prezime = parts[1].strip() if len(parts) > 1 else ''
|
||
if not prezime or len(prezime) < 2: continue
|
||
igraci.append((hns_id, ime, prezime, url))
|
||
return igraci
|
||
|
||
def parse_seasons(html, hns_igrac_id):
|
||
"""Vraća listu dictova sezona"""
|
||
# tražimo JSON-LD ili tablicu
|
||
json_match = re.search(r'<script[^>]*type="application/ld\+json"[^>]*>(.*?)</script>', html, re.DOTALL)
|
||
if json_match:
|
||
try:
|
||
data = json.loads(json_match.group(1))
|
||
seasons_data = data.get('playerSeason', [])
|
||
if not seasons_data:
|
||
return []
|
||
seasons = []
|
||
for s in seasons_data:
|
||
seasons.append({
|
||
"hns_igrac_id": hns_igrac_id,
|
||
"sezona": s.get("season", ""),
|
||
"klub_hns_id": str(s.get("clubId", "")),
|
||
"klub_naziv": s.get("clubName", ""),
|
||
"natjecanje": s.get("competition", ""),
|
||
"nastupi": int(s.get("apps", 0)),
|
||
"golovi": int(s.get("goals", 0)),
|
||
"asistencije": int(s.get("assists", 0)),
|
||
"zuti": int(s.get("yellow", 0)),
|
||
"crveni": int(s.get("red", 0)),
|
||
"minute": int(s.get("minutes", 0))
|
||
})
|
||
return seasons
|
||
except:
|
||
pass
|
||
return []
|
||
|
||
def parse_matches(html, hns_igrac_id):
|
||
"""Vraća listu dictova utakmica iz JSON-LD"""
|
||
json_match = re.search(r'<script[^>]*type="application/ld\+json"[^>]*>(.*?)</script>', html, re.DOTALL)
|
||
if json_match:
|
||
try:
|
||
data = json.loads(json_match.group(1))
|
||
matches_data = data.get('playerMatch', [])
|
||
matches = []
|
||
for m in matches_data:
|
||
matches.append({
|
||
"hns_igrac_id": hns_igrac_id,
|
||
"datum": m.get("date", ""),
|
||
"domacin": m.get("homeTeam", ""),
|
||
"gost": m.get("awayTeam", ""),
|
||
"rezultat": m.get("result", "")
|
||
})
|
||
return matches
|
||
except:
|
||
pass
|
||
return []
|
||
|
||
# ─── UPSERT ─────────────────────────────────────────
|
||
def upsert_players(conn, players):
|
||
sql = """INSERT INTO pgz_sport.clanovi (hns_igrac_id, ime, prezime, source_url)
|
||
VALUES %s
|
||
ON CONFLICT (hns_igrac_id) DO UPDATE SET
|
||
ime = EXCLUDED.ime, prezime = EXCLUDED.prezime, source_url = EXCLUDED.source_url"""
|
||
with conn.cursor() as cur:
|
||
execute_values(cur, sql, players)
|
||
|
||
def upsert_seasons(conn, seasons):
|
||
if not seasons: return
|
||
sql = """INSERT INTO pgz_sport.hns_player_seasons
|
||
(hns_igrac_id, sezona, natjecanje, klub_hns_id, klub_naziv,
|
||
nastupi, golovi, asistencije, zuti, crveni, minute)
|
||
VALUES %s
|
||
ON CONFLICT (hns_igrac_id, sezona, klub_hns_id, natjecanje) DO UPDATE SET
|
||
nastupi=EXCLUDED.nastupi, golovi=EXCLUDED.golovi,
|
||
asistencije=EXCLUDED.asistencije, zuti=EXCLUDED.zuti,
|
||
crveni=EXCLUDED.crveni, minute=EXCLUDED.minute,
|
||
klub_naziv=EXCLUDED.klub_naziv"""
|
||
vals = [(s['hns_igrac_id'], s['sezona'], s['natjecanje'], s['klub_hns_id'],
|
||
s['klub_naziv'], s['nastupi'], s['golovi'], s['asistencije'],
|
||
s['zuti'], s['crveni'], s['minute']) for s in seasons]
|
||
with conn.cursor() as cur:
|
||
execute_values(cur, sql, vals, page_size=50)
|
||
|
||
def upsert_matches(conn, matches):
|
||
if not matches: return
|
||
sql = """INSERT INTO pgz_sport.hns_player_matches
|
||
(hns_igrac_id, datum, domacin, gost, rezultat)
|
||
VALUES %s
|
||
ON CONFLICT (hns_igrac_id, datum, domacin, gost) DO NOTHING"""
|
||
vals = [(m['hns_igrac_id'], m['datum'], m['domacin'], m['gost'], m['rezultat']) for m in matches]
|
||
with conn.cursor() as cur:
|
||
execute_values(cur, sql, vals, page_size=50)
|
||
|
||
# ─── MAIN ───────────────────────────────────────────
|
||
def main():
|
||
log.info("=== START FULL PGŽ HNS SCRAPE v2 ===")
|
||
conn = get_conn()
|
||
conn.autocommit = True
|
||
|
||
# 1. Učitaj katalog za URL-ove klubova
|
||
with open('/opt/pgz-sport/cc_tasks/sub1_hns_catalog.json', 'r') as f:
|
||
catalog = json.load(f)
|
||
klub_url_map = {}
|
||
for item in catalog:
|
||
klub_url_map[item['id']] = f"https://semafor.hns.family/klubovi/{item['id']}/{item['slug']}/"
|
||
log.info(f"Učitano {len(klub_url_map)} klubova iz kataloga.")
|
||
|
||
# 2. Dohvati klubove iz baze koji imaju hns_klub_id i savez_id=10
|
||
cur = conn.cursor()
|
||
cur.execute("SELECT id, naziv, hns_klub_id FROM pgz_sport.klubovi WHERE savez_id = 10 AND hns_klub_id IS NOT NULL")
|
||
klubovi = cur.fetchall()
|
||
log.info(f"Klubova za obradu: {len(klubovi)}")
|
||
|
||
total_players = total_seasons = total_matches = 0
|
||
|
||
for klub_id, naziv, hns_id in klubovi:
|
||
klub_url = klub_url_map.get(hns_id)
|
||
if not klub_url:
|
||
log.warning(f" ⚠️ {naziv} (HNS {hns_id}) nema URL u katalogu, preskačem.")
|
||
continue
|
||
log.info(f"🏟️ {naziv} → {klub_url}")
|
||
html = fetch(klub_url)
|
||
if not html:
|
||
log.warning(f" ❌ Ne mogu dohvatiti stranicu kluba.")
|
||
continue
|
||
|
||
players = parse_roster(html)
|
||
if not players:
|
||
log.warning(f" ⚠️ Nema igrača.")
|
||
continue
|
||
|
||
# upsert igrača
|
||
player_tuples = [(p[0], p[1], p[2], p[3]) for p in players]
|
||
upsert_players(conn, player_tuples)
|
||
log.info(f" 👥 {len(players)} igrača")
|
||
|
||
# za svakog igrača skini detalje
|
||
for hns_igrac_id, ime, prezime, profile_url in players:
|
||
# provjeri da li smo nedavno scrapeali sezone
|
||
cur.execute("SELECT MAX(scraped_at) FROM pgz_sport.hns_player_seasons WHERE hns_igrac_id = %s", (hns_igrac_id,))
|
||
last = cur.fetchone()[0]
|
||
if last and (datetime.now() - last) < timedelta(days=7):
|
||
continue
|
||
html = fetch(profile_url)
|
||
if not html:
|
||
continue
|
||
seasons = parse_seasons(html, hns_igrac_id)
|
||
if seasons:
|
||
upsert_seasons(conn, seasons)
|
||
total_seasons += len(seasons)
|
||
matches = parse_matches(html, hns_igrac_id)
|
||
if matches:
|
||
upsert_matches(conn, matches)
|
||
total_matches += len(matches)
|
||
time.sleep(0.3)
|
||
total_players += len(players)
|
||
|
||
conn.close()
|
||
log.info(f"=== GOTOVO: {total_players} igrača, {total_seasons} sezona, {total_matches} utakmica ===")
|
||
|
||
if __name__ == "__main__":
|
||
main() |