feat: /api/v2/analiza/* endpoints - sport analytics backend
This commit is contained in:
Executable
+218
@@ -0,0 +1,218 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os, sys, re, time, logging
|
||||
import requests
|
||||
from requests.exceptions import RequestException
|
||||
import psycopg2
|
||||
from psycopg2.extras import execute_values
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv('/opt/.env.rinet')
|
||||
|
||||
# --- LOGGING ---
|
||||
LOG_DIR = "/var/log/pgz-sport-sync"
|
||||
os.makedirs(LOG_DIR, exist_ok=True)
|
||||
LOG_FILE = os.path.join(LOG_DIR, "sync_master.log")
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s [%(levelname)s] %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler(LOG_FILE, encoding='utf-8'),
|
||||
logging.StreamHandler(sys.stdout)
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# --- CONFIG ---
|
||||
db_pass = os.environ.get('PG_PASS')
|
||||
if not db_pass:
|
||||
logger.critical("PG_PASS nije pronađen u /opt/.env.rinet")
|
||||
sys.exit(1)
|
||||
|
||||
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={db_pass}"
|
||||
UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
SESSION = requests.Session()
|
||||
SESSION.headers.update({"User-Agent": UA})
|
||||
|
||||
BASE_URL = "https://semafor.hns.family"
|
||||
|
||||
# Dodana sva natjecanja koja si naveo
|
||||
NATJECANJA_URLS = [
|
||||
"https://semafor.hns.family/natjecanja/101025334/1-nl-ns-rijeka-juniori-2526/",
|
||||
"https://semafor.hns.family/natjecanja/100585203/treca-nl-zapad-2526/",
|
||||
"https://semafor.hns.family/natjecanja/101555188/1-znl-seniori-2526/",
|
||||
"https://semafor.hns.family/natjecanja/102503486/1-zupanijska-omladinska-liga-kadeti-skupina-a-2526/"
|
||||
]
|
||||
|
||||
def strip_tags(text):
|
||||
"""Uklanja sve ugniježđene HTML tagove i vraća čisti string."""
|
||||
text = re.sub(r'<[^>]+>', ' ', text)
|
||||
return re.sub(r'\s+', ' ', text).strip()
|
||||
|
||||
# --- DATABASE ---
|
||||
def db_conn():
|
||||
try:
|
||||
c = psycopg2.connect(DSN)
|
||||
c.autocommit = True
|
||||
return c
|
||||
except psycopg2.Error as e:
|
||||
logger.critical(f"DB Connection failed: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# --- HTTP FETCH ---
|
||||
def fetch(url, retries=3):
|
||||
for attempt in range(1, retries + 1):
|
||||
try:
|
||||
r = SESSION.get(url, timeout=15)
|
||||
if r.status_code == 404:
|
||||
logger.warning(f"HTTP 404 Not Found: {url}")
|
||||
return None
|
||||
r.raise_for_status()
|
||||
return r.text
|
||||
except RequestException as e:
|
||||
logger.warning(f"HTTP GET failed ({attempt}/{retries}) for {url}: {e}")
|
||||
time.sleep(2 * attempt)
|
||||
logger.error(f"Gave up fetching {url} after {retries} attempts.")
|
||||
return None
|
||||
|
||||
# --- SYNC PROCEDURES ---
|
||||
def extract_klubovi(html):
|
||||
if not html: return []
|
||||
klubovi = {}
|
||||
|
||||
# Prilagođen regex za prepoznavanje svega unutar <a> taga, bez obzira na slike i spanove
|
||||
for m in re.finditer(r'<a[^>]*href="(/klubovi/(\d+)/([^/"]+)[^"]*)"[^>]*>(.*?)</a>', html, re.DOTALL | re.IGNORECASE):
|
||||
hns_id = m.group(2)
|
||||
slug = m.group(3)
|
||||
naziv = strip_tags(m.group(4))
|
||||
|
||||
if not naziv:
|
||||
naziv = slug.replace('-', ' ').title()
|
||||
|
||||
# Makni potencijalne krive linkove
|
||||
if len(naziv) < 50 and hns_id:
|
||||
klubovi[hns_id] = (hns_id, naziv, BASE_URL + m.group(1))
|
||||
|
||||
return list(klubovi.values())
|
||||
|
||||
def upsert_klubovi(conn, klubovi):
|
||||
if not klubovi: return []
|
||||
try:
|
||||
with conn.cursor() as cur:
|
||||
execute_values(cur, """
|
||||
INSERT INTO pgz_sport.klubovi (hns_id, naziv, source_url)
|
||||
VALUES %s
|
||||
ON CONFLICT (hns_id) DO UPDATE SET
|
||||
naziv = EXCLUDED.naziv,
|
||||
source_url = EXCLUDED.source_url
|
||||
WHERE pgz_sport.klubovi.naziv IS DISTINCT FROM EXCLUDED.naziv
|
||||
OR pgz_sport.klubovi.source_url IS DISTINCT FROM EXCLUDED.source_url;
|
||||
""", klubovi)
|
||||
cur.execute("SELECT id, hns_id, source_url FROM pgz_sport.klubovi WHERE hns_id = ANY(%s)", ([k[0] for k in klubovi],))
|
||||
return cur.fetchall()
|
||||
except psycopg2.Error as e:
|
||||
logger.error(f"DB Greška pri UPSERT klubova: {e}")
|
||||
return []
|
||||
|
||||
def sync_roster(conn, klub_hns_id, klub_url):
|
||||
target_url = klub_url if klub_url.endswith('/') else klub_url + '/'
|
||||
target_url += "igraci/"
|
||||
|
||||
html = fetch(target_url)
|
||||
if not html: return []
|
||||
|
||||
igraci = {}
|
||||
for m in re.finditer(r'<a[^>]*href="(/igraci/(\d+)/([^/"]+)[^"]*)"[^>]*>(.*?)</a>', html, re.DOTALL | re.IGNORECASE):
|
||||
hns_igrac_id = m.group(2)
|
||||
slug = m.group(3)
|
||||
ime_prezime = strip_tags(m.group(4))
|
||||
|
||||
if not ime_prezime or len(ime_prezime) > 60:
|
||||
continue
|
||||
|
||||
parts = ime_prezime.split(' ', 1)
|
||||
ime = parts[0] if parts else "Nepoznato"
|
||||
prezime = parts[1] if len(parts) > 1 else slug.replace('-', ' ').title()
|
||||
|
||||
igraci[hns_igrac_id] = (hns_igrac_id, ime, prezime, klub_hns_id, BASE_URL + m.group(1), slug)
|
||||
|
||||
igraci_list = list(igraci.values())
|
||||
if not igraci_list:
|
||||
logger.debug(f"Klub {klub_hns_id} nema igrača (ili greška u parsiranju).")
|
||||
return []
|
||||
|
||||
try:
|
||||
with conn.cursor() as cur:
|
||||
execute_values(cur, """
|
||||
INSERT INTO pgz_sport.clanovi (hns_igrac_id, ime, prezime, klub_hns_id, source_url, slug)
|
||||
VALUES %s
|
||||
ON CONFLICT (hns_igrac_id) DO UPDATE SET
|
||||
ime = EXCLUDED.ime,
|
||||
prezime = EXCLUDED.prezime,
|
||||
klub_hns_id = EXCLUDED.klub_hns_id,
|
||||
source_url = EXCLUDED.source_url,
|
||||
slug = EXCLUDED.slug
|
||||
WHERE pgz_sport.clanovi.ime IS DISTINCT FROM EXCLUDED.ime
|
||||
OR pgz_sport.clanovi.prezime IS DISTINCT FROM EXCLUDED.prezime
|
||||
OR pgz_sport.clanovi.klub_hns_id IS DISTINCT FROM EXCLUDED.klub_hns_id
|
||||
OR pgz_sport.clanovi.source_url IS DISTINCT FROM EXCLUDED.source_url;
|
||||
""", igraci_list)
|
||||
logger.info(f"Roster za klub {klub_hns_id}: uspješno sinkronizirano {len(igraci_list)} igrača.")
|
||||
return igraci_list
|
||||
except psycopg2.Error as e:
|
||||
logger.error(f"DB Greška pri UPSERT rostera za klub {klub_hns_id}: {e}")
|
||||
return []
|
||||
|
||||
def get_all_db_clubs(conn):
|
||||
try:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("SELECT id, hns_id, source_url FROM pgz_sport.klubovi WHERE source_url IS NOT NULL")
|
||||
return cur.fetchall()
|
||||
except:
|
||||
return []
|
||||
|
||||
# --- MAIN ENGINE ---
|
||||
def main():
|
||||
logger.info("=== START: HNS PGŽ FULL SYNC ===")
|
||||
conn = db_conn()
|
||||
all_extracted_klubovi = []
|
||||
|
||||
# 1. Traži klubove po ligama
|
||||
for url in NATJECANJA_URLS:
|
||||
logger.info(f"Preuzimanje klubova iz natjecanja: {url}")
|
||||
html = fetch(url)
|
||||
extracted = extract_klubovi(html)
|
||||
logger.info(f"Pronađeno {len(extracted)} klubova u natjecanju.")
|
||||
all_extracted_klubovi.extend(extracted)
|
||||
time.sleep(1)
|
||||
|
||||
unique_klubovi = list({k[0]: k for k in all_extracted_klubovi}.values())
|
||||
logger.info(f"Ukupno jedinstvenih klubova za UPSERT: {len(unique_klubovi)}")
|
||||
upsert_klubovi(conn, unique_klubovi)
|
||||
|
||||
# 2. Skini roster za svaki klub iz baze
|
||||
db_klubovi = get_all_db_clubs(conn)
|
||||
logger.info(f"Pokrećem sync rostera za {len(db_klubovi)} klubova iz baze...")
|
||||
|
||||
for _, klub_hns_id, klub_url in db_klubovi:
|
||||
try:
|
||||
sync_roster(conn, klub_hns_id, klub_url)
|
||||
time.sleep(0.5)
|
||||
except Exception as e:
|
||||
logger.critical(f"Kritična greška kod kluba {klub_hns_id}: {e}")
|
||||
continue
|
||||
|
||||
logger.info("=== KRAJ: HNS PGŽ FULL SYNC ===")
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
main()
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Skripta prekinuta.")
|
||||
sys.exit(0)
|
||||
except Exception as e:
|
||||
logger.critical(f"Neočekivani pad skripte: {e}", exc_info=True)
|
||||
sys.exit(1)
|
||||
Reference in New Issue
Block a user