Files
pgz-sport/scripts/hns_pgz_sync_robust.py

219 lines
7.8 KiB
Python
Executable File

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os, sys, re, time, logging
import requests
from requests.exceptions import RequestException
import psycopg2
from psycopg2.extras import execute_values
from dotenv import load_dotenv
load_dotenv('/opt/.env.rinet')
# --- LOGGING ---
LOG_DIR = "/var/log/pgz-sport-sync"
os.makedirs(LOG_DIR, exist_ok=True)
LOG_FILE = os.path.join(LOG_DIR, "sync_master.log")
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(message)s',
handlers=[
logging.FileHandler(LOG_FILE, encoding='utf-8'),
logging.StreamHandler(sys.stdout)
]
)
logger = logging.getLogger(__name__)
# --- CONFIG ---
db_pass = os.environ.get('PG_PASS')
if not db_pass:
logger.critical("PG_PASS nije pronađen u /opt/.env.rinet")
sys.exit(1)
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={db_pass}"
UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
SESSION = requests.Session()
SESSION.headers.update({"User-Agent": UA})
BASE_URL = "https://semafor.hns.family"
# Dodana sva natjecanja koja si naveo
NATJECANJA_URLS = [
"https://semafor.hns.family/natjecanja/101025334/1-nl-ns-rijeka-juniori-2526/",
"https://semafor.hns.family/natjecanja/100585203/treca-nl-zapad-2526/",
"https://semafor.hns.family/natjecanja/101555188/1-znl-seniori-2526/",
"https://semafor.hns.family/natjecanja/102503486/1-zupanijska-omladinska-liga-kadeti-skupina-a-2526/"
]
def strip_tags(text):
"""Uklanja sve ugniježđene HTML tagove i vraća čisti string."""
text = re.sub(r'<[^>]+>', ' ', text)
return re.sub(r'\s+', ' ', text).strip()
# --- DATABASE ---
def db_conn():
try:
c = psycopg2.connect(DSN)
c.autocommit = True
return c
except psycopg2.Error as e:
logger.critical(f"DB Connection failed: {e}")
sys.exit(1)
# --- HTTP FETCH ---
def fetch(url, retries=3):
for attempt in range(1, retries + 1):
try:
r = SESSION.get(url, timeout=15)
if r.status_code == 404:
logger.warning(f"HTTP 404 Not Found: {url}")
return None
r.raise_for_status()
return r.text
except RequestException as e:
logger.warning(f"HTTP GET failed ({attempt}/{retries}) for {url}: {e}")
time.sleep(2 * attempt)
logger.error(f"Gave up fetching {url} after {retries} attempts.")
return None
# --- SYNC PROCEDURES ---
def extract_klubovi(html):
if not html: return []
klubovi = {}
# Prilagođen regex za prepoznavanje svega unutar <a> taga, bez obzira na slike i spanove
for m in re.finditer(r'<a[^>]*href="(/klubovi/(\d+)/([^/"]+)[^"]*)"[^>]*>(.*?)</a>', html, re.DOTALL | re.IGNORECASE):
hns_id = m.group(2)
slug = m.group(3)
naziv = strip_tags(m.group(4))
if not naziv:
naziv = slug.replace('-', ' ').title()
# Makni potencijalne krive linkove
if len(naziv) < 50 and hns_id:
klubovi[hns_id] = (hns_id, naziv, BASE_URL + m.group(1))
return list(klubovi.values())
def upsert_klubovi(conn, klubovi):
if not klubovi: return []
try:
with conn.cursor() as cur:
execute_values(cur, """
INSERT INTO pgz_sport.klubovi (hns_id, naziv, source_url)
VALUES %s
ON CONFLICT (hns_id) DO UPDATE SET
naziv = EXCLUDED.naziv,
source_url = EXCLUDED.source_url
WHERE pgz_sport.klubovi.naziv IS DISTINCT FROM EXCLUDED.naziv
OR pgz_sport.klubovi.source_url IS DISTINCT FROM EXCLUDED.source_url;
""", klubovi)
cur.execute("SELECT id, hns_id, source_url FROM pgz_sport.klubovi WHERE hns_id = ANY(%s)", ([k[0] for k in klubovi],))
return cur.fetchall()
except psycopg2.Error as e:
logger.error(f"DB Greška pri UPSERT klubova: {e}")
return []
def sync_roster(conn, klub_hns_id, klub_url):
target_url = klub_url if klub_url.endswith('/') else klub_url + '/'
target_url += "igraci/"
html = fetch(target_url)
if not html: return []
igraci = {}
for m in re.finditer(r'<a[^>]*href="(/igraci/(\d+)/([^/"]+)[^"]*)"[^>]*>(.*?)</a>', html, re.DOTALL | re.IGNORECASE):
hns_igrac_id = m.group(2)
slug = m.group(3)
ime_prezime = strip_tags(m.group(4))
if not ime_prezime or len(ime_prezime) > 60:
continue
parts = ime_prezime.split(' ', 1)
ime = parts[0] if parts else "Nepoznato"
prezime = parts[1] if len(parts) > 1 else slug.replace('-', ' ').title()
igraci[hns_igrac_id] = (hns_igrac_id, ime, prezime, klub_hns_id, BASE_URL + m.group(1), slug)
igraci_list = list(igraci.values())
if not igraci_list:
logger.debug(f"Klub {klub_hns_id} nema igrača (ili greška u parsiranju).")
return []
try:
with conn.cursor() as cur:
execute_values(cur, """
INSERT INTO pgz_sport.clanovi (hns_igrac_id, ime, prezime, klub_hns_id, source_url, slug)
VALUES %s
ON CONFLICT (hns_igrac_id) DO UPDATE SET
ime = EXCLUDED.ime,
prezime = EXCLUDED.prezime,
klub_hns_id = EXCLUDED.klub_hns_id,
source_url = EXCLUDED.source_url,
slug = EXCLUDED.slug
WHERE pgz_sport.clanovi.ime IS DISTINCT FROM EXCLUDED.ime
OR pgz_sport.clanovi.prezime IS DISTINCT FROM EXCLUDED.prezime
OR pgz_sport.clanovi.klub_hns_id IS DISTINCT FROM EXCLUDED.klub_hns_id
OR pgz_sport.clanovi.source_url IS DISTINCT FROM EXCLUDED.source_url;
""", igraci_list)
logger.info(f"Roster za klub {klub_hns_id}: uspješno sinkronizirano {len(igraci_list)} igrača.")
return igraci_list
except psycopg2.Error as e:
logger.error(f"DB Greška pri UPSERT rostera za klub {klub_hns_id}: {e}")
return []
def get_all_db_clubs(conn):
try:
with conn.cursor() as cur:
cur.execute("SELECT id, hns_id, source_url FROM pgz_sport.klubovi WHERE source_url IS NOT NULL")
return cur.fetchall()
except:
return []
# --- MAIN ENGINE ---
def main():
logger.info("=== START: HNS PGŽ FULL SYNC ===")
conn = db_conn()
all_extracted_klubovi = []
# 1. Traži klubove po ligama
for url in NATJECANJA_URLS:
logger.info(f"Preuzimanje klubova iz natjecanja: {url}")
html = fetch(url)
extracted = extract_klubovi(html)
logger.info(f"Pronađeno {len(extracted)} klubova u natjecanju.")
all_extracted_klubovi.extend(extracted)
time.sleep(1)
unique_klubovi = list({k[0]: k for k in all_extracted_klubovi}.values())
logger.info(f"Ukupno jedinstvenih klubova za UPSERT: {len(unique_klubovi)}")
upsert_klubovi(conn, unique_klubovi)
# 2. Skini roster za svaki klub iz baze
db_klubovi = get_all_db_clubs(conn)
logger.info(f"Pokrećem sync rostera za {len(db_klubovi)} klubova iz baze...")
for _, klub_hns_id, klub_url in db_klubovi:
try:
sync_roster(conn, klub_hns_id, klub_url)
time.sleep(0.5)
except Exception as e:
logger.critical(f"Kritična greška kod kluba {klub_hns_id}: {e}")
continue
logger.info("=== KRAJ: HNS PGŽ FULL SYNC ===")
conn.close()
if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
logger.info("Skripta prekinuta.")
sys.exit(0)
except Exception as e:
logger.critical(f"Neočekivani pad skripte: {e}", exc_info=True)
sys.exit(1)