#!/usr/bin/env python3 # -*- coding: utf-8 -*- import os, sys, re, time, logging import requests from requests.exceptions import RequestException import psycopg2 from psycopg2.extras import execute_values from dotenv import load_dotenv load_dotenv('/opt/.env.rinet') # --- LOGGING --- LOG_DIR = "/var/log/pgz-sport-sync" os.makedirs(LOG_DIR, exist_ok=True) LOG_FILE = os.path.join(LOG_DIR, "sync_master.log") logging.basicConfig( level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s', handlers=[ logging.FileHandler(LOG_FILE, encoding='utf-8'), logging.StreamHandler(sys.stdout) ] ) logger = logging.getLogger(__name__) # --- CONFIG --- db_pass = os.environ.get('PG_PASS') if not db_pass: logger.critical("PG_PASS nije pronađen u /opt/.env.rinet") sys.exit(1) DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={db_pass}" UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" SESSION = requests.Session() SESSION.headers.update({"User-Agent": UA}) BASE_URL = "https://semafor.hns.family" # Dodana sva natjecanja koja si naveo NATJECANJA_URLS = [ "https://semafor.hns.family/natjecanja/101025334/1-nl-ns-rijeka-juniori-2526/", "https://semafor.hns.family/natjecanja/100585203/treca-nl-zapad-2526/", "https://semafor.hns.family/natjecanja/101555188/1-znl-seniori-2526/", "https://semafor.hns.family/natjecanja/102503486/1-zupanijska-omladinska-liga-kadeti-skupina-a-2526/" ] def strip_tags(text): """Uklanja sve ugniježđene HTML tagove i vraća čisti string.""" text = re.sub(r'<[^>]+>', ' ', text) return re.sub(r'\s+', ' ', text).strip() # --- DATABASE --- def db_conn(): try: c = psycopg2.connect(DSN) c.autocommit = True return c except psycopg2.Error as e: logger.critical(f"DB Connection failed: {e}") sys.exit(1) # --- HTTP FETCH --- def fetch(url, retries=3): for attempt in range(1, retries + 1): try: r = SESSION.get(url, timeout=15) if r.status_code == 404: logger.warning(f"HTTP 404 Not Found: {url}") return None r.raise_for_status() return r.text except RequestException as e: logger.warning(f"HTTP GET failed ({attempt}/{retries}) for {url}: {e}") time.sleep(2 * attempt) logger.error(f"Gave up fetching {url} after {retries} attempts.") return None # --- SYNC PROCEDURES --- def extract_klubovi(html): if not html: return [] klubovi = {} # Prilagođen regex za prepoznavanje svega unutar taga, bez obzira na slike i spanove for m in re.finditer(r']*href="(/klubovi/(\d+)/([^/"]+)[^"]*)"[^>]*>(.*?)', html, re.DOTALL | re.IGNORECASE): hns_id = m.group(2) slug = m.group(3) naziv = strip_tags(m.group(4)) if not naziv: naziv = slug.replace('-', ' ').title() # Makni potencijalne krive linkove if len(naziv) < 50 and hns_id: klubovi[hns_id] = (hns_id, naziv, BASE_URL + m.group(1)) return list(klubovi.values()) def upsert_klubovi(conn, klubovi): if not klubovi: return [] try: with conn.cursor() as cur: execute_values(cur, """ INSERT INTO pgz_sport.klubovi (hns_id, naziv, source_url) VALUES %s ON CONFLICT (hns_id) DO UPDATE SET naziv = EXCLUDED.naziv, source_url = EXCLUDED.source_url WHERE pgz_sport.klubovi.naziv IS DISTINCT FROM EXCLUDED.naziv OR pgz_sport.klubovi.source_url IS DISTINCT FROM EXCLUDED.source_url; """, klubovi) cur.execute("SELECT id, hns_id, source_url FROM pgz_sport.klubovi WHERE hns_id = ANY(%s)", ([k[0] for k in klubovi],)) return cur.fetchall() except psycopg2.Error as e: logger.error(f"DB Greška pri UPSERT klubova: {e}") return [] def sync_roster(conn, klub_hns_id, klub_url): target_url = klub_url if klub_url.endswith('/') else klub_url + '/' target_url += "igraci/" html = fetch(target_url) if not html: return [] igraci = {} for m in re.finditer(r']*href="(/igraci/(\d+)/([^/"]+)[^"]*)"[^>]*>(.*?)', html, re.DOTALL | re.IGNORECASE): hns_igrac_id = m.group(2) slug = m.group(3) ime_prezime = strip_tags(m.group(4)) if not ime_prezime or len(ime_prezime) > 60: continue parts = ime_prezime.split(' ', 1) ime = parts[0] if parts else "Nepoznato" prezime = parts[1] if len(parts) > 1 else slug.replace('-', ' ').title() igraci[hns_igrac_id] = (hns_igrac_id, ime, prezime, klub_hns_id, BASE_URL + m.group(1), slug) igraci_list = list(igraci.values()) if not igraci_list: logger.debug(f"Klub {klub_hns_id} nema igrača (ili greška u parsiranju).") return [] try: with conn.cursor() as cur: execute_values(cur, """ INSERT INTO pgz_sport.clanovi (hns_igrac_id, ime, prezime, klub_hns_id, source_url, slug) VALUES %s ON CONFLICT (hns_igrac_id) DO UPDATE SET ime = EXCLUDED.ime, prezime = EXCLUDED.prezime, klub_hns_id = EXCLUDED.klub_hns_id, source_url = EXCLUDED.source_url, slug = EXCLUDED.slug WHERE pgz_sport.clanovi.ime IS DISTINCT FROM EXCLUDED.ime OR pgz_sport.clanovi.prezime IS DISTINCT FROM EXCLUDED.prezime OR pgz_sport.clanovi.klub_hns_id IS DISTINCT FROM EXCLUDED.klub_hns_id OR pgz_sport.clanovi.source_url IS DISTINCT FROM EXCLUDED.source_url; """, igraci_list) logger.info(f"Roster za klub {klub_hns_id}: uspješno sinkronizirano {len(igraci_list)} igrača.") return igraci_list except psycopg2.Error as e: logger.error(f"DB Greška pri UPSERT rostera za klub {klub_hns_id}: {e}") return [] def get_all_db_clubs(conn): try: with conn.cursor() as cur: cur.execute("SELECT id, hns_id, source_url FROM pgz_sport.klubovi WHERE source_url IS NOT NULL") return cur.fetchall() except: return [] # --- MAIN ENGINE --- def main(): logger.info("=== START: HNS PGŽ FULL SYNC ===") conn = db_conn() all_extracted_klubovi = [] # 1. Traži klubove po ligama for url in NATJECANJA_URLS: logger.info(f"Preuzimanje klubova iz natjecanja: {url}") html = fetch(url) extracted = extract_klubovi(html) logger.info(f"Pronađeno {len(extracted)} klubova u natjecanju.") all_extracted_klubovi.extend(extracted) time.sleep(1) unique_klubovi = list({k[0]: k for k in all_extracted_klubovi}.values()) logger.info(f"Ukupno jedinstvenih klubova za UPSERT: {len(unique_klubovi)}") upsert_klubovi(conn, unique_klubovi) # 2. Skini roster za svaki klub iz baze db_klubovi = get_all_db_clubs(conn) logger.info(f"Pokrećem sync rostera za {len(db_klubovi)} klubova iz baze...") for _, klub_hns_id, klub_url in db_klubovi: try: sync_roster(conn, klub_hns_id, klub_url) time.sleep(0.5) except Exception as e: logger.critical(f"Kritična greška kod kluba {klub_hns_id}: {e}") continue logger.info("=== KRAJ: HNS PGŽ FULL SYNC ===") conn.close() if __name__ == "__main__": try: main() except KeyboardInterrupt: logger.info("Skripta prekinuta.") sys.exit(0) except Exception as e: logger.critical(f"Neočekivani pad skripte: {e}", exc_info=True) sys.exit(1)