#!/usr/bin/env python3 # ═══════════════════════════════════════════════════════════════════ # Fajl: sub1_hns_fix_and_extract.py | v1.0.0 | 05.05.2026 # Lokacija: /opt/pgz-sport/scripts/sub1_hns_fix_and_extract.py # Autor: dradulic@outlook.com / damir@rinet.one # Svrha: SUB1 finalize — (a) rollback false positives, # (b) extract hns_klub_id iz već postojećeg source_url, # (c) verify presence preko HEAD i upsert. # ═══════════════════════════════════════════════════════════════════ """SUB1 fix-up: false-positive rollback + source_url-based extraction.""" import os, re, sys, time, json, subprocess, urllib.request from datetime import datetime import psycopg2 from psycopg2.extras import RealDictCursor DSN = os.getenv("RINET_DSN", "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7") TG = os.getenv("TG_BOT_TOKEN", "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y") TG_CHAT = os.getenv("TG_CHAT", "7969491558") UA = "PGZ-Sport-Bot/1.0 (+https://api.rinet.one/sport/; contact dradulic@outlook.com)" LOG_PATH = f"/var/log/pgz-sport-debug/sub1_fix_{datetime.now().strftime('%Y%m%d_%H%M')}.log" LOG = open(LOG_PATH, "a") # False positives to ROLLBACK (cleared and marked not_found) FALSE_POS = { 2572: "NK Hajduk Tovarnik (matched HNK Hajduk Split — different club)", 600: "Ženski NK XXL Kraljevica (matched men's NK Kraljevica — wrong sex)", } def log(msg, telegram=False): line = f"[{datetime.now().isoformat(timespec='seconds')}] {msg}" print(line, flush=True); LOG.write(line+"\n"); LOG.flush() if telegram: try: subprocess.run(["curl","-s","-X","POST", f"https://api.telegram.org/bot{TG}/sendMessage", "-d", f"chat_id={TG_CHAT}", "--data-urlencode", f"text={msg[:3500]}"], timeout=8, capture_output=True) except: pass def http_head_or_get(url, timeout=12): """Verify URL exists. Return (status, title).""" try: req = urllib.request.Request(url, headers={"User-Agent": UA}) with urllib.request.urlopen(req, timeout=timeout) as r: html = r.read().decode("utf-8", errors="replace") m = re.search(r']*>([^<]+)', html) title = m.group(1).strip() if m else None return r.status, title except urllib.error.HTTPError as e: return e.code, None except Exception as e: return 0, str(e) URL_RE = re.compile(r'/klubovi/(\d+)/([a-z0-9-]*)/?') def main(): log(f"=== SUB1 fix start; log={LOG_PATH} ===") conn = psycopg2.connect(DSN); conn.autocommit = True cur = conn.cursor(cursor_factory=RealDictCursor) # Phase 1: Rollback false positives rb = 0 for kid, reason in FALSE_POS.items(): cur.execute(""" UPDATE pgz_sport.klubovi SET hns_klub_id = NULL, hns_slug = NULL, scrape_source = 'hns_not_found', last_scraped_at = now() WHERE id = %s """, (kid,)) log(f" ROLLBACK [{kid}] — {reason}") rb += 1 # Phase 2: Extract hns_klub_id from existing source_url cur.execute(""" SELECT id, naziv, source_url FROM pgz_sport.klubovi WHERE sport='nogomet' AND pgz_sufinanciran=true AND hns_klub_id IS NULL AND source_url ~ 'semafor\\.hns\\.family/klubovi/[0-9]+' ORDER BY id """) rows = cur.fetchall() log(f"Source-URL extraction candidates: {len(rows)}") extracted = 0; verify_fail = 0 for r in rows: kid, naziv, url = r['id'], r['naziv'], r['source_url'] m = URL_RE.search(url) if not m: log(f" SKIP [{kid}] no match in {url}") continue hns_id = int(m.group(1)) slug = m.group(2) or None # Verify verify_url = f"https://semafor.hns.family/klubovi/{hns_id}/" status, title = http_head_or_get(verify_url) time.sleep(0.8) if status != 200 or not title: log(f" VERIFY FAIL [{kid}] {naziv} -> {hns_id}: status={status} title={title}") verify_fail += 1 continue # If slug missing, try inferring from title if not slug and title: slug = re.sub(r'[^a-z0-9]+', '-', title.lower() .replace('č','c').replace('ć','c').replace('š','s').replace('ž','z').replace('đ','d') ).strip('-') canonical = f"https://semafor.hns.family/klubovi/{hns_id}/{slug}/" if slug else verify_url try: cur.execute(""" UPDATE pgz_sport.klubovi SET hns_klub_id = %s, hns_slug = %s, source_url = %s, scrape_source = 'hns_semafor', last_scraped_at = now() WHERE id = %s """, (hns_id, slug, canonical, kid)) log(f" EXTRACT [{kid}] {naziv} -> HNS {hns_id} '{title}' (slug={slug})") extracted += 1 except Exception as e: log(f" UPDATE fail [{kid}]: {e}") # Phase 3: Final stats cur.execute(""" SELECT COUNT(*) FILTER (WHERE hns_klub_id IS NOT NULL) AS mapped, COUNT(*) FILTER (WHERE hns_klub_id IS NULL AND scrape_source='hns_not_found') AS marked_nf, COUNT(*) FILTER (WHERE hns_klub_id IS NULL AND (scrape_source IS NULL OR scrape_source != 'hns_not_found')) AS untouched FROM pgz_sport.klubovi WHERE sport='nogomet' AND pgz_sufinanciran=true AND naziv !~* 'Malonogometni|Mini Nogomet|Mali Nogomet|Američkog|Plaž|Pijesku|HMNK|MNK|preteča|povijesn|1903|1906|1908|1917|1919|1926|1929' """) stats = cur.fetchone() log(f"=== Final state (real football, PGŽ priority): mapped={stats['mapped']}, marked_not_found={stats['marked_nf']}, untouched={stats['untouched']} ===") msg = (f"SUB1 fix done: rollback={rb}, source_url-extracted={extracted}, " f"verify_fail={verify_fail}. Final mapped={stats['mapped']} / " f"not_found={stats['marked_nf']} / untouched={stats['untouched']}") log(msg, telegram=True) if __name__ == "__main__": main()