#!/usr/bin/env python3
from dotenv import load_dotenv
load_dotenv('/opt/rinet-gpu/.env.master')
# auto-added by patch_scrapers_with_dotenv.sh
# ═══════════════════════════════════════════════════════════════════
# Fajl: sub1_hns_fix_and_extract.py | v1.0.0 | 05.05.2026
# Lokacija: /opt/pgz-sport/scripts/sub1_hns_fix_and_extract.py
# Autor: dradulic@outlook.com / damir@rinet.one
# Svrha: SUB1 finalize — (a) rollback false positives,
# (b) extract hns_klub_id iz već postojećeg source_url,
# (c) verify presence preko HEAD i upsert.
# ═══════════════════════════════════════════════════════════════════
"""SUB1 fix-up: false-positive rollback + source_url-based extraction."""
import os, re, sys, time, json, subprocess, urllib.request
from datetime import datetime
import psycopg2
from psycopg2.extras import RealDictCursor
DSN = os.getenv("RINET_DSN",
f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}")
TG = os.getenv("TG_BOT_TOKEN", "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y")
TG_CHAT = os.getenv("TG_CHAT", "7969491558")
UA = "PGZ-Sport-Bot/1.0 (+https://api.rinet.one/sport/; contact dradulic@outlook.com)"
LOG_PATH = f"/var/log/pgz-sport-debug/sub1_fix_{datetime.now().strftime('%Y%m%d_%H%M')}.log"
LOG = open(LOG_PATH, "a")
# False positives to ROLLBACK (cleared and marked not_found)
FALSE_POS = {
2572: "NK Hajduk Tovarnik (matched HNK Hajduk Split — different club)",
600: "Ženski NK XXL Kraljevica (matched men's NK Kraljevica — wrong sex)",
}
def log(msg, telegram=False):
line = f"[{datetime.now().isoformat(timespec='seconds')}] {msg}"
print(line, flush=True); LOG.write(line+"\n"); LOG.flush()
if telegram:
try:
subprocess.run(["curl","-s","-X","POST",
f"https://api.telegram.org/bot{TG}/sendMessage",
"-d", f"chat_id={TG_CHAT}",
"--data-urlencode", f"text={msg[:3500]}"],
timeout=8, capture_output=True)
except: pass
def http_head_or_get(url, timeout=12):
"""Verify URL exists. Return (status, title)."""
try:
req = urllib.request.Request(url, headers={"User-Agent": UA})
with urllib.request.urlopen(req, timeout=timeout) as r:
html = r.read().decode("utf-8", errors="replace")
m = re.search(r'
]*>([^<]+)
', html)
title = m.group(1).strip() if m else None
return r.status, title
except urllib.error.HTTPError as e:
return e.code, None
except Exception as e:
return 0, str(e)
URL_RE = re.compile(r'/klubovi/(\d+)/([a-z0-9-]*)/?')
def main():
log(f"=== SUB1 fix start; log={LOG_PATH} ===")
conn = psycopg2.connect(DSN); conn.autocommit = True
cur = conn.cursor(cursor_factory=RealDictCursor)
# Phase 1: Rollback false positives
rb = 0
for kid, reason in FALSE_POS.items():
cur.execute("""
UPDATE pgz_sport.klubovi
SET hns_klub_id = NULL,
hns_slug = NULL,
scrape_source = 'hns_not_found',
last_scraped_at = now()
WHERE id = %s
""", (kid,))
log(f" ROLLBACK [{kid}] — {reason}")
rb += 1
# Phase 2: Extract hns_klub_id from existing source_url
cur.execute("""
SELECT id, naziv, source_url
FROM pgz_sport.klubovi
WHERE sport='nogomet' AND pgz_sufinanciran=true
AND hns_klub_id IS NULL
AND source_url ~ 'semafor\\.hns\\.family/klubovi/[0-9]+'
ORDER BY id
""")
rows = cur.fetchall()
log(f"Source-URL extraction candidates: {len(rows)}")
extracted = 0; verify_fail = 0
for r in rows:
kid, naziv, url = r['id'], r['naziv'], r['source_url']
m = URL_RE.search(url)
if not m:
log(f" SKIP [{kid}] no match in {url}")
continue
hns_id = int(m.group(1))
slug = m.group(2) or None
# Verify
verify_url = f"https://semafor.hns.family/klubovi/{hns_id}/"
status, title = http_head_or_get(verify_url)
time.sleep(0.8)
if status != 200 or not title:
log(f" VERIFY FAIL [{kid}] {naziv} -> {hns_id}: status={status} title={title}")
verify_fail += 1
continue
# If slug missing, try inferring from title
if not slug and title:
slug = re.sub(r'[^a-z0-9]+', '-',
title.lower()
.replace('č','c').replace('ć','c').replace('š','s').replace('ž','z').replace('đ','d')
).strip('-')
canonical = f"https://semafor.hns.family/klubovi/{hns_id}/{slug}/" if slug else verify_url
try:
cur.execute("""
UPDATE pgz_sport.klubovi
SET hns_klub_id = %s,
hns_slug = %s,
source_url = %s,
scrape_source = 'hns_semafor',
last_scraped_at = now()
WHERE id = %s
""", (hns_id, slug, canonical, kid))
log(f" EXTRACT [{kid}] {naziv} -> HNS {hns_id} '{title}' (slug={slug})")
extracted += 1
except Exception as e:
log(f" UPDATE fail [{kid}]: {e}")
# Phase 3: Final stats
cur.execute("""
SELECT
COUNT(*) FILTER (WHERE hns_klub_id IS NOT NULL) AS mapped,
COUNT(*) FILTER (WHERE hns_klub_id IS NULL AND scrape_source='hns_not_found') AS marked_nf,
COUNT(*) FILTER (WHERE hns_klub_id IS NULL AND (scrape_source IS NULL OR scrape_source != 'hns_not_found')) AS untouched
FROM pgz_sport.klubovi
WHERE sport='nogomet' AND pgz_sufinanciran=true
AND naziv !~* 'Malonogometni|Mini Nogomet|Mali Nogomet|Američkog|Plaž|Pijesku|HMNK|MNK|preteča|povijesn|1903|1906|1908|1917|1919|1926|1929'
""")
stats = cur.fetchone()
log(f"=== Final state (real football, PGŽ priority): mapped={stats['mapped']}, marked_not_found={stats['marked_nf']}, untouched={stats['untouched']} ===")
msg = (f"SUB1 fix done: rollback={rb}, source_url-extracted={extracted}, "
f"verify_fail={verify_fail}. Final mapped={stats['mapped']} / "
f"not_found={stats['marked_nf']} / untouched={stats['untouched']}")
log(msg, telegram=True)
if __name__ == "__main__":
main()