Files
pgz-sport/scripts/sport_harvest_health.py

124 lines
4.0 KiB
Python
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
from dotenv import load_dotenv
load_dotenv('/opt/rinet-gpu/.env.master')
# auto-added by patch_scrapers_with_dotenv.sh
# sport_harvest_health.py — staleness check za pgz_sport klubove
# v1.0 — dradulic@outlook.com / damir@rinet.one — 2026-05-05
# Description: Provjerava kad je svaki aktivan klub zadnji put scrape-an
# (klub_roster.scraped_at clanovi.last_scraped_at). Klubovi >7 dana
# flag-irani su za re-scrape; Telegram alert se šalje ako ima staleova.
# Pokreće ga /etc/cron.d/sport-harvesters u 04:30 svaki 2. dan.
import os
import sys
import json
import subprocess
from datetime import datetime, timedelta, timezone
import psycopg2
from psycopg2.extras import RealDictCursor
DSN = os.getenv(
"RINET_DSN",
f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}",
)
TG_TOKEN = os.getenv("TG_TOKEN", "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y")
TG_CHAT = os.getenv("TG_CHAT", "7969491558")
STALE_DAYS = int(os.getenv("SPORT_STALE_DAYS", "7"))
LOG_DIR = "/var/log/pgz-sport-debug"
LOG_PATH = os.path.join(LOG_DIR, f"health_{datetime.now().strftime('%Y%m%d_%H%M')}.log")
os.makedirs(LOG_DIR, exist_ok=True)
_logfh = open(LOG_PATH, "a")
def log(msg: str) -> None:
line = f"[{datetime.now().isoformat(timespec='seconds')}] {msg}"
print(line, flush=True)
_logfh.write(line + "\n")
_logfh.flush()
SQL = """
WITH last_per_klub AS (
SELECT k.id AS klub_id, k.naziv, k.sport,
GREATEST(
COALESCE((SELECT MAX(scraped_at) FROM pgz_sport.klub_roster WHERE klub_id = k.id), 'epoch'::timestamptz),
COALESCE((SELECT MAX(last_scraped_at) FROM pgz_sport.clanovi WHERE klub_id = k.id), 'epoch'::timestamptz)
) AS last_scrape
FROM pgz_sport.klubovi k
WHERE k.aktivan = true
)
SELECT klub_id, naziv, sport, last_scrape,
(last_scrape <= 'epoch'::timestamptz OR last_scrape < now() - interval %s) AS stale
FROM last_per_klub;
"""
def telegram(text: str) -> None:
try:
subprocess.run(
[
"curl", "-sS", "-X", "POST",
f"https://api.telegram.org/bot{TG_TOKEN}/sendMessage",
"-d", f"chat_id={TG_CHAT}",
"--data-urlencode", f"text={text}",
],
capture_output=True,
timeout=10,
check=False,
)
log(f"telegram sent ({len(text)} chars)")
except Exception as e:
log(f"telegram fail: {e}")
def main() -> int:
log(f"sport_harvest_health START stale_days={STALE_DAYS}")
try:
conn = psycopg2.connect(DSN)
except Exception as e:
log(f"DB connect FAIL: {e}")
telegram(f"🚨 sport_harvest_health: DB connect FAIL — {e}")
return 2
interval_str = f"{STALE_DAYS} days"
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute(SQL, (interval_str,))
rows = cur.fetchall()
total = len(rows)
stale_rows = [r for r in rows if r["stale"]]
by_sport: dict = {}
for r in stale_rows:
s = (r["sport"] or "?").lower()
by_sport[s] = by_sport.get(s, 0) + 1
top_stale = sorted(
stale_rows,
key=lambda r: (r["last_scrape"] or datetime(1970, 1, 1, tzinfo=timezone.utc)),
)[:10]
log(f"klubova_total={total} stale={len(stale_rows)} by_sport={json.dumps(by_sport, ensure_ascii=False)}")
for r in top_stale:
log(f" STALE klub_id={r['klub_id']} sport={r['sport']} last={r['last_scrape']} naziv={r['naziv']}")
if stale_rows:
sport_summary = ", ".join(f"{k.upper()}:{v}" for k, v in sorted(by_sport.items()))
top_lines = "\n".join(
f"{r['naziv']} ({(r['sport'] or '?')}) — {r['last_scrape']}"
for r in top_stale[:5]
)
msg = (
f"⚠️ Sport harvest stale: {len(stale_rows)}/{total} klubova "
f">{STALE_DAYS} dana ({sport_summary})\nTop:\n{top_lines}"
)
telegram(msg)
conn.close()
log("sport_harvest_health DONE")
return 1 if stale_rows else 0
if __name__ == "__main__":
sys.exit(main())