Files
pgz-sport/scripts/auto_triage.py
T
damir 63ca005b6e DEBUG OBSERVABILITY: live error feed + auto-triage bot + dashboard
PHASE 1 — DEBUG mode:
- /etc/systemd/system/pgz-sport.service.d/debug.conf: DEBUG=1, LOG_LEVEL=DEBUG, PYTHONUNBUFFERED=1, UVICORN_LOG_LEVEL=debug

PHASE 2 — Error stream:
- /opt/pgz-sport/scripts/debug_tail.sh: tail journalctl + nginx → /var/log/pgz-sport-debug/{stream,errors}.jsonl
- pgz-debug-tail.service (always restart, multiplexes 4 sources)

PHASE 3 — Auto-triage bot:
- /opt/pgz-sport/scripts/auto_triage.py: classifies errors, dispatches CC agents
- Patterns: 5xx spike → CC4, 401/403 spike → CC2, 4xx API → CC3, ImportError/DB → CC4
- Rate limit: 6 telegram/5min
- Records decisions in triage_decisions.jsonl
- pgz-auto-triage.service

PHASE 4 — Live dashboard:
- routers/debug_router.py mounted in pgz_sport_api
- GET /api/debug/health — services + DB + error count
- GET /api/debug/errors?limit=N — last N errors (JSON)
- GET /api/debug/decisions — auto-fix decisions
- GET /api/debug/stream — full log tail
- GET /api/debug/dashboard — live HTML refresh 5s

Damir admin tier dashboard: https://sport.rinet.one/sport/api/debug/dashboard
2026-05-05 08:46:09 +02:00

186 lines
6.9 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
auto_triage.py — Active error monitor za pgz-sport stack.
Tail-a /var/log/pgz-sport-debug/errors.jsonl, klasificira greške,
i automatski dispatcha tasks na CC agente kad detektira pattern.
Patterns:
- Recurring 5xx → CC4 (backend)
- 401/403 spike → CC2 (auth)
- 4xx na specifičnoj stranici → CC3 (frontend route)
- DB connection error → CC4 + telegram urgent
- ImportError/AttributeError u pgz-sport → CC4 dispatch + restart attempt
"""
import json, os, re, time, subprocess, sys
from collections import defaultdict, deque
from pathlib import Path
from datetime import datetime
LOG_FILE = Path("/var/log/pgz-sport-debug/errors.jsonl")
TRIAGE_LOG = Path("/var/log/pgz-sport-debug/triage.log")
TRIAGE_DECISIONS = Path("/var/log/pgz-sport-debug/triage_decisions.jsonl")
TG_TOKEN = "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y"
TG_CHAT = "7969491558"
# Rate limit: ne više od X telegram poruka po 5 min
RATE_WIN = 300 # seconds
RATE_MAX = 6
recent_alerts = deque()
# Pattern counts (sliding window)
PATTERN_WIN = 60 # 60s window
recent_patterns = defaultdict(deque)
def log(msg):
ts = datetime.now().isoformat(timespec='seconds')
with open(TRIAGE_LOG, "a") as f:
f.write(f"[{ts}] {msg}\n")
print(f"[{ts}] {msg}", flush=True)
def telegram(text):
now = time.time()
while recent_alerts and now - recent_alerts[0] > RATE_WIN:
recent_alerts.popleft()
if len(recent_alerts) >= RATE_MAX:
log(f"RATE LIMITED telegram: {text[:80]}")
return False
recent_alerts.append(now)
try:
subprocess.run([
"curl", "-s", "-X", "POST",
f"https://api.telegram.org/bot{TG_TOKEN}/sendMessage",
"-d", f"chat_id={TG_CHAT}",
"--data-urlencode", f"text={text}"
], timeout=10, capture_output=True)
return True
except Exception as e:
log(f"telegram fail: {e}")
return False
def dispatch_to_cc(session, msg):
"""Pošalji task na CC tmux session."""
try:
subprocess.run(["tmux", "send-keys", "-t", f"{session}:0", msg], check=False, capture_output=True)
time.sleep(1)
subprocess.run(["tmux", "send-keys", "-t", f"{session}:0", "Enter"], check=False, capture_output=True)
log(f"dispatched to {session}: {msg[:80]}")
record_decision({"action": "dispatch", "target": session, "msg": msg[:200]})
return True
except Exception as e:
log(f"dispatch fail to {session}: {e}")
return False
def record_decision(obj):
obj["ts"] = datetime.now().isoformat(timespec='seconds')
with open(TRIAGE_DECISIONS, "a") as f:
f.write(json.dumps(obj) + "\n")
def pattern_count(key, since=None):
"""Count of pattern occurences within sliding window."""
if since is None: since = time.time() - PATTERN_WIN
dq = recent_patterns[key]
while dq and dq[0] < since:
dq.popleft()
return len(dq)
def add_pattern(key):
recent_patterns[key].append(time.time())
def classify(line):
try:
ev = json.loads(line)
except:
return None
msg = ev.get("msg", "") or ""
src = ev.get("src", "")
code = ev.get("code", "")
path = ev.get("path", "")
method = ev.get("method", "")
# ─── Pattern A: HTTP 5xx
if code and code.startswith("5"):
key = f"5xx:{path[:100]}"
add_pattern(key)
n = pattern_count(key)
if n >= 3:
telegram(f"⚠️ 5xx spike: {method} {path}{code} (×{n}/60s)")
dispatch_to_cc("cc4", f"5xx detected: {method} {path} {code} occurring {n}x in 60s. Investigate /opt/pgz-sport/routers/ for the route handler. Check DB connection, log traceback. Run smoke test. Fix + restart pgz-sport + verify resolved.")
recent_patterns[key].clear() # reset after dispatch
return ("5xx_spike", n, path)
# ─── Pattern B: 401/403 spike (auth issue)
if code in ("401", "403"):
key = f"auth:{path[:80]}"
add_pattern(key)
n = pattern_count(key)
if n >= 5:
telegram(f"🔒 Auth spike: {code} on {path} (×{n}/60s)")
dispatch_to_cc("cc2", f"Auth spike: {code} on {path} ×{n} times in 60s. Check JWT middleware in pgz_sport_api.py + auth/auth_v2.py. Verify role-based access control. Smoke test 3 demo accounts.")
recent_patterns[key].clear()
return ("auth_spike", n, path)
# ─── Pattern C: 4xx on consumer endpoints (frontend bug)
if code and code.startswith("4") and code not in ("401", "403"):
if path.startswith("/sport/api/"):
key = f"4xx_api:{path[:80]}"
add_pattern(key)
n = pattern_count(key)
if n >= 5:
telegram(f"⚠️ 4xx API: {path} ×{n}/60s")
dispatch_to_cc("cc3", f"Frontend bug: {path} returning {code} ×{n}x. Frontend may call wrong URL or send bad payload. Check static/*.html for fetch/api() calls to {path}. Verify request shape matches backend schema.")
recent_patterns[key].clear()
return ("4xx_api", n, path)
# ─── Pattern D: ImportError / AttributeError / SyntaxError u backendu
crit_patterns = [
(r"ImportError|ModuleNotFoundError", "import_error"),
(r"AttributeError", "attribute_error"),
(r"SyntaxError", "syntax_error"),
(r"OperationalError.*could not connect", "db_connect_error"),
(r"asyncpg|psycopg2.*OperationalError", "db_pool_error"),
(r"FATAL|CRITICAL", "fatal"),
]
for pat, kind in crit_patterns:
if re.search(pat, msg, re.I):
telegram(f"🚨 {kind.upper()}: {msg[:200]}")
target = "cc4" if "db" not in kind else "cc4"
dispatch_to_cc(target, f"CRITICAL {kind} detected u pgz-sport: {msg[:300]}. Identify file:line, fix, py_compile, restart, verify. If db_connect_error, check Server B (10.10.0.2:6432) connectivity.")
return (kind, 1, msg[:80])
# ─── Pattern E: Empty page detection
if code == "200" and "size_download" in str(ev) and ev.get("size", 0) < 100:
key = f"empty:{path}"
add_pattern(key)
if pattern_count(key) >= 2:
telegram(f"📄 Empty page: {path}")
dispatch_to_cc("cc3", f"Empty page detected: {path} returning <100 bytes. Check static/{path.split('/')[-1]} or backend response.")
recent_patterns[key].clear()
return None
def follow(path):
"""Tail -F equivalent."""
while not path.exists():
time.sleep(1)
f = open(path, "r")
f.seek(0, 2) # EOF
while True:
line = f.readline()
if not line:
time.sleep(0.5)
continue
result = classify(line)
if result:
log(f"PATTERN {result[0]} ×{result[1]}: {result[2]}")
if __name__ == "__main__":
log("auto_triage starting")
log(f"watching {LOG_FILE}")
try:
follow(LOG_FILE)
except KeyboardInterrupt:
log("shutdown")