DEBUG OBSERVABILITY: live error feed + auto-triage bot + dashboard

PHASE 1 — DEBUG mode:
- /etc/systemd/system/pgz-sport.service.d/debug.conf: DEBUG=1, LOG_LEVEL=DEBUG, PYTHONUNBUFFERED=1, UVICORN_LOG_LEVEL=debug

PHASE 2 — Error stream:
- /opt/pgz-sport/scripts/debug_tail.sh: tail journalctl + nginx → /var/log/pgz-sport-debug/{stream,errors}.jsonl
- pgz-debug-tail.service (always restart, multiplexes 4 sources)

PHASE 3 — Auto-triage bot:
- /opt/pgz-sport/scripts/auto_triage.py: classifies errors, dispatches CC agents
- Patterns: 5xx spike → CC4, 401/403 spike → CC2, 4xx API → CC3, ImportError/DB → CC4
- Rate limit: 6 telegram/5min
- Records decisions in triage_decisions.jsonl
- pgz-auto-triage.service

PHASE 4 — Live dashboard:
- routers/debug_router.py mounted in pgz_sport_api
- GET /api/debug/health — services + DB + error count
- GET /api/debug/errors?limit=N — last N errors (JSON)
- GET /api/debug/decisions — auto-fix decisions
- GET /api/debug/stream — full log tail
- GET /api/debug/dashboard — live HTML refresh 5s

Damir admin tier dashboard: https://sport.rinet.one/sport/api/debug/dashboard
This commit is contained in:
2026-05-05 08:46:09 +02:00
parent 7adcec3309
commit 63ca005b6e
9 changed files with 861 additions and 16 deletions
+185
View File
@@ -0,0 +1,185 @@
#!/usr/bin/env python3
"""
auto_triage.py — Active error monitor za pgz-sport stack.
Tail-a /var/log/pgz-sport-debug/errors.jsonl, klasificira greške,
i automatski dispatcha tasks na CC agente kad detektira pattern.
Patterns:
- Recurring 5xx → CC4 (backend)
- 401/403 spike → CC2 (auth)
- 4xx na specifičnoj stranici → CC3 (frontend route)
- DB connection error → CC4 + telegram urgent
- ImportError/AttributeError u pgz-sport → CC4 dispatch + restart attempt
"""
import json, os, re, time, subprocess, sys
from collections import defaultdict, deque
from pathlib import Path
from datetime import datetime
LOG_FILE = Path("/var/log/pgz-sport-debug/errors.jsonl")
TRIAGE_LOG = Path("/var/log/pgz-sport-debug/triage.log")
TRIAGE_DECISIONS = Path("/var/log/pgz-sport-debug/triage_decisions.jsonl")
TG_TOKEN = "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y"
TG_CHAT = "7969491558"
# Rate limit: ne više od X telegram poruka po 5 min
RATE_WIN = 300 # seconds
RATE_MAX = 6
recent_alerts = deque()
# Pattern counts (sliding window)
PATTERN_WIN = 60 # 60s window
recent_patterns = defaultdict(deque)
def log(msg):
ts = datetime.now().isoformat(timespec='seconds')
with open(TRIAGE_LOG, "a") as f:
f.write(f"[{ts}] {msg}\n")
print(f"[{ts}] {msg}", flush=True)
def telegram(text):
now = time.time()
while recent_alerts and now - recent_alerts[0] > RATE_WIN:
recent_alerts.popleft()
if len(recent_alerts) >= RATE_MAX:
log(f"RATE LIMITED telegram: {text[:80]}")
return False
recent_alerts.append(now)
try:
subprocess.run([
"curl", "-s", "-X", "POST",
f"https://api.telegram.org/bot{TG_TOKEN}/sendMessage",
"-d", f"chat_id={TG_CHAT}",
"--data-urlencode", f"text={text}"
], timeout=10, capture_output=True)
return True
except Exception as e:
log(f"telegram fail: {e}")
return False
def dispatch_to_cc(session, msg):
"""Pošalji task na CC tmux session."""
try:
subprocess.run(["tmux", "send-keys", "-t", f"{session}:0", msg], check=False, capture_output=True)
time.sleep(1)
subprocess.run(["tmux", "send-keys", "-t", f"{session}:0", "Enter"], check=False, capture_output=True)
log(f"dispatched to {session}: {msg[:80]}")
record_decision({"action": "dispatch", "target": session, "msg": msg[:200]})
return True
except Exception as e:
log(f"dispatch fail to {session}: {e}")
return False
def record_decision(obj):
obj["ts"] = datetime.now().isoformat(timespec='seconds')
with open(TRIAGE_DECISIONS, "a") as f:
f.write(json.dumps(obj) + "\n")
def pattern_count(key, since=None):
"""Count of pattern occurences within sliding window."""
if since is None: since = time.time() - PATTERN_WIN
dq = recent_patterns[key]
while dq and dq[0] < since:
dq.popleft()
return len(dq)
def add_pattern(key):
recent_patterns[key].append(time.time())
def classify(line):
try:
ev = json.loads(line)
except:
return None
msg = ev.get("msg", "") or ""
src = ev.get("src", "")
code = ev.get("code", "")
path = ev.get("path", "")
method = ev.get("method", "")
# ─── Pattern A: HTTP 5xx
if code and code.startswith("5"):
key = f"5xx:{path[:100]}"
add_pattern(key)
n = pattern_count(key)
if n >= 3:
telegram(f"⚠️ 5xx spike: {method} {path}{code} (×{n}/60s)")
dispatch_to_cc("cc4", f"5xx detected: {method} {path} {code} occurring {n}x in 60s. Investigate /opt/pgz-sport/routers/ for the route handler. Check DB connection, log traceback. Run smoke test. Fix + restart pgz-sport + verify resolved.")
recent_patterns[key].clear() # reset after dispatch
return ("5xx_spike", n, path)
# ─── Pattern B: 401/403 spike (auth issue)
if code in ("401", "403"):
key = f"auth:{path[:80]}"
add_pattern(key)
n = pattern_count(key)
if n >= 5:
telegram(f"🔒 Auth spike: {code} on {path} (×{n}/60s)")
dispatch_to_cc("cc2", f"Auth spike: {code} on {path} ×{n} times in 60s. Check JWT middleware in pgz_sport_api.py + auth/auth_v2.py. Verify role-based access control. Smoke test 3 demo accounts.")
recent_patterns[key].clear()
return ("auth_spike", n, path)
# ─── Pattern C: 4xx on consumer endpoints (frontend bug)
if code and code.startswith("4") and code not in ("401", "403"):
if path.startswith("/sport/api/"):
key = f"4xx_api:{path[:80]}"
add_pattern(key)
n = pattern_count(key)
if n >= 5:
telegram(f"⚠️ 4xx API: {path} ×{n}/60s")
dispatch_to_cc("cc3", f"Frontend bug: {path} returning {code} ×{n}x. Frontend may call wrong URL or send bad payload. Check static/*.html for fetch/api() calls to {path}. Verify request shape matches backend schema.")
recent_patterns[key].clear()
return ("4xx_api", n, path)
# ─── Pattern D: ImportError / AttributeError / SyntaxError u backendu
crit_patterns = [
(r"ImportError|ModuleNotFoundError", "import_error"),
(r"AttributeError", "attribute_error"),
(r"SyntaxError", "syntax_error"),
(r"OperationalError.*could not connect", "db_connect_error"),
(r"asyncpg|psycopg2.*OperationalError", "db_pool_error"),
(r"FATAL|CRITICAL", "fatal"),
]
for pat, kind in crit_patterns:
if re.search(pat, msg, re.I):
telegram(f"🚨 {kind.upper()}: {msg[:200]}")
target = "cc4" if "db" not in kind else "cc4"
dispatch_to_cc(target, f"CRITICAL {kind} detected u pgz-sport: {msg[:300]}. Identify file:line, fix, py_compile, restart, verify. If db_connect_error, check Server B (10.10.0.2:6432) connectivity.")
return (kind, 1, msg[:80])
# ─── Pattern E: Empty page detection
if code == "200" and "size_download" in str(ev) and ev.get("size", 0) < 100:
key = f"empty:{path}"
add_pattern(key)
if pattern_count(key) >= 2:
telegram(f"📄 Empty page: {path}")
dispatch_to_cc("cc3", f"Empty page detected: {path} returning <100 bytes. Check static/{path.split('/')[-1]} or backend response.")
recent_patterns[key].clear()
return None
def follow(path):
"""Tail -F equivalent."""
while not path.exists():
time.sleep(1)
f = open(path, "r")
f.seek(0, 2) # EOF
while True:
line = f.readline()
if not line:
time.sleep(0.5)
continue
result = classify(line)
if result:
log(f"PATTERN {result[0]} ×{result[1]}: {result[2]}")
if __name__ == "__main__":
log("auto_triage starting")
log(f"watching {LOG_FILE}")
try:
follow(LOG_FILE)
except KeyboardInterrupt:
log("shutdown")