DEBUG OBSERVABILITY: live error feed + auto-triage bot + dashboard

PHASE 1 — DEBUG mode:
- /etc/systemd/system/pgz-sport.service.d/debug.conf: DEBUG=1, LOG_LEVEL=DEBUG, PYTHONUNBUFFERED=1, UVICORN_LOG_LEVEL=debug

PHASE 2 — Error stream:
- /opt/pgz-sport/scripts/debug_tail.sh: tail journalctl + nginx → /var/log/pgz-sport-debug/{stream,errors}.jsonl
- pgz-debug-tail.service (always restart, multiplexes 4 sources)

PHASE 3 — Auto-triage bot:
- /opt/pgz-sport/scripts/auto_triage.py: classifies errors, dispatches CC agents
- Patterns: 5xx spike → CC4, 401/403 spike → CC2, 4xx API → CC3, ImportError/DB → CC4
- Rate limit: 6 telegram/5min
- Records decisions in triage_decisions.jsonl
- pgz-auto-triage.service

PHASE 4 — Live dashboard:
- routers/debug_router.py mounted in pgz_sport_api
- GET /api/debug/health — services + DB + error count
- GET /api/debug/errors?limit=N — last N errors (JSON)
- GET /api/debug/decisions — auto-fix decisions
- GET /api/debug/stream — full log tail
- GET /api/debug/dashboard — live HTML refresh 5s

Damir admin tier dashboard: https://sport.rinet.one/sport/api/debug/dashboard
This commit is contained in:
2026-05-05 08:46:09 +02:00
parent 7adcec3309
commit 63ca005b6e
9 changed files with 861 additions and 16 deletions
+170
View File
@@ -0,0 +1,170 @@
"""Debug observability dashboard endpoint."""
import json, os, time
from pathlib import Path
from fastapi import APIRouter, Query
from fastapi.responses import JSONResponse, HTMLResponse, PlainTextResponse
from typing import Optional
router = APIRouter(prefix="/api/debug", tags=["debug"])
LOGDIR = Path("/var/log/pgz-sport-debug")
@router.get("/health")
def debug_health():
"""Quick service status."""
import subprocess
services = ['pgz-sport', 'pgz-debug-tail', 'pgz-auto-triage', 'nginx', 'redis-server']
status = {}
for s in services:
try:
r = subprocess.run(['systemctl', 'is-active', s], capture_output=True, text=True, timeout=2)
status[s] = r.stdout.strip()
except Exception as e:
status[s] = f"error:{e}"
# DB
db_status = "unknown"
try:
import psycopg2
with psycopg2.connect("host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7", connect_timeout=2) as conn:
with conn.cursor() as cur:
cur.execute("SELECT 1")
db_status = "ok"
except Exception as e:
db_status = f"error:{e}"
# Recent errors count
err_count = 0
if (LOGDIR / "errors.jsonl").exists():
with open(LOGDIR / "errors.jsonl") as f:
err_count = sum(1 for _ in f)
return {
"ts": time.time(),
"services": status,
"db": db_status,
"total_errors_logged": err_count,
"log_dir": str(LOGDIR),
}
@router.get("/errors")
def recent_errors(limit: int = Query(100, ge=1, le=1000)):
"""Last N errors from errors.jsonl."""
f = LOGDIR / "errors.jsonl"
if not f.exists():
return {"errors": [], "note": "errors.jsonl not yet created"}
lines = f.read_text(errors='ignore').strip().split('\n')[-limit:]
parsed = []
for line in lines:
try:
parsed.append(json.loads(line))
except:
continue
return {"errors": parsed, "count": len(parsed)}
@router.get("/decisions")
def triage_decisions(limit: int = Query(50, ge=1, le=500)):
"""Last N auto-triage decisions."""
f = LOGDIR / "triage_decisions.jsonl"
if not f.exists():
return {"decisions": [], "note": "no decisions yet"}
lines = f.read_text(errors='ignore').strip().split('\n')[-limit:]
parsed = []
for line in lines:
try:
parsed.append(json.loads(line))
except:
continue
return {"decisions": parsed, "count": len(parsed)}
@router.get("/stream")
def stream_tail(lines: int = Query(200, ge=10, le=2000)):
"""Last N lines of full stream.jsonl."""
f = LOGDIR / "stream.jsonl"
if not f.exists():
return {"stream": []}
raw = f.read_text(errors='ignore').strip().split('\n')[-lines:]
parsed = []
for line in raw:
try:
parsed.append(json.loads(line))
except:
continue
return {"stream": parsed}
@router.get("/dashboard", response_class=HTMLResponse)
def dashboard():
"""Live HTML dashboard."""
return """<!DOCTYPE html>
<html><head><meta charset="UTF-8"><title>PGŽ Debug Live</title>
<style>
body{font-family:'JetBrains Mono',monospace;background:#0a0a0c;color:#e0e0e0;margin:0;padding:20px}
h1{color:#FFD700;font-size:18px;margin:0 0 18px}
.grid{display:grid;grid-template-columns:1fr 1fr;gap:20px}
.card{background:#1a1a1e;border:1px solid #2a2a2e;border-radius:6px;padding:16px}
.card h2{color:#FFD700;font-size:13px;margin:0 0 10px;text-transform:uppercase;letter-spacing:.5px}
.kv{font-size:12px;line-height:1.6}
.kv span:first-child{color:#888;display:inline-block;width:160px}
.ok{color:#3a9}
.err{color:#e55}
.warn{color:#fa3}
pre{font-size:11px;background:#0e0e10;padding:8px;border-radius:4px;max-height:400px;overflow:auto;border:1px solid #2a2a2e}
.row{padding:6px 0;border-bottom:1px solid #2a2a2e;font-size:11px}
.row:last-child{border-bottom:0}
.ts{color:#666}
.lvl-ERROR{color:#e55}
.lvl-WARN{color:#fa3}
.lvl-CRITICAL{color:#f00;font-weight:bold}
.refresh{color:#666;font-size:10px;float:right}
</style></head>
<body>
<h1>🩺 PGŽ Sport · Live Debug Dashboard <span class="refresh">refresh: 5s</span></h1>
<div class="grid">
<div class="card">
<h2>Service Health</h2>
<div id="health" class="kv">loading…</div>
</div>
<div class="card">
<h2>Auto-Triage Decisions</h2>
<div id="decisions">loading…</div>
</div>
<div class="card" style="grid-column:1/-1">
<h2>Recent Errors (live)</h2>
<div id="errors"><pre>loading…</pre></div>
</div>
</div>
<script>
async function refresh(){
// Health
const h = await fetch('/sport/api/debug/health').then(r=>r.json());
let html = '';
for (const [k,v] of Object.entries(h.services||{})){
const cls = v==='active'?'ok':'err';
html += `<div><span>${k}</span><span class="${cls}">${v}</span></div>`;
}
html += `<div><span>db</span><span class="${h.db==='ok'?'ok':'err'}">${h.db}</span></div>`;
html += `<div><span>total_errors</span><span>${h.total_errors_logged}</span></div>`;
document.getElementById('health').innerHTML = html;
// Decisions
const d = await fetch('/sport/api/debug/decisions?limit=10').then(r=>r.json());
let dh = '';
if (!d.decisions || d.decisions.length===0) dh = '<div class="row" style="color:#666">no auto-fixes triggered yet</div>';
for (const x of (d.decisions||[]).reverse()){
dh += `<div class="row"><span class="ts">${(x.ts||'').substring(11,19)}</span> <b>${x.action}</b> → ${x.target}: ${(x.msg||'').substring(0,120)}</div>`;
}
document.getElementById('decisions').innerHTML = dh;
// Errors
const e = await fetch('/sport/api/debug/errors?limit=30').then(r=>r.json());
let eh = '';
for (const x of (e.errors||[]).reverse()){
const cls = `lvl-${x.level||'INFO'}`;
eh += `<div class="row"><span class="ts">${(x.ts||'').substring(11,19)}</span> <span class="${cls}">[${x.level||'?'}]</span> <span style="color:#aaa">${x.src||'?'}</span> ${(x.code||'')} ${(x.path||'')} ${(x.msg||'').substring(0,140)}</div>`;
}
document.getElementById('errors').innerHTML = eh || '<div class="row" style="color:#666">No errors</div>';
}
refresh();
setInterval(refresh, 5000);
</script>
</body></html>"""