Files
pgz-sport/routers/debug_router.py
T
damir 63ca005b6e DEBUG OBSERVABILITY: live error feed + auto-triage bot + dashboard
PHASE 1 — DEBUG mode:
- /etc/systemd/system/pgz-sport.service.d/debug.conf: DEBUG=1, LOG_LEVEL=DEBUG, PYTHONUNBUFFERED=1, UVICORN_LOG_LEVEL=debug

PHASE 2 — Error stream:
- /opt/pgz-sport/scripts/debug_tail.sh: tail journalctl + nginx → /var/log/pgz-sport-debug/{stream,errors}.jsonl
- pgz-debug-tail.service (always restart, multiplexes 4 sources)

PHASE 3 — Auto-triage bot:
- /opt/pgz-sport/scripts/auto_triage.py: classifies errors, dispatches CC agents
- Patterns: 5xx spike → CC4, 401/403 spike → CC2, 4xx API → CC3, ImportError/DB → CC4
- Rate limit: 6 telegram/5min
- Records decisions in triage_decisions.jsonl
- pgz-auto-triage.service

PHASE 4 — Live dashboard:
- routers/debug_router.py mounted in pgz_sport_api
- GET /api/debug/health — services + DB + error count
- GET /api/debug/errors?limit=N — last N errors (JSON)
- GET /api/debug/decisions — auto-fix decisions
- GET /api/debug/stream — full log tail
- GET /api/debug/dashboard — live HTML refresh 5s

Damir admin tier dashboard: https://sport.rinet.one/sport/api/debug/dashboard
2026-05-05 08:46:09 +02:00

171 lines
6.1 KiB
Python

"""Debug observability dashboard endpoint."""
import json, os, time
from pathlib import Path
from fastapi import APIRouter, Query
from fastapi.responses import JSONResponse, HTMLResponse, PlainTextResponse
from typing import Optional
router = APIRouter(prefix="/api/debug", tags=["debug"])
LOGDIR = Path("/var/log/pgz-sport-debug")
@router.get("/health")
def debug_health():
"""Quick service status."""
import subprocess
services = ['pgz-sport', 'pgz-debug-tail', 'pgz-auto-triage', 'nginx', 'redis-server']
status = {}
for s in services:
try:
r = subprocess.run(['systemctl', 'is-active', s], capture_output=True, text=True, timeout=2)
status[s] = r.stdout.strip()
except Exception as e:
status[s] = f"error:{e}"
# DB
db_status = "unknown"
try:
import psycopg2
with psycopg2.connect("host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7", connect_timeout=2) as conn:
with conn.cursor() as cur:
cur.execute("SELECT 1")
db_status = "ok"
except Exception as e:
db_status = f"error:{e}"
# Recent errors count
err_count = 0
if (LOGDIR / "errors.jsonl").exists():
with open(LOGDIR / "errors.jsonl") as f:
err_count = sum(1 for _ in f)
return {
"ts": time.time(),
"services": status,
"db": db_status,
"total_errors_logged": err_count,
"log_dir": str(LOGDIR),
}
@router.get("/errors")
def recent_errors(limit: int = Query(100, ge=1, le=1000)):
"""Last N errors from errors.jsonl."""
f = LOGDIR / "errors.jsonl"
if not f.exists():
return {"errors": [], "note": "errors.jsonl not yet created"}
lines = f.read_text(errors='ignore').strip().split('\n')[-limit:]
parsed = []
for line in lines:
try:
parsed.append(json.loads(line))
except:
continue
return {"errors": parsed, "count": len(parsed)}
@router.get("/decisions")
def triage_decisions(limit: int = Query(50, ge=1, le=500)):
"""Last N auto-triage decisions."""
f = LOGDIR / "triage_decisions.jsonl"
if not f.exists():
return {"decisions": [], "note": "no decisions yet"}
lines = f.read_text(errors='ignore').strip().split('\n')[-limit:]
parsed = []
for line in lines:
try:
parsed.append(json.loads(line))
except:
continue
return {"decisions": parsed, "count": len(parsed)}
@router.get("/stream")
def stream_tail(lines: int = Query(200, ge=10, le=2000)):
"""Last N lines of full stream.jsonl."""
f = LOGDIR / "stream.jsonl"
if not f.exists():
return {"stream": []}
raw = f.read_text(errors='ignore').strip().split('\n')[-lines:]
parsed = []
for line in raw:
try:
parsed.append(json.loads(line))
except:
continue
return {"stream": parsed}
@router.get("/dashboard", response_class=HTMLResponse)
def dashboard():
"""Live HTML dashboard."""
return """<!DOCTYPE html>
<html><head><meta charset="UTF-8"><title>PGŽ Debug Live</title>
<style>
body{font-family:'JetBrains Mono',monospace;background:#0a0a0c;color:#e0e0e0;margin:0;padding:20px}
h1{color:#FFD700;font-size:18px;margin:0 0 18px}
.grid{display:grid;grid-template-columns:1fr 1fr;gap:20px}
.card{background:#1a1a1e;border:1px solid #2a2a2e;border-radius:6px;padding:16px}
.card h2{color:#FFD700;font-size:13px;margin:0 0 10px;text-transform:uppercase;letter-spacing:.5px}
.kv{font-size:12px;line-height:1.6}
.kv span:first-child{color:#888;display:inline-block;width:160px}
.ok{color:#3a9}
.err{color:#e55}
.warn{color:#fa3}
pre{font-size:11px;background:#0e0e10;padding:8px;border-radius:4px;max-height:400px;overflow:auto;border:1px solid #2a2a2e}
.row{padding:6px 0;border-bottom:1px solid #2a2a2e;font-size:11px}
.row:last-child{border-bottom:0}
.ts{color:#666}
.lvl-ERROR{color:#e55}
.lvl-WARN{color:#fa3}
.lvl-CRITICAL{color:#f00;font-weight:bold}
.refresh{color:#666;font-size:10px;float:right}
</style></head>
<body>
<h1>🩺 PGŽ Sport · Live Debug Dashboard <span class="refresh">refresh: 5s</span></h1>
<div class="grid">
<div class="card">
<h2>Service Health</h2>
<div id="health" class="kv">loading…</div>
</div>
<div class="card">
<h2>Auto-Triage Decisions</h2>
<div id="decisions">loading…</div>
</div>
<div class="card" style="grid-column:1/-1">
<h2>Recent Errors (live)</h2>
<div id="errors"><pre>loading…</pre></div>
</div>
</div>
<script>
async function refresh(){
// Health
const h = await fetch('/sport/api/debug/health').then(r=>r.json());
let html = '';
for (const [k,v] of Object.entries(h.services||{})){
const cls = v==='active'?'ok':'err';
html += `<div><span>${k}</span><span class="${cls}">${v}</span></div>`;
}
html += `<div><span>db</span><span class="${h.db==='ok'?'ok':'err'}">${h.db}</span></div>`;
html += `<div><span>total_errors</span><span>${h.total_errors_logged}</span></div>`;
document.getElementById('health').innerHTML = html;
// Decisions
const d = await fetch('/sport/api/debug/decisions?limit=10').then(r=>r.json());
let dh = '';
if (!d.decisions || d.decisions.length===0) dh = '<div class="row" style="color:#666">no auto-fixes triggered yet</div>';
for (const x of (d.decisions||[]).reverse()){
dh += `<div class="row"><span class="ts">${(x.ts||'').substring(11,19)}</span> <b>${x.action}</b> → ${x.target}: ${(x.msg||'').substring(0,120)}</div>`;
}
document.getElementById('decisions').innerHTML = dh;
// Errors
const e = await fetch('/sport/api/debug/errors?limit=30').then(r=>r.json());
let eh = '';
for (const x of (e.errors||[]).reverse()){
const cls = `lvl-${x.level||'INFO'}`;
eh += `<div class="row"><span class="ts">${(x.ts||'').substring(11,19)}</span> <span class="${cls}">[${x.level||'?'}]</span> <span style="color:#aaa">${x.src||'?'}</span> ${(x.code||'')} ${(x.path||'')} ${(x.msg||'').substring(0,140)}</div>`;
}
document.getElementById('errors').innerHTML = eh || '<div class="row" style="color:#666">No errors</div>';
}
refresh();
setInterval(refresh, 5000);
</script>
</body></html>"""