DEBUG OBSERVABILITY: live error feed + auto-triage bot + dashboard
PHASE 1 — DEBUG mode:
- /etc/systemd/system/pgz-sport.service.d/debug.conf: DEBUG=1, LOG_LEVEL=DEBUG, PYTHONUNBUFFERED=1, UVICORN_LOG_LEVEL=debug
PHASE 2 — Error stream:
- /opt/pgz-sport/scripts/debug_tail.sh: tail journalctl + nginx → /var/log/pgz-sport-debug/{stream,errors}.jsonl
- pgz-debug-tail.service (always restart, multiplexes 4 sources)
PHASE 3 — Auto-triage bot:
- /opt/pgz-sport/scripts/auto_triage.py: classifies errors, dispatches CC agents
- Patterns: 5xx spike → CC4, 401/403 spike → CC2, 4xx API → CC3, ImportError/DB → CC4
- Rate limit: 6 telegram/5min
- Records decisions in triage_decisions.jsonl
- pgz-auto-triage.service
PHASE 4 — Live dashboard:
- routers/debug_router.py mounted in pgz_sport_api
- GET /api/debug/health — services + DB + error count
- GET /api/debug/errors?limit=N — last N errors (JSON)
- GET /api/debug/decisions — auto-fix decisions
- GET /api/debug/stream — full log tail
- GET /api/debug/dashboard — live HTML refresh 5s
Damir admin tier dashboard: https://sport.rinet.one/sport/api/debug/dashboard
This commit is contained in:
Executable
+185
@@ -0,0 +1,185 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
auto_triage.py — Active error monitor za pgz-sport stack.
|
||||
|
||||
Tail-a /var/log/pgz-sport-debug/errors.jsonl, klasificira greške,
|
||||
i automatski dispatcha tasks na CC agente kad detektira pattern.
|
||||
|
||||
Patterns:
|
||||
- Recurring 5xx → CC4 (backend)
|
||||
- 401/403 spike → CC2 (auth)
|
||||
- 4xx na specifičnoj stranici → CC3 (frontend route)
|
||||
- DB connection error → CC4 + telegram urgent
|
||||
- ImportError/AttributeError u pgz-sport → CC4 dispatch + restart attempt
|
||||
"""
|
||||
import json, os, re, time, subprocess, sys
|
||||
from collections import defaultdict, deque
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
LOG_FILE = Path("/var/log/pgz-sport-debug/errors.jsonl")
|
||||
TRIAGE_LOG = Path("/var/log/pgz-sport-debug/triage.log")
|
||||
TRIAGE_DECISIONS = Path("/var/log/pgz-sport-debug/triage_decisions.jsonl")
|
||||
TG_TOKEN = "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y"
|
||||
TG_CHAT = "7969491558"
|
||||
|
||||
# Rate limit: ne više od X telegram poruka po 5 min
|
||||
RATE_WIN = 300 # seconds
|
||||
RATE_MAX = 6
|
||||
recent_alerts = deque()
|
||||
|
||||
# Pattern counts (sliding window)
|
||||
PATTERN_WIN = 60 # 60s window
|
||||
recent_patterns = defaultdict(deque)
|
||||
|
||||
def log(msg):
|
||||
ts = datetime.now().isoformat(timespec='seconds')
|
||||
with open(TRIAGE_LOG, "a") as f:
|
||||
f.write(f"[{ts}] {msg}\n")
|
||||
print(f"[{ts}] {msg}", flush=True)
|
||||
|
||||
def telegram(text):
|
||||
now = time.time()
|
||||
while recent_alerts and now - recent_alerts[0] > RATE_WIN:
|
||||
recent_alerts.popleft()
|
||||
if len(recent_alerts) >= RATE_MAX:
|
||||
log(f"RATE LIMITED telegram: {text[:80]}")
|
||||
return False
|
||||
recent_alerts.append(now)
|
||||
try:
|
||||
subprocess.run([
|
||||
"curl", "-s", "-X", "POST",
|
||||
f"https://api.telegram.org/bot{TG_TOKEN}/sendMessage",
|
||||
"-d", f"chat_id={TG_CHAT}",
|
||||
"--data-urlencode", f"text={text}"
|
||||
], timeout=10, capture_output=True)
|
||||
return True
|
||||
except Exception as e:
|
||||
log(f"telegram fail: {e}")
|
||||
return False
|
||||
|
||||
def dispatch_to_cc(session, msg):
|
||||
"""Pošalji task na CC tmux session."""
|
||||
try:
|
||||
subprocess.run(["tmux", "send-keys", "-t", f"{session}:0", msg], check=False, capture_output=True)
|
||||
time.sleep(1)
|
||||
subprocess.run(["tmux", "send-keys", "-t", f"{session}:0", "Enter"], check=False, capture_output=True)
|
||||
log(f"dispatched to {session}: {msg[:80]}")
|
||||
record_decision({"action": "dispatch", "target": session, "msg": msg[:200]})
|
||||
return True
|
||||
except Exception as e:
|
||||
log(f"dispatch fail to {session}: {e}")
|
||||
return False
|
||||
|
||||
def record_decision(obj):
|
||||
obj["ts"] = datetime.now().isoformat(timespec='seconds')
|
||||
with open(TRIAGE_DECISIONS, "a") as f:
|
||||
f.write(json.dumps(obj) + "\n")
|
||||
|
||||
def pattern_count(key, since=None):
|
||||
"""Count of pattern occurences within sliding window."""
|
||||
if since is None: since = time.time() - PATTERN_WIN
|
||||
dq = recent_patterns[key]
|
||||
while dq and dq[0] < since:
|
||||
dq.popleft()
|
||||
return len(dq)
|
||||
|
||||
def add_pattern(key):
|
||||
recent_patterns[key].append(time.time())
|
||||
|
||||
def classify(line):
|
||||
try:
|
||||
ev = json.loads(line)
|
||||
except:
|
||||
return None
|
||||
|
||||
msg = ev.get("msg", "") or ""
|
||||
src = ev.get("src", "")
|
||||
code = ev.get("code", "")
|
||||
path = ev.get("path", "")
|
||||
method = ev.get("method", "")
|
||||
|
||||
# ─── Pattern A: HTTP 5xx
|
||||
if code and code.startswith("5"):
|
||||
key = f"5xx:{path[:100]}"
|
||||
add_pattern(key)
|
||||
n = pattern_count(key)
|
||||
if n >= 3:
|
||||
telegram(f"⚠️ 5xx spike: {method} {path} → {code} (×{n}/60s)")
|
||||
dispatch_to_cc("cc4", f"5xx detected: {method} {path} {code} occurring {n}x in 60s. Investigate /opt/pgz-sport/routers/ for the route handler. Check DB connection, log traceback. Run smoke test. Fix + restart pgz-sport + verify resolved.")
|
||||
recent_patterns[key].clear() # reset after dispatch
|
||||
return ("5xx_spike", n, path)
|
||||
|
||||
# ─── Pattern B: 401/403 spike (auth issue)
|
||||
if code in ("401", "403"):
|
||||
key = f"auth:{path[:80]}"
|
||||
add_pattern(key)
|
||||
n = pattern_count(key)
|
||||
if n >= 5:
|
||||
telegram(f"🔒 Auth spike: {code} on {path} (×{n}/60s)")
|
||||
dispatch_to_cc("cc2", f"Auth spike: {code} on {path} ×{n} times in 60s. Check JWT middleware in pgz_sport_api.py + auth/auth_v2.py. Verify role-based access control. Smoke test 3 demo accounts.")
|
||||
recent_patterns[key].clear()
|
||||
return ("auth_spike", n, path)
|
||||
|
||||
# ─── Pattern C: 4xx on consumer endpoints (frontend bug)
|
||||
if code and code.startswith("4") and code not in ("401", "403"):
|
||||
if path.startswith("/sport/api/"):
|
||||
key = f"4xx_api:{path[:80]}"
|
||||
add_pattern(key)
|
||||
n = pattern_count(key)
|
||||
if n >= 5:
|
||||
telegram(f"⚠️ 4xx API: {path} ×{n}/60s")
|
||||
dispatch_to_cc("cc3", f"Frontend bug: {path} returning {code} ×{n}x. Frontend may call wrong URL or send bad payload. Check static/*.html for fetch/api() calls to {path}. Verify request shape matches backend schema.")
|
||||
recent_patterns[key].clear()
|
||||
return ("4xx_api", n, path)
|
||||
|
||||
# ─── Pattern D: ImportError / AttributeError / SyntaxError u backendu
|
||||
crit_patterns = [
|
||||
(r"ImportError|ModuleNotFoundError", "import_error"),
|
||||
(r"AttributeError", "attribute_error"),
|
||||
(r"SyntaxError", "syntax_error"),
|
||||
(r"OperationalError.*could not connect", "db_connect_error"),
|
||||
(r"asyncpg|psycopg2.*OperationalError", "db_pool_error"),
|
||||
(r"FATAL|CRITICAL", "fatal"),
|
||||
]
|
||||
for pat, kind in crit_patterns:
|
||||
if re.search(pat, msg, re.I):
|
||||
telegram(f"🚨 {kind.upper()}: {msg[:200]}")
|
||||
target = "cc4" if "db" not in kind else "cc4"
|
||||
dispatch_to_cc(target, f"CRITICAL {kind} detected u pgz-sport: {msg[:300]}. Identify file:line, fix, py_compile, restart, verify. If db_connect_error, check Server B (10.10.0.2:6432) connectivity.")
|
||||
return (kind, 1, msg[:80])
|
||||
|
||||
# ─── Pattern E: Empty page detection
|
||||
if code == "200" and "size_download" in str(ev) and ev.get("size", 0) < 100:
|
||||
key = f"empty:{path}"
|
||||
add_pattern(key)
|
||||
if pattern_count(key) >= 2:
|
||||
telegram(f"📄 Empty page: {path}")
|
||||
dispatch_to_cc("cc3", f"Empty page detected: {path} returning <100 bytes. Check static/{path.split('/')[-1]} or backend response.")
|
||||
recent_patterns[key].clear()
|
||||
|
||||
return None
|
||||
|
||||
def follow(path):
|
||||
"""Tail -F equivalent."""
|
||||
while not path.exists():
|
||||
time.sleep(1)
|
||||
|
||||
f = open(path, "r")
|
||||
f.seek(0, 2) # EOF
|
||||
while True:
|
||||
line = f.readline()
|
||||
if not line:
|
||||
time.sleep(0.5)
|
||||
continue
|
||||
result = classify(line)
|
||||
if result:
|
||||
log(f"PATTERN {result[0]} ×{result[1]}: {result[2]}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
log("auto_triage starting")
|
||||
log(f"watching {LOG_FILE}")
|
||||
try:
|
||||
follow(LOG_FILE)
|
||||
except KeyboardInterrupt:
|
||||
log("shutdown")
|
||||
Executable
+59
@@ -0,0 +1,59 @@
|
||||
#!/bin/bash
|
||||
# Tail journalctl + nginx errors → strukturirani JSONL stream
|
||||
LOGDIR=/var/log/pgz-sport-debug
|
||||
mkdir -p $LOGDIR
|
||||
|
||||
# Tail journalctl
|
||||
journalctl -u pgz-sport -f -n 0 --output=cat 2>/dev/null | while read line; do
|
||||
ts=$(date -Iseconds)
|
||||
level="INFO"
|
||||
|
||||
# Klasifikacija
|
||||
if echo "$line" | grep -qE "ERROR|Exception|Traceback|CRITICAL|FATAL"; then level="ERROR"; fi
|
||||
if echo "$line" | grep -qE "WARNING|WARN"; then level="WARN"; fi
|
||||
if echo "$line" | grep -qE "DEBUG"; then level="DEBUG"; fi
|
||||
|
||||
# JSON-escape
|
||||
safe=$(echo "$line" | python3 -c "import json,sys; print(json.dumps(sys.stdin.read().strip()))")
|
||||
echo "{\"ts\":\"$ts\",\"src\":\"pgz-sport\",\"level\":\"$level\",\"msg\":$safe}" >> $LOGDIR/stream.jsonl
|
||||
done &
|
||||
JPID=$!
|
||||
echo $JPID > $LOGDIR/journalctl_tail.pid
|
||||
|
||||
# Tail nginx error log
|
||||
tail -F /var/log/nginx/sport.error.log 2>/dev/null | while read line; do
|
||||
ts=$(date -Iseconds)
|
||||
safe=$(echo "$line" | python3 -c "import json,sys; print(json.dumps(sys.stdin.read().strip()))")
|
||||
echo "{\"ts\":\"$ts\",\"src\":\"nginx\",\"level\":\"ERROR\",\"msg\":$safe}" >> $LOGDIR/stream.jsonl
|
||||
done &
|
||||
NPID=$!
|
||||
echo $NPID > $LOGDIR/nginx_tail.pid
|
||||
|
||||
# Tail nginx access log za 4xx/5xx
|
||||
tail -F /var/log/nginx/sport.access.log 2>/dev/null | while read line; do
|
||||
# parse: status code je 9. polje (combined log format)
|
||||
code=$(echo "$line" | awk '{print $9}')
|
||||
if [[ "$code" =~ ^[45][0-9][0-9]$ ]]; then
|
||||
ts=$(date -Iseconds)
|
||||
method=$(echo "$line" | awk '{print $6}' | tr -d '"')
|
||||
path=$(echo "$line" | awk '{print $7}')
|
||||
safe=$(echo "$line" | python3 -c "import json,sys; print(json.dumps(sys.stdin.read().strip()))")
|
||||
echo "{\"ts\":\"$ts\",\"src\":\"nginx-access\",\"level\":\"WARN\",\"code\":\"$code\",\"method\":\"$method\",\"path\":\"$path\",\"raw\":$safe}" >> $LOGDIR/stream.jsonl
|
||||
|
||||
# ACTIVE ALERTING: ako je 5xx ili 401-403, log do error feed
|
||||
if [[ "$code" =~ ^5 ]] || [[ "$code" == "401" ]] || [[ "$code" == "403" ]]; then
|
||||
echo "{\"ts\":\"$ts\",\"src\":\"nginx-access\",\"level\":\"ERROR\",\"code\":\"$code\",\"method\":\"$method\",\"path\":\"$path\"}" >> $LOGDIR/errors.jsonl
|
||||
fi
|
||||
fi
|
||||
done &
|
||||
APID=$!
|
||||
echo $APID > $LOGDIR/access_tail.pid
|
||||
|
||||
# Drop ERROR-level u zaseban error file (agenti gledaju ovaj)
|
||||
tail -F $LOGDIR/stream.jsonl 2>/dev/null | grep -E "\"level\":\"(ERROR|CRITICAL|FATAL)\"" >> $LOGDIR/errors.jsonl &
|
||||
EPID=$!
|
||||
echo $EPID > $LOGDIR/error_filter.pid
|
||||
|
||||
echo "Debug tail running. PIDs: journalctl=$JPID nginx=$NPID access=$APID error_filter=$EPID"
|
||||
echo " stream.jsonl + errors.jsonl in $LOGDIR"
|
||||
wait
|
||||
Reference in New Issue
Block a user