63ca005b6e
PHASE 1 — DEBUG mode:
- /etc/systemd/system/pgz-sport.service.d/debug.conf: DEBUG=1, LOG_LEVEL=DEBUG, PYTHONUNBUFFERED=1, UVICORN_LOG_LEVEL=debug
PHASE 2 — Error stream:
- /opt/pgz-sport/scripts/debug_tail.sh: tail journalctl + nginx → /var/log/pgz-sport-debug/{stream,errors}.jsonl
- pgz-debug-tail.service (always restart, multiplexes 4 sources)
PHASE 3 — Auto-triage bot:
- /opt/pgz-sport/scripts/auto_triage.py: classifies errors, dispatches CC agents
- Patterns: 5xx spike → CC4, 401/403 spike → CC2, 4xx API → CC3, ImportError/DB → CC4
- Rate limit: 6 telegram/5min
- Records decisions in triage_decisions.jsonl
- pgz-auto-triage.service
PHASE 4 — Live dashboard:
- routers/debug_router.py mounted in pgz_sport_api
- GET /api/debug/health — services + DB + error count
- GET /api/debug/errors?limit=N — last N errors (JSON)
- GET /api/debug/decisions — auto-fix decisions
- GET /api/debug/stream — full log tail
- GET /api/debug/dashboard — live HTML refresh 5s
Damir admin tier dashboard: https://sport.rinet.one/sport/api/debug/dashboard
186 lines
6.9 KiB
Python
Executable File
186 lines
6.9 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
"""
|
||
auto_triage.py — Active error monitor za pgz-sport stack.
|
||
|
||
Tail-a /var/log/pgz-sport-debug/errors.jsonl, klasificira greške,
|
||
i automatski dispatcha tasks na CC agente kad detektira pattern.
|
||
|
||
Patterns:
|
||
- Recurring 5xx → CC4 (backend)
|
||
- 401/403 spike → CC2 (auth)
|
||
- 4xx na specifičnoj stranici → CC3 (frontend route)
|
||
- DB connection error → CC4 + telegram urgent
|
||
- ImportError/AttributeError u pgz-sport → CC4 dispatch + restart attempt
|
||
"""
|
||
import json, os, re, time, subprocess, sys
|
||
from collections import defaultdict, deque
|
||
from pathlib import Path
|
||
from datetime import datetime
|
||
|
||
LOG_FILE = Path("/var/log/pgz-sport-debug/errors.jsonl")
|
||
TRIAGE_LOG = Path("/var/log/pgz-sport-debug/triage.log")
|
||
TRIAGE_DECISIONS = Path("/var/log/pgz-sport-debug/triage_decisions.jsonl")
|
||
TG_TOKEN = "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y"
|
||
TG_CHAT = "7969491558"
|
||
|
||
# Rate limit: ne više od X telegram poruka po 5 min
|
||
RATE_WIN = 300 # seconds
|
||
RATE_MAX = 6
|
||
recent_alerts = deque()
|
||
|
||
# Pattern counts (sliding window)
|
||
PATTERN_WIN = 60 # 60s window
|
||
recent_patterns = defaultdict(deque)
|
||
|
||
def log(msg):
|
||
ts = datetime.now().isoformat(timespec='seconds')
|
||
with open(TRIAGE_LOG, "a") as f:
|
||
f.write(f"[{ts}] {msg}\n")
|
||
print(f"[{ts}] {msg}", flush=True)
|
||
|
||
def telegram(text):
|
||
now = time.time()
|
||
while recent_alerts and now - recent_alerts[0] > RATE_WIN:
|
||
recent_alerts.popleft()
|
||
if len(recent_alerts) >= RATE_MAX:
|
||
log(f"RATE LIMITED telegram: {text[:80]}")
|
||
return False
|
||
recent_alerts.append(now)
|
||
try:
|
||
subprocess.run([
|
||
"curl", "-s", "-X", "POST",
|
||
f"https://api.telegram.org/bot{TG_TOKEN}/sendMessage",
|
||
"-d", f"chat_id={TG_CHAT}",
|
||
"--data-urlencode", f"text={text}"
|
||
], timeout=10, capture_output=True)
|
||
return True
|
||
except Exception as e:
|
||
log(f"telegram fail: {e}")
|
||
return False
|
||
|
||
def dispatch_to_cc(session, msg):
|
||
"""Pošalji task na CC tmux session."""
|
||
try:
|
||
subprocess.run(["tmux", "send-keys", "-t", f"{session}:0", msg], check=False, capture_output=True)
|
||
time.sleep(1)
|
||
subprocess.run(["tmux", "send-keys", "-t", f"{session}:0", "Enter"], check=False, capture_output=True)
|
||
log(f"dispatched to {session}: {msg[:80]}")
|
||
record_decision({"action": "dispatch", "target": session, "msg": msg[:200]})
|
||
return True
|
||
except Exception as e:
|
||
log(f"dispatch fail to {session}: {e}")
|
||
return False
|
||
|
||
def record_decision(obj):
|
||
obj["ts"] = datetime.now().isoformat(timespec='seconds')
|
||
with open(TRIAGE_DECISIONS, "a") as f:
|
||
f.write(json.dumps(obj) + "\n")
|
||
|
||
def pattern_count(key, since=None):
|
||
"""Count of pattern occurences within sliding window."""
|
||
if since is None: since = time.time() - PATTERN_WIN
|
||
dq = recent_patterns[key]
|
||
while dq and dq[0] < since:
|
||
dq.popleft()
|
||
return len(dq)
|
||
|
||
def add_pattern(key):
|
||
recent_patterns[key].append(time.time())
|
||
|
||
def classify(line):
|
||
try:
|
||
ev = json.loads(line)
|
||
except:
|
||
return None
|
||
|
||
msg = ev.get("msg", "") or ""
|
||
src = ev.get("src", "")
|
||
code = ev.get("code", "")
|
||
path = ev.get("path", "")
|
||
method = ev.get("method", "")
|
||
|
||
# ─── Pattern A: HTTP 5xx
|
||
if code and code.startswith("5"):
|
||
key = f"5xx:{path[:100]}"
|
||
add_pattern(key)
|
||
n = pattern_count(key)
|
||
if n >= 3:
|
||
telegram(f"⚠️ 5xx spike: {method} {path} → {code} (×{n}/60s)")
|
||
dispatch_to_cc("cc4", f"5xx detected: {method} {path} {code} occurring {n}x in 60s. Investigate /opt/pgz-sport/routers/ for the route handler. Check DB connection, log traceback. Run smoke test. Fix + restart pgz-sport + verify resolved.")
|
||
recent_patterns[key].clear() # reset after dispatch
|
||
return ("5xx_spike", n, path)
|
||
|
||
# ─── Pattern B: 401/403 spike (auth issue)
|
||
if code in ("401", "403"):
|
||
key = f"auth:{path[:80]}"
|
||
add_pattern(key)
|
||
n = pattern_count(key)
|
||
if n >= 5:
|
||
telegram(f"🔒 Auth spike: {code} on {path} (×{n}/60s)")
|
||
dispatch_to_cc("cc2", f"Auth spike: {code} on {path} ×{n} times in 60s. Check JWT middleware in pgz_sport_api.py + auth/auth_v2.py. Verify role-based access control. Smoke test 3 demo accounts.")
|
||
recent_patterns[key].clear()
|
||
return ("auth_spike", n, path)
|
||
|
||
# ─── Pattern C: 4xx on consumer endpoints (frontend bug)
|
||
if code and code.startswith("4") and code not in ("401", "403"):
|
||
if path.startswith("/sport/api/"):
|
||
key = f"4xx_api:{path[:80]}"
|
||
add_pattern(key)
|
||
n = pattern_count(key)
|
||
if n >= 5:
|
||
telegram(f"⚠️ 4xx API: {path} ×{n}/60s")
|
||
dispatch_to_cc("cc3", f"Frontend bug: {path} returning {code} ×{n}x. Frontend may call wrong URL or send bad payload. Check static/*.html for fetch/api() calls to {path}. Verify request shape matches backend schema.")
|
||
recent_patterns[key].clear()
|
||
return ("4xx_api", n, path)
|
||
|
||
# ─── Pattern D: ImportError / AttributeError / SyntaxError u backendu
|
||
crit_patterns = [
|
||
(r"ImportError|ModuleNotFoundError", "import_error"),
|
||
(r"AttributeError", "attribute_error"),
|
||
(r"SyntaxError", "syntax_error"),
|
||
(r"OperationalError.*could not connect", "db_connect_error"),
|
||
(r"asyncpg|psycopg2.*OperationalError", "db_pool_error"),
|
||
(r"FATAL|CRITICAL", "fatal"),
|
||
]
|
||
for pat, kind in crit_patterns:
|
||
if re.search(pat, msg, re.I):
|
||
telegram(f"🚨 {kind.upper()}: {msg[:200]}")
|
||
target = "cc4" if "db" not in kind else "cc4"
|
||
dispatch_to_cc(target, f"CRITICAL {kind} detected u pgz-sport: {msg[:300]}. Identify file:line, fix, py_compile, restart, verify. If db_connect_error, check Server B (10.10.0.2:6432) connectivity.")
|
||
return (kind, 1, msg[:80])
|
||
|
||
# ─── Pattern E: Empty page detection
|
||
if code == "200" and "size_download" in str(ev) and ev.get("size", 0) < 100:
|
||
key = f"empty:{path}"
|
||
add_pattern(key)
|
||
if pattern_count(key) >= 2:
|
||
telegram(f"📄 Empty page: {path}")
|
||
dispatch_to_cc("cc3", f"Empty page detected: {path} returning <100 bytes. Check static/{path.split('/')[-1]} or backend response.")
|
||
recent_patterns[key].clear()
|
||
|
||
return None
|
||
|
||
def follow(path):
|
||
"""Tail -F equivalent."""
|
||
while not path.exists():
|
||
time.sleep(1)
|
||
|
||
f = open(path, "r")
|
||
f.seek(0, 2) # EOF
|
||
while True:
|
||
line = f.readline()
|
||
if not line:
|
||
time.sleep(0.5)
|
||
continue
|
||
result = classify(line)
|
||
if result:
|
||
log(f"PATTERN {result[0]} ×{result[1]}: {result[2]}")
|
||
|
||
if __name__ == "__main__":
|
||
log("auto_triage starting")
|
||
log(f"watching {LOG_FILE}")
|
||
try:
|
||
follow(LOG_FILE)
|
||
except KeyboardInterrupt:
|
||
log("shutdown")
|