PHASE 5: sidebar Debug link + swarm_monitor.py daemon
- static/shared/sidebar.js: '🩺 Debug' link in pgz_admin sidebar
- scripts/swarm_monitor.py: detects stuck/idle CC agents,
Telegram alerts on session expired or limit prompts
- pgz-swarm-monitor.service running 60s checks
Full debug stack now active:
- pgz-debug-tail: error stream
- pgz-auto-triage: pattern → CC dispatch
- pgz-swarm-monitor: agent health
- /api/debug/* dashboard
This commit is contained in:
Executable
+78
@@ -0,0 +1,78 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
swarm_monitor.py — Watch CC agents, detect stuck/idle, dispatch unblock prompts.
|
||||
"""
|
||||
import subprocess, time, re, json
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
LOG = Path("/var/log/pgz-sport-debug/swarm_monitor.log")
|
||||
SESSIONS = ['cc1', 'cc2', 'cc3', 'cc4', 'cc5', 'cc6', 'cc-di']
|
||||
|
||||
# Stuck patterns
|
||||
STUCK_PATTERNS = [
|
||||
(r"Could not load extra usage status", "session_expired"),
|
||||
(r"You've used \d+% of your weekly limit", "near_limit"),
|
||||
(r"Stop and wait for limit to reset", "limit_prompt"),
|
||||
(r"Add funds to continue", "funds_prompt"),
|
||||
(r"Switch to Team plan", "plan_prompt"),
|
||||
(r"^❯\s*$", "idle"), # idle prompt with no input
|
||||
]
|
||||
|
||||
# Last seen for each session
|
||||
last_state = {}
|
||||
|
||||
def log(msg):
|
||||
ts = datetime.now().isoformat(timespec='seconds')
|
||||
with open(LOG, "a") as f:
|
||||
f.write(f"[{ts}] {msg}\n")
|
||||
|
||||
def capture(session):
|
||||
try:
|
||||
r = subprocess.run(['tmux', 'capture-pane', '-p', '-t', f'{session}:0'],
|
||||
capture_output=True, text=True, timeout=3)
|
||||
return r.stdout
|
||||
except Exception as e:
|
||||
return None
|
||||
|
||||
def check():
|
||||
for s in SESSIONS:
|
||||
text = capture(s)
|
||||
if text is None:
|
||||
continue
|
||||
last5 = '\n'.join(text.strip().split('\n')[-10:])
|
||||
for pat, kind in STUCK_PATTERNS:
|
||||
if re.search(pat, last5, re.M):
|
||||
key = f"{s}:{kind}"
|
||||
# Don't spam — only log first time
|
||||
if last_state.get(key) != kind:
|
||||
last_state[key] = kind
|
||||
log(f"DETECT {s} = {kind}")
|
||||
# Telegram alert (rate-limited via auto_triage already)
|
||||
if kind in ('session_expired', 'limit_prompt'):
|
||||
try:
|
||||
subprocess.run([
|
||||
"curl", "-s", "-X", "POST",
|
||||
"https://api.telegram.org/bot8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y/sendMessage",
|
||||
"-d", "chat_id=7969491558",
|
||||
"--data-urlencode", f"text=⚠️ CC agent {s} {kind} — needs attention"
|
||||
], capture_output=True, timeout=5)
|
||||
except: pass
|
||||
break
|
||||
else:
|
||||
# Active state — clear flags
|
||||
for k in list(last_state.keys()):
|
||||
if k.startswith(f"{s}:"):
|
||||
del last_state[k]
|
||||
|
||||
def main():
|
||||
log("swarm_monitor starting")
|
||||
while True:
|
||||
try:
|
||||
check()
|
||||
except Exception as e:
|
||||
log(f"check fail: {e}")
|
||||
time.sleep(60)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user