PHASE 5: sidebar Debug link + swarm_monitor.py daemon
- static/shared/sidebar.js: '🩺 Debug' link in pgz_admin sidebar
- scripts/swarm_monitor.py: detects stuck/idle CC agents,
Telegram alerts on session expired or limit prompts
- pgz-swarm-monitor.service running 60s checks
Full debug stack now active:
- pgz-debug-tail: error stream
- pgz-auto-triage: pattern → CC dispatch
- pgz-swarm-monitor: agent health
- /api/debug/* dashboard
This commit is contained in:
Executable
+78
@@ -0,0 +1,78 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
swarm_monitor.py — Watch CC agents, detect stuck/idle, dispatch unblock prompts.
|
||||||
|
"""
|
||||||
|
import subprocess, time, re, json
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
LOG = Path("/var/log/pgz-sport-debug/swarm_monitor.log")
|
||||||
|
SESSIONS = ['cc1', 'cc2', 'cc3', 'cc4', 'cc5', 'cc6', 'cc-di']
|
||||||
|
|
||||||
|
# Stuck patterns
|
||||||
|
STUCK_PATTERNS = [
|
||||||
|
(r"Could not load extra usage status", "session_expired"),
|
||||||
|
(r"You've used \d+% of your weekly limit", "near_limit"),
|
||||||
|
(r"Stop and wait for limit to reset", "limit_prompt"),
|
||||||
|
(r"Add funds to continue", "funds_prompt"),
|
||||||
|
(r"Switch to Team plan", "plan_prompt"),
|
||||||
|
(r"^❯\s*$", "idle"), # idle prompt with no input
|
||||||
|
]
|
||||||
|
|
||||||
|
# Last seen for each session
|
||||||
|
last_state = {}
|
||||||
|
|
||||||
|
def log(msg):
|
||||||
|
ts = datetime.now().isoformat(timespec='seconds')
|
||||||
|
with open(LOG, "a") as f:
|
||||||
|
f.write(f"[{ts}] {msg}\n")
|
||||||
|
|
||||||
|
def capture(session):
|
||||||
|
try:
|
||||||
|
r = subprocess.run(['tmux', 'capture-pane', '-p', '-t', f'{session}:0'],
|
||||||
|
capture_output=True, text=True, timeout=3)
|
||||||
|
return r.stdout
|
||||||
|
except Exception as e:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def check():
|
||||||
|
for s in SESSIONS:
|
||||||
|
text = capture(s)
|
||||||
|
if text is None:
|
||||||
|
continue
|
||||||
|
last5 = '\n'.join(text.strip().split('\n')[-10:])
|
||||||
|
for pat, kind in STUCK_PATTERNS:
|
||||||
|
if re.search(pat, last5, re.M):
|
||||||
|
key = f"{s}:{kind}"
|
||||||
|
# Don't spam — only log first time
|
||||||
|
if last_state.get(key) != kind:
|
||||||
|
last_state[key] = kind
|
||||||
|
log(f"DETECT {s} = {kind}")
|
||||||
|
# Telegram alert (rate-limited via auto_triage already)
|
||||||
|
if kind in ('session_expired', 'limit_prompt'):
|
||||||
|
try:
|
||||||
|
subprocess.run([
|
||||||
|
"curl", "-s", "-X", "POST",
|
||||||
|
"https://api.telegram.org/bot8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y/sendMessage",
|
||||||
|
"-d", "chat_id=7969491558",
|
||||||
|
"--data-urlencode", f"text=⚠️ CC agent {s} {kind} — needs attention"
|
||||||
|
], capture_output=True, timeout=5)
|
||||||
|
except: pass
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
# Active state — clear flags
|
||||||
|
for k in list(last_state.keys()):
|
||||||
|
if k.startswith(f"{s}:"):
|
||||||
|
del last_state[k]
|
||||||
|
|
||||||
|
def main():
|
||||||
|
log("swarm_monitor starting")
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
check()
|
||||||
|
except Exception as e:
|
||||||
|
log(f"check fail: {e}")
|
||||||
|
time.sleep(60)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user