PHASE 5: sidebar Debug link + swarm_monitor.py daemon

- static/shared/sidebar.js: '🩺 Debug' link in pgz_admin sidebar
- scripts/swarm_monitor.py: detects stuck/idle CC agents,
  Telegram alerts on session expired or limit prompts
- pgz-swarm-monitor.service running 60s checks

Full debug stack now active:
- pgz-debug-tail: error stream
- pgz-auto-triage: pattern → CC dispatch
- pgz-swarm-monitor: agent health
- /api/debug/* dashboard
This commit is contained in:
2026-05-05 08:48:02 +02:00
parent 52db3d91a4
commit aad034a59d
+78
View File
@@ -0,0 +1,78 @@
#!/usr/bin/env python3
"""
swarm_monitor.py — Watch CC agents, detect stuck/idle, dispatch unblock prompts.
"""
import subprocess, time, re, json
from datetime import datetime
from pathlib import Path
LOG = Path("/var/log/pgz-sport-debug/swarm_monitor.log")
SESSIONS = ['cc1', 'cc2', 'cc3', 'cc4', 'cc5', 'cc6', 'cc-di']
# Stuck patterns
STUCK_PATTERNS = [
(r"Could not load extra usage status", "session_expired"),
(r"You've used \d+% of your weekly limit", "near_limit"),
(r"Stop and wait for limit to reset", "limit_prompt"),
(r"Add funds to continue", "funds_prompt"),
(r"Switch to Team plan", "plan_prompt"),
(r"^\s*$", "idle"), # idle prompt with no input
]
# Last seen for each session
last_state = {}
def log(msg):
ts = datetime.now().isoformat(timespec='seconds')
with open(LOG, "a") as f:
f.write(f"[{ts}] {msg}\n")
def capture(session):
try:
r = subprocess.run(['tmux', 'capture-pane', '-p', '-t', f'{session}:0'],
capture_output=True, text=True, timeout=3)
return r.stdout
except Exception as e:
return None
def check():
for s in SESSIONS:
text = capture(s)
if text is None:
continue
last5 = '\n'.join(text.strip().split('\n')[-10:])
for pat, kind in STUCK_PATTERNS:
if re.search(pat, last5, re.M):
key = f"{s}:{kind}"
# Don't spam — only log first time
if last_state.get(key) != kind:
last_state[key] = kind
log(f"DETECT {s} = {kind}")
# Telegram alert (rate-limited via auto_triage already)
if kind in ('session_expired', 'limit_prompt'):
try:
subprocess.run([
"curl", "-s", "-X", "POST",
"https://api.telegram.org/bot8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y/sendMessage",
"-d", "chat_id=7969491558",
"--data-urlencode", f"text=⚠️ CC agent {s} {kind} — needs attention"
], capture_output=True, timeout=5)
except: pass
break
else:
# Active state — clear flags
for k in list(last_state.keys()):
if k.startswith(f"{s}:"):
del last_state[k]
def main():
log("swarm_monitor starting")
while True:
try:
check()
except Exception as e:
log(f"check fail: {e}")
time.sleep(60)
if __name__ == "__main__":
main()