aad034a59d
- static/shared/sidebar.js: '🩺 Debug' link in pgz_admin sidebar
- scripts/swarm_monitor.py: detects stuck/idle CC agents,
Telegram alerts on session expired or limit prompts
- pgz-swarm-monitor.service running 60s checks
Full debug stack now active:
- pgz-debug-tail: error stream
- pgz-auto-triage: pattern → CC dispatch
- pgz-swarm-monitor: agent health
- /api/debug/* dashboard
79 lines
2.6 KiB
Python
Executable File
79 lines
2.6 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
"""
|
||
swarm_monitor.py — Watch CC agents, detect stuck/idle, dispatch unblock prompts.
|
||
"""
|
||
import subprocess, time, re, json
|
||
from datetime import datetime
|
||
from pathlib import Path
|
||
|
||
LOG = Path("/var/log/pgz-sport-debug/swarm_monitor.log")
|
||
SESSIONS = ['cc1', 'cc2', 'cc3', 'cc4', 'cc5', 'cc6', 'cc-di']
|
||
|
||
# Stuck patterns
|
||
STUCK_PATTERNS = [
|
||
(r"Could not load extra usage status", "session_expired"),
|
||
(r"You've used \d+% of your weekly limit", "near_limit"),
|
||
(r"Stop and wait for limit to reset", "limit_prompt"),
|
||
(r"Add funds to continue", "funds_prompt"),
|
||
(r"Switch to Team plan", "plan_prompt"),
|
||
(r"^❯\s*$", "idle"), # idle prompt with no input
|
||
]
|
||
|
||
# Last seen for each session
|
||
last_state = {}
|
||
|
||
def log(msg):
|
||
ts = datetime.now().isoformat(timespec='seconds')
|
||
with open(LOG, "a") as f:
|
||
f.write(f"[{ts}] {msg}\n")
|
||
|
||
def capture(session):
|
||
try:
|
||
r = subprocess.run(['tmux', 'capture-pane', '-p', '-t', f'{session}:0'],
|
||
capture_output=True, text=True, timeout=3)
|
||
return r.stdout
|
||
except Exception as e:
|
||
return None
|
||
|
||
def check():
|
||
for s in SESSIONS:
|
||
text = capture(s)
|
||
if text is None:
|
||
continue
|
||
last5 = '\n'.join(text.strip().split('\n')[-10:])
|
||
for pat, kind in STUCK_PATTERNS:
|
||
if re.search(pat, last5, re.M):
|
||
key = f"{s}:{kind}"
|
||
# Don't spam — only log first time
|
||
if last_state.get(key) != kind:
|
||
last_state[key] = kind
|
||
log(f"DETECT {s} = {kind}")
|
||
# Telegram alert (rate-limited via auto_triage already)
|
||
if kind in ('session_expired', 'limit_prompt'):
|
||
try:
|
||
subprocess.run([
|
||
"curl", "-s", "-X", "POST",
|
||
"https://api.telegram.org/bot8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y/sendMessage",
|
||
"-d", "chat_id=7969491558",
|
||
"--data-urlencode", f"text=⚠️ CC agent {s} {kind} — needs attention"
|
||
], capture_output=True, timeout=5)
|
||
except: pass
|
||
break
|
||
else:
|
||
# Active state — clear flags
|
||
for k in list(last_state.keys()):
|
||
if k.startswith(f"{s}:"):
|
||
del last_state[k]
|
||
|
||
def main():
|
||
log("swarm_monitor starting")
|
||
while True:
|
||
try:
|
||
check()
|
||
except Exception as e:
|
||
log(f"check fail: {e}")
|
||
time.sleep(60)
|
||
|
||
if __name__ == "__main__":
|
||
main()
|