Files
pgz-sport/scripts/swarm_monitor.py
T
damir aad034a59d PHASE 5: sidebar Debug link + swarm_monitor.py daemon
- static/shared/sidebar.js: '🩺 Debug' link in pgz_admin sidebar
- scripts/swarm_monitor.py: detects stuck/idle CC agents,
  Telegram alerts on session expired or limit prompts
- pgz-swarm-monitor.service running 60s checks

Full debug stack now active:
- pgz-debug-tail: error stream
- pgz-auto-triage: pattern → CC dispatch
- pgz-swarm-monitor: agent health
- /api/debug/* dashboard
2026-05-05 08:48:02 +02:00

79 lines
2.6 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
swarm_monitor.py — Watch CC agents, detect stuck/idle, dispatch unblock prompts.
"""
import subprocess, time, re, json
from datetime import datetime
from pathlib import Path
LOG = Path("/var/log/pgz-sport-debug/swarm_monitor.log")
SESSIONS = ['cc1', 'cc2', 'cc3', 'cc4', 'cc5', 'cc6', 'cc-di']
# Stuck patterns
STUCK_PATTERNS = [
(r"Could not load extra usage status", "session_expired"),
(r"You've used \d+% of your weekly limit", "near_limit"),
(r"Stop and wait for limit to reset", "limit_prompt"),
(r"Add funds to continue", "funds_prompt"),
(r"Switch to Team plan", "plan_prompt"),
(r"^\s*$", "idle"), # idle prompt with no input
]
# Last seen for each session
last_state = {}
def log(msg):
ts = datetime.now().isoformat(timespec='seconds')
with open(LOG, "a") as f:
f.write(f"[{ts}] {msg}\n")
def capture(session):
try:
r = subprocess.run(['tmux', 'capture-pane', '-p', '-t', f'{session}:0'],
capture_output=True, text=True, timeout=3)
return r.stdout
except Exception as e:
return None
def check():
for s in SESSIONS:
text = capture(s)
if text is None:
continue
last5 = '\n'.join(text.strip().split('\n')[-10:])
for pat, kind in STUCK_PATTERNS:
if re.search(pat, last5, re.M):
key = f"{s}:{kind}"
# Don't spam — only log first time
if last_state.get(key) != kind:
last_state[key] = kind
log(f"DETECT {s} = {kind}")
# Telegram alert (rate-limited via auto_triage already)
if kind in ('session_expired', 'limit_prompt'):
try:
subprocess.run([
"curl", "-s", "-X", "POST",
"https://api.telegram.org/bot8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y/sendMessage",
"-d", "chat_id=7969491558",
"--data-urlencode", f"text=⚠️ CC agent {s} {kind} — needs attention"
], capture_output=True, timeout=5)
except: pass
break
else:
# Active state — clear flags
for k in list(last_state.keys()):
if k.startswith(f"{s}:"):
del last_state[k]
def main():
log("swarm_monitor starting")
while True:
try:
check()
except Exception as e:
log(f"check fail: {e}")
time.sleep(60)
if __name__ == "__main__":
main()