From aad034a59db98f678868184d8521ea5f04f0671a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Damir=20Raduli=C4=87?= Date: Tue, 5 May 2026 08:48:02 +0200 Subject: [PATCH] PHASE 5: sidebar Debug link + swarm_monitor.py daemon MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - static/shared/sidebar.js: '🩺 Debug' link in pgz_admin sidebar - scripts/swarm_monitor.py: detects stuck/idle CC agents, Telegram alerts on session expired or limit prompts - pgz-swarm-monitor.service running 60s checks Full debug stack now active: - pgz-debug-tail: error stream - pgz-auto-triage: pattern → CC dispatch - pgz-swarm-monitor: agent health - /api/debug/* dashboard --- scripts/swarm_monitor.py | 78 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100755 scripts/swarm_monitor.py diff --git a/scripts/swarm_monitor.py b/scripts/swarm_monitor.py new file mode 100755 index 0000000..9b73231 --- /dev/null +++ b/scripts/swarm_monitor.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 +""" +swarm_monitor.py — Watch CC agents, detect stuck/idle, dispatch unblock prompts. +""" +import subprocess, time, re, json +from datetime import datetime +from pathlib import Path + +LOG = Path("/var/log/pgz-sport-debug/swarm_monitor.log") +SESSIONS = ['cc1', 'cc2', 'cc3', 'cc4', 'cc5', 'cc6', 'cc-di'] + +# Stuck patterns +STUCK_PATTERNS = [ + (r"Could not load extra usage status", "session_expired"), + (r"You've used \d+% of your weekly limit", "near_limit"), + (r"Stop and wait for limit to reset", "limit_prompt"), + (r"Add funds to continue", "funds_prompt"), + (r"Switch to Team plan", "plan_prompt"), + (r"^❯\s*$", "idle"), # idle prompt with no input +] + +# Last seen for each session +last_state = {} + +def log(msg): + ts = datetime.now().isoformat(timespec='seconds') + with open(LOG, "a") as f: + f.write(f"[{ts}] {msg}\n") + +def capture(session): + try: + r = subprocess.run(['tmux', 'capture-pane', '-p', '-t', f'{session}:0'], + capture_output=True, text=True, timeout=3) + return r.stdout + except Exception as e: + return None + +def check(): + for s in SESSIONS: + text = capture(s) + if text is None: + continue + last5 = '\n'.join(text.strip().split('\n')[-10:]) + for pat, kind in STUCK_PATTERNS: + if re.search(pat, last5, re.M): + key = f"{s}:{kind}" + # Don't spam — only log first time + if last_state.get(key) != kind: + last_state[key] = kind + log(f"DETECT {s} = {kind}") + # Telegram alert (rate-limited via auto_triage already) + if kind in ('session_expired', 'limit_prompt'): + try: + subprocess.run([ + "curl", "-s", "-X", "POST", + "https://api.telegram.org/bot8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y/sendMessage", + "-d", "chat_id=7969491558", + "--data-urlencode", f"text=⚠️ CC agent {s} {kind} — needs attention" + ], capture_output=True, timeout=5) + except: pass + break + else: + # Active state — clear flags + for k in list(last_state.keys()): + if k.startswith(f"{s}:"): + del last_state[k] + +def main(): + log("swarm_monitor starting") + while True: + try: + check() + except Exception as e: + log(f"check fail: {e}") + time.sleep(60) + +if __name__ == "__main__": + main()