#!/usr/bin/env python3 """ auto_triage.py — Active error monitor za pgz-sport stack. Tail-a /var/log/pgz-sport-debug/errors.jsonl, klasificira greške, i automatski dispatcha tasks na CC agente kad detektira pattern. Patterns: - Recurring 5xx → CC4 (backend) - 401/403 spike → CC2 (auth) - 4xx na specifičnoj stranici → CC3 (frontend route) - DB connection error → CC4 + telegram urgent - ImportError/AttributeError u pgz-sport → CC4 dispatch + restart attempt """ import json, os, re, time, subprocess, sys from collections import defaultdict, deque from pathlib import Path from datetime import datetime LOG_FILE = Path("/var/log/pgz-sport-debug/errors.jsonl") TRIAGE_LOG = Path("/var/log/pgz-sport-debug/triage.log") TRIAGE_DECISIONS = Path("/var/log/pgz-sport-debug/triage_decisions.jsonl") TG_TOKEN = "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y" TG_CHAT = "7969491558" # Rate limit: ne više od X telegram poruka po 5 min RATE_WIN = 300 # seconds RATE_MAX = 6 recent_alerts = deque() # Pattern counts (sliding window) PATTERN_WIN = 60 # 60s window recent_patterns = defaultdict(deque) def log(msg): ts = datetime.now().isoformat(timespec='seconds') with open(TRIAGE_LOG, "a") as f: f.write(f"[{ts}] {msg}\n") print(f"[{ts}] {msg}", flush=True) def telegram(text): now = time.time() while recent_alerts and now - recent_alerts[0] > RATE_WIN: recent_alerts.popleft() if len(recent_alerts) >= RATE_MAX: log(f"RATE LIMITED telegram: {text[:80]}") return False recent_alerts.append(now) try: subprocess.run([ "curl", "-s", "-X", "POST", f"https://api.telegram.org/bot{TG_TOKEN}/sendMessage", "-d", f"chat_id={TG_CHAT}", "--data-urlencode", f"text={text}" ], timeout=10, capture_output=True) return True except Exception as e: log(f"telegram fail: {e}") return False def dispatch_to_cc(session, msg): """Pošalji task na CC tmux session.""" try: subprocess.run(["tmux", "send-keys", "-t", f"{session}:0", msg], check=False, capture_output=True) time.sleep(1) subprocess.run(["tmux", "send-keys", "-t", f"{session}:0", "Enter"], check=False, capture_output=True) log(f"dispatched to {session}: {msg[:80]}") record_decision({"action": "dispatch", "target": session, "msg": msg[:200]}) return True except Exception as e: log(f"dispatch fail to {session}: {e}") return False def record_decision(obj): obj["ts"] = datetime.now().isoformat(timespec='seconds') with open(TRIAGE_DECISIONS, "a") as f: f.write(json.dumps(obj) + "\n") def pattern_count(key, since=None): """Count of pattern occurences within sliding window.""" if since is None: since = time.time() - PATTERN_WIN dq = recent_patterns[key] while dq and dq[0] < since: dq.popleft() return len(dq) def add_pattern(key): recent_patterns[key].append(time.time()) def classify(line): try: ev = json.loads(line) except: return None msg = ev.get("msg", "") or "" src = ev.get("src", "") code = ev.get("code", "") path = ev.get("path", "") method = ev.get("method", "") # ─── Pattern A: HTTP 5xx if code and code.startswith("5"): key = f"5xx:{path[:100]}" add_pattern(key) n = pattern_count(key) if n >= 3: telegram(f"⚠️ 5xx spike: {method} {path} → {code} (×{n}/60s)") dispatch_to_cc("cc4", f"5xx detected: {method} {path} {code} occurring {n}x in 60s. Investigate /opt/pgz-sport/routers/ for the route handler. Check DB connection, log traceback. Run smoke test. Fix + restart pgz-sport + verify resolved.") recent_patterns[key].clear() # reset after dispatch return ("5xx_spike", n, path) # ─── Pattern B: 401/403 spike (auth issue) if code in ("401", "403"): key = f"auth:{path[:80]}" add_pattern(key) n = pattern_count(key) if n >= 5: telegram(f"🔒 Auth spike: {code} on {path} (×{n}/60s)") dispatch_to_cc("cc2", f"Auth spike: {code} on {path} ×{n} times in 60s. Check JWT middleware in pgz_sport_api.py + auth/auth_v2.py. Verify role-based access control. Smoke test 3 demo accounts.") recent_patterns[key].clear() return ("auth_spike", n, path) # ─── Pattern C: 4xx on consumer endpoints (frontend bug) if code and code.startswith("4") and code not in ("401", "403"): if path.startswith("/sport/api/"): key = f"4xx_api:{path[:80]}" add_pattern(key) n = pattern_count(key) if n >= 5: telegram(f"⚠️ 4xx API: {path} ×{n}/60s") dispatch_to_cc("cc3", f"Frontend bug: {path} returning {code} ×{n}x. Frontend may call wrong URL or send bad payload. Check static/*.html for fetch/api() calls to {path}. Verify request shape matches backend schema.") recent_patterns[key].clear() return ("4xx_api", n, path) # ─── Pattern D: ImportError / AttributeError / SyntaxError u backendu crit_patterns = [ (r"ImportError|ModuleNotFoundError", "import_error"), (r"AttributeError", "attribute_error"), (r"SyntaxError", "syntax_error"), (r"OperationalError.*could not connect", "db_connect_error"), (r"asyncpg|psycopg2.*OperationalError", "db_pool_error"), (r"FATAL|CRITICAL", "fatal"), ] for pat, kind in crit_patterns: if re.search(pat, msg, re.I): telegram(f"🚨 {kind.upper()}: {msg[:200]}") target = "cc4" if "db" not in kind else "cc4" dispatch_to_cc(target, f"CRITICAL {kind} detected u pgz-sport: {msg[:300]}. Identify file:line, fix, py_compile, restart, verify. If db_connect_error, check Server B (10.10.0.2:6432) connectivity.") return (kind, 1, msg[:80]) # ─── Pattern E: Empty page detection if code == "200" and "size_download" in str(ev) and ev.get("size", 0) < 100: key = f"empty:{path}" add_pattern(key) if pattern_count(key) >= 2: telegram(f"📄 Empty page: {path}") dispatch_to_cc("cc3", f"Empty page detected: {path} returning <100 bytes. Check static/{path.split('/')[-1]} or backend response.") recent_patterns[key].clear() return None def follow(path): """Tail -F equivalent.""" while not path.exists(): time.sleep(1) f = open(path, "r") f.seek(0, 2) # EOF while True: line = f.readline() if not line: time.sleep(0.5) continue result = classify(line) if result: log(f"PATTERN {result[0]} ×{result[1]}: {result[2]}") if __name__ == "__main__": log("auto_triage starting") log(f"watching {LOG_FILE}") try: follow(LOG_FILE) except KeyboardInterrupt: log("shutdown")