DEBUG OBSERVABILITY: live error feed + auto-triage bot + dashboard
PHASE 1 — DEBUG mode:
- /etc/systemd/system/pgz-sport.service.d/debug.conf: DEBUG=1, LOG_LEVEL=DEBUG, PYTHONUNBUFFERED=1, UVICORN_LOG_LEVEL=debug
PHASE 2 — Error stream:
- /opt/pgz-sport/scripts/debug_tail.sh: tail journalctl + nginx → /var/log/pgz-sport-debug/{stream,errors}.jsonl
- pgz-debug-tail.service (always restart, multiplexes 4 sources)
PHASE 3 — Auto-triage bot:
- /opt/pgz-sport/scripts/auto_triage.py: classifies errors, dispatches CC agents
- Patterns: 5xx spike → CC4, 401/403 spike → CC2, 4xx API → CC3, ImportError/DB → CC4
- Rate limit: 6 telegram/5min
- Records decisions in triage_decisions.jsonl
- pgz-auto-triage.service
PHASE 4 — Live dashboard:
- routers/debug_router.py mounted in pgz_sport_api
- GET /api/debug/health — services + DB + error count
- GET /api/debug/errors?limit=N — last N errors (JSON)
- GET /api/debug/decisions — auto-fix decisions
- GET /api/debug/stream — full log tail
- GET /api/debug/dashboard — live HTML refresh 5s
Damir admin tier dashboard: https://sport.rinet.one/sport/api/debug/dashboard
This commit is contained in:
Binary file not shown.
|
After Width: | Height: | Size: 39 KiB |
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -0,0 +1,170 @@
|
|||||||
|
"""Debug observability dashboard endpoint."""
|
||||||
|
import json, os, time
|
||||||
|
from pathlib import Path
|
||||||
|
from fastapi import APIRouter, Query
|
||||||
|
from fastapi.responses import JSONResponse, HTMLResponse, PlainTextResponse
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/api/debug", tags=["debug"])
|
||||||
|
|
||||||
|
LOGDIR = Path("/var/log/pgz-sport-debug")
|
||||||
|
|
||||||
|
@router.get("/health")
|
||||||
|
def debug_health():
|
||||||
|
"""Quick service status."""
|
||||||
|
import subprocess
|
||||||
|
services = ['pgz-sport', 'pgz-debug-tail', 'pgz-auto-triage', 'nginx', 'redis-server']
|
||||||
|
status = {}
|
||||||
|
for s in services:
|
||||||
|
try:
|
||||||
|
r = subprocess.run(['systemctl', 'is-active', s], capture_output=True, text=True, timeout=2)
|
||||||
|
status[s] = r.stdout.strip()
|
||||||
|
except Exception as e:
|
||||||
|
status[s] = f"error:{e}"
|
||||||
|
|
||||||
|
# DB
|
||||||
|
db_status = "unknown"
|
||||||
|
try:
|
||||||
|
import psycopg2
|
||||||
|
with psycopg2.connect("host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7", connect_timeout=2) as conn:
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
cur.execute("SELECT 1")
|
||||||
|
db_status = "ok"
|
||||||
|
except Exception as e:
|
||||||
|
db_status = f"error:{e}"
|
||||||
|
|
||||||
|
# Recent errors count
|
||||||
|
err_count = 0
|
||||||
|
if (LOGDIR / "errors.jsonl").exists():
|
||||||
|
with open(LOGDIR / "errors.jsonl") as f:
|
||||||
|
err_count = sum(1 for _ in f)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"ts": time.time(),
|
||||||
|
"services": status,
|
||||||
|
"db": db_status,
|
||||||
|
"total_errors_logged": err_count,
|
||||||
|
"log_dir": str(LOGDIR),
|
||||||
|
}
|
||||||
|
|
||||||
|
@router.get("/errors")
|
||||||
|
def recent_errors(limit: int = Query(100, ge=1, le=1000)):
|
||||||
|
"""Last N errors from errors.jsonl."""
|
||||||
|
f = LOGDIR / "errors.jsonl"
|
||||||
|
if not f.exists():
|
||||||
|
return {"errors": [], "note": "errors.jsonl not yet created"}
|
||||||
|
lines = f.read_text(errors='ignore').strip().split('\n')[-limit:]
|
||||||
|
parsed = []
|
||||||
|
for line in lines:
|
||||||
|
try:
|
||||||
|
parsed.append(json.loads(line))
|
||||||
|
except:
|
||||||
|
continue
|
||||||
|
return {"errors": parsed, "count": len(parsed)}
|
||||||
|
|
||||||
|
@router.get("/decisions")
|
||||||
|
def triage_decisions(limit: int = Query(50, ge=1, le=500)):
|
||||||
|
"""Last N auto-triage decisions."""
|
||||||
|
f = LOGDIR / "triage_decisions.jsonl"
|
||||||
|
if not f.exists():
|
||||||
|
return {"decisions": [], "note": "no decisions yet"}
|
||||||
|
lines = f.read_text(errors='ignore').strip().split('\n')[-limit:]
|
||||||
|
parsed = []
|
||||||
|
for line in lines:
|
||||||
|
try:
|
||||||
|
parsed.append(json.loads(line))
|
||||||
|
except:
|
||||||
|
continue
|
||||||
|
return {"decisions": parsed, "count": len(parsed)}
|
||||||
|
|
||||||
|
@router.get("/stream")
|
||||||
|
def stream_tail(lines: int = Query(200, ge=10, le=2000)):
|
||||||
|
"""Last N lines of full stream.jsonl."""
|
||||||
|
f = LOGDIR / "stream.jsonl"
|
||||||
|
if not f.exists():
|
||||||
|
return {"stream": []}
|
||||||
|
raw = f.read_text(errors='ignore').strip().split('\n')[-lines:]
|
||||||
|
parsed = []
|
||||||
|
for line in raw:
|
||||||
|
try:
|
||||||
|
parsed.append(json.loads(line))
|
||||||
|
except:
|
||||||
|
continue
|
||||||
|
return {"stream": parsed}
|
||||||
|
|
||||||
|
@router.get("/dashboard", response_class=HTMLResponse)
|
||||||
|
def dashboard():
|
||||||
|
"""Live HTML dashboard."""
|
||||||
|
return """<!DOCTYPE html>
|
||||||
|
<html><head><meta charset="UTF-8"><title>PGŽ Debug Live</title>
|
||||||
|
<style>
|
||||||
|
body{font-family:'JetBrains Mono',monospace;background:#0a0a0c;color:#e0e0e0;margin:0;padding:20px}
|
||||||
|
h1{color:#FFD700;font-size:18px;margin:0 0 18px}
|
||||||
|
.grid{display:grid;grid-template-columns:1fr 1fr;gap:20px}
|
||||||
|
.card{background:#1a1a1e;border:1px solid #2a2a2e;border-radius:6px;padding:16px}
|
||||||
|
.card h2{color:#FFD700;font-size:13px;margin:0 0 10px;text-transform:uppercase;letter-spacing:.5px}
|
||||||
|
.kv{font-size:12px;line-height:1.6}
|
||||||
|
.kv span:first-child{color:#888;display:inline-block;width:160px}
|
||||||
|
.ok{color:#3a9}
|
||||||
|
.err{color:#e55}
|
||||||
|
.warn{color:#fa3}
|
||||||
|
pre{font-size:11px;background:#0e0e10;padding:8px;border-radius:4px;max-height:400px;overflow:auto;border:1px solid #2a2a2e}
|
||||||
|
.row{padding:6px 0;border-bottom:1px solid #2a2a2e;font-size:11px}
|
||||||
|
.row:last-child{border-bottom:0}
|
||||||
|
.ts{color:#666}
|
||||||
|
.lvl-ERROR{color:#e55}
|
||||||
|
.lvl-WARN{color:#fa3}
|
||||||
|
.lvl-CRITICAL{color:#f00;font-weight:bold}
|
||||||
|
.refresh{color:#666;font-size:10px;float:right}
|
||||||
|
</style></head>
|
||||||
|
<body>
|
||||||
|
<h1>🩺 PGŽ Sport · Live Debug Dashboard <span class="refresh">refresh: 5s</span></h1>
|
||||||
|
<div class="grid">
|
||||||
|
<div class="card">
|
||||||
|
<h2>Service Health</h2>
|
||||||
|
<div id="health" class="kv">loading…</div>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<h2>Auto-Triage Decisions</h2>
|
||||||
|
<div id="decisions">loading…</div>
|
||||||
|
</div>
|
||||||
|
<div class="card" style="grid-column:1/-1">
|
||||||
|
<h2>Recent Errors (live)</h2>
|
||||||
|
<div id="errors"><pre>loading…</pre></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<script>
|
||||||
|
async function refresh(){
|
||||||
|
// Health
|
||||||
|
const h = await fetch('/sport/api/debug/health').then(r=>r.json());
|
||||||
|
let html = '';
|
||||||
|
for (const [k,v] of Object.entries(h.services||{})){
|
||||||
|
const cls = v==='active'?'ok':'err';
|
||||||
|
html += `<div><span>${k}</span><span class="${cls}">${v}</span></div>`;
|
||||||
|
}
|
||||||
|
html += `<div><span>db</span><span class="${h.db==='ok'?'ok':'err'}">${h.db}</span></div>`;
|
||||||
|
html += `<div><span>total_errors</span><span>${h.total_errors_logged}</span></div>`;
|
||||||
|
document.getElementById('health').innerHTML = html;
|
||||||
|
|
||||||
|
// Decisions
|
||||||
|
const d = await fetch('/sport/api/debug/decisions?limit=10').then(r=>r.json());
|
||||||
|
let dh = '';
|
||||||
|
if (!d.decisions || d.decisions.length===0) dh = '<div class="row" style="color:#666">no auto-fixes triggered yet</div>';
|
||||||
|
for (const x of (d.decisions||[]).reverse()){
|
||||||
|
dh += `<div class="row"><span class="ts">${(x.ts||'').substring(11,19)}</span> <b>${x.action}</b> → ${x.target}: ${(x.msg||'').substring(0,120)}</div>`;
|
||||||
|
}
|
||||||
|
document.getElementById('decisions').innerHTML = dh;
|
||||||
|
|
||||||
|
// Errors
|
||||||
|
const e = await fetch('/sport/api/debug/errors?limit=30').then(r=>r.json());
|
||||||
|
let eh = '';
|
||||||
|
for (const x of (e.errors||[]).reverse()){
|
||||||
|
const cls = `lvl-${x.level||'INFO'}`;
|
||||||
|
eh += `<div class="row"><span class="ts">${(x.ts||'').substring(11,19)}</span> <span class="${cls}">[${x.level||'?'}]</span> <span style="color:#aaa">${x.src||'?'}</span> ${(x.code||'')} ${(x.path||'')} ${(x.msg||'').substring(0,140)}</div>`;
|
||||||
|
}
|
||||||
|
document.getElementById('errors').innerHTML = eh || '<div class="row" style="color:#666">No errors</div>';
|
||||||
|
}
|
||||||
|
refresh();
|
||||||
|
setInterval(refresh, 5000);
|
||||||
|
</script>
|
||||||
|
</body></html>"""
|
||||||
@@ -1322,8 +1322,17 @@ def _apply_to_db(kind: str, eid: int, fields: dict, sources: list, user_email: O
|
|||||||
params.append(json.dumps(meta_in, ensure_ascii=False, default=str))
|
params.append(json.dumps(meta_in, ensure_ascii=False, default=str))
|
||||||
|
|
||||||
params.append(eid)
|
params.append(eid)
|
||||||
cur.execute(f"UPDATE {table} SET {', '.join(sets)} WHERE id=%s RETURNING *", params)
|
try:
|
||||||
after = dict(cur.fetchone())
|
cur.execute(f"UPDATE {table} SET {', '.join(sets)} WHERE id=%s RETURNING *", params)
|
||||||
|
after = dict(cur.fetchone())
|
||||||
|
except psycopg2.errors.UniqueViolation as _uve:
|
||||||
|
# Race condition — fetch existing row instead
|
||||||
|
conn.rollback()
|
||||||
|
cur.execute(f"SELECT * FROM {table} WHERE id=%s", (eid,))
|
||||||
|
row = cur.fetchone()
|
||||||
|
after = dict(row) if row else {}
|
||||||
|
import logging as _lg
|
||||||
|
_lg.getLogger("enrich").info(f"UniqueViolation race avoided table={table} id={eid}")
|
||||||
|
|
||||||
cur.execute(
|
cur.execute(
|
||||||
"""INSERT INTO pgz_sport.enrichment_log
|
"""INSERT INTO pgz_sport.enrichment_log
|
||||||
|
|||||||
@@ -0,0 +1,31 @@
|
|||||||
|
[
|
||||||
|
"Gradovi_u_Hrvatskoj",
|
||||||
|
"Hrvatski_otoci",
|
||||||
|
"Planine_u_Hrvatskoj",
|
||||||
|
"Rijeke_u_Hrvatskoj",
|
||||||
|
"Primorsko-goranska_županija",
|
||||||
|
"Naselja_u_Primorsko-goranskoj_županiji",
|
||||||
|
"Hrvatski_političari",
|
||||||
|
"Hrvatski_športaši",
|
||||||
|
"Hrvatski_glazbenici",
|
||||||
|
"Hrvatski_književnici",
|
||||||
|
"Hrvatski_glumci",
|
||||||
|
"Hrvatska_povijest",
|
||||||
|
"Hrvatska_kuhinja",
|
||||||
|
"Hrvatska_kultura",
|
||||||
|
"Domovinski_rat",
|
||||||
|
"Gospodarstvo_Hrvatske",
|
||||||
|
"Hrvatski_nogometni_klubovi",
|
||||||
|
"Hrvatski_košarkaški_klubovi",
|
||||||
|
"Hrvatski_rukometni_klubovi",
|
||||||
|
"Hrvatski_odbojkaški_klubovi",
|
||||||
|
"Hrvatske_političke_stranke",
|
||||||
|
"Rijeka",
|
||||||
|
"Krk",
|
||||||
|
"Cres",
|
||||||
|
"Lošinj",
|
||||||
|
"Rab",
|
||||||
|
"Pag",
|
||||||
|
"Učka",
|
||||||
|
"HNK_Rijeka"
|
||||||
|
]
|
||||||
@@ -18,40 +18,35 @@ API = "https://hr.wikipedia.org/w/api.php"
|
|||||||
|
|
||||||
# Kategorije — širok HR knowledge bazu
|
# Kategorije — širok HR knowledge bazu
|
||||||
CATEGORIES = [
|
CATEGORIES = [
|
||||||
"Hrvatski_gradovi",
|
"Gradovi_u_Hrvatskoj",
|
||||||
"Hrvatske_općine",
|
|
||||||
"Hrvatski_otoci",
|
"Hrvatski_otoci",
|
||||||
"Hrvatske_planine",
|
"Planine_u_Hrvatskoj",
|
||||||
"Hrvatske_rijeke",
|
"Rijeke_u_Hrvatskoj",
|
||||||
"Primorsko-goranska_županija",
|
"Primorsko-goranska_županija",
|
||||||
"Naselja_u_Primorsko-goranskoj_županiji",
|
"Naselja_u_Primorsko-goranskoj_županiji",
|
||||||
"Hrvatski_političari",
|
"Hrvatski_političari",
|
||||||
"Hrvatski_sportaši",
|
"Hrvatski_športaši",
|
||||||
"Hrvatski_glazbenici",
|
"Hrvatski_glazbenici",
|
||||||
"Hrvatski_pisci",
|
"Hrvatski_književnici",
|
||||||
"Hrvatski_glumci",
|
"Hrvatski_glumci",
|
||||||
"Hrvatska_povijest",
|
"Hrvatska_povijest",
|
||||||
"Hrvatska_arhitektura",
|
|
||||||
"Hrvatska_kuhinja",
|
"Hrvatska_kuhinja",
|
||||||
"Hrvatska_kultura",
|
"Hrvatska_kultura",
|
||||||
"Hrvatska_znanost",
|
|
||||||
"Domovinski_rat",
|
"Domovinski_rat",
|
||||||
"Hrvatska_ekonomija",
|
"Gospodarstvo_Hrvatske",
|
||||||
"Hrvatski_klubovi",
|
|
||||||
"Hrvatski_nogometni_klubovi",
|
"Hrvatski_nogometni_klubovi",
|
||||||
"Hrvatski_košarkaški_klubovi",
|
"Hrvatski_košarkaški_klubovi",
|
||||||
|
"Hrvatski_rukometni_klubovi",
|
||||||
|
"Hrvatski_odbojkaški_klubovi",
|
||||||
"Hrvatske_političke_stranke",
|
"Hrvatske_političke_stranke",
|
||||||
"Predsjednici_Hrvatske",
|
|
||||||
"Premijeri_Hrvatske",
|
|
||||||
"Rijeka",
|
"Rijeka",
|
||||||
"Kvarner",
|
|
||||||
"Krk",
|
"Krk",
|
||||||
"Cres",
|
"Cres",
|
||||||
"Lošinj",
|
"Lošinj",
|
||||||
"Rab",
|
"Rab",
|
||||||
"Pag",
|
"Pag",
|
||||||
"Učka",
|
"Učka",
|
||||||
"Risnjak",
|
"HNK_Rijeka"
|
||||||
]
|
]
|
||||||
|
|
||||||
def api_get(params):
|
def api_get(params):
|
||||||
|
|||||||
Executable
+185
@@ -0,0 +1,185 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
auto_triage.py — Active error monitor za pgz-sport stack.
|
||||||
|
|
||||||
|
Tail-a /var/log/pgz-sport-debug/errors.jsonl, klasificira greške,
|
||||||
|
i automatski dispatcha tasks na CC agente kad detektira pattern.
|
||||||
|
|
||||||
|
Patterns:
|
||||||
|
- Recurring 5xx → CC4 (backend)
|
||||||
|
- 401/403 spike → CC2 (auth)
|
||||||
|
- 4xx na specifičnoj stranici → CC3 (frontend route)
|
||||||
|
- DB connection error → CC4 + telegram urgent
|
||||||
|
- ImportError/AttributeError u pgz-sport → CC4 dispatch + restart attempt
|
||||||
|
"""
|
||||||
|
import json, os, re, time, subprocess, sys
|
||||||
|
from collections import defaultdict, deque
|
||||||
|
from pathlib import Path
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
LOG_FILE = Path("/var/log/pgz-sport-debug/errors.jsonl")
|
||||||
|
TRIAGE_LOG = Path("/var/log/pgz-sport-debug/triage.log")
|
||||||
|
TRIAGE_DECISIONS = Path("/var/log/pgz-sport-debug/triage_decisions.jsonl")
|
||||||
|
TG_TOKEN = "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y"
|
||||||
|
TG_CHAT = "7969491558"
|
||||||
|
|
||||||
|
# Rate limit: ne više od X telegram poruka po 5 min
|
||||||
|
RATE_WIN = 300 # seconds
|
||||||
|
RATE_MAX = 6
|
||||||
|
recent_alerts = deque()
|
||||||
|
|
||||||
|
# Pattern counts (sliding window)
|
||||||
|
PATTERN_WIN = 60 # 60s window
|
||||||
|
recent_patterns = defaultdict(deque)
|
||||||
|
|
||||||
|
def log(msg):
|
||||||
|
ts = datetime.now().isoformat(timespec='seconds')
|
||||||
|
with open(TRIAGE_LOG, "a") as f:
|
||||||
|
f.write(f"[{ts}] {msg}\n")
|
||||||
|
print(f"[{ts}] {msg}", flush=True)
|
||||||
|
|
||||||
|
def telegram(text):
|
||||||
|
now = time.time()
|
||||||
|
while recent_alerts and now - recent_alerts[0] > RATE_WIN:
|
||||||
|
recent_alerts.popleft()
|
||||||
|
if len(recent_alerts) >= RATE_MAX:
|
||||||
|
log(f"RATE LIMITED telegram: {text[:80]}")
|
||||||
|
return False
|
||||||
|
recent_alerts.append(now)
|
||||||
|
try:
|
||||||
|
subprocess.run([
|
||||||
|
"curl", "-s", "-X", "POST",
|
||||||
|
f"https://api.telegram.org/bot{TG_TOKEN}/sendMessage",
|
||||||
|
"-d", f"chat_id={TG_CHAT}",
|
||||||
|
"--data-urlencode", f"text={text}"
|
||||||
|
], timeout=10, capture_output=True)
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
log(f"telegram fail: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def dispatch_to_cc(session, msg):
|
||||||
|
"""Pošalji task na CC tmux session."""
|
||||||
|
try:
|
||||||
|
subprocess.run(["tmux", "send-keys", "-t", f"{session}:0", msg], check=False, capture_output=True)
|
||||||
|
time.sleep(1)
|
||||||
|
subprocess.run(["tmux", "send-keys", "-t", f"{session}:0", "Enter"], check=False, capture_output=True)
|
||||||
|
log(f"dispatched to {session}: {msg[:80]}")
|
||||||
|
record_decision({"action": "dispatch", "target": session, "msg": msg[:200]})
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
log(f"dispatch fail to {session}: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def record_decision(obj):
|
||||||
|
obj["ts"] = datetime.now().isoformat(timespec='seconds')
|
||||||
|
with open(TRIAGE_DECISIONS, "a") as f:
|
||||||
|
f.write(json.dumps(obj) + "\n")
|
||||||
|
|
||||||
|
def pattern_count(key, since=None):
|
||||||
|
"""Count of pattern occurences within sliding window."""
|
||||||
|
if since is None: since = time.time() - PATTERN_WIN
|
||||||
|
dq = recent_patterns[key]
|
||||||
|
while dq and dq[0] < since:
|
||||||
|
dq.popleft()
|
||||||
|
return len(dq)
|
||||||
|
|
||||||
|
def add_pattern(key):
|
||||||
|
recent_patterns[key].append(time.time())
|
||||||
|
|
||||||
|
def classify(line):
|
||||||
|
try:
|
||||||
|
ev = json.loads(line)
|
||||||
|
except:
|
||||||
|
return None
|
||||||
|
|
||||||
|
msg = ev.get("msg", "") or ""
|
||||||
|
src = ev.get("src", "")
|
||||||
|
code = ev.get("code", "")
|
||||||
|
path = ev.get("path", "")
|
||||||
|
method = ev.get("method", "")
|
||||||
|
|
||||||
|
# ─── Pattern A: HTTP 5xx
|
||||||
|
if code and code.startswith("5"):
|
||||||
|
key = f"5xx:{path[:100]}"
|
||||||
|
add_pattern(key)
|
||||||
|
n = pattern_count(key)
|
||||||
|
if n >= 3:
|
||||||
|
telegram(f"⚠️ 5xx spike: {method} {path} → {code} (×{n}/60s)")
|
||||||
|
dispatch_to_cc("cc4", f"5xx detected: {method} {path} {code} occurring {n}x in 60s. Investigate /opt/pgz-sport/routers/ for the route handler. Check DB connection, log traceback. Run smoke test. Fix + restart pgz-sport + verify resolved.")
|
||||||
|
recent_patterns[key].clear() # reset after dispatch
|
||||||
|
return ("5xx_spike", n, path)
|
||||||
|
|
||||||
|
# ─── Pattern B: 401/403 spike (auth issue)
|
||||||
|
if code in ("401", "403"):
|
||||||
|
key = f"auth:{path[:80]}"
|
||||||
|
add_pattern(key)
|
||||||
|
n = pattern_count(key)
|
||||||
|
if n >= 5:
|
||||||
|
telegram(f"🔒 Auth spike: {code} on {path} (×{n}/60s)")
|
||||||
|
dispatch_to_cc("cc2", f"Auth spike: {code} on {path} ×{n} times in 60s. Check JWT middleware in pgz_sport_api.py + auth/auth_v2.py. Verify role-based access control. Smoke test 3 demo accounts.")
|
||||||
|
recent_patterns[key].clear()
|
||||||
|
return ("auth_spike", n, path)
|
||||||
|
|
||||||
|
# ─── Pattern C: 4xx on consumer endpoints (frontend bug)
|
||||||
|
if code and code.startswith("4") and code not in ("401", "403"):
|
||||||
|
if path.startswith("/sport/api/"):
|
||||||
|
key = f"4xx_api:{path[:80]}"
|
||||||
|
add_pattern(key)
|
||||||
|
n = pattern_count(key)
|
||||||
|
if n >= 5:
|
||||||
|
telegram(f"⚠️ 4xx API: {path} ×{n}/60s")
|
||||||
|
dispatch_to_cc("cc3", f"Frontend bug: {path} returning {code} ×{n}x. Frontend may call wrong URL or send bad payload. Check static/*.html for fetch/api() calls to {path}. Verify request shape matches backend schema.")
|
||||||
|
recent_patterns[key].clear()
|
||||||
|
return ("4xx_api", n, path)
|
||||||
|
|
||||||
|
# ─── Pattern D: ImportError / AttributeError / SyntaxError u backendu
|
||||||
|
crit_patterns = [
|
||||||
|
(r"ImportError|ModuleNotFoundError", "import_error"),
|
||||||
|
(r"AttributeError", "attribute_error"),
|
||||||
|
(r"SyntaxError", "syntax_error"),
|
||||||
|
(r"OperationalError.*could not connect", "db_connect_error"),
|
||||||
|
(r"asyncpg|psycopg2.*OperationalError", "db_pool_error"),
|
||||||
|
(r"FATAL|CRITICAL", "fatal"),
|
||||||
|
]
|
||||||
|
for pat, kind in crit_patterns:
|
||||||
|
if re.search(pat, msg, re.I):
|
||||||
|
telegram(f"🚨 {kind.upper()}: {msg[:200]}")
|
||||||
|
target = "cc4" if "db" not in kind else "cc4"
|
||||||
|
dispatch_to_cc(target, f"CRITICAL {kind} detected u pgz-sport: {msg[:300]}. Identify file:line, fix, py_compile, restart, verify. If db_connect_error, check Server B (10.10.0.2:6432) connectivity.")
|
||||||
|
return (kind, 1, msg[:80])
|
||||||
|
|
||||||
|
# ─── Pattern E: Empty page detection
|
||||||
|
if code == "200" and "size_download" in str(ev) and ev.get("size", 0) < 100:
|
||||||
|
key = f"empty:{path}"
|
||||||
|
add_pattern(key)
|
||||||
|
if pattern_count(key) >= 2:
|
||||||
|
telegram(f"📄 Empty page: {path}")
|
||||||
|
dispatch_to_cc("cc3", f"Empty page detected: {path} returning <100 bytes. Check static/{path.split('/')[-1]} or backend response.")
|
||||||
|
recent_patterns[key].clear()
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def follow(path):
|
||||||
|
"""Tail -F equivalent."""
|
||||||
|
while not path.exists():
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
f = open(path, "r")
|
||||||
|
f.seek(0, 2) # EOF
|
||||||
|
while True:
|
||||||
|
line = f.readline()
|
||||||
|
if not line:
|
||||||
|
time.sleep(0.5)
|
||||||
|
continue
|
||||||
|
result = classify(line)
|
||||||
|
if result:
|
||||||
|
log(f"PATTERN {result[0]} ×{result[1]}: {result[2]}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
log("auto_triage starting")
|
||||||
|
log(f"watching {LOG_FILE}")
|
||||||
|
try:
|
||||||
|
follow(LOG_FILE)
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
log("shutdown")
|
||||||
Executable
+59
@@ -0,0 +1,59 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Tail journalctl + nginx errors → strukturirani JSONL stream
|
||||||
|
LOGDIR=/var/log/pgz-sport-debug
|
||||||
|
mkdir -p $LOGDIR
|
||||||
|
|
||||||
|
# Tail journalctl
|
||||||
|
journalctl -u pgz-sport -f -n 0 --output=cat 2>/dev/null | while read line; do
|
||||||
|
ts=$(date -Iseconds)
|
||||||
|
level="INFO"
|
||||||
|
|
||||||
|
# Klasifikacija
|
||||||
|
if echo "$line" | grep -qE "ERROR|Exception|Traceback|CRITICAL|FATAL"; then level="ERROR"; fi
|
||||||
|
if echo "$line" | grep -qE "WARNING|WARN"; then level="WARN"; fi
|
||||||
|
if echo "$line" | grep -qE "DEBUG"; then level="DEBUG"; fi
|
||||||
|
|
||||||
|
# JSON-escape
|
||||||
|
safe=$(echo "$line" | python3 -c "import json,sys; print(json.dumps(sys.stdin.read().strip()))")
|
||||||
|
echo "{\"ts\":\"$ts\",\"src\":\"pgz-sport\",\"level\":\"$level\",\"msg\":$safe}" >> $LOGDIR/stream.jsonl
|
||||||
|
done &
|
||||||
|
JPID=$!
|
||||||
|
echo $JPID > $LOGDIR/journalctl_tail.pid
|
||||||
|
|
||||||
|
# Tail nginx error log
|
||||||
|
tail -F /var/log/nginx/sport.error.log 2>/dev/null | while read line; do
|
||||||
|
ts=$(date -Iseconds)
|
||||||
|
safe=$(echo "$line" | python3 -c "import json,sys; print(json.dumps(sys.stdin.read().strip()))")
|
||||||
|
echo "{\"ts\":\"$ts\",\"src\":\"nginx\",\"level\":\"ERROR\",\"msg\":$safe}" >> $LOGDIR/stream.jsonl
|
||||||
|
done &
|
||||||
|
NPID=$!
|
||||||
|
echo $NPID > $LOGDIR/nginx_tail.pid
|
||||||
|
|
||||||
|
# Tail nginx access log za 4xx/5xx
|
||||||
|
tail -F /var/log/nginx/sport.access.log 2>/dev/null | while read line; do
|
||||||
|
# parse: status code je 9. polje (combined log format)
|
||||||
|
code=$(echo "$line" | awk '{print $9}')
|
||||||
|
if [[ "$code" =~ ^[45][0-9][0-9]$ ]]; then
|
||||||
|
ts=$(date -Iseconds)
|
||||||
|
method=$(echo "$line" | awk '{print $6}' | tr -d '"')
|
||||||
|
path=$(echo "$line" | awk '{print $7}')
|
||||||
|
safe=$(echo "$line" | python3 -c "import json,sys; print(json.dumps(sys.stdin.read().strip()))")
|
||||||
|
echo "{\"ts\":\"$ts\",\"src\":\"nginx-access\",\"level\":\"WARN\",\"code\":\"$code\",\"method\":\"$method\",\"path\":\"$path\",\"raw\":$safe}" >> $LOGDIR/stream.jsonl
|
||||||
|
|
||||||
|
# ACTIVE ALERTING: ako je 5xx ili 401-403, log do error feed
|
||||||
|
if [[ "$code" =~ ^5 ]] || [[ "$code" == "401" ]] || [[ "$code" == "403" ]]; then
|
||||||
|
echo "{\"ts\":\"$ts\",\"src\":\"nginx-access\",\"level\":\"ERROR\",\"code\":\"$code\",\"method\":\"$method\",\"path\":\"$path\"}" >> $LOGDIR/errors.jsonl
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
done &
|
||||||
|
APID=$!
|
||||||
|
echo $APID > $LOGDIR/access_tail.pid
|
||||||
|
|
||||||
|
# Drop ERROR-level u zaseban error file (agenti gledaju ovaj)
|
||||||
|
tail -F $LOGDIR/stream.jsonl 2>/dev/null | grep -E "\"level\":\"(ERROR|CRITICAL|FATAL)\"" >> $LOGDIR/errors.jsonl &
|
||||||
|
EPID=$!
|
||||||
|
echo $EPID > $LOGDIR/error_filter.pid
|
||||||
|
|
||||||
|
echo "Debug tail running. PIDs: journalctl=$JPID nginx=$NPID access=$APID error_filter=$EPID"
|
||||||
|
echo " stream.jsonl + errors.jsonl in $LOGDIR"
|
||||||
|
wait
|
||||||
Reference in New Issue
Block a user