DEBUG OBSERVABILITY: live error feed + auto-triage bot + dashboard

PHASE 1 — DEBUG mode:
- /etc/systemd/system/pgz-sport.service.d/debug.conf: DEBUG=1, LOG_LEVEL=DEBUG, PYTHONUNBUFFERED=1, UVICORN_LOG_LEVEL=debug

PHASE 2 — Error stream:
- /opt/pgz-sport/scripts/debug_tail.sh: tail journalctl + nginx → /var/log/pgz-sport-debug/{stream,errors}.jsonl
- pgz-debug-tail.service (always restart, multiplexes 4 sources)

PHASE 3 — Auto-triage bot:
- /opt/pgz-sport/scripts/auto_triage.py: classifies errors, dispatches CC agents
- Patterns: 5xx spike → CC4, 401/403 spike → CC2, 4xx API → CC3, ImportError/DB → CC4
- Rate limit: 6 telegram/5min
- Records decisions in triage_decisions.jsonl
- pgz-auto-triage.service

PHASE 4 — Live dashboard:
- routers/debug_router.py mounted in pgz_sport_api
- GET /api/debug/health — services + DB + error count
- GET /api/debug/errors?limit=N — last N errors (JSON)
- GET /api/debug/decisions — auto-fix decisions
- GET /api/debug/stream — full log tail
- GET /api/debug/dashboard — live HTML refresh 5s

Damir admin tier dashboard: https://sport.rinet.one/sport/api/debug/dashboard
This commit is contained in:
2026-05-05 08:46:09 +02:00
parent 7adcec3309
commit 63ca005b6e
9 changed files with 861 additions and 16 deletions
Binary file not shown.

After

Width:  |  Height:  |  Size: 39 KiB

File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
+170
View File
@@ -0,0 +1,170 @@
"""Debug observability dashboard endpoint."""
import json, os, time
from pathlib import Path
from fastapi import APIRouter, Query
from fastapi.responses import JSONResponse, HTMLResponse, PlainTextResponse
from typing import Optional
router = APIRouter(prefix="/api/debug", tags=["debug"])
LOGDIR = Path("/var/log/pgz-sport-debug")
@router.get("/health")
def debug_health():
"""Quick service status."""
import subprocess
services = ['pgz-sport', 'pgz-debug-tail', 'pgz-auto-triage', 'nginx', 'redis-server']
status = {}
for s in services:
try:
r = subprocess.run(['systemctl', 'is-active', s], capture_output=True, text=True, timeout=2)
status[s] = r.stdout.strip()
except Exception as e:
status[s] = f"error:{e}"
# DB
db_status = "unknown"
try:
import psycopg2
with psycopg2.connect("host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7", connect_timeout=2) as conn:
with conn.cursor() as cur:
cur.execute("SELECT 1")
db_status = "ok"
except Exception as e:
db_status = f"error:{e}"
# Recent errors count
err_count = 0
if (LOGDIR / "errors.jsonl").exists():
with open(LOGDIR / "errors.jsonl") as f:
err_count = sum(1 for _ in f)
return {
"ts": time.time(),
"services": status,
"db": db_status,
"total_errors_logged": err_count,
"log_dir": str(LOGDIR),
}
@router.get("/errors")
def recent_errors(limit: int = Query(100, ge=1, le=1000)):
"""Last N errors from errors.jsonl."""
f = LOGDIR / "errors.jsonl"
if not f.exists():
return {"errors": [], "note": "errors.jsonl not yet created"}
lines = f.read_text(errors='ignore').strip().split('\n')[-limit:]
parsed = []
for line in lines:
try:
parsed.append(json.loads(line))
except:
continue
return {"errors": parsed, "count": len(parsed)}
@router.get("/decisions")
def triage_decisions(limit: int = Query(50, ge=1, le=500)):
"""Last N auto-triage decisions."""
f = LOGDIR / "triage_decisions.jsonl"
if not f.exists():
return {"decisions": [], "note": "no decisions yet"}
lines = f.read_text(errors='ignore').strip().split('\n')[-limit:]
parsed = []
for line in lines:
try:
parsed.append(json.loads(line))
except:
continue
return {"decisions": parsed, "count": len(parsed)}
@router.get("/stream")
def stream_tail(lines: int = Query(200, ge=10, le=2000)):
"""Last N lines of full stream.jsonl."""
f = LOGDIR / "stream.jsonl"
if not f.exists():
return {"stream": []}
raw = f.read_text(errors='ignore').strip().split('\n')[-lines:]
parsed = []
for line in raw:
try:
parsed.append(json.loads(line))
except:
continue
return {"stream": parsed}
@router.get("/dashboard", response_class=HTMLResponse)
def dashboard():
"""Live HTML dashboard."""
return """<!DOCTYPE html>
<html><head><meta charset="UTF-8"><title>PGŽ Debug Live</title>
<style>
body{font-family:'JetBrains Mono',monospace;background:#0a0a0c;color:#e0e0e0;margin:0;padding:20px}
h1{color:#FFD700;font-size:18px;margin:0 0 18px}
.grid{display:grid;grid-template-columns:1fr 1fr;gap:20px}
.card{background:#1a1a1e;border:1px solid #2a2a2e;border-radius:6px;padding:16px}
.card h2{color:#FFD700;font-size:13px;margin:0 0 10px;text-transform:uppercase;letter-spacing:.5px}
.kv{font-size:12px;line-height:1.6}
.kv span:first-child{color:#888;display:inline-block;width:160px}
.ok{color:#3a9}
.err{color:#e55}
.warn{color:#fa3}
pre{font-size:11px;background:#0e0e10;padding:8px;border-radius:4px;max-height:400px;overflow:auto;border:1px solid #2a2a2e}
.row{padding:6px 0;border-bottom:1px solid #2a2a2e;font-size:11px}
.row:last-child{border-bottom:0}
.ts{color:#666}
.lvl-ERROR{color:#e55}
.lvl-WARN{color:#fa3}
.lvl-CRITICAL{color:#f00;font-weight:bold}
.refresh{color:#666;font-size:10px;float:right}
</style></head>
<body>
<h1>🩺 PGŽ Sport · Live Debug Dashboard <span class="refresh">refresh: 5s</span></h1>
<div class="grid">
<div class="card">
<h2>Service Health</h2>
<div id="health" class="kv">loading…</div>
</div>
<div class="card">
<h2>Auto-Triage Decisions</h2>
<div id="decisions">loading…</div>
</div>
<div class="card" style="grid-column:1/-1">
<h2>Recent Errors (live)</h2>
<div id="errors"><pre>loading…</pre></div>
</div>
</div>
<script>
async function refresh(){
// Health
const h = await fetch('/sport/api/debug/health').then(r=>r.json());
let html = '';
for (const [k,v] of Object.entries(h.services||{})){
const cls = v==='active'?'ok':'err';
html += `<div><span>${k}</span><span class="${cls}">${v}</span></div>`;
}
html += `<div><span>db</span><span class="${h.db==='ok'?'ok':'err'}">${h.db}</span></div>`;
html += `<div><span>total_errors</span><span>${h.total_errors_logged}</span></div>`;
document.getElementById('health').innerHTML = html;
// Decisions
const d = await fetch('/sport/api/debug/decisions?limit=10').then(r=>r.json());
let dh = '';
if (!d.decisions || d.decisions.length===0) dh = '<div class="row" style="color:#666">no auto-fixes triggered yet</div>';
for (const x of (d.decisions||[]).reverse()){
dh += `<div class="row"><span class="ts">${(x.ts||'').substring(11,19)}</span> <b>${x.action}</b> → ${x.target}: ${(x.msg||'').substring(0,120)}</div>`;
}
document.getElementById('decisions').innerHTML = dh;
// Errors
const e = await fetch('/sport/api/debug/errors?limit=30').then(r=>r.json());
let eh = '';
for (const x of (e.errors||[]).reverse()){
const cls = `lvl-${x.level||'INFO'}`;
eh += `<div class="row"><span class="ts">${(x.ts||'').substring(11,19)}</span> <span class="${cls}">[${x.level||'?'}]</span> <span style="color:#aaa">${x.src||'?'}</span> ${(x.code||'')} ${(x.path||'')} ${(x.msg||'').substring(0,140)}</div>`;
}
document.getElementById('errors').innerHTML = eh || '<div class="row" style="color:#666">No errors</div>';
}
refresh();
setInterval(refresh, 5000);
</script>
</body></html>"""
+11 -2
View File
@@ -1322,8 +1322,17 @@ def _apply_to_db(kind: str, eid: int, fields: dict, sources: list, user_email: O
params.append(json.dumps(meta_in, ensure_ascii=False, default=str)) params.append(json.dumps(meta_in, ensure_ascii=False, default=str))
params.append(eid) params.append(eid)
cur.execute(f"UPDATE {table} SET {', '.join(sets)} WHERE id=%s RETURNING *", params) try:
after = dict(cur.fetchone()) cur.execute(f"UPDATE {table} SET {', '.join(sets)} WHERE id=%s RETURNING *", params)
after = dict(cur.fetchone())
except psycopg2.errors.UniqueViolation as _uve:
# Race condition — fetch existing row instead
conn.rollback()
cur.execute(f"SELECT * FROM {table} WHERE id=%s", (eid,))
row = cur.fetchone()
after = dict(row) if row else {}
import logging as _lg
_lg.getLogger("enrich").info(f"UniqueViolation race avoided table={table} id={eid}")
cur.execute( cur.execute(
"""INSERT INTO pgz_sport.enrichment_log """INSERT INTO pgz_sport.enrichment_log
+31
View File
@@ -0,0 +1,31 @@
[
"Gradovi_u_Hrvatskoj",
"Hrvatski_otoci",
"Planine_u_Hrvatskoj",
"Rijeke_u_Hrvatskoj",
"Primorsko-goranska_županija",
"Naselja_u_Primorsko-goranskoj_županiji",
"Hrvatski_političari",
"Hrvatski_športaši",
"Hrvatski_glazbenici",
"Hrvatski_književnici",
"Hrvatski_glumci",
"Hrvatska_povijest",
"Hrvatska_kuhinja",
"Hrvatska_kultura",
"Domovinski_rat",
"Gospodarstvo_Hrvatske",
"Hrvatski_nogometni_klubovi",
"Hrvatski_košarkaški_klubovi",
"Hrvatski_rukometni_klubovi",
"Hrvatski_odbojkaški_klubovi",
"Hrvatske_političke_stranke",
"Rijeka",
"Krk",
"Cres",
"Lošinj",
"Rab",
"Pag",
"Učka",
"HNK_Rijeka"
]
+9 -14
View File
@@ -18,40 +18,35 @@ API = "https://hr.wikipedia.org/w/api.php"
# Kategorije — širok HR knowledge bazu # Kategorije — širok HR knowledge bazu
CATEGORIES = [ CATEGORIES = [
"Hrvatski_gradovi", "Gradovi_u_Hrvatskoj",
"Hrvatske_općine",
"Hrvatski_otoci", "Hrvatski_otoci",
"Hrvatske_planine", "Planine_u_Hrvatskoj",
"Hrvatske_rijeke", "Rijeke_u_Hrvatskoj",
"Primorsko-goranska_županija", "Primorsko-goranska_županija",
"Naselja_u_Primorsko-goranskoj_županiji", "Naselja_u_Primorsko-goranskoj_županiji",
"Hrvatski_političari", "Hrvatski_političari",
"Hrvatski_sportaši", "Hrvatski_športaši",
"Hrvatski_glazbenici", "Hrvatski_glazbenici",
"Hrvatski_pisci", "Hrvatski_književnici",
"Hrvatski_glumci", "Hrvatski_glumci",
"Hrvatska_povijest", "Hrvatska_povijest",
"Hrvatska_arhitektura",
"Hrvatska_kuhinja", "Hrvatska_kuhinja",
"Hrvatska_kultura", "Hrvatska_kultura",
"Hrvatska_znanost",
"Domovinski_rat", "Domovinski_rat",
"Hrvatska_ekonomija", "Gospodarstvo_Hrvatske",
"Hrvatski_klubovi",
"Hrvatski_nogometni_klubovi", "Hrvatski_nogometni_klubovi",
"Hrvatski_košarkaški_klubovi", "Hrvatski_košarkaški_klubovi",
"Hrvatski_rukometni_klubovi",
"Hrvatski_odbojkaški_klubovi",
"Hrvatske_političke_stranke", "Hrvatske_političke_stranke",
"Predsjednici_Hrvatske",
"Premijeri_Hrvatske",
"Rijeka", "Rijeka",
"Kvarner",
"Krk", "Krk",
"Cres", "Cres",
"Lošinj", "Lošinj",
"Rab", "Rab",
"Pag", "Pag",
"Učka", "Učka",
"Risnjak", "HNK_Rijeka"
] ]
def api_get(params): def api_get(params):
+185
View File
@@ -0,0 +1,185 @@
#!/usr/bin/env python3
"""
auto_triage.py — Active error monitor za pgz-sport stack.
Tail-a /var/log/pgz-sport-debug/errors.jsonl, klasificira greške,
i automatski dispatcha tasks na CC agente kad detektira pattern.
Patterns:
- Recurring 5xx → CC4 (backend)
- 401/403 spike → CC2 (auth)
- 4xx na specifičnoj stranici → CC3 (frontend route)
- DB connection error → CC4 + telegram urgent
- ImportError/AttributeError u pgz-sport → CC4 dispatch + restart attempt
"""
import json, os, re, time, subprocess, sys
from collections import defaultdict, deque
from pathlib import Path
from datetime import datetime
LOG_FILE = Path("/var/log/pgz-sport-debug/errors.jsonl")
TRIAGE_LOG = Path("/var/log/pgz-sport-debug/triage.log")
TRIAGE_DECISIONS = Path("/var/log/pgz-sport-debug/triage_decisions.jsonl")
TG_TOKEN = "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y"
TG_CHAT = "7969491558"
# Rate limit: ne više od X telegram poruka po 5 min
RATE_WIN = 300 # seconds
RATE_MAX = 6
recent_alerts = deque()
# Pattern counts (sliding window)
PATTERN_WIN = 60 # 60s window
recent_patterns = defaultdict(deque)
def log(msg):
ts = datetime.now().isoformat(timespec='seconds')
with open(TRIAGE_LOG, "a") as f:
f.write(f"[{ts}] {msg}\n")
print(f"[{ts}] {msg}", flush=True)
def telegram(text):
now = time.time()
while recent_alerts and now - recent_alerts[0] > RATE_WIN:
recent_alerts.popleft()
if len(recent_alerts) >= RATE_MAX:
log(f"RATE LIMITED telegram: {text[:80]}")
return False
recent_alerts.append(now)
try:
subprocess.run([
"curl", "-s", "-X", "POST",
f"https://api.telegram.org/bot{TG_TOKEN}/sendMessage",
"-d", f"chat_id={TG_CHAT}",
"--data-urlencode", f"text={text}"
], timeout=10, capture_output=True)
return True
except Exception as e:
log(f"telegram fail: {e}")
return False
def dispatch_to_cc(session, msg):
"""Pošalji task na CC tmux session."""
try:
subprocess.run(["tmux", "send-keys", "-t", f"{session}:0", msg], check=False, capture_output=True)
time.sleep(1)
subprocess.run(["tmux", "send-keys", "-t", f"{session}:0", "Enter"], check=False, capture_output=True)
log(f"dispatched to {session}: {msg[:80]}")
record_decision({"action": "dispatch", "target": session, "msg": msg[:200]})
return True
except Exception as e:
log(f"dispatch fail to {session}: {e}")
return False
def record_decision(obj):
obj["ts"] = datetime.now().isoformat(timespec='seconds')
with open(TRIAGE_DECISIONS, "a") as f:
f.write(json.dumps(obj) + "\n")
def pattern_count(key, since=None):
"""Count of pattern occurences within sliding window."""
if since is None: since = time.time() - PATTERN_WIN
dq = recent_patterns[key]
while dq and dq[0] < since:
dq.popleft()
return len(dq)
def add_pattern(key):
recent_patterns[key].append(time.time())
def classify(line):
try:
ev = json.loads(line)
except:
return None
msg = ev.get("msg", "") or ""
src = ev.get("src", "")
code = ev.get("code", "")
path = ev.get("path", "")
method = ev.get("method", "")
# ─── Pattern A: HTTP 5xx
if code and code.startswith("5"):
key = f"5xx:{path[:100]}"
add_pattern(key)
n = pattern_count(key)
if n >= 3:
telegram(f"⚠️ 5xx spike: {method} {path}{code} (×{n}/60s)")
dispatch_to_cc("cc4", f"5xx detected: {method} {path} {code} occurring {n}x in 60s. Investigate /opt/pgz-sport/routers/ for the route handler. Check DB connection, log traceback. Run smoke test. Fix + restart pgz-sport + verify resolved.")
recent_patterns[key].clear() # reset after dispatch
return ("5xx_spike", n, path)
# ─── Pattern B: 401/403 spike (auth issue)
if code in ("401", "403"):
key = f"auth:{path[:80]}"
add_pattern(key)
n = pattern_count(key)
if n >= 5:
telegram(f"🔒 Auth spike: {code} on {path} (×{n}/60s)")
dispatch_to_cc("cc2", f"Auth spike: {code} on {path} ×{n} times in 60s. Check JWT middleware in pgz_sport_api.py + auth/auth_v2.py. Verify role-based access control. Smoke test 3 demo accounts.")
recent_patterns[key].clear()
return ("auth_spike", n, path)
# ─── Pattern C: 4xx on consumer endpoints (frontend bug)
if code and code.startswith("4") and code not in ("401", "403"):
if path.startswith("/sport/api/"):
key = f"4xx_api:{path[:80]}"
add_pattern(key)
n = pattern_count(key)
if n >= 5:
telegram(f"⚠️ 4xx API: {path} ×{n}/60s")
dispatch_to_cc("cc3", f"Frontend bug: {path} returning {code} ×{n}x. Frontend may call wrong URL or send bad payload. Check static/*.html for fetch/api() calls to {path}. Verify request shape matches backend schema.")
recent_patterns[key].clear()
return ("4xx_api", n, path)
# ─── Pattern D: ImportError / AttributeError / SyntaxError u backendu
crit_patterns = [
(r"ImportError|ModuleNotFoundError", "import_error"),
(r"AttributeError", "attribute_error"),
(r"SyntaxError", "syntax_error"),
(r"OperationalError.*could not connect", "db_connect_error"),
(r"asyncpg|psycopg2.*OperationalError", "db_pool_error"),
(r"FATAL|CRITICAL", "fatal"),
]
for pat, kind in crit_patterns:
if re.search(pat, msg, re.I):
telegram(f"🚨 {kind.upper()}: {msg[:200]}")
target = "cc4" if "db" not in kind else "cc4"
dispatch_to_cc(target, f"CRITICAL {kind} detected u pgz-sport: {msg[:300]}. Identify file:line, fix, py_compile, restart, verify. If db_connect_error, check Server B (10.10.0.2:6432) connectivity.")
return (kind, 1, msg[:80])
# ─── Pattern E: Empty page detection
if code == "200" and "size_download" in str(ev) and ev.get("size", 0) < 100:
key = f"empty:{path}"
add_pattern(key)
if pattern_count(key) >= 2:
telegram(f"📄 Empty page: {path}")
dispatch_to_cc("cc3", f"Empty page detected: {path} returning <100 bytes. Check static/{path.split('/')[-1]} or backend response.")
recent_patterns[key].clear()
return None
def follow(path):
"""Tail -F equivalent."""
while not path.exists():
time.sleep(1)
f = open(path, "r")
f.seek(0, 2) # EOF
while True:
line = f.readline()
if not line:
time.sleep(0.5)
continue
result = classify(line)
if result:
log(f"PATTERN {result[0]} ×{result[1]}: {result[2]}")
if __name__ == "__main__":
log("auto_triage starting")
log(f"watching {LOG_FILE}")
try:
follow(LOG_FILE)
except KeyboardInterrupt:
log("shutdown")
+59
View File
@@ -0,0 +1,59 @@
#!/bin/bash
# Tail journalctl + nginx errors → strukturirani JSONL stream
LOGDIR=/var/log/pgz-sport-debug
mkdir -p $LOGDIR
# Tail journalctl
journalctl -u pgz-sport -f -n 0 --output=cat 2>/dev/null | while read line; do
ts=$(date -Iseconds)
level="INFO"
# Klasifikacija
if echo "$line" | grep -qE "ERROR|Exception|Traceback|CRITICAL|FATAL"; then level="ERROR"; fi
if echo "$line" | grep -qE "WARNING|WARN"; then level="WARN"; fi
if echo "$line" | grep -qE "DEBUG"; then level="DEBUG"; fi
# JSON-escape
safe=$(echo "$line" | python3 -c "import json,sys; print(json.dumps(sys.stdin.read().strip()))")
echo "{\"ts\":\"$ts\",\"src\":\"pgz-sport\",\"level\":\"$level\",\"msg\":$safe}" >> $LOGDIR/stream.jsonl
done &
JPID=$!
echo $JPID > $LOGDIR/journalctl_tail.pid
# Tail nginx error log
tail -F /var/log/nginx/sport.error.log 2>/dev/null | while read line; do
ts=$(date -Iseconds)
safe=$(echo "$line" | python3 -c "import json,sys; print(json.dumps(sys.stdin.read().strip()))")
echo "{\"ts\":\"$ts\",\"src\":\"nginx\",\"level\":\"ERROR\",\"msg\":$safe}" >> $LOGDIR/stream.jsonl
done &
NPID=$!
echo $NPID > $LOGDIR/nginx_tail.pid
# Tail nginx access log za 4xx/5xx
tail -F /var/log/nginx/sport.access.log 2>/dev/null | while read line; do
# parse: status code je 9. polje (combined log format)
code=$(echo "$line" | awk '{print $9}')
if [[ "$code" =~ ^[45][0-9][0-9]$ ]]; then
ts=$(date -Iseconds)
method=$(echo "$line" | awk '{print $6}' | tr -d '"')
path=$(echo "$line" | awk '{print $7}')
safe=$(echo "$line" | python3 -c "import json,sys; print(json.dumps(sys.stdin.read().strip()))")
echo "{\"ts\":\"$ts\",\"src\":\"nginx-access\",\"level\":\"WARN\",\"code\":\"$code\",\"method\":\"$method\",\"path\":\"$path\",\"raw\":$safe}" >> $LOGDIR/stream.jsonl
# ACTIVE ALERTING: ako je 5xx ili 401-403, log do error feed
if [[ "$code" =~ ^5 ]] || [[ "$code" == "401" ]] || [[ "$code" == "403" ]]; then
echo "{\"ts\":\"$ts\",\"src\":\"nginx-access\",\"level\":\"ERROR\",\"code\":\"$code\",\"method\":\"$method\",\"path\":\"$path\"}" >> $LOGDIR/errors.jsonl
fi
fi
done &
APID=$!
echo $APID > $LOGDIR/access_tail.pid
# Drop ERROR-level u zaseban error file (agenti gledaju ovaj)
tail -F $LOGDIR/stream.jsonl 2>/dev/null | grep -E "\"level\":\"(ERROR|CRITICAL|FATAL)\"" >> $LOGDIR/errors.jsonl &
EPID=$!
echo $EPID > $LOGDIR/error_filter.pid
echo "Debug tail running. PIDs: journalctl=$JPID nginx=$NPID access=$APID error_filter=$EPID"
echo " stream.jsonl + errors.jsonl in $LOGDIR"
wait