SOAK-03: daily-reboot-test.sh deployed on both nodes via cron (4 AM). Systemd oneshot verifies recovery on boot, logs to reboot-test.csv. SOAK-04: generate-stability-report.sh compiles metrics from uptime-monitor, reboot-test, sync-check CSVs. Initial .228 report: 99.847% uptime, 0 OOM kills, 32/32 containers. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
125 lines
5.1 KiB
Bash
Executable File
125 lines
5.1 KiB
Bash
Executable File
#!/bin/bash
|
|
# generate-stability-report.sh — Compile stability report from monitoring data
|
|
# Run after 30-day soak test period
|
|
# Usage: ./scripts/generate-stability-report.sh [TARGET_IP]
|
|
|
|
TARGET="${1:-192.168.1.228}"
|
|
SSH_KEY="${HOME}/.ssh/archipelago-deploy"
|
|
SSH_OPTS="-i ${SSH_KEY} -o StrictHostKeyChecking=no -o ConnectTimeout=10"
|
|
|
|
echo "╔════════════════════════════════════════════════════════════════╗"
|
|
echo "║ Archipelago Stability Report ║"
|
|
echo "╚════════════════════════════════════════════════════════════════╝"
|
|
echo ""
|
|
echo "Node: ${TARGET}"
|
|
echo "Generated: $(date -u +%Y-%m-%dT%H:%M:%SZ)"
|
|
echo ""
|
|
|
|
# Uptime metrics
|
|
echo "═══ Uptime Metrics ═══"
|
|
ssh $SSH_OPTS "archipelago@${TARGET}" "
|
|
if [ -f /var/lib/archipelago/uptime-monitor/metrics.csv ]; then
|
|
TOTAL=\$(tail -n +2 /var/lib/archipelago/uptime-monitor/metrics.csv | wc -l)
|
|
OK=\$(grep -c ',200,' /var/lib/archipelago/uptime-monitor/metrics.csv 2>/dev/null || echo 0)
|
|
if [ \$TOTAL -gt 0 ]; then
|
|
PCT=\$(python3 -c \"print(round(\$OK / \$TOTAL * 100, 3))\" 2>/dev/null || echo '?')
|
|
echo \" Total checks: \$TOTAL\"
|
|
echo \" Healthy: \$OK\"
|
|
echo \" Uptime: \${PCT}%\"
|
|
FIRST=\$(head -2 /var/lib/archipelago/uptime-monitor/metrics.csv | tail -1 | cut -d, -f1)
|
|
LAST=\$(tail -1 /var/lib/archipelago/uptime-monitor/metrics.csv | cut -d, -f1)
|
|
echo \" Period: \$FIRST to \$LAST\"
|
|
fi
|
|
else
|
|
echo ' No uptime data found'
|
|
fi
|
|
" 2>/dev/null
|
|
echo ""
|
|
|
|
# Reboot test results
|
|
echo "═══ Daily Reboot Tests ═══"
|
|
ssh $SSH_OPTS "archipelago@${TARGET}" "
|
|
if [ -f /var/lib/archipelago/monitoring/reboot-test.csv ]; then
|
|
REBOOTS=\$(grep -c ',reboot,' /var/lib/archipelago/monitoring/reboot-test.csv 2>/dev/null || echo 0)
|
|
VERIFIED=\$(grep -c ',verify,' /var/lib/archipelago/monitoring/reboot-test.csv 2>/dev/null || echo 0)
|
|
OK=\$(grep ',verify,.*,OK,' /var/lib/archipelago/monitoring/reboot-test.csv 2>/dev/null | wc -l || echo 0)
|
|
if [ \$VERIFIED -gt 0 ]; then
|
|
AVG=\$(grep ',verify,' /var/lib/archipelago/monitoring/reboot-test.csv | awk -F, '{sum+=\$7; n++} END {if(n>0) print int(sum/n); else print 0}')
|
|
echo \" Total reboots: \$REBOOTS\"
|
|
echo \" Verified recoveries: \$VERIFIED\"
|
|
echo \" Successful: \$OK\"
|
|
echo \" Avg recovery time: \${AVG}s\"
|
|
fi
|
|
else
|
|
echo ' No reboot test data (starts at 4 AM daily)'
|
|
fi
|
|
" 2>/dev/null
|
|
echo ""
|
|
|
|
# Federation sync
|
|
echo "═══ Federation Sync ═══"
|
|
ssh $SSH_OPTS "archipelago@${TARGET}" "
|
|
if [ -f /var/lib/archipelago/monitoring/sync-check.csv ]; then
|
|
TOTAL=\$(tail -n +2 /var/lib/archipelago/monitoring/sync-check.csv | wc -l)
|
|
OK=\$(awk -F, '\$2 > 0' /var/lib/archipelago/monitoring/sync-check.csv | wc -l)
|
|
if [ \$TOTAL -gt 0 ]; then
|
|
PCT=\$(python3 -c \"print(round(\$OK / \$TOTAL * 100, 1))\" 2>/dev/null || echo '?')
|
|
echo \" Total syncs: \$TOTAL\"
|
|
echo \" Successful: \$OK\"
|
|
echo \" Success rate: \${PCT}%\"
|
|
fi
|
|
else
|
|
echo ' No sync data yet'
|
|
fi
|
|
" 2>/dev/null
|
|
echo ""
|
|
|
|
# Memory trend
|
|
echo "═══ Memory Trend ═══"
|
|
ssh $SSH_OPTS "archipelago@${TARGET}" "
|
|
if [ -f /var/lib/archipelago/uptime-monitor/metrics.csv ]; then
|
|
echo ' First reading:'
|
|
head -2 /var/lib/archipelago/uptime-monitor/metrics.csv | tail -1 | awk -F, '{printf \" %s: %s/%s MB used\\n\", \$1, \$7, \$8}'
|
|
echo ' Latest reading:'
|
|
tail -1 /var/lib/archipelago/uptime-monitor/metrics.csv | awk -F, '{printf \" %s: %s/%s MB used\\n\", \$1, \$7, \$8}'
|
|
fi
|
|
" 2>/dev/null
|
|
echo ""
|
|
|
|
# Disk trend
|
|
echo "═══ Disk Trend ═══"
|
|
ssh $SSH_OPTS "archipelago@${TARGET}" "
|
|
if [ -f /var/lib/archipelago/uptime-monitor/metrics.csv ]; then
|
|
echo ' First reading:'
|
|
head -2 /var/lib/archipelago/uptime-monitor/metrics.csv | tail -1 | awk -F, '{printf \" %s: %s/%s GB\\n\", \$1, \$9, \$10}'
|
|
echo ' Latest reading:'
|
|
tail -1 /var/lib/archipelago/uptime-monitor/metrics.csv | awk -F, '{printf \" %s: %s/%s GB\\n\", \$1, \$9, \$10}'
|
|
fi
|
|
" 2>/dev/null
|
|
echo ""
|
|
|
|
# Container health
|
|
echo "═══ Container Health ═══"
|
|
ssh $SSH_OPTS "archipelago@${TARGET}" "
|
|
DOCKER=podman; command -v podman >/dev/null 2>&1 || DOCKER=docker
|
|
RUNNING=\$(sudo \$DOCKER ps --format '{{.Names}}' 2>/dev/null | wc -l)
|
|
EXITED=\$(sudo \$DOCKER ps -a --filter status=exited --format '{{.Names}}' 2>/dev/null | wc -l)
|
|
echo \" Running: \$RUNNING\"
|
|
echo \" Exited: \$EXITED\"
|
|
if [ \$EXITED -gt 0 ]; then
|
|
echo ' Exited containers:'
|
|
sudo \$DOCKER ps -a --filter status=exited --format ' {{.Names}}: {{.Status}}' 2>/dev/null
|
|
fi
|
|
" 2>/dev/null
|
|
echo ""
|
|
|
|
# OOM kills
|
|
echo "═══ OOM Kills ═══"
|
|
ssh $SSH_OPTS "archipelago@${TARGET}" "
|
|
OOM=\$(sudo dmesg --level=err,crit 2>/dev/null | grep -c 'oom-kill' || echo 0)
|
|
echo \" OOM kills since boot: \$OOM\"
|
|
" 2>/dev/null
|
|
echo ""
|
|
|
|
echo "═══ Report Complete ═══"
|