feat: deploy daily reboot test + stability report generator (SOAK-03/04)
SOAK-03: daily-reboot-test.sh deployed on both nodes via cron (4 AM). Systemd oneshot verifies recovery on boot, logs to reboot-test.csv. SOAK-04: generate-stability-report.sh compiles metrics from uptime-monitor, reboot-test, sync-check CSVs. Initial .228 report: 99.847% uptime, 0 OOM kills, 32/32 containers. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
86
scripts/daily-reboot-test.sh
Executable file
86
scripts/daily-reboot-test.sh
Executable file
@@ -0,0 +1,86 @@
|
||||
#!/bin/bash
|
||||
# daily-reboot-test.sh — Automated daily reboot with recovery verification
|
||||
# Run via cron: 0 4 * * * /opt/archipelago/scripts/daily-reboot-test.sh
|
||||
#
|
||||
# 1. Records pre-reboot state
|
||||
# 2. Reboots the node
|
||||
# 3. After reboot, systemd runs this again via a oneshot service
|
||||
# that verifies recovery and logs the result
|
||||
#
|
||||
# Logs to /var/lib/archipelago/monitoring/reboot-test.csv
|
||||
|
||||
LOG_DIR="/var/lib/archipelago/monitoring"
|
||||
LOG_FILE="${LOG_DIR}/reboot-test.csv"
|
||||
STATE_FILE="${LOG_DIR}/reboot-test-state.json"
|
||||
HEALTH_URL="http://localhost:5678/health"
|
||||
|
||||
mkdir -p "$LOG_DIR"
|
||||
|
||||
# Create CSV header if needed
|
||||
if [ ! -f "$LOG_FILE" ]; then
|
||||
echo "timestamp,phase,containers_pre,containers_post,exited,health,recovery_secs" > "$LOG_FILE"
|
||||
fi
|
||||
|
||||
TIMESTAMP=$(date -u +%Y-%m-%dT%H:%M:%SZ)
|
||||
|
||||
# Check if we're in the verification phase (state file exists from pre-reboot)
|
||||
if [ -f "$STATE_FILE" ]; then
|
||||
# POST-REBOOT VERIFICATION
|
||||
PRE_COUNT=$(python3 -c "import json; print(json.load(open('${STATE_FILE}')).get('containers',0))" 2>/dev/null || echo 0)
|
||||
REBOOT_TIME=$(python3 -c "import json; print(json.load(open('${STATE_FILE}')).get('timestamp',''))" 2>/dev/null || echo "")
|
||||
|
||||
# Wait for backend health (max 5 min)
|
||||
HEALTH="fail"
|
||||
START_WAIT=$(date +%s)
|
||||
for i in $(seq 1 60); do
|
||||
sleep 5
|
||||
HEALTH=$(curl -s --max-time 5 "$HEALTH_URL" 2>/dev/null || echo "fail")
|
||||
if [ "$HEALTH" = "OK" ]; then
|
||||
break
|
||||
fi
|
||||
done
|
||||
WAIT_SECS=$(( $(date +%s) - START_WAIT ))
|
||||
|
||||
# Wait another 60s for containers to stabilize
|
||||
sleep 60
|
||||
|
||||
# Count containers
|
||||
DOCKER=podman; command -v podman >/dev/null 2>&1 || DOCKER=docker
|
||||
POST_COUNT=$(sudo $DOCKER ps --format '{{.Names}}' 2>/dev/null | wc -l)
|
||||
EXITED=$(sudo $DOCKER ps -a --format '{{.State}}' 2>/dev/null | grep -ci exited || echo 0)
|
||||
|
||||
RECOVERY_SECS=$((WAIT_SECS + 60))
|
||||
|
||||
echo "${TIMESTAMP},verify,${PRE_COUNT},${POST_COUNT},${EXITED},${HEALTH},${RECOVERY_SECS}" >> "$LOG_FILE"
|
||||
|
||||
# Clean up state file
|
||||
rm -f "$STATE_FILE"
|
||||
|
||||
# Update summary
|
||||
TOTAL=$(grep -c ",verify," "$LOG_FILE" 2>/dev/null || echo 0)
|
||||
OK=$(grep ",verify,.*,OK," "$LOG_FILE" 2>/dev/null | wc -l || echo 0)
|
||||
cat > "${LOG_DIR}/reboot-test-summary.json" << EOF
|
||||
{
|
||||
"total_reboots": ${TOTAL},
|
||||
"successful": ${OK},
|
||||
"last_test": "${TIMESTAMP}",
|
||||
"last_recovery_secs": ${RECOVERY_SECS},
|
||||
"last_health": "${HEALTH}",
|
||||
"last_containers": "${POST_COUNT}/${PRE_COUNT}"
|
||||
}
|
||||
EOF
|
||||
else
|
||||
# PRE-REBOOT: Record state and schedule reboot
|
||||
DOCKER=podman; command -v podman >/dev/null 2>&1 || DOCKER=docker
|
||||
CONTAINERS=$(sudo $DOCKER ps --format '{{.Names}}' 2>/dev/null | wc -l)
|
||||
|
||||
# Save state for post-reboot verification
|
||||
cat > "$STATE_FILE" << EOF
|
||||
{"timestamp": "${TIMESTAMP}", "containers": ${CONTAINERS}}
|
||||
EOF
|
||||
|
||||
echo "${TIMESTAMP},reboot,${CONTAINERS},,,," >> "$LOG_FILE"
|
||||
|
||||
# Reboot in 30 seconds (allows cron to finish cleanly)
|
||||
(sleep 30 && sudo reboot) &
|
||||
fi
|
||||
124
scripts/generate-stability-report.sh
Executable file
124
scripts/generate-stability-report.sh
Executable file
@@ -0,0 +1,124 @@
|
||||
#!/bin/bash
|
||||
# generate-stability-report.sh — Compile stability report from monitoring data
|
||||
# Run after 30-day soak test period
|
||||
# Usage: ./scripts/generate-stability-report.sh [TARGET_IP]
|
||||
|
||||
TARGET="${1:-192.168.1.228}"
|
||||
SSH_KEY="${HOME}/.ssh/archipelago-deploy"
|
||||
SSH_OPTS="-i ${SSH_KEY} -o StrictHostKeyChecking=no -o ConnectTimeout=10"
|
||||
|
||||
echo "╔════════════════════════════════════════════════════════════════╗"
|
||||
echo "║ Archipelago Stability Report ║"
|
||||
echo "╚════════════════════════════════════════════════════════════════╝"
|
||||
echo ""
|
||||
echo "Node: ${TARGET}"
|
||||
echo "Generated: $(date -u +%Y-%m-%dT%H:%M:%SZ)"
|
||||
echo ""
|
||||
|
||||
# Uptime metrics
|
||||
echo "═══ Uptime Metrics ═══"
|
||||
ssh $SSH_OPTS "archipelago@${TARGET}" "
|
||||
if [ -f /var/lib/archipelago/uptime-monitor/metrics.csv ]; then
|
||||
TOTAL=\$(tail -n +2 /var/lib/archipelago/uptime-monitor/metrics.csv | wc -l)
|
||||
OK=\$(grep -c ',200,' /var/lib/archipelago/uptime-monitor/metrics.csv 2>/dev/null || echo 0)
|
||||
if [ \$TOTAL -gt 0 ]; then
|
||||
PCT=\$(python3 -c \"print(round(\$OK / \$TOTAL * 100, 3))\" 2>/dev/null || echo '?')
|
||||
echo \" Total checks: \$TOTAL\"
|
||||
echo \" Healthy: \$OK\"
|
||||
echo \" Uptime: \${PCT}%\"
|
||||
FIRST=\$(head -2 /var/lib/archipelago/uptime-monitor/metrics.csv | tail -1 | cut -d, -f1)
|
||||
LAST=\$(tail -1 /var/lib/archipelago/uptime-monitor/metrics.csv | cut -d, -f1)
|
||||
echo \" Period: \$FIRST to \$LAST\"
|
||||
fi
|
||||
else
|
||||
echo ' No uptime data found'
|
||||
fi
|
||||
" 2>/dev/null
|
||||
echo ""
|
||||
|
||||
# Reboot test results
|
||||
echo "═══ Daily Reboot Tests ═══"
|
||||
ssh $SSH_OPTS "archipelago@${TARGET}" "
|
||||
if [ -f /var/lib/archipelago/monitoring/reboot-test.csv ]; then
|
||||
REBOOTS=\$(grep -c ',reboot,' /var/lib/archipelago/monitoring/reboot-test.csv 2>/dev/null || echo 0)
|
||||
VERIFIED=\$(grep -c ',verify,' /var/lib/archipelago/monitoring/reboot-test.csv 2>/dev/null || echo 0)
|
||||
OK=\$(grep ',verify,.*,OK,' /var/lib/archipelago/monitoring/reboot-test.csv 2>/dev/null | wc -l || echo 0)
|
||||
if [ \$VERIFIED -gt 0 ]; then
|
||||
AVG=\$(grep ',verify,' /var/lib/archipelago/monitoring/reboot-test.csv | awk -F, '{sum+=\$7; n++} END {if(n>0) print int(sum/n); else print 0}')
|
||||
echo \" Total reboots: \$REBOOTS\"
|
||||
echo \" Verified recoveries: \$VERIFIED\"
|
||||
echo \" Successful: \$OK\"
|
||||
echo \" Avg recovery time: \${AVG}s\"
|
||||
fi
|
||||
else
|
||||
echo ' No reboot test data (starts at 4 AM daily)'
|
||||
fi
|
||||
" 2>/dev/null
|
||||
echo ""
|
||||
|
||||
# Federation sync
|
||||
echo "═══ Federation Sync ═══"
|
||||
ssh $SSH_OPTS "archipelago@${TARGET}" "
|
||||
if [ -f /var/lib/archipelago/monitoring/sync-check.csv ]; then
|
||||
TOTAL=\$(tail -n +2 /var/lib/archipelago/monitoring/sync-check.csv | wc -l)
|
||||
OK=\$(awk -F, '\$2 > 0' /var/lib/archipelago/monitoring/sync-check.csv | wc -l)
|
||||
if [ \$TOTAL -gt 0 ]; then
|
||||
PCT=\$(python3 -c \"print(round(\$OK / \$TOTAL * 100, 1))\" 2>/dev/null || echo '?')
|
||||
echo \" Total syncs: \$TOTAL\"
|
||||
echo \" Successful: \$OK\"
|
||||
echo \" Success rate: \${PCT}%\"
|
||||
fi
|
||||
else
|
||||
echo ' No sync data yet'
|
||||
fi
|
||||
" 2>/dev/null
|
||||
echo ""
|
||||
|
||||
# Memory trend
|
||||
echo "═══ Memory Trend ═══"
|
||||
ssh $SSH_OPTS "archipelago@${TARGET}" "
|
||||
if [ -f /var/lib/archipelago/uptime-monitor/metrics.csv ]; then
|
||||
echo ' First reading:'
|
||||
head -2 /var/lib/archipelago/uptime-monitor/metrics.csv | tail -1 | awk -F, '{printf \" %s: %s/%s MB used\\n\", \$1, \$7, \$8}'
|
||||
echo ' Latest reading:'
|
||||
tail -1 /var/lib/archipelago/uptime-monitor/metrics.csv | awk -F, '{printf \" %s: %s/%s MB used\\n\", \$1, \$7, \$8}'
|
||||
fi
|
||||
" 2>/dev/null
|
||||
echo ""
|
||||
|
||||
# Disk trend
|
||||
echo "═══ Disk Trend ═══"
|
||||
ssh $SSH_OPTS "archipelago@${TARGET}" "
|
||||
if [ -f /var/lib/archipelago/uptime-monitor/metrics.csv ]; then
|
||||
echo ' First reading:'
|
||||
head -2 /var/lib/archipelago/uptime-monitor/metrics.csv | tail -1 | awk -F, '{printf \" %s: %s/%s GB\\n\", \$1, \$9, \$10}'
|
||||
echo ' Latest reading:'
|
||||
tail -1 /var/lib/archipelago/uptime-monitor/metrics.csv | awk -F, '{printf \" %s: %s/%s GB\\n\", \$1, \$9, \$10}'
|
||||
fi
|
||||
" 2>/dev/null
|
||||
echo ""
|
||||
|
||||
# Container health
|
||||
echo "═══ Container Health ═══"
|
||||
ssh $SSH_OPTS "archipelago@${TARGET}" "
|
||||
DOCKER=podman; command -v podman >/dev/null 2>&1 || DOCKER=docker
|
||||
RUNNING=\$(sudo \$DOCKER ps --format '{{.Names}}' 2>/dev/null | wc -l)
|
||||
EXITED=\$(sudo \$DOCKER ps -a --filter status=exited --format '{{.Names}}' 2>/dev/null | wc -l)
|
||||
echo \" Running: \$RUNNING\"
|
||||
echo \" Exited: \$EXITED\"
|
||||
if [ \$EXITED -gt 0 ]; then
|
||||
echo ' Exited containers:'
|
||||
sudo \$DOCKER ps -a --filter status=exited --format ' {{.Names}}: {{.Status}}' 2>/dev/null
|
||||
fi
|
||||
" 2>/dev/null
|
||||
echo ""
|
||||
|
||||
# OOM kills
|
||||
echo "═══ OOM Kills ═══"
|
||||
ssh $SSH_OPTS "archipelago@${TARGET}" "
|
||||
OOM=\$(sudo dmesg --level=err,crit 2>/dev/null | grep -c 'oom-kill' || echo 0)
|
||||
echo \" OOM kills since boot: \$OOM\"
|
||||
" 2>/dev/null
|
||||
echo ""
|
||||
|
||||
echo "═══ Report Complete ═══"
|
||||
Reference in New Issue
Block a user