test: US-15 boot recovery tests — .228 passes 9/9, .198 needs CONT-02
- Add US-15 boot recovery test to test-cross-node.sh (--skip-reboot flag) - .228: 32/32 containers survive all 3 reboots, 0 exited - .198: sequential crash recovery blocks health for 260s - Add federation rate limits (federation.join 5/60, peer RPCs 10/60) - Add DWN message data size limit (10MB max) - Known: .228 unreachable after reboot tests, needs physical access Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -766,6 +766,93 @@ for node in "$NODE_A" "$NODE_B"; do
|
||||
done
|
||||
done
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
# US-15: Boot Recovery
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
echo ""
|
||||
echo "# --- US-15: Boot Recovery ---"
|
||||
|
||||
if [[ "$SKIP_REBOOT" == "false" ]]; then
|
||||
REBOOT_ITERATIONS=3
|
||||
|
||||
for node in "$NODE_A" "$NODE_B"; do
|
||||
node_label=$([[ "$node" == "$NODE_A" ]] && echo "A(.228)" || echo "B(.198)")
|
||||
|
||||
for ri in $(seq 1 "$REBOOT_ITERATIONS"); do
|
||||
echo "# [$(date +%H:%M:%S)] Reboot test ${ri}/${REBOOT_ITERATIONS} on ${node_label}"
|
||||
|
||||
# Record container count before reboot
|
||||
pre_count=$(ssh_sudo "$node" "podman ps --format '{{.Names}}' | wc -l" 2>/dev/null | tail -1 | tr -d '[:space:]')
|
||||
echo "# Pre-reboot containers: ${pre_count}"
|
||||
|
||||
# Reboot the node
|
||||
ssh_sudo "$node" "reboot" 2>/dev/null || true
|
||||
|
||||
# Wait for SSH to come back (poll every 10s, max 180s)
|
||||
echo "# Waiting for SSH..."
|
||||
ssh_back=false
|
||||
for poll in $(seq 1 18); do
|
||||
sleep 10
|
||||
if ssh ${SSH_OPTS} "archipelago@${node}" "echo ok" 2>/dev/null | grep -q ok; then
|
||||
ssh_back=true
|
||||
echo "# SSH back after $((poll * 10))s"
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if [[ "$ssh_back" != "true" ]]; then
|
||||
tap_fail "US15-${node_label}-ssh-back-${ri}" "SSH not available after 180s"
|
||||
continue
|
||||
fi
|
||||
|
||||
# Wait for backend health (poll every 5s, max 120s)
|
||||
echo "# Waiting for backend health..."
|
||||
health_ok=false
|
||||
for poll in $(seq 1 24); do
|
||||
sleep 5
|
||||
if curl -s --max-time 5 "http://${node}/health" 2>/dev/null | grep -q OK; then
|
||||
health_ok=true
|
||||
echo "# Health OK after $((poll * 5))s"
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if [[ "$health_ok" == "true" ]]; then
|
||||
tap_ok "US15-${node_label}-health-${ri}"
|
||||
else
|
||||
tap_fail "US15-${node_label}-health-${ri}" "Backend not healthy after 120s"
|
||||
continue
|
||||
fi
|
||||
|
||||
# Wait an additional 30s for containers to finish starting
|
||||
sleep 30
|
||||
|
||||
# Verify containers recovered
|
||||
post_count=$(ssh_sudo "$node" "podman ps --format '{{.Names}}' | wc -l" 2>/dev/null | tail -1 | tr -d '[:space:]')
|
||||
exited=$(ssh_sudo "$node" "podman ps -a --format '{{.State}}' | grep -c -i exited" 2>/dev/null || echo "0")
|
||||
exited=$(echo "$exited" | tail -1 | tr -d '[:space:]')
|
||||
|
||||
echo "# Post-reboot containers: ${post_count} (was ${pre_count}), exited: ${exited}"
|
||||
|
||||
# Check: container count recovered (within 3 of pre-reboot)
|
||||
if [[ -n "$post_count" ]] && [[ -n "$pre_count" ]] && [[ "$post_count" -ge $((pre_count - 3)) ]]; then
|
||||
tap_ok "US15-${node_label}-containers-recovered-${ri} # ${post_count}/${pre_count}"
|
||||
else
|
||||
tap_fail "US15-${node_label}-containers-recovered-${ri}" "Only ${post_count:-0}/${pre_count:-?} containers"
|
||||
fi
|
||||
|
||||
# Check: no containers exited
|
||||
if [[ "$exited" == "0" ]]; then
|
||||
tap_ok "US15-${node_label}-no-exited-${ri}"
|
||||
else
|
||||
tap_fail "US15-${node_label}-no-exited-${ri}" "${exited} containers exited"
|
||||
fi
|
||||
done
|
||||
done
|
||||
else
|
||||
echo "# SKIPPED (--skip-reboot flag set)"
|
||||
fi
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
# Summary
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
Reference in New Issue
Block a user