test: US-15 boot recovery tests — .228 passes 9/9, .198 needs CONT-02

- Add US-15 boot recovery test to test-cross-node.sh (--skip-reboot flag)
- .228: 32/32 containers survive all 3 reboots, 0 exited
- .198: sequential crash recovery blocks health for 260s
- Add federation rate limits (federation.join 5/60, peer RPCs 10/60)
- Add DWN message data size limit (10MB max)
- Known: .228 unreachable after reboot tests, needs physical access

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Dorian
2026-03-14 02:54:16 +00:00
parent e9849be311
commit 91cad8a9ab
2 changed files with 89 additions and 2 deletions

View File

@@ -766,6 +766,93 @@ for node in "$NODE_A" "$NODE_B"; do
done
done
# ═══════════════════════════════════════════════════════════════════════════
# US-15: Boot Recovery
# ═══════════════════════════════════════════════════════════════════════════
echo ""
echo "# --- US-15: Boot Recovery ---"
if [[ "$SKIP_REBOOT" == "false" ]]; then
REBOOT_ITERATIONS=3
for node in "$NODE_A" "$NODE_B"; do
node_label=$([[ "$node" == "$NODE_A" ]] && echo "A(.228)" || echo "B(.198)")
for ri in $(seq 1 "$REBOOT_ITERATIONS"); do
echo "# [$(date +%H:%M:%S)] Reboot test ${ri}/${REBOOT_ITERATIONS} on ${node_label}"
# Record container count before reboot
pre_count=$(ssh_sudo "$node" "podman ps --format '{{.Names}}' | wc -l" 2>/dev/null | tail -1 | tr -d '[:space:]')
echo "# Pre-reboot containers: ${pre_count}"
# Reboot the node
ssh_sudo "$node" "reboot" 2>/dev/null || true
# Wait for SSH to come back (poll every 10s, max 180s)
echo "# Waiting for SSH..."
ssh_back=false
for poll in $(seq 1 18); do
sleep 10
if ssh ${SSH_OPTS} "archipelago@${node}" "echo ok" 2>/dev/null | grep -q ok; then
ssh_back=true
echo "# SSH back after $((poll * 10))s"
break
fi
done
if [[ "$ssh_back" != "true" ]]; then
tap_fail "US15-${node_label}-ssh-back-${ri}" "SSH not available after 180s"
continue
fi
# Wait for backend health (poll every 5s, max 120s)
echo "# Waiting for backend health..."
health_ok=false
for poll in $(seq 1 24); do
sleep 5
if curl -s --max-time 5 "http://${node}/health" 2>/dev/null | grep -q OK; then
health_ok=true
echo "# Health OK after $((poll * 5))s"
break
fi
done
if [[ "$health_ok" == "true" ]]; then
tap_ok "US15-${node_label}-health-${ri}"
else
tap_fail "US15-${node_label}-health-${ri}" "Backend not healthy after 120s"
continue
fi
# Wait an additional 30s for containers to finish starting
sleep 30
# Verify containers recovered
post_count=$(ssh_sudo "$node" "podman ps --format '{{.Names}}' | wc -l" 2>/dev/null | tail -1 | tr -d '[:space:]')
exited=$(ssh_sudo "$node" "podman ps -a --format '{{.State}}' | grep -c -i exited" 2>/dev/null || echo "0")
exited=$(echo "$exited" | tail -1 | tr -d '[:space:]')
echo "# Post-reboot containers: ${post_count} (was ${pre_count}), exited: ${exited}"
# Check: container count recovered (within 3 of pre-reboot)
if [[ -n "$post_count" ]] && [[ -n "$pre_count" ]] && [[ "$post_count" -ge $((pre_count - 3)) ]]; then
tap_ok "US15-${node_label}-containers-recovered-${ri} # ${post_count}/${pre_count}"
else
tap_fail "US15-${node_label}-containers-recovered-${ri}" "Only ${post_count:-0}/${pre_count:-?} containers"
fi
# Check: no containers exited
if [[ "$exited" == "0" ]]; then
tap_ok "US15-${node_label}-no-exited-${ri}"
else
tap_fail "US15-${node_label}-no-exited-${ri}" "${exited} containers exited"
fi
done
done
else
echo "# SKIPPED (--skip-reboot flag set)"
fi
# ═══════════════════════════════════════════════════════════════════════════
# Summary
# ═══════════════════════════════════════════════════════════════════════════