From ebffe3ace5ffd5f650c54602b105458dac1e4d62 Mon Sep 17 00:00:00 2001 From: archipelago Date: Fri, 1 May 2026 16:17:27 -0400 Subject: [PATCH] test(lifecycle): regression gate for FM3 cgroup-cascade SIGKILL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sister suite to companion-survives-archipelago-restart.bats. That one tests the same property for UI companions, which already ship via Quadlet (commit 6e716f68) and so already pass. This new suite tests the property for backend containers (bitcoin-knots / bitcoin-core / lnd / electrumx). Until v1.7.52 Phase 3 ships these under Quadlet too, the suite is EXPECTED TO FAIL on fleet boxes — it's the executable definition of "FM3 fixed". Observed live on .198 on 2026-05-01: `sudo systemctl stop archipelago` killed every container in archipelago.service's cgroup. The dedicated "backends survive archipelago restart" test catches exactly that, and also verifies the SAME container instance survives (compares pre/post .Id), so an orchestrator that recreates a fresh container after the SIGKILL doesn't read as pass. Three @test cases: * destructive gate (skip-marker for the suite) * baseline: at least one backend installed + running * backends survive: same .Id pre + post archipelago restart Don't gate releases on this passing until Phase 3 lands; before then treat it as a "expected to fail / shows progress" indicator. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../backend-survives-archipelago-restart.bats | 107 ++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 tests/lifecycle/bats/backend-survives-archipelago-restart.bats diff --git a/tests/lifecycle/bats/backend-survives-archipelago-restart.bats b/tests/lifecycle/bats/backend-survives-archipelago-restart.bats new file mode 100644 index 00000000..bb962eaa --- /dev/null +++ b/tests/lifecycle/bats/backend-survives-archipelago-restart.bats @@ -0,0 +1,107 @@ +#!/usr/bin/env bats +# tests/lifecycle/bats/backend-survives-archipelago-restart.bats +# +# Quadlet-everywhere promise (Phase 3 of v1.7.52): backend containers +# (bitcoin-knots / lnd / electrumx) are managed by systemd via Quadlet +# units, NOT parented under archipelago.service's cgroup. Restarting the +# archipelago service must NOT take them down. +# +# This is the regression gate for FM3 (cgroup cascade SIGKILL — observed +# live on .198 on 2026-05-01: stopping archipelago.service killed every +# container in its cgroup, leaving the box in a multi-hour recovery +# loop). Until v1.7.52 Phase 3 ships, this suite is EXPECTED TO FAIL on +# fleet boxes — it serves as the executable definition of "Phase 3 +# complete". Do not gate the release on it passing pre-Phase-3. +# +# Sister to companion-survives-archipelago-restart.bats which tests the +# same property for UI companions (already shipping via Quadlet since +# commit 6e716f68). +# +# Gated by ARCHY_ALLOW_DESTRUCTIVE=1 because it bounces archipelago. + +backend_units=( + "bitcoin-knots" + "bitcoin-core" + "lnd" + "electrumx" +) + +container_running() { + local name="$1" + [[ "$(podman inspect --format '{{.State.Running}}' "$name" 2>/dev/null)" == "true" ]] +} + +wait_archipelago_back() { + local timeout="${1:-60}" + local deadline=$(( $(date +%s) + timeout )) + while (( $(date +%s) < deadline )); do + if curl -fsS -o /dev/null "http://127.0.0.1:5678/health" 2>/dev/null; then + return 0 + fi + sleep 2 + done + return 1 +} + +@test "destructive gate enabled" { + [[ "${ARCHY_ALLOW_DESTRUCTIVE:-0}" == "1" ]] || skip "ARCHY_ALLOW_DESTRUCTIVE not set" +} + +@test "at least one backend container is running before restart" { + [[ "${ARCHY_ALLOW_DESTRUCTIVE:-0}" == "1" ]] || skip "ARCHY_ALLOW_DESTRUCTIVE not set" + local up=0 + for c in "${backend_units[@]}"; do + if container_running "$c"; then + up=$(( up + 1 )) + fi + done + (( up > 0 )) || skip "No backends installed on this node" +} + +@test "backends survive archipelago restart" { + [[ "${ARCHY_ALLOW_DESTRUCTIVE:-0}" == "1" ]] || skip "ARCHY_ALLOW_DESTRUCTIVE not set" + + # Snapshot: which backends were up before we touched anything. + local before=() + for c in "${backend_units[@]}"; do + if container_running "$c"; then + before+=("$c") + fi + done + (( ${#before[@]} > 0 )) || skip "No backends installed on this node" + + # Capture pre-restart container IDs so we can verify the SAME process + # survives — not "the orchestrator started a fresh container after the + # cascade SIGKILL'd the original" (which would also be a fail; FM3 is + # specifically about losing the running container, even if the + # orchestrator can recreate one minutes later). + declare -A pre_id + for c in "${before[@]}"; do + pre_id["$c"]=$(podman inspect --format '{{.Id}}' "$c" 2>/dev/null || echo "") + done + + # Bounce archipelago. Same approach as companion-survives-* for parity. + if systemctl --user list-units --no-legend archipelago.service | grep -q archipelago; then + systemctl --user restart archipelago.service + else + sudo systemctl restart archipelago.service + fi + + run wait_archipelago_back 60 + [ "$status" -eq 0 ] + + # Every backend that was up before must still be up after, AND it must + # be the SAME container instance (same .Id). A different .Id means the + # original was killed and a fresh one was created — that's the FM3 + # failure we're catching. + for c in "${before[@]}"; do + run container_running "$c" + [ "$status" -eq 0 ] || fail "backend $c died across archipelago restart (FM3 cgroup cascade)" + + local post_id + post_id=$(podman inspect --format '{{.Id}}' "$c" 2>/dev/null || echo "") + [[ -n "$post_id" ]] || fail "backend $c has no container id after restart" + [[ "$post_id" == "${pre_id[$c]}" ]] \ + || fail "backend $c was recreated across archipelago restart (FM3): pre=${pre_id[$c]:0:12} post=${post_id:0:12}" + done +}