From 4f503df6f182f1ed223ccc7a44bd4609c9e5e74e Mon Sep 17 00:00:00 2001 From: archipelago Date: Fri, 1 May 2026 16:32:59 -0400 Subject: [PATCH] test(bootstrap): regression gate for the heal_podman_state socket bug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extracted the heal_podman_state cleanup list as a module-level HEAL_RUNTIME_SUBDIRS const so a unit test can structurally enforce the invariant: the list must contain "containers" + "libpod" but must NOT contain "podman" (which holds systemd's podman.sock listener and was the bug fixed in commit bb421803). If anyone re-adds "podman" — accidentally, by reverting, or by copy-paste from old plan memory — this test fires before we ship, not on the next deploy when it nukes the orchestrator's HTTP path. Total tests: 614 → 615. Co-Authored-By: Claude Opus 4.7 (1M context) --- core/archipelago/src/bootstrap.rs | 38 ++++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/core/archipelago/src/bootstrap.rs b/core/archipelago/src/bootstrap.rs index 5666644d..685b62a6 100644 --- a/core/archipelago/src/bootstrap.rs +++ b/core/archipelago/src/bootstrap.rs @@ -122,15 +122,23 @@ enum PodmanHealOutcome { /// - Networking corruption (netavark cache). Currently `podman info` /// doesn't diagnose that; if cleanup doesn't fix it, the operator /// will see the warning in the journal. +/// Subdirectories of `$XDG_RUNTIME_DIR` that hold podman's transient +/// state and are safe to remove when `podman info` is wedged. The +/// `podman/` subdir is **deliberately absent** — that's where systemd's +/// socket-activated `podman.sock` listener lives. Removing it would +/// silently break every libpod HTTP call from the orchestrator until +/// `systemctl --user restart podman.socket`. See +/// `heal_podman_state` docstring for the full rationale and the +/// `heal_podman_state_does_not_clean_socket_dir` regression test. +const HEAL_RUNTIME_SUBDIRS: &[&str] = &["containers", "libpod"]; + async fn heal_podman_state() -> Result { if probe_podman_ok().await { return Ok(PodmanHealOutcome::Healthy); } - // Wedged. Clean runtime state and try again. Note: `podman/` is - // intentionally absent from this list — see fn docstring. let xdg = std::env::var("XDG_RUNTIME_DIR") .context("XDG_RUNTIME_DIR not set; can't locate podman runtime state to clean")?; - for sub in &["containers", "libpod"] { + for sub in HEAL_RUNTIME_SUBDIRS { let path = PathBuf::from(&xdg).join(sub); match fs::remove_dir_all(&path).await { Ok(()) => debug!(path = %path.display(), "removed podman runtime state dir"), @@ -602,3 +610,27 @@ async fn run_nginx() -> Result { let _ = host_sudo(&["rm", "-f", &backup]).await; Ok(true) } + +#[cfg(test)] +mod tests { + use super::*; + + /// Regression gate for the 2026-05-01 bootstrap bug: heal_podman_state + /// was removing $XDG_RUNTIME_DIR/podman/ alongside containers/ and + /// libpod/, which silently broke the systemd-bound podman.sock and + /// every libpod HTTP call from the orchestrator. If anyone re-adds + /// "podman" to HEAL_RUNTIME_SUBDIRS this test fires before we ship. + #[test] + fn heal_podman_state_does_not_clean_socket_dir() { + assert!( + !HEAL_RUNTIME_SUBDIRS.contains(&"podman"), + "HEAL_RUNTIME_SUBDIRS must not include 'podman' — that dir holds \ + systemd's podman.sock listener; removing it breaks every libpod \ + HTTP call from the orchestrator. See bootstrap.rs commit bb421803." + ); + // Sanity: the actually-runtime-state dirs are still in the list so + // we don't accidentally turn the heal into a no-op. + assert!(HEAL_RUNTIME_SUBDIRS.contains(&"containers")); + assert!(HEAL_RUNTIME_SUBDIRS.contains(&"libpod")); + } +}