From 44f275eda43c240eda288de00a2e760f54383bb4 Mon Sep 17 00:00:00 2001 From: archipelago Date: Sat, 2 May 2026 07:14:48 -0400 Subject: [PATCH] fix(quadlet): TimeoutStartSec=600 when Notify=healthy is set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bug surfaced live on .228 2026-05-02 — every backend Quadlet unit (lnd, electrumx, fedimint, btcpay-server, mempool-api, bitcoin-knots) hit systemd's default 90s start timeout because Notify=healthy makes systemctl wait for the first green health probe, but HealthInterval=30s × HealthRetries=3 = 90s minimum even on a healthy service. Race: timeout fires the moment the third probe MIGHT succeed. Result was three different post-states (inactive+running, failed+missing, inactive+stopped) depending on whether systemd's ExecStopPost ran podman rm before the orchestrator's adoption logic re-grabbed the container. Fix: when health is set, render TimeoutStartSec=600 (10 minutes) into [Service]. Long enough for slow-starting backends (electrumx index replay, lnd wallet unlock) without being so long that a truly stuck unit hangs forever. Companions stay unchanged (no health → no override, default 90s applies). Co-Authored-By: Claude Opus 4.7 (1M context) --- core/archipelago/src/container/quadlet.rs | 25 ++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/core/archipelago/src/container/quadlet.rs b/core/archipelago/src/container/quadlet.rs index ecd5b07a..1ec54d56 100644 --- a/core/archipelago/src/container/quadlet.rs +++ b/core/archipelago/src/container/quadlet.rs @@ -257,6 +257,20 @@ impl QuadletUnit { // OnFailure (clean stops stay stopped). let _ = writeln!(s, "Restart={}", self.restart_policy.as_systemd()); let _ = writeln!(s, "RestartSec=10"); + if self.health.is_some() { + // Notify=healthy makes systemd block the unit's "started" + // state on the first green health probe. systemd's default + // TimeoutStartSec is 90s — but `HealthInterval=30s` × + // `HealthRetries=3` is itself 90s, so the timeout fires the + // moment the third probe MIGHT succeed. On .228 every backend + // (lnd, electrumx, fedimint, btcpay-server, mempool-api, + // bitcoin-knots) timed out at 90s and systemd terminated the + // container while it was still warming up. Bump to 600s — long + // enough for slow-starting backends (electrumx replays its + // index, lnd unlocks its wallet) without being so long that a + // truly stuck unit hangs forever. + let _ = writeln!(s, "TimeoutStartSec=600"); + } let _ = writeln!(s); let _ = writeln!(s, "[Install]"); let _ = writeln!(s, "WantedBy=default.target"); @@ -876,17 +890,22 @@ app: assert!(s.contains("HealthTimeout=5s")); assert!(s.contains("HealthRetries=3")); assert!(s.contains("Notify=healthy")); + // Notify=healthy needs a long-enough TimeoutStartSec or systemd + // kills the unit before the first probe can pass — observed live + // on .228 2026-05-02 across all six backends. + assert!(s.contains("TimeoutStartSec=600"), "got: {s}"); } #[test] fn render_skips_health_directives_when_absent() { - // No health spec → no Notify=healthy and no HealthCmd, so companion - // units (which never set health) keep their existing behavior: the - // unit is "started" the moment the process spawns. + // No health spec → no Notify=healthy, no HealthCmd, no + // TimeoutStartSec override (default 90s applies). Companions rely + // on this so their rendered bytes stay unchanged. let s = sample_unit().render(); assert!(!s.contains("HealthCmd=")); assert!(!s.contains("Notify=healthy")); assert!(!s.contains("HealthRetries=")); + assert!(!s.contains("TimeoutStartSec=")); } #[test]