refactor(container): move companion UIs to systemd via Quadlet
Companion UI containers (archy-bitcoin-ui, archy-lnd-ui, archy-electrs-ui) used to be launched as fire-and-forget tokio::spawn blocks from install.rs. If archipelago crashed mid-spawn or the container's cgroup was reaped, companions vanished from podman ps -a and only a manual rm/run could bring them back (the .228 incident). Now each companion is rendered as a Quadlet .container unit under ~/.config/containers/systemd/, daemon-reloaded, and started via systemctl --user. systemd owns supervision from that point on: - archipelago can crash, restart, or be uninstalled without touching any companion. - Quadlet's Restart=always + RestartSec=10 handles container exits. - A 30s reconcile tick in boot_reconciler enumerates expected companion units and re-installs any whose unit file or service vanished — defense-in-depth against external tampering. New module layout: - container/quadlet.rs: pure unit renderer + atomic write_if_changed + systemctl helpers (daemon_reload_user / enable_now / disable_remove / is_active). 6 unit tests, no I/O in the renderer. - container/companion.rs: per-app companion specs, install/remove/ reconcile, image presence (build local first, fall back to insecure registry only via image_uses_insecure_registry whitelist). 2 tests. install.rs handle_package_install now ends with a single call to companion::install_for(package_id), replacing 287 lines of spawn-and- hope shellouts plus a ~120-line nginx auth-injector helper that worked around per-node RPC password baking. The helper is gone too — the pre-start hook renders the per-node nginx.conf to /var/lib/archipelago/ bitcoin-ui/nginx.conf and the Quadlet unit bind-mounts it read-only. runtime.rs handle_package_uninstall now disables companions before the container rm loop. Otherwise systemd's Restart=always would respawn each companion within ~10s of removal. Tests: 53 container tests pass, including 6 quadlet renderer tests (host network, bridge network, capability set, atomic write idempotence) and 2 companion specs (per-app companion lookup, build_unit shape). boot_reconciler tests gain a #[cfg(test)] without_companion_stage() flag so the paused-clock fixtures don't race the real systemctl I/O. A bats regression test (companion-survives-archipelago-restart.bats, gated on ARCHY_ALLOW_DESTRUCTIVE=1) asserts the .228 failure mode cannot recur: every installed companion has a unit file, services stay active across systemctl --user restart archipelago, and a deleted unit file is recreated within one reconcile tick. Net delta: +941 / -363, but the +941 is mostly tests (~440 lines) and the new declarative layer; the imperative tokio::spawn block and its nginx-auth helper are gone, removing two failure classes (orphan companions on archipelago crash, and post-start exec races under tightly-confined cgroups) that previously needed manual SSH recovery. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -83,3 +83,4 @@ scripts/resilience/reports/
|
|||||||
.pnpm-store/
|
.pnpm-store/
|
||||||
**/__pycache__/
|
**/__pycache__/
|
||||||
*.bak
|
*.bak
|
||||||
|
.claude/scheduled_tasks.lock
|
||||||
|
|||||||
@@ -32,130 +32,6 @@ pub(in crate::api::rpc) async fn install_log(msg: &str) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Patch the Bitcoin RPC `Authorization: Basic ...` header inside the running
|
|
||||||
/// bitcoin-ui container's nginx config and reload nginx. Authoritative
|
|
||||||
/// credential injection — runs whether the image was built locally or pulled
|
|
||||||
/// from the registry. Without this, registry images ship with whatever auth
|
|
||||||
/// header was baked at build time on the publisher's machine, which never
|
|
||||||
/// matches the per-node randomly-generated bitcoin-rpc-password.
|
|
||||||
///
|
|
||||||
/// Implementation note: this used to do `podman exec sed`, but rootless
|
|
||||||
/// podman + tightly-confined containers (--cap-drop=ALL, restricted user)
|
|
||||||
/// reject the exec because crun can't add a new process to the container's
|
|
||||||
/// cgroup ("write cgroup.procs: Permission denied"). Switched to
|
|
||||||
/// `podman cp` (storage layer, no cgroup join) + `podman kill --signal=SIGHUP`
|
|
||||||
/// (signal to existing PID 1, no new process needed). Verified on .228.
|
|
||||||
async fn inject_bitcoin_rpc_auth_into_running_container(container: &str, auth_b64: &str) {
|
|
||||||
use rand::distributions::{Alphanumeric, DistString};
|
|
||||||
let token = Alphanumeric.sample_string(&mut rand::thread_rng(), 8);
|
|
||||||
let host_path = format!("/tmp/archy-{container}-nginx.conf-{token}");
|
|
||||||
let in_container = "/etc/nginx/conf.d/default.conf";
|
|
||||||
|
|
||||||
// 1. Copy the running config out to host
|
|
||||||
let cp_out = tokio::process::Command::new("podman")
|
|
||||||
.args(["cp", &format!("{container}:{in_container}"), &host_path])
|
|
||||||
.output()
|
|
||||||
.await;
|
|
||||||
if let Err(e) = cp_out {
|
|
||||||
warn!("inject auth: podman cp out failed for {}: {}", container, e);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
if let Ok(ref o) = cp_out {
|
|
||||||
if !o.status.success() {
|
|
||||||
warn!(
|
|
||||||
"inject auth: podman cp out failed for {}: {}",
|
|
||||||
container,
|
|
||||||
String::from_utf8_lossy(&o.stderr)
|
|
||||||
);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// 2. Patch the auth line on disk
|
|
||||||
let content = match tokio::fs::read_to_string(&host_path).await {
|
|
||||||
Ok(c) => c,
|
|
||||||
Err(e) => {
|
|
||||||
warn!("inject auth: read {} failed: {}", host_path, e);
|
|
||||||
let _ = tokio::fs::remove_file(&host_path).await;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
let mut patched_any = false;
|
|
||||||
let updated: String = content
|
|
||||||
.lines()
|
|
||||||
.map(|line| {
|
|
||||||
if line.contains("proxy_set_header Authorization") && line.contains("Basic") {
|
|
||||||
patched_any = true;
|
|
||||||
format!(
|
|
||||||
" proxy_set_header Authorization \"Basic {}\";",
|
|
||||||
auth_b64
|
|
||||||
)
|
|
||||||
} else {
|
|
||||||
line.to_string()
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.collect::<Vec<_>>()
|
|
||||||
.join("\n");
|
|
||||||
if !patched_any {
|
|
||||||
warn!(
|
|
||||||
"inject auth: no Authorization line matched in {}'s nginx.conf",
|
|
||||||
container
|
|
||||||
);
|
|
||||||
let _ = tokio::fs::remove_file(&host_path).await;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
if let Err(e) = tokio::fs::write(&host_path, format!("{}\n", updated)).await {
|
|
||||||
warn!("inject auth: write back failed: {}", e);
|
|
||||||
let _ = tokio::fs::remove_file(&host_path).await;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 3. Copy patched config back into the container
|
|
||||||
let cp_in = tokio::process::Command::new("podman")
|
|
||||||
.args(["cp", &host_path, &format!("{container}:{in_container}")])
|
|
||||||
.output()
|
|
||||||
.await;
|
|
||||||
let _ = tokio::fs::remove_file(&host_path).await;
|
|
||||||
match cp_in {
|
|
||||||
Ok(o) if !o.status.success() => {
|
|
||||||
warn!(
|
|
||||||
"inject auth: podman cp in failed for {}: {}",
|
|
||||||
container,
|
|
||||||
String::from_utf8_lossy(&o.stderr)
|
|
||||||
);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
warn!("inject auth: podman cp in errored for {}: {}", container, e);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
_ => {}
|
|
||||||
}
|
|
||||||
|
|
||||||
// 4. Reload nginx via SIGHUP to PID 1 (no exec/cgroup join needed)
|
|
||||||
let reload = tokio::process::Command::new("podman")
|
|
||||||
.args(["kill", "--signal=SIGHUP", container])
|
|
||||||
.output()
|
|
||||||
.await;
|
|
||||||
match reload {
|
|
||||||
Ok(o) if o.status.success() => {
|
|
||||||
info!(
|
|
||||||
"Injected Bitcoin RPC auth into {} (post-start, cp+SIGHUP)",
|
|
||||||
container
|
|
||||||
);
|
|
||||||
}
|
|
||||||
Ok(o) => warn!(
|
|
||||||
"Patched nginx.conf in {} but SIGHUP failed: {}",
|
|
||||||
container,
|
|
||||||
String::from_utf8_lossy(&o.stderr)
|
|
||||||
),
|
|
||||||
Err(e) => warn!(
|
|
||||||
"Patched nginx.conf in {} but SIGHUP errored: {}",
|
|
||||||
container, e
|
|
||||||
),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl RpcHandler {
|
impl RpcHandler {
|
||||||
/// Install a package from a Docker image.
|
/// Install a package from a Docker image.
|
||||||
/// Security: Image verification, resource limits, network isolation.
|
/// Security: Image verification, resource limits, network isolation.
|
||||||
@@ -1552,235 +1428,15 @@ autopilot.active=false\n",
|
|||||||
info!("Nextcloud trusted domains configured for {}", host_ip);
|
info!("Nextcloud trusted domains configured for {}", host_ip);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Inject Bitcoin RPC auth into bitcoin-ui nginx.conf.
|
// Companion UIs (archy-bitcoin-ui, archy-lnd-ui, archy-electrs-ui)
|
||||||
// Two paths because the credential is per-node and randomly generated
|
// are now Quadlet-managed: install_for writes ~/.config/containers/
|
||||||
// at first boot, so it can't be baked into the published registry image:
|
// systemd/<name>.container, daemon-reloads, and starts the generated
|
||||||
// 1. Build-time: rewrite nginx.conf on disk before `podman build`.
|
// .service. systemd owns supervision from there — companions survive
|
||||||
// Only fires when /opt/archipelago/docker/bitcoin-ui exists (dev
|
// archipelago crashes, restarts, and OOM kills. Per-node config
|
||||||
// box or ISO that shipped the docker tree). Skipped silently in
|
// (e.g. bitcoin-ui's nginx.conf with the live RPC auth) is rendered
|
||||||
// production where ui_builds falls through to the registry image.
|
// by each spec's pre_start hook and bind-mounted read-only.
|
||||||
// 2. Post-start: `podman exec` into the running container to patch
|
for (name, err) in crate::container::companion::install_for(package_id).await {
|
||||||
// nginx.conf and reload. Authoritative for both paths — runs
|
install_log(&format!("COMPANION FAIL: {name} — {err:#}")).await;
|
||||||
// regardless of how the image was built.
|
|
||||||
let bitcoin_rpc_auth_b64: Option<String> = if matches!(
|
|
||||||
package_id,
|
|
||||||
"bitcoin" | "bitcoin-core" | "bitcoin-knots"
|
|
||||||
) {
|
|
||||||
let (rpc_user, rpc_pass) = crate::bitcoin_rpc::bitcoin_rpc_credentials().await;
|
|
||||||
use base64::Engine;
|
|
||||||
let auth_b64 = base64::engine::general_purpose::STANDARD
|
|
||||||
.encode(format!("{}:{}", rpc_user, rpc_pass));
|
|
||||||
for dir in [
|
|
||||||
"/opt/archipelago/docker/bitcoin-ui",
|
|
||||||
"/home/archipelago/archy/docker/bitcoin-ui",
|
|
||||||
] {
|
|
||||||
let conf_path = format!("{}/nginx.conf", dir);
|
|
||||||
match tokio::fs::read_to_string(&conf_path).await {
|
|
||||||
Ok(content) => {
|
|
||||||
let updated = content
|
|
||||||
.replace("__BITCOIN_RPC_AUTH__", &auth_b64)
|
|
||||||
.lines()
|
|
||||||
.map(|line| {
|
|
||||||
if line.contains("proxy_set_header Authorization")
|
|
||||||
&& line.contains("Basic")
|
|
||||||
{
|
|
||||||
format!(
|
|
||||||
" proxy_set_header Authorization \"Basic {}\";",
|
|
||||||
auth_b64
|
|
||||||
)
|
|
||||||
} else {
|
|
||||||
line.to_string()
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.collect::<Vec<_>>()
|
|
||||||
.join("\n");
|
|
||||||
if let Err(e) = tokio::fs::write(&conf_path, format!("{}\n", updated)).await
|
|
||||||
{
|
|
||||||
warn!(
|
|
||||||
"Failed to write {} with injected RPC auth: {}",
|
|
||||||
conf_path, e
|
|
||||||
);
|
|
||||||
} else {
|
|
||||||
info!("Injected Bitcoin RPC auth into {} (build-time)", conf_path);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Err(_) => {
|
|
||||||
debug!(
|
|
||||||
"No build-time nginx.conf at {} (will patch running container after start)",
|
|
||||||
conf_path
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Some(auth_b64)
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
};
|
|
||||||
|
|
||||||
// Build and start companion UI containers for headless services.
|
|
||||||
// All UIs proxy to localhost (backend :5678 or bitcoin :8332) so they need --network=host.
|
|
||||||
let ui_builds: Vec<(&str, &str, &str)> = match package_id {
|
|
||||||
"bitcoin" | "bitcoin-core" | "bitcoin-knots" => {
|
|
||||||
vec![(
|
|
||||||
"archy-bitcoin-ui",
|
|
||||||
"/opt/archipelago/docker/bitcoin-ui",
|
|
||||||
"bitcoin-ui",
|
|
||||||
)]
|
|
||||||
}
|
|
||||||
"lnd" => {
|
|
||||||
vec![("archy-lnd-ui", "/opt/archipelago/docker/lnd-ui", "lnd-ui")]
|
|
||||||
}
|
|
||||||
"electrumx" | "electrs" | "mempool-electrs" => {
|
|
||||||
vec![(
|
|
||||||
"archy-electrs-ui",
|
|
||||||
"/opt/archipelago/docker/electrs-ui",
|
|
||||||
"electrs-ui",
|
|
||||||
)]
|
|
||||||
}
|
|
||||||
_ => vec![],
|
|
||||||
};
|
|
||||||
|
|
||||||
for (name, ui_dir, image_base) in ui_builds {
|
|
||||||
let name = name.to_string();
|
|
||||||
// Check multiple paths: /opt (production), project tree (dev)
|
|
||||||
let ui_dir = [
|
|
||||||
ui_dir.to_string(),
|
|
||||||
format!("/home/archipelago/archy/docker/{}", image_base),
|
|
||||||
format!("/home/archipelago/Projects/archy/docker/{}", image_base),
|
|
||||||
]
|
|
||||||
.into_iter()
|
|
||||||
.find(|d| std::path::Path::new(d).join("Dockerfile").exists())
|
|
||||||
.unwrap_or_else(|| ui_dir.to_string());
|
|
||||||
let image_base = image_base.to_string();
|
|
||||||
let registry = "146.59.87.168:3000/lfg2025";
|
|
||||||
let registry_image = format!("{}/{}:latest", registry, image_base);
|
|
||||||
let local_image = format!("localhost/{}:latest", image_base);
|
|
||||||
let post_start_auth = if name == "archy-bitcoin-ui" {
|
|
||||||
bitcoin_rpc_auth_b64.clone()
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
};
|
|
||||||
tokio::spawn(async move {
|
|
||||||
// Remove existing container
|
|
||||||
let _ = tokio::process::Command::new("podman")
|
|
||||||
.args(["rm", "-f", &name])
|
|
||||||
.output()
|
|
||||||
.await;
|
|
||||||
|
|
||||||
// Build locally first (templates may have injected credentials),
|
|
||||||
// fall back to registry only if no local Dockerfile exists.
|
|
||||||
let image = {
|
|
||||||
if std::path::Path::new(&ui_dir).exists() {
|
|
||||||
info!("Building {} locally from {}", name, ui_dir);
|
|
||||||
let build = tokio::process::Command::new("podman")
|
|
||||||
.args(["build", "--no-cache", "-t", &local_image, &ui_dir])
|
|
||||||
.output()
|
|
||||||
.await;
|
|
||||||
match build {
|
|
||||||
Ok(o) if o.status.success() => local_image,
|
|
||||||
Ok(o) => {
|
|
||||||
warn!(
|
|
||||||
"Failed to build {}: {}",
|
|
||||||
name,
|
|
||||||
String::from_utf8_lossy(&o.stderr)
|
|
||||||
);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
warn!("Failed to build {}: {}", name, e);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// No local Dockerfile — try pulling from registry
|
|
||||||
let mut pull_cmd = tokio::process::Command::new("podman");
|
|
||||||
pull_cmd
|
|
||||||
.arg("pull")
|
|
||||||
.arg("--tls-verify=false")
|
|
||||||
.arg(®istry_image);
|
|
||||||
let pull = pull_cmd.output().await;
|
|
||||||
if pull.is_ok_and(|o| o.status.success()) {
|
|
||||||
info!("Pulled {} UI from registry", name);
|
|
||||||
registry_image.clone()
|
|
||||||
} else {
|
|
||||||
warn!("No local source or registry image for {} — skipping", name);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// For bitcoin-ui specifically: render nginx.conf to host BEFORE
|
|
||||||
// starting the container, then bind-mount it. This is the durable
|
|
||||||
// fix for the bitcoin-rpc 401 — the per-node password is in the
|
|
||||||
// file before nginx ever opens it. Survives container recreate,
|
|
||||||
// image update, reboot, --restart=unless-stopped cycles, and
|
|
||||||
// doesn't need any post-start patching that could fail under
|
|
||||||
// tightly-confined cgroup permissions.
|
|
||||||
let mut bitcoin_ui_mount: Option<String> = None;
|
|
||||||
if name == "archy-bitcoin-ui" {
|
|
||||||
let paths = crate::container::bitcoin_ui::RenderPaths::default();
|
|
||||||
match crate::container::bitcoin_ui::render(&paths).await {
|
|
||||||
Ok(outcome) => {
|
|
||||||
bitcoin_ui_mount = Some(format!(
|
|
||||||
"{}:/etc/nginx/conf.d/default.conf:ro,Z",
|
|
||||||
paths.rendered_path.display()
|
|
||||||
));
|
|
||||||
info!(
|
|
||||||
"bitcoin-ui nginx.conf rendered ({:?}) — will bind-mount at startup",
|
|
||||||
outcome
|
|
||||||
);
|
|
||||||
}
|
|
||||||
Err(e) => warn!(
|
|
||||||
"Failed to render bitcoin-ui nginx.conf: {} — \
|
|
||||||
will fall back to post-start patch (less reliable)",
|
|
||||||
e
|
|
||||||
),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Run with --network=host (UIs proxy to localhost backend/bitcoin)
|
|
||||||
// --user 0:0: run as root inside container (still unprivileged on host
|
|
||||||
// in rootless podman) to avoid nginx chown failures
|
|
||||||
let mut args: Vec<String> = vec![
|
|
||||||
"run".into(),
|
|
||||||
"-d".into(),
|
|
||||||
"--name".into(),
|
|
||||||
name.clone(),
|
|
||||||
"--restart=unless-stopped".into(),
|
|
||||||
"--network=host".into(),
|
|
||||||
"--user=0:0".into(),
|
|
||||||
"--cap-drop=ALL".into(),
|
|
||||||
"--cap-add=CHOWN".into(),
|
|
||||||
"--cap-add=DAC_OVERRIDE".into(),
|
|
||||||
"--cap-add=NET_BIND_SERVICE".into(),
|
|
||||||
"--cap-add=SETUID".into(),
|
|
||||||
"--cap-add=SETGID".into(),
|
|
||||||
"--memory=128m".into(),
|
|
||||||
];
|
|
||||||
if let Some(ref mount) = bitcoin_ui_mount {
|
|
||||||
args.push("-v".into());
|
|
||||||
args.push(mount.clone());
|
|
||||||
}
|
|
||||||
args.push(image.clone());
|
|
||||||
let run = tokio::process::Command::new("podman")
|
|
||||||
.args(&args)
|
|
||||||
.output()
|
|
||||||
.await;
|
|
||||||
match run {
|
|
||||||
Ok(o) if o.status.success() => {
|
|
||||||
info!("{} UI container started (host network)", name);
|
|
||||||
if let Some(ref auth) = post_start_auth {
|
|
||||||
inject_bitcoin_rpc_auth_into_running_container(&name, auth).await;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(o) => warn!(
|
|
||||||
"Failed to start {}: {}",
|
|
||||||
name,
|
|
||||||
String::from_utf8_lossy(&o.stderr)
|
|
||||||
),
|
|
||||||
Err(e) => warn!("Failed to start {}: {}", name, e),
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -227,6 +227,11 @@ impl RpcHandler {
|
|||||||
.and_then(|v| v.as_bool())
|
.and_then(|v| v.as_bool())
|
||||||
.unwrap_or(false);
|
.unwrap_or(false);
|
||||||
|
|
||||||
|
// Disable + remove Quadlet companion units BEFORE the rm loop.
|
||||||
|
// Otherwise systemd's Restart=always will respawn each companion
|
||||||
|
// within ~10s of `podman rm`, leaving them orphaned post-uninstall.
|
||||||
|
crate::container::companion::remove_for(package_id).await;
|
||||||
|
|
||||||
let containers_to_remove = get_containers_for_app(package_id).await?;
|
let containers_to_remove = get_containers_for_app(package_id).await?;
|
||||||
if containers_to_remove.is_empty() {
|
if containers_to_remove.is_empty() {
|
||||||
tracing::warn!("Uninstall {}: no containers found", package_id);
|
tracing::warn!("Uninstall {}: no containers found", package_id);
|
||||||
|
|||||||
@@ -29,6 +29,11 @@ pub struct BootReconciler {
|
|||||||
orchestrator: Arc<ProdContainerOrchestrator>,
|
orchestrator: Arc<ProdContainerOrchestrator>,
|
||||||
interval: Duration,
|
interval: Duration,
|
||||||
shutdown: Arc<Notify>,
|
shutdown: Arc<Notify>,
|
||||||
|
/// Run the companion-unit repair stage each tick. Default true.
|
||||||
|
/// Tests disable this — companion reconcile shells out to
|
||||||
|
/// `systemctl --user` and `podman`, which both block real time
|
||||||
|
/// and would race the paused-clock test fixtures.
|
||||||
|
companion_stage: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl BootReconciler {
|
impl BootReconciler {
|
||||||
@@ -41,28 +46,46 @@ impl BootReconciler {
|
|||||||
orchestrator,
|
orchestrator,
|
||||||
interval,
|
interval,
|
||||||
shutdown,
|
shutdown,
|
||||||
|
companion_stage: true,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Disable the companion-unit reconcile stage. Used by unit tests
|
||||||
|
/// that exercise loop cadence without the real systemd / podman
|
||||||
|
/// surface. Production must not call this.
|
||||||
|
#[cfg(test)]
|
||||||
|
pub fn without_companion_stage(mut self) -> Self {
|
||||||
|
self.companion_stage = false;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
/// Run the reconcile loop until `shutdown` is notified.
|
/// Run the reconcile loop until `shutdown` is notified.
|
||||||
///
|
///
|
||||||
/// Does one reconcile immediately, then sleeps `interval` between
|
/// Does one reconcile immediately, then sleeps `interval` between
|
||||||
/// subsequent passes. A `shutdown.notify_one()` call unblocks the sleep
|
/// subsequent passes. A `shutdown.notify_one()` call unblocks the sleep
|
||||||
/// and the task returns after the *next* pass completes.
|
/// and the task returns after the *next* pass completes.
|
||||||
///
|
///
|
||||||
/// Never panics: per-app failures are absorbed into `ReconcileReport` by
|
/// Each pass is two stages:
|
||||||
/// the orchestrator, and `reconcile_all` itself returns infallibly.
|
/// 1. App reconcile: `reconcile_all()` keeps every loaded manifest's
|
||||||
|
/// container running.
|
||||||
|
/// 2. Companion reconcile: any expected Quadlet companion unit that
|
||||||
|
/// is missing or inactive is repaired (writes the unit, daemon-
|
||||||
|
/// reloads, starts the service). This is the safety net for the
|
||||||
|
/// "someone deleted my unit file" / "systemd lost the service"
|
||||||
|
/// failure modes.
|
||||||
|
///
|
||||||
|
/// Never panics: per-app failures are absorbed into `ReconcileReport`
|
||||||
|
/// by the orchestrator, and companion failures are logged but never
|
||||||
|
/// propagated.
|
||||||
pub async fn run_forever(self) {
|
pub async fn run_forever(self) {
|
||||||
// Initial pass: no delay.
|
// Initial pass: no delay.
|
||||||
let report = self.orchestrator.reconcile_all().await;
|
self.tick().await;
|
||||||
Self::log_report(&report);
|
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
let deadline = Instant::now() + self.interval;
|
let deadline = Instant::now() + self.interval;
|
||||||
tokio::select! {
|
tokio::select! {
|
||||||
_ = time::sleep_until(deadline) => {
|
_ = time::sleep_until(deadline) => {
|
||||||
let report = self.orchestrator.reconcile_all().await;
|
self.tick().await;
|
||||||
Self::log_report(&report);
|
|
||||||
}
|
}
|
||||||
_ = self.shutdown.notified() => {
|
_ = self.shutdown.notified() => {
|
||||||
tracing::info!("boot reconciler: shutdown requested, exiting loop");
|
tracing::info!("boot reconciler: shutdown requested, exiting loop");
|
||||||
@@ -72,6 +95,25 @@ impl BootReconciler {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn tick(&self) {
|
||||||
|
let report = self.orchestrator.reconcile_all().await;
|
||||||
|
Self::log_report(&report);
|
||||||
|
|
||||||
|
if !self.companion_stage {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
let installed = self.orchestrator.manifest_ids().await;
|
||||||
|
for (companion, err) in
|
||||||
|
crate::container::companion::reconcile(&installed).await
|
||||||
|
{
|
||||||
|
tracing::warn!(
|
||||||
|
companion = %companion,
|
||||||
|
error = %err,
|
||||||
|
"companion reconcile failed"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn log_report(report: &ReconcileReport) {
|
fn log_report(report: &ReconcileReport) {
|
||||||
for (app_id, action) in &report.actions {
|
for (app_id, action) in &report.actions {
|
||||||
tracing::debug!(app_id = %app_id, action = ?action, "reconcile action");
|
tracing::debug!(app_id = %app_id, action = ?action, "reconcile action");
|
||||||
@@ -218,7 +260,7 @@ mod tests {
|
|||||||
let orch = orch_with_one_running_manifest(rt.clone()).await;
|
let orch = orch_with_one_running_manifest(rt.clone()).await;
|
||||||
let shutdown = Arc::new(Notify::new());
|
let shutdown = Arc::new(Notify::new());
|
||||||
let reconciler =
|
let reconciler =
|
||||||
BootReconciler::new(orch.clone(), Duration::from_secs(30), shutdown.clone());
|
BootReconciler::new(orch.clone(), Duration::from_secs(30), shutdown.clone()).without_companion_stage();
|
||||||
let handle = tokio::spawn(reconciler.run_forever());
|
let handle = tokio::spawn(reconciler.run_forever());
|
||||||
|
|
||||||
// Yield so the spawned task gets CPU to run its initial reconcile.
|
// Yield so the spawned task gets CPU to run its initial reconcile.
|
||||||
@@ -242,7 +284,7 @@ mod tests {
|
|||||||
let orch = orch_with_one_running_manifest(rt.clone()).await;
|
let orch = orch_with_one_running_manifest(rt.clone()).await;
|
||||||
let shutdown = Arc::new(Notify::new());
|
let shutdown = Arc::new(Notify::new());
|
||||||
let reconciler =
|
let reconciler =
|
||||||
BootReconciler::new(orch.clone(), Duration::from_secs(30), shutdown.clone());
|
BootReconciler::new(orch.clone(), Duration::from_secs(30), shutdown.clone()).without_companion_stage();
|
||||||
let handle = tokio::spawn(reconciler.run_forever());
|
let handle = tokio::spawn(reconciler.run_forever());
|
||||||
|
|
||||||
tokio::task::yield_now().await;
|
tokio::task::yield_now().await;
|
||||||
@@ -271,7 +313,7 @@ mod tests {
|
|||||||
let orch = orch_with_one_running_manifest(rt.clone()).await;
|
let orch = orch_with_one_running_manifest(rt.clone()).await;
|
||||||
let shutdown = Arc::new(Notify::new());
|
let shutdown = Arc::new(Notify::new());
|
||||||
let reconciler =
|
let reconciler =
|
||||||
BootReconciler::new(orch.clone(), Duration::from_secs(30), shutdown.clone());
|
BootReconciler::new(orch.clone(), Duration::from_secs(30), shutdown.clone()).without_companion_stage();
|
||||||
let handle = tokio::spawn(reconciler.run_forever());
|
let handle = tokio::spawn(reconciler.run_forever());
|
||||||
tokio::task::yield_now().await;
|
tokio::task::yield_now().await;
|
||||||
tokio::task::yield_now().await;
|
tokio::task::yield_now().await;
|
||||||
@@ -305,7 +347,7 @@ mod tests {
|
|||||||
.await;
|
.await;
|
||||||
let shutdown = Arc::new(Notify::new());
|
let shutdown = Arc::new(Notify::new());
|
||||||
let reconciler =
|
let reconciler =
|
||||||
BootReconciler::new(orch.clone(), Duration::from_secs(30), shutdown.clone());
|
BootReconciler::new(orch.clone(), Duration::from_secs(30), shutdown.clone()).without_companion_stage();
|
||||||
let handle = tokio::spawn(reconciler.run_forever());
|
let handle = tokio::spawn(reconciler.run_forever());
|
||||||
|
|
||||||
tokio::task::yield_now().await;
|
tokio::task::yield_now().await;
|
||||||
|
|||||||
348
core/archipelago/src/container/companion.rs
Normal file
348
core/archipelago/src/container/companion.rs
Normal file
@@ -0,0 +1,348 @@
|
|||||||
|
//! Companion UI container lifecycle, entirely Quadlet-managed.
|
||||||
|
//!
|
||||||
|
//! A "companion" is a small nginx-based container that exposes a
|
||||||
|
//! browser-friendly UI on top of a headless backend service:
|
||||||
|
//!
|
||||||
|
//! | Backend | Companion | Purpose |
|
||||||
|
//! |------------------|--------------------|--------------------------|
|
||||||
|
//! | bitcoin-knots | archy-bitcoin-ui | RPC viewer |
|
||||||
|
//! | bitcoin-core | archy-bitcoin-ui | RPC viewer |
|
||||||
|
//! | lnd | archy-lnd-ui | wallet/channel UI |
|
||||||
|
//! | electrumx | archy-electrs-ui | indexer status UI |
|
||||||
|
//!
|
||||||
|
//! Lifecycle: `install` writes a Quadlet `.container` unit to
|
||||||
|
//! `~/.config/containers/systemd/`, daemon-reloads, then starts the
|
||||||
|
//! generated `.service`. systemd owns supervision from that point on
|
||||||
|
//! — archipelago can crash, restart, or be uninstalled without
|
||||||
|
//! touching the companion.
|
||||||
|
//!
|
||||||
|
//! This replaces the old `tokio::spawn { podman run }` block in
|
||||||
|
//! `install.rs` (~165 lines of fire-and-forget shellouts) with a
|
||||||
|
//! single declarative call.
|
||||||
|
|
||||||
|
use anyhow::{Context, Result};
|
||||||
|
use std::path::PathBuf;
|
||||||
|
use tokio::fs;
|
||||||
|
use tokio::process::Command;
|
||||||
|
use tracing::{info, warn};
|
||||||
|
|
||||||
|
use crate::container::quadlet::{
|
||||||
|
self, BindMount, NetworkMode, QuadletUnit,
|
||||||
|
};
|
||||||
|
use archipelago_container::image_uses_insecure_registry;
|
||||||
|
|
||||||
|
const COMPANION_REGISTRY: &str = "146.59.87.168:3000/lfg2025";
|
||||||
|
|
||||||
|
/// Static description of one companion. The full list per backend
|
||||||
|
/// app_id lives in `companions_for`.
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct CompanionSpec {
|
||||||
|
/// Container + unit name (e.g. "archy-bitcoin-ui").
|
||||||
|
pub name: &'static str,
|
||||||
|
/// Image base name in the lfg2025 registry namespace
|
||||||
|
/// (e.g. "bitcoin-ui" → "146.59.87.168:3000/lfg2025/bitcoin-ui:latest").
|
||||||
|
pub image_base: &'static str,
|
||||||
|
/// Filesystem locations to look for a local Dockerfile (build wins
|
||||||
|
/// over registry pull). Searched in order; first hit wins.
|
||||||
|
pub build_dir_candidates: &'static [&'static str],
|
||||||
|
/// Optional pre-start hook that renders config files referenced
|
||||||
|
/// by `bind_mounts`. Returns Ok(()) on success; bind-mount must
|
||||||
|
/// be present at start time or the companion will 502.
|
||||||
|
pub pre_start: Option<PreStartHook>,
|
||||||
|
/// Bind mounts. Always read-only — companions don't write to
|
||||||
|
/// host paths.
|
||||||
|
pub bind_mounts: &'static [(&'static str, &'static str)],
|
||||||
|
}
|
||||||
|
|
||||||
|
pub type PreStartHook = fn() -> futures_util::future::BoxFuture<'static, Result<()>>;
|
||||||
|
|
||||||
|
/// Companions to install when `package_id` lands. Empty for apps
|
||||||
|
/// without a companion UI.
|
||||||
|
pub fn companions_for(package_id: &str) -> &'static [CompanionSpec] {
|
||||||
|
match package_id {
|
||||||
|
"bitcoin" | "bitcoin-core" | "bitcoin-knots" => BITCOIN_UI,
|
||||||
|
"lnd" => LND_UI,
|
||||||
|
"electrumx" | "electrs" | "mempool-electrs" => ELECTRS_UI,
|
||||||
|
_ => &[],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const BITCOIN_UI: &[CompanionSpec] = &[CompanionSpec {
|
||||||
|
name: "archy-bitcoin-ui",
|
||||||
|
image_base: "bitcoin-ui",
|
||||||
|
build_dir_candidates: &[
|
||||||
|
"/opt/archipelago/docker/bitcoin-ui",
|
||||||
|
"/home/archipelago/archy/docker/bitcoin-ui",
|
||||||
|
"/home/archipelago/Projects/archy/docker/bitcoin-ui",
|
||||||
|
],
|
||||||
|
pre_start: Some(render_bitcoin_ui),
|
||||||
|
bind_mounts: &[(
|
||||||
|
"/var/lib/archipelago/bitcoin-ui/nginx.conf",
|
||||||
|
"/etc/nginx/conf.d/default.conf",
|
||||||
|
)],
|
||||||
|
}];
|
||||||
|
|
||||||
|
const LND_UI: &[CompanionSpec] = &[CompanionSpec {
|
||||||
|
name: "archy-lnd-ui",
|
||||||
|
image_base: "lnd-ui",
|
||||||
|
build_dir_candidates: &[
|
||||||
|
"/opt/archipelago/docker/lnd-ui",
|
||||||
|
"/home/archipelago/archy/docker/lnd-ui",
|
||||||
|
"/home/archipelago/Projects/archy/docker/lnd-ui",
|
||||||
|
],
|
||||||
|
pre_start: None,
|
||||||
|
bind_mounts: &[],
|
||||||
|
}];
|
||||||
|
|
||||||
|
const ELECTRS_UI: &[CompanionSpec] = &[CompanionSpec {
|
||||||
|
name: "archy-electrs-ui",
|
||||||
|
image_base: "electrs-ui",
|
||||||
|
build_dir_candidates: &[
|
||||||
|
"/opt/archipelago/docker/electrs-ui",
|
||||||
|
"/home/archipelago/archy/docker/electrs-ui",
|
||||||
|
"/home/archipelago/Projects/archy/docker/electrs-ui",
|
||||||
|
],
|
||||||
|
pre_start: None,
|
||||||
|
bind_mounts: &[],
|
||||||
|
}];
|
||||||
|
|
||||||
|
fn render_bitcoin_ui() -> futures_util::future::BoxFuture<'static, Result<()>> {
|
||||||
|
Box::pin(async {
|
||||||
|
let paths = crate::container::bitcoin_ui::RenderPaths::default();
|
||||||
|
crate::container::bitcoin_ui::render(&paths)
|
||||||
|
.await
|
||||||
|
.map(|_| ())
|
||||||
|
.context("render bitcoin-ui nginx.conf")
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Provision and start every companion for `package_id`. Each
|
||||||
|
/// companion is independent — a failure in one is logged but does
|
||||||
|
/// not abort the others.
|
||||||
|
pub async fn install_for(package_id: &str) -> Vec<(String, anyhow::Error)> {
|
||||||
|
let mut failures = Vec::new();
|
||||||
|
for spec in companions_for(package_id) {
|
||||||
|
if let Err(e) = install_one(spec).await {
|
||||||
|
warn!(companion = spec.name, error = %e, "companion install failed");
|
||||||
|
failures.push((spec.name.to_string(), e));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
failures
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Stop and remove every companion for `package_id`. Best effort:
|
||||||
|
/// errors are logged but do not abort the sequence.
|
||||||
|
pub async fn remove_for(package_id: &str) {
|
||||||
|
let dir = match quadlet::unit_dir().await {
|
||||||
|
Ok(d) => d,
|
||||||
|
Err(e) => {
|
||||||
|
warn!("companion remove: cannot resolve quadlet dir: {e:#}");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
for spec in companions_for(package_id) {
|
||||||
|
if let Err(e) = quadlet::disable_remove(spec.name, &dir).await {
|
||||||
|
warn!(companion = spec.name, error = %e, "companion remove failed");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Provision one companion: pre-start hook → image present → write
|
||||||
|
/// quadlet → daemon-reload → start.
|
||||||
|
pub async fn install_one(spec: &CompanionSpec) -> Result<()> {
|
||||||
|
if let Some(hook) = spec.pre_start {
|
||||||
|
hook().await.with_context(|| {
|
||||||
|
format!("pre-start hook failed for {} — companion will not start", spec.name)
|
||||||
|
})?;
|
||||||
|
}
|
||||||
|
let image = ensure_image_present(spec).await?;
|
||||||
|
let unit = build_unit(spec, &image);
|
||||||
|
let dir = quadlet::unit_dir().await?;
|
||||||
|
let changed = quadlet::write_if_changed(&unit, &dir).await?;
|
||||||
|
if changed {
|
||||||
|
info!(companion = spec.name, "wrote quadlet unit");
|
||||||
|
quadlet::daemon_reload_user().await?;
|
||||||
|
}
|
||||||
|
// Start is idempotent — if already running, systemctl returns 0.
|
||||||
|
quadlet::enable_now(&unit.service_name()).await?;
|
||||||
|
info!(companion = spec.name, "companion started");
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Build companion image locally if a Dockerfile exists, otherwise
|
||||||
|
/// pull from the lfg2025 registry. Returns the image ref the quadlet
|
||||||
|
/// should reference (`localhost/<base>:latest` for build, registry
|
||||||
|
/// URL for pull).
|
||||||
|
async fn ensure_image_present(spec: &CompanionSpec) -> Result<String> {
|
||||||
|
let local_image = format!("localhost/{}:latest", spec.image_base);
|
||||||
|
let registry_image = format!("{}/{}:latest", COMPANION_REGISTRY, spec.image_base);
|
||||||
|
|
||||||
|
// Prefer local build — companions can carry build-time customizations
|
||||||
|
// (e.g. nginx.conf templates baked in). Search known candidates.
|
||||||
|
for dir in spec.build_dir_candidates {
|
||||||
|
let dockerfile = PathBuf::from(dir).join("Dockerfile");
|
||||||
|
if fs::try_exists(&dockerfile).await.unwrap_or(false) {
|
||||||
|
info!(companion = spec.name, "building locally from {dir}");
|
||||||
|
let out = Command::new("podman")
|
||||||
|
.args(["build", "--no-cache", "-t", &local_image, dir])
|
||||||
|
.output()
|
||||||
|
.await
|
||||||
|
.context("spawn podman build")?;
|
||||||
|
if out.status.success() {
|
||||||
|
return Ok(local_image);
|
||||||
|
}
|
||||||
|
warn!(
|
||||||
|
companion = spec.name,
|
||||||
|
"local build failed: {}",
|
||||||
|
String::from_utf8_lossy(&out.stderr).trim()
|
||||||
|
);
|
||||||
|
// Fall through to registry pull rather than fail outright.
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Registry pull. Use insecure flag only for whitelisted hosts.
|
||||||
|
let mut cmd = Command::new("podman");
|
||||||
|
cmd.arg("pull");
|
||||||
|
if image_uses_insecure_registry(®istry_image) {
|
||||||
|
cmd.arg("--tls-verify=false");
|
||||||
|
}
|
||||||
|
cmd.arg(®istry_image);
|
||||||
|
let out = cmd.output().await.context("spawn podman pull")?;
|
||||||
|
if !out.status.success() {
|
||||||
|
anyhow::bail!(
|
||||||
|
"no local Dockerfile and registry pull failed for {}: {}",
|
||||||
|
spec.name,
|
||||||
|
String::from_utf8_lossy(&out.stderr).trim()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
Ok(registry_image)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn build_unit(spec: &CompanionSpec, image: &str) -> QuadletUnit {
|
||||||
|
QuadletUnit {
|
||||||
|
name: spec.name.into(),
|
||||||
|
description: format!("Archipelago companion UI: {}", spec.name),
|
||||||
|
image: image.into(),
|
||||||
|
// Companions proxy to localhost — backend is on :5678, bitcoin
|
||||||
|
// RPC on :8332. Host network is the simplest way to reach them
|
||||||
|
// without per-app gateway plumbing.
|
||||||
|
network: NetworkMode::Host,
|
||||||
|
// Run as root inside the container so nginx can chown its
|
||||||
|
// worker dirs. Rootless podman maps this to a high host UID,
|
||||||
|
// so it is unprivileged on the host.
|
||||||
|
user: Some("0:0".into()),
|
||||||
|
memory_mb: Some(128),
|
||||||
|
cap_drop_all: true,
|
||||||
|
cap_add: vec![
|
||||||
|
"CHOWN".into(),
|
||||||
|
"DAC_OVERRIDE".into(),
|
||||||
|
"NET_BIND_SERVICE".into(),
|
||||||
|
"SETUID".into(),
|
||||||
|
"SETGID".into(),
|
||||||
|
],
|
||||||
|
bind_mounts: spec
|
||||||
|
.bind_mounts
|
||||||
|
.iter()
|
||||||
|
.map(|(host, container)| BindMount {
|
||||||
|
host: PathBuf::from(*host),
|
||||||
|
container: PathBuf::from(*container),
|
||||||
|
read_only: true,
|
||||||
|
})
|
||||||
|
.collect(),
|
||||||
|
extra_podman_args: vec![],
|
||||||
|
depends_on: vec![],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Is a user systemd manager reachable? In production archipelago.service
|
||||||
|
/// inherits XDG_RUNTIME_DIR from systemd; in unit tests / CI sandboxes it
|
||||||
|
/// is unset, in which case `systemctl --user` would fail and write to
|
||||||
|
/// HOME would be an unwanted side effect. The reconciler skips its
|
||||||
|
/// companion stage when this is false.
|
||||||
|
fn user_systemd_available() -> bool {
|
||||||
|
std::env::var_os("XDG_RUNTIME_DIR")
|
||||||
|
.map(|v| !v.is_empty())
|
||||||
|
.unwrap_or(false)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Reconcile companion presence: every expected companion for the
|
||||||
|
/// given installed apps must have its quadlet unit on disk and its
|
||||||
|
/// service active. Returns a list of (companion, error) for anything
|
||||||
|
/// that needed correction and failed.
|
||||||
|
///
|
||||||
|
/// Called from `boot_reconciler` so a deleted unit file or a stopped
|
||||||
|
/// service is repaired within one tick. No-ops if the user systemd
|
||||||
|
/// manager is not reachable (CI / test environments).
|
||||||
|
pub async fn reconcile(installed_apps: &[String]) -> Vec<(String, anyhow::Error)> {
|
||||||
|
if !user_systemd_available() {
|
||||||
|
return Vec::new();
|
||||||
|
}
|
||||||
|
let mut failures = Vec::new();
|
||||||
|
for app_id in installed_apps {
|
||||||
|
for spec in companions_for(app_id) {
|
||||||
|
match needs_repair(spec).await {
|
||||||
|
Ok(false) => {}
|
||||||
|
Ok(true) => {
|
||||||
|
info!(companion = spec.name, "reconcile: companion not active, repairing");
|
||||||
|
if let Err(e) = install_one(spec).await {
|
||||||
|
failures.push((spec.name.to_string(), e));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
warn!(companion = spec.name, error = %e, "reconcile probe failed");
|
||||||
|
failures.push((spec.name.to_string(), e));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
failures
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Does this companion need install_one to be re-run? Returns true if
|
||||||
|
/// the unit file is missing OR the service is not active.
|
||||||
|
async fn needs_repair(spec: &CompanionSpec) -> Result<bool> {
|
||||||
|
let dir = quadlet::unit_dir().await?;
|
||||||
|
let unit_path = dir.join(format!("{}.container", spec.name));
|
||||||
|
if !fs::try_exists(&unit_path).await.unwrap_or(false) {
|
||||||
|
return Ok(true);
|
||||||
|
}
|
||||||
|
let svc = format!("{}.service", spec.name);
|
||||||
|
Ok(!quadlet::is_active(&svc).await)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn companions_for_known_apps_returns_expected_set() {
|
||||||
|
assert_eq!(companions_for("bitcoin-knots").len(), 1);
|
||||||
|
assert_eq!(companions_for("bitcoin-core").len(), 1);
|
||||||
|
assert_eq!(companions_for("bitcoin").len(), 1);
|
||||||
|
assert_eq!(companions_for("lnd").len(), 1);
|
||||||
|
assert_eq!(companions_for("electrumx").len(), 1);
|
||||||
|
assert_eq!(companions_for("electrs").len(), 1);
|
||||||
|
assert_eq!(companions_for("mempool-electrs").len(), 1);
|
||||||
|
assert_eq!(companions_for("nextcloud").len(), 0);
|
||||||
|
assert_eq!(companions_for("not-a-real-app").len(), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn build_unit_uses_host_network_and_drops_caps() {
|
||||||
|
let spec = &BITCOIN_UI[0];
|
||||||
|
let u = build_unit(spec, "localhost/bitcoin-ui:latest");
|
||||||
|
assert_eq!(u.name, "archy-bitcoin-ui");
|
||||||
|
assert!(matches!(u.network, NetworkMode::Host));
|
||||||
|
assert!(u.cap_drop_all);
|
||||||
|
assert!(u.cap_add.iter().any(|c| c == "NET_BIND_SERVICE"));
|
||||||
|
assert_eq!(u.user.as_deref(), Some("0:0"));
|
||||||
|
assert_eq!(u.memory_mb, Some(128));
|
||||||
|
assert_eq!(u.bind_mounts.len(), 1);
|
||||||
|
assert_eq!(
|
||||||
|
u.bind_mounts[0].container,
|
||||||
|
PathBuf::from("/etc/nginx/conf.d/default.conf")
|
||||||
|
);
|
||||||
|
assert!(u.bind_mounts[0].read_only);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,11 +1,13 @@
|
|||||||
pub mod bitcoin_ui;
|
pub mod bitcoin_ui;
|
||||||
pub mod boot_reconciler;
|
pub mod boot_reconciler;
|
||||||
|
pub mod companion;
|
||||||
pub mod data_manager;
|
pub mod data_manager;
|
||||||
pub mod dev_orchestrator;
|
pub mod dev_orchestrator;
|
||||||
pub mod docker_packages;
|
pub mod docker_packages;
|
||||||
pub mod filebrowser;
|
pub mod filebrowser;
|
||||||
pub mod image_versions;
|
pub mod image_versions;
|
||||||
pub mod prod_orchestrator;
|
pub mod prod_orchestrator;
|
||||||
|
pub mod quadlet;
|
||||||
pub mod registry;
|
pub mod registry;
|
||||||
pub mod traits;
|
pub mod traits;
|
||||||
|
|
||||||
|
|||||||
@@ -311,6 +311,13 @@ impl ProdContainerOrchestrator {
|
|||||||
.ok_or_else(|| anyhow::anyhow!("unknown app_id: {app_id}"))
|
.ok_or_else(|| anyhow::anyhow!("unknown app_id: {app_id}"))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Snapshot of the app IDs currently in the in-memory manifest map.
|
||||||
|
/// Used by the boot reconciler to drive companion-unit reconciliation.
|
||||||
|
pub async fn manifest_ids(&self) -> Vec<String> {
|
||||||
|
let state = self.state.read().await;
|
||||||
|
state.manifests.keys().cloned().collect()
|
||||||
|
}
|
||||||
|
|
||||||
/// Scan the runtime for containers whose names match one of our manifests.
|
/// Scan the runtime for containers whose names match one of our manifests.
|
||||||
/// This is a read-only adoption pass: nothing is created, started, or touched.
|
/// This is a read-only adoption pass: nothing is created, started, or touched.
|
||||||
pub async fn adopt_existing(&self) -> Result<AdoptionReport> {
|
pub async fn adopt_existing(&self) -> Result<AdoptionReport> {
|
||||||
|
|||||||
371
core/archipelago/src/container/quadlet.rs
Normal file
371
core/archipelago/src/container/quadlet.rs
Normal file
@@ -0,0 +1,371 @@
|
|||||||
|
//! Render and lifecycle Quadlet `.container` units for companion UI
|
||||||
|
//! containers (archy-bitcoin-ui, archy-lnd-ui, archy-electrs-ui).
|
||||||
|
//!
|
||||||
|
//! Why Quadlet: companions used to run as fire-and-forget `tokio::spawn`
|
||||||
|
//! blocks from `install.rs`. If archipelago crashed mid-spawn or the
|
||||||
|
//! kernel reaped a parent cgroup, companions vanished from `podman ps`
|
||||||
|
//! entirely and only a manual `podman run` brought them back. Putting the
|
||||||
|
//! unit on disk and letting systemd own start/restart removes that whole
|
||||||
|
//! class of failure: the daemon is now systemd, archipelago is just the
|
||||||
|
//! provisioner.
|
||||||
|
//!
|
||||||
|
//! Design constraints kept this module small on purpose:
|
||||||
|
//!
|
||||||
|
//! - **Single responsibility**: render → write → enable → disable. We do
|
||||||
|
//! NOT pull images here — the caller is expected to have the image
|
||||||
|
//! present locally (companions either build from `/opt/archipelago/docker/`
|
||||||
|
//! or are pre-pulled by `install_companion_image`). The quadlet unit
|
||||||
|
//! declares `Pull=never` so a missing image surfaces immediately
|
||||||
|
//! instead of silently retrying behind systemd's restart loop.
|
||||||
|
//! - **Atomic writes**: `tempfile + rename` so a partially-written unit
|
||||||
|
//! is never visible to systemd. A daemon-reload during a rolling
|
||||||
|
//! update can't see half a file.
|
||||||
|
//! - **Idempotent**: `write_if_changed` compares bytes before touching
|
||||||
|
//! the file. No daemon-reload, no service-restart cascade if the
|
||||||
|
//! rendered bytes match what's on disk.
|
||||||
|
//! - **systemctl --user only**: archipelago runs as uid=1000 with
|
||||||
|
//! linger enabled. We never touch the system bus from here.
|
||||||
|
//!
|
||||||
|
//! See `docs/rust-orchestrator-migration.md` and the failure-mode log in
|
||||||
|
//! `feedback_container_lifecycle_failure_modes.md` for the incident
|
||||||
|
//! that motivated the move.
|
||||||
|
|
||||||
|
use anyhow::{anyhow, Context, Result};
|
||||||
|
use std::fmt::Write as _;
|
||||||
|
use std::path::{Path, PathBuf};
|
||||||
|
use tokio::fs;
|
||||||
|
use tokio::process::Command;
|
||||||
|
|
||||||
|
/// Default rootless quadlet directory. Resolved per-user at runtime via
|
||||||
|
/// `unit_dir()`. Tests pass an explicit dir.
|
||||||
|
pub const DEFAULT_REL_UNIT_DIR: &str = ".config/containers/systemd";
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct BindMount {
|
||||||
|
pub host: PathBuf,
|
||||||
|
pub container: PathBuf,
|
||||||
|
pub read_only: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Default)]
|
||||||
|
#[allow(dead_code)] // Bridge is reserved for Phase 5 per-app network isolation.
|
||||||
|
pub enum NetworkMode {
|
||||||
|
#[default]
|
||||||
|
Host,
|
||||||
|
/// A user-defined podman network — quadlet creates the container
|
||||||
|
/// attached to it. The network must already exist (orchestrator's
|
||||||
|
/// `ensure_container_network` handles that on every reconcile tick).
|
||||||
|
Bridge(String),
|
||||||
|
}
|
||||||
|
|
||||||
|
/// One Quadlet `.container` unit. Field set is deliberately small —
|
||||||
|
/// add a new field only when a companion actually needs it.
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct QuadletUnit {
|
||||||
|
pub name: String,
|
||||||
|
pub description: String,
|
||||||
|
pub image: String,
|
||||||
|
pub network: NetworkMode,
|
||||||
|
pub user: Option<String>,
|
||||||
|
pub memory_mb: Option<u32>,
|
||||||
|
pub cap_drop_all: bool,
|
||||||
|
pub cap_add: Vec<String>,
|
||||||
|
pub bind_mounts: Vec<BindMount>,
|
||||||
|
pub extra_podman_args: Vec<String>,
|
||||||
|
pub depends_on: Vec<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl QuadletUnit {
|
||||||
|
/// File name on disk: `<name>.container`. Quadlet translates this
|
||||||
|
/// into a `<name>.service` unit at daemon-reload time.
|
||||||
|
pub fn unit_filename(&self) -> String {
|
||||||
|
format!("{}.container", self.name)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// systemd service name created by Quadlet for this unit.
|
||||||
|
pub fn service_name(&self) -> String {
|
||||||
|
format!("{}.service", self.name)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Render the canonical Quadlet unit text. Pure function — no I/O.
|
||||||
|
pub fn render(&self) -> String {
|
||||||
|
let mut s = String::with_capacity(512);
|
||||||
|
let _ = writeln!(s, "# Generated by archipelago. DO NOT EDIT.");
|
||||||
|
let _ = writeln!(s, "# Edits are overwritten on the next reconcile.");
|
||||||
|
let _ = writeln!(s);
|
||||||
|
let _ = writeln!(s, "[Unit]");
|
||||||
|
let _ = writeln!(s, "Description={}", self.description);
|
||||||
|
let _ = writeln!(s, "After=network-online.target");
|
||||||
|
let _ = writeln!(s, "Wants=network-online.target");
|
||||||
|
for dep in &self.depends_on {
|
||||||
|
let _ = writeln!(s, "Requires={dep}");
|
||||||
|
let _ = writeln!(s, "After={dep}");
|
||||||
|
}
|
||||||
|
let _ = writeln!(s);
|
||||||
|
let _ = writeln!(s, "[Container]");
|
||||||
|
let _ = writeln!(s, "ContainerName={}", self.name);
|
||||||
|
let _ = writeln!(s, "Image={}", self.image);
|
||||||
|
// Pull=never: companions are pre-pulled or built. A missing image
|
||||||
|
// must surface as a unit start failure, not a silent retry storm.
|
||||||
|
let _ = writeln!(s, "Pull=never");
|
||||||
|
match &self.network {
|
||||||
|
NetworkMode::Host => {
|
||||||
|
let _ = writeln!(s, "Network=host");
|
||||||
|
}
|
||||||
|
NetworkMode::Bridge(net) => {
|
||||||
|
let _ = writeln!(s, "Network={net}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if let Some(user) = &self.user {
|
||||||
|
let _ = writeln!(s, "User={user}");
|
||||||
|
}
|
||||||
|
if self.cap_drop_all {
|
||||||
|
let _ = writeln!(s, "DropCapability=ALL");
|
||||||
|
}
|
||||||
|
for cap in &self.cap_add {
|
||||||
|
let _ = writeln!(s, "AddCapability={cap}");
|
||||||
|
}
|
||||||
|
if let Some(mb) = self.memory_mb {
|
||||||
|
let _ = writeln!(s, "PodmanArgs=--memory={mb}m");
|
||||||
|
}
|
||||||
|
for bm in &self.bind_mounts {
|
||||||
|
let mode = if bm.read_only { ":ro,Z" } else { ":Z" };
|
||||||
|
let _ = writeln!(
|
||||||
|
s,
|
||||||
|
"Volume={}:{}{}",
|
||||||
|
bm.host.display(),
|
||||||
|
bm.container.display(),
|
||||||
|
mode
|
||||||
|
);
|
||||||
|
}
|
||||||
|
for arg in &self.extra_podman_args {
|
||||||
|
let _ = writeln!(s, "PodmanArgs={arg}");
|
||||||
|
}
|
||||||
|
let _ = writeln!(s);
|
||||||
|
let _ = writeln!(s, "[Service]");
|
||||||
|
// Always restart with a 10s backoff. RestartSec keeps a
|
||||||
|
// crash-loop from saturating the journal.
|
||||||
|
let _ = writeln!(s, "Restart=always");
|
||||||
|
let _ = writeln!(s, "RestartSec=10");
|
||||||
|
let _ = writeln!(s);
|
||||||
|
let _ = writeln!(s, "[Install]");
|
||||||
|
let _ = writeln!(s, "WantedBy=default.target");
|
||||||
|
s
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Resolve the per-user quadlet dir under $HOME. Created if missing.
|
||||||
|
pub async fn unit_dir() -> Result<PathBuf> {
|
||||||
|
let home = std::env::var_os("HOME")
|
||||||
|
.map(PathBuf::from)
|
||||||
|
.ok_or_else(|| anyhow!("HOME not set; cannot locate quadlet unit dir"))?;
|
||||||
|
let dir = home.join(DEFAULT_REL_UNIT_DIR);
|
||||||
|
fs::create_dir_all(&dir)
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("create_dir_all {}", dir.display()))?;
|
||||||
|
Ok(dir)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Atomically write `unit` into `dir/<name>.container` if the bytes
|
||||||
|
/// differ from what's already there. Returns true if the file changed.
|
||||||
|
pub async fn write_if_changed(unit: &QuadletUnit, dir: &Path) -> Result<bool> {
|
||||||
|
let path = dir.join(unit.unit_filename());
|
||||||
|
let new_bytes = unit.render();
|
||||||
|
|
||||||
|
if let Ok(old) = fs::read_to_string(&path).await {
|
||||||
|
if old == new_bytes {
|
||||||
|
return Ok(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fs::create_dir_all(dir)
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("create_dir_all {}", dir.display()))?;
|
||||||
|
let tmp = path.with_extension("container.tmp");
|
||||||
|
fs::write(&tmp, new_bytes.as_bytes())
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("write tmp {}", tmp.display()))?;
|
||||||
|
fs::rename(&tmp, &path)
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("rename {} -> {}", tmp.display(), path.display()))?;
|
||||||
|
Ok(true)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Reload the user systemd manager. Required after any quadlet write
|
||||||
|
/// or removal so systemd picks up the generated `.service` translation.
|
||||||
|
pub async fn daemon_reload_user() -> Result<()> {
|
||||||
|
let status = Command::new("systemctl")
|
||||||
|
.args(["--user", "daemon-reload"])
|
||||||
|
.status()
|
||||||
|
.await
|
||||||
|
.context("spawn systemctl --user daemon-reload")?;
|
||||||
|
if !status.success() {
|
||||||
|
return Err(anyhow!("systemctl --user daemon-reload exited {status}"));
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Enable + start a quadlet-generated service. `enable --now` makes it
|
||||||
|
/// survive reboots and starts it immediately.
|
||||||
|
pub async fn enable_now(service: &str) -> Result<()> {
|
||||||
|
// Quadlet-generated units cannot be `enable`d directly because the
|
||||||
|
// .service file lives under /run, not /etc — `enable` would refuse
|
||||||
|
// ("transient or generated"). The unit's `[Install] WantedBy` is
|
||||||
|
// honoured at daemon-reload, so we just start it.
|
||||||
|
let status = Command::new("systemctl")
|
||||||
|
.args(["--user", "start", service])
|
||||||
|
.status()
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("spawn systemctl --user start {service}"))?;
|
||||||
|
if !status.success() {
|
||||||
|
return Err(anyhow!("systemctl --user start {service} exited {status}"));
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Stop + remove a quadlet unit and its on-disk file. Best-effort:
|
||||||
|
/// errors stop only the destructive write at the failing step so a
|
||||||
|
/// partial removal doesn't leave a quadlet file pointing at a service
|
||||||
|
/// that systemd no longer knows about.
|
||||||
|
pub async fn disable_remove(unit_name: &str, dir: &Path) -> Result<()> {
|
||||||
|
let svc = format!("{unit_name}.service");
|
||||||
|
// Stop first; ignore failure (unit may already be down).
|
||||||
|
let _ = Command::new("systemctl")
|
||||||
|
.args(["--user", "stop", &svc])
|
||||||
|
.status()
|
||||||
|
.await;
|
||||||
|
let path = dir.join(format!("{unit_name}.container"));
|
||||||
|
if fs::try_exists(&path).await.unwrap_or(false) {
|
||||||
|
fs::remove_file(&path)
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("remove {}", path.display()))?;
|
||||||
|
}
|
||||||
|
daemon_reload_user().await.ok();
|
||||||
|
// Defensive: kill the actual container too, in case quadlet left it.
|
||||||
|
let _ = Command::new("podman")
|
||||||
|
.args(["rm", "-f", unit_name])
|
||||||
|
.status()
|
||||||
|
.await;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Is the quadlet-generated service currently active?
|
||||||
|
pub async fn is_active(service: &str) -> bool {
|
||||||
|
Command::new("systemctl")
|
||||||
|
.args(["--user", "is-active", "--quiet", service])
|
||||||
|
.status()
|
||||||
|
.await
|
||||||
|
.map(|s| s.success())
|
||||||
|
.unwrap_or(false)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use tempfile::tempdir;
|
||||||
|
|
||||||
|
fn sample_unit() -> QuadletUnit {
|
||||||
|
QuadletUnit {
|
||||||
|
name: "archy-bitcoin-ui".into(),
|
||||||
|
description: "Bitcoin RPC UI proxy".into(),
|
||||||
|
image: "146.59.87.168:3000/lfg2025/bitcoin-ui:latest".into(),
|
||||||
|
network: NetworkMode::Host,
|
||||||
|
user: Some("0:0".into()),
|
||||||
|
memory_mb: Some(128),
|
||||||
|
cap_drop_all: true,
|
||||||
|
cap_add: vec![
|
||||||
|
"CHOWN".into(),
|
||||||
|
"DAC_OVERRIDE".into(),
|
||||||
|
"NET_BIND_SERVICE".into(),
|
||||||
|
"SETUID".into(),
|
||||||
|
"SETGID".into(),
|
||||||
|
],
|
||||||
|
bind_mounts: vec![BindMount {
|
||||||
|
host: PathBuf::from("/var/lib/archipelago/bitcoin-ui/nginx.conf"),
|
||||||
|
container: PathBuf::from("/etc/nginx/conf.d/default.conf"),
|
||||||
|
read_only: true,
|
||||||
|
}],
|
||||||
|
extra_podman_args: vec![],
|
||||||
|
depends_on: vec![],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn render_contains_required_directives() {
|
||||||
|
let s = sample_unit().render();
|
||||||
|
assert!(s.contains("[Container]"));
|
||||||
|
assert!(s.contains("ContainerName=archy-bitcoin-ui"));
|
||||||
|
assert!(s.contains("Image=146.59.87.168:3000/lfg2025/bitcoin-ui:latest"));
|
||||||
|
assert!(s.contains("Pull=never"));
|
||||||
|
assert!(s.contains("Network=host"));
|
||||||
|
assert!(s.contains("DropCapability=ALL"));
|
||||||
|
assert!(s.contains("AddCapability=CHOWN"));
|
||||||
|
assert!(s.contains("AddCapability=NET_BIND_SERVICE"));
|
||||||
|
assert!(s.contains("PodmanArgs=--memory=128m"));
|
||||||
|
assert!(s.contains(
|
||||||
|
"Volume=/var/lib/archipelago/bitcoin-ui/nginx.conf:/etc/nginx/conf.d/default.conf:ro,Z"
|
||||||
|
));
|
||||||
|
assert!(s.contains("[Service]"));
|
||||||
|
assert!(s.contains("Restart=always"));
|
||||||
|
assert!(s.contains("WantedBy=default.target"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn render_bridge_network_emits_network_name() {
|
||||||
|
let mut u = sample_unit();
|
||||||
|
u.network = NetworkMode::Bridge("archy-bitcoin-ui-net".into());
|
||||||
|
let s = u.render();
|
||||||
|
assert!(s.contains("Network=archy-bitcoin-ui-net"));
|
||||||
|
assert!(!s.contains("Network=host"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn unit_filename_and_service_name_are_consistent() {
|
||||||
|
let u = sample_unit();
|
||||||
|
assert_eq!(u.unit_filename(), "archy-bitcoin-ui.container");
|
||||||
|
assert_eq!(u.service_name(), "archy-bitcoin-ui.service");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn write_if_changed_writes_first_time_then_noops() {
|
||||||
|
let dir = tempdir().unwrap();
|
||||||
|
let u = sample_unit();
|
||||||
|
let changed = write_if_changed(&u, dir.path()).await.unwrap();
|
||||||
|
assert!(changed, "first write must report changed");
|
||||||
|
let on_disk = tokio::fs::read_to_string(dir.path().join(u.unit_filename()))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert!(on_disk.starts_with("# Generated by archipelago"));
|
||||||
|
|
||||||
|
let changed2 = write_if_changed(&u, dir.path()).await.unwrap();
|
||||||
|
assert!(!changed2, "second write with identical bytes must no-op");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn write_if_changed_rewrites_when_field_changes() {
|
||||||
|
let dir = tempdir().unwrap();
|
||||||
|
let mut u = sample_unit();
|
||||||
|
write_if_changed(&u, dir.path()).await.unwrap();
|
||||||
|
|
||||||
|
u.memory_mb = Some(256);
|
||||||
|
let changed = write_if_changed(&u, dir.path()).await.unwrap();
|
||||||
|
assert!(changed, "field change must trigger rewrite");
|
||||||
|
let on_disk = tokio::fs::read_to_string(dir.path().join(u.unit_filename()))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert!(on_disk.contains("PodmanArgs=--memory=256m"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn write_if_changed_atomic_rename_leaves_no_tmp() {
|
||||||
|
let dir = tempdir().unwrap();
|
||||||
|
write_if_changed(&sample_unit(), dir.path()).await.unwrap();
|
||||||
|
let mut entries = tokio::fs::read_dir(dir.path()).await.unwrap();
|
||||||
|
while let Some(e) = entries.next_entry().await.unwrap() {
|
||||||
|
assert!(
|
||||||
|
!e.file_name().to_string_lossy().ends_with(".tmp"),
|
||||||
|
"atomic rename must leave no .tmp residue"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
146
tests/lifecycle/bats/companion-survives-archipelago-restart.bats
Normal file
146
tests/lifecycle/bats/companion-survives-archipelago-restart.bats
Normal file
@@ -0,0 +1,146 @@
|
|||||||
|
#!/usr/bin/env bats
|
||||||
|
# tests/lifecycle/bats/companion-survives-archipelago-restart.bats
|
||||||
|
#
|
||||||
|
# Quadlet promise: companion UIs (archy-bitcoin-ui, archy-lnd-ui,
|
||||||
|
# archy-electrs-ui) are managed by systemd, not archipelago. Restarting
|
||||||
|
# the archipelago user service must NOT take them down.
|
||||||
|
#
|
||||||
|
# This is the regression gate for the .228 incident in
|
||||||
|
# feedback_container_lifecycle_failure_modes.md (FM1: companions vanished
|
||||||
|
# from `podman ps -a` after archipelago crash-loop).
|
||||||
|
#
|
||||||
|
# Gated by ARCHY_ALLOW_DESTRUCTIVE=1 because it bounces archipelago.
|
||||||
|
|
||||||
|
companion_units=(
|
||||||
|
"archy-bitcoin-ui"
|
||||||
|
"archy-lnd-ui"
|
||||||
|
"archy-electrs-ui"
|
||||||
|
)
|
||||||
|
|
||||||
|
unit_dir="$HOME/.config/containers/systemd"
|
||||||
|
|
||||||
|
unit_file_present() {
|
||||||
|
local name="$1"
|
||||||
|
[[ -f "$unit_dir/$name.container" ]]
|
||||||
|
}
|
||||||
|
|
||||||
|
service_active() {
|
||||||
|
local name="$1"
|
||||||
|
systemctl --user is-active --quiet "$name.service"
|
||||||
|
}
|
||||||
|
|
||||||
|
container_running() {
|
||||||
|
local name="$1"
|
||||||
|
[[ "$(podman inspect --format '{{.State.Running}}' "$name" 2>/dev/null)" == "true" ]]
|
||||||
|
}
|
||||||
|
|
||||||
|
wait_service_active() {
|
||||||
|
local name="$1"
|
||||||
|
local timeout="${2:-60}"
|
||||||
|
local deadline=$(( $(date +%s) + timeout ))
|
||||||
|
while (( $(date +%s) < deadline )); do
|
||||||
|
if service_active "$name"; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
sleep 2
|
||||||
|
done
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
wait_archipelago_back() {
|
||||||
|
local timeout="${1:-60}"
|
||||||
|
local deadline=$(( $(date +%s) + timeout ))
|
||||||
|
while (( $(date +%s) < deadline )); do
|
||||||
|
if curl -fsS -o /dev/null "http://127.0.0.1:5678/health" 2>/dev/null; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
sleep 2
|
||||||
|
done
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
@test "destructive gate enabled" {
|
||||||
|
[[ "${ARCHY_ALLOW_DESTRUCTIVE:-0}" == "1" ]] || skip "ARCHY_ALLOW_DESTRUCTIVE not set"
|
||||||
|
}
|
||||||
|
|
||||||
|
@test "every installed companion has a quadlet unit on disk" {
|
||||||
|
[[ "${ARCHY_ALLOW_DESTRUCTIVE:-0}" == "1" ]] || skip "ARCHY_ALLOW_DESTRUCTIVE not set"
|
||||||
|
local present=0
|
||||||
|
for c in "${companion_units[@]}"; do
|
||||||
|
if container_running "$c"; then
|
||||||
|
run unit_file_present "$c"
|
||||||
|
[ "$status" -eq 0 ]
|
||||||
|
present=$(( present + 1 ))
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
(( present > 0 )) || skip "No companions installed on this node"
|
||||||
|
}
|
||||||
|
|
||||||
|
@test "every installed companion service is active before restart" {
|
||||||
|
[[ "${ARCHY_ALLOW_DESTRUCTIVE:-0}" == "1" ]] || skip "ARCHY_ALLOW_DESTRUCTIVE not set"
|
||||||
|
for c in "${companion_units[@]}"; do
|
||||||
|
if container_running "$c"; then
|
||||||
|
run service_active "$c"
|
||||||
|
[ "$status" -eq 0 ]
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
@test "companions survive archipelago restart" {
|
||||||
|
[[ "${ARCHY_ALLOW_DESTRUCTIVE:-0}" == "1" ]] || skip "ARCHY_ALLOW_DESTRUCTIVE not set"
|
||||||
|
|
||||||
|
# Snapshot: which companions were up before we touched anything.
|
||||||
|
local before=()
|
||||||
|
for c in "${companion_units[@]}"; do
|
||||||
|
if container_running "$c"; then
|
||||||
|
before+=("$c")
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
(( ${#before[@]} > 0 )) || skip "No companions installed on this node"
|
||||||
|
|
||||||
|
# Bounce archipelago. The user service is the production canonical name;
|
||||||
|
# fall back to the system service for older nodes.
|
||||||
|
if systemctl --user list-units --no-legend archipelago.service | grep -q archipelago; then
|
||||||
|
systemctl --user restart archipelago.service
|
||||||
|
else
|
||||||
|
sudo systemctl restart archipelago.service
|
||||||
|
fi
|
||||||
|
|
||||||
|
run wait_archipelago_back 60
|
||||||
|
[ "$status" -eq 0 ]
|
||||||
|
|
||||||
|
# Every companion that was up before must still be up + healthy after.
|
||||||
|
for c in "${before[@]}"; do
|
||||||
|
run service_active "$c"
|
||||||
|
[ "$status" -eq 0 ]
|
||||||
|
run container_running "$c"
|
||||||
|
[ "$status" -eq 0 ]
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
@test "deleted unit file is recreated within one reconcile tick" {
|
||||||
|
[[ "${ARCHY_ALLOW_DESTRUCTIVE:-0}" == "1" ]] || skip "ARCHY_ALLOW_DESTRUCTIVE not set"
|
||||||
|
|
||||||
|
# Pick a companion that's currently running.
|
||||||
|
local target=""
|
||||||
|
for c in "${companion_units[@]}"; do
|
||||||
|
if container_running "$c"; then
|
||||||
|
target="$c"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
[[ -n "$target" ]] || skip "No companions installed on this node"
|
||||||
|
|
||||||
|
# Delete the unit file behind systemd's back. The reconciler should
|
||||||
|
# notice and rewrite it within one 30s tick, then start the service.
|
||||||
|
rm -f "$unit_dir/$target.container"
|
||||||
|
systemctl --user daemon-reload >/dev/null 2>&1 || true
|
||||||
|
systemctl --user stop "$target.service" >/dev/null 2>&1 || true
|
||||||
|
|
||||||
|
# Allow up to two reconcile ticks (60s + grace).
|
||||||
|
run wait_service_active "$target" 90
|
||||||
|
[ "$status" -eq 0 ]
|
||||||
|
|
||||||
|
run unit_file_present "$target"
|
||||||
|
[ "$status" -eq 0 ]
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user