fix: bulletproof first-boot container creation and install reliability
Remove the Bitcoin RPC 60-second gate that blocked 13+ dependent containers (mempool, electrumx, btcpay, lnd, fedimint) from being created on first boot. Containers now always get created and auto-restart via health monitor once Bitcoin becomes responsive — the designed recovery path. Additional hardening: - Validate archy-net creation with retry (silent failure broke DNS) - Verify critical images are loaded, re-load from tarballs if missing - Create SearXNG settings.yml before container start (was missing) - Run reconciler automatically after first-boot failures - Add load-images as explicit systemd dependency with 900s timeout - Propagate config write errors in install.rs (bitcoin.conf, lnd.conf) - FileBrowser password change: retry loop (6 attempts) + 0o600 perms - Post-start verification: detect containers that exit immediately - Add 2s dependency waits between container starts Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -412,11 +412,13 @@ load_spec_searxng() {
|
||||
SPEC_IMAGE="${SEARXNG_IMAGE}"
|
||||
SPEC_PORTS="8888:8080"
|
||||
SPEC_MEMORY="$(mem_limit searxng)"
|
||||
SPEC_VOLUMES="/var/lib/archipelago/searxng:/etc/searxng"
|
||||
SPEC_HEALTH_CMD="curl -sf http://localhost:8080/ || exit 1"
|
||||
SPEC_READONLY="true"
|
||||
SPEC_TMPFS="/tmp:rw,noexec,nosuid,size=256m /run:rw,noexec,nosuid,size=64m /etc/searxng:rw,noexec,nosuid,size=16m"
|
||||
SPEC_TMPFS="/tmp:rw,noexec,nosuid,size=256m /run:rw,noexec,nosuid,size=64m"
|
||||
SPEC_TIER="3"
|
||||
SPEC_CAPS=""
|
||||
SPEC_DATA_DIR="/var/lib/archipelago/searxng"
|
||||
}
|
||||
|
||||
load_spec_onlyoffice() {
|
||||
|
||||
@@ -233,8 +233,19 @@ chmod 700 /run/user/1000
|
||||
runuser -u archipelago -- env XDG_RUNTIME_DIR=/run/user/1000 \
|
||||
systemctl --user start podman.socket 2>/dev/null || true
|
||||
|
||||
# Ensure network exists (matches deploy)
|
||||
# Ensure archy-net exists — critical for inter-container DNS (mempool→bitcoin, etc.)
|
||||
$DOCKER network create archy-net 2>/dev/null || true
|
||||
if ! $DOCKER network exists archy-net 2>/dev/null; then
|
||||
log "WARNING: archy-net creation failed, retrying in 5s..."
|
||||
sleep 5
|
||||
$DOCKER network create archy-net 2>>"$LOG"
|
||||
if ! $DOCKER network exists archy-net 2>/dev/null; then
|
||||
log "FATAL: Cannot create archy-net — inter-container DNS will not work."
|
||||
log " All containers requiring archy-net will fail. Exiting."
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
log "archy-net network ready"
|
||||
|
||||
# Rootless podman UID mapping: fix data dir ownership so container processes
|
||||
# can write. Rootless podman maps container UIDs via subuid (container UID N
|
||||
@@ -299,6 +310,43 @@ mem_limit() {
|
||||
esac
|
||||
}
|
||||
|
||||
# ── Verify critical images are loaded ──────────────────────────────────
|
||||
# archipelago-load-images.service should have loaded these from tarballs.
|
||||
# If any are missing (corrupt tarball, disk full, etc.), try re-loading.
|
||||
log "Verifying container images..."
|
||||
MISSING_IMAGES=""
|
||||
for img_var in BITCOIN_KNOTS_IMAGE MARIADB_IMAGE ELECTRUMX_IMAGE \
|
||||
MEMPOOL_BACKEND_IMAGE MEMPOOL_WEB_IMAGE BTCPAY_POSTGRES_IMAGE \
|
||||
NBXPLORER_IMAGE BTCPAY_IMAGE LND_IMAGE FEDIMINT_IMAGE \
|
||||
FEDIMINT_GATEWAY_IMAGE HOMEASSISTANT_IMAGE GRAFANA_IMAGE \
|
||||
UPTIME_KUMA_IMAGE JELLYFIN_IMAGE VAULTWARDEN_IMAGE \
|
||||
NEXTCLOUD_IMAGE SEARXNG_IMAGE FILEBROWSER_IMAGE; do
|
||||
img="${!img_var}"
|
||||
if [ -z "$img" ]; then
|
||||
continue # Variable not defined in image-versions.sh
|
||||
fi
|
||||
if ! $DOCKER images --format '{{.Repository}}:{{.Tag}}' 2>/dev/null | grep -qF "$img"; then
|
||||
MISSING_IMAGES="$MISSING_IMAGES $img_var"
|
||||
fi
|
||||
done
|
||||
if [ -n "$MISSING_IMAGES" ]; then
|
||||
log "WARNING: Missing images:$MISSING_IMAGES"
|
||||
log "Attempting to re-load from /opt/archipelago/container-images/..."
|
||||
RELOAD_COUNT=0
|
||||
for tarfile in /opt/archipelago/container-images/*.tar; do
|
||||
if [ -f "$tarfile" ]; then
|
||||
if $DOCKER load -i "$tarfile" 2>>"$LOG"; then
|
||||
RELOAD_COUNT=$((RELOAD_COUNT + 1))
|
||||
else
|
||||
log " Failed to load: $tarfile"
|
||||
fi
|
||||
fi
|
||||
done
|
||||
log "Re-loaded $RELOAD_COUNT image tarballs"
|
||||
else
|
||||
log "All critical images verified"
|
||||
fi
|
||||
|
||||
# ── Tier 1: Databases & Core Infrastructure ──────────────────────────────
|
||||
log "=== Tier 1: Databases & Core Infrastructure ==="
|
||||
|
||||
@@ -337,13 +385,16 @@ else
|
||||
$DOCKER network connect archy-net bitcoin-knots 2>/dev/null || true
|
||||
log "Bitcoin Knots already running"
|
||||
fi
|
||||
# Wait for Bitcoin Knots RPC to be responsive
|
||||
# Check Bitcoin Knots RPC (informational — containers created regardless)
|
||||
# Dependent containers use --restart=unless-stopped and the health monitor
|
||||
# will auto-restart them once Bitcoin becomes responsive.
|
||||
if wait_for_container "Bitcoin Knots RPC" "$DOCKER exec bitcoin-knots bitcoin-cli -rpcuser='$BITCOIN_RPC_USER' -rpcpassword='$BITCOIN_RPC_PASS' getblockchaininfo" 60; then
|
||||
BITCOIN_READY=true
|
||||
log "Bitcoin Knots is ready — dependent containers will proceed"
|
||||
log "Bitcoin Knots is ready"
|
||||
else
|
||||
BITCOIN_READY=false
|
||||
log "WARNING: Bitcoin Knots NOT ready — skipping dependent containers (electrumx, lnd, mempool, btcpay, fedimint)"
|
||||
log "Bitcoin Knots not yet responsive (normal during IBD) — creating dependent containers anyway"
|
||||
log " They will auto-restart via health monitor once Bitcoin is ready"
|
||||
fi
|
||||
track_container "bitcoin-knots"
|
||||
|
||||
@@ -355,7 +406,8 @@ if ! $DOCKER exec bitcoin-knots bitcoin-cli "-rpcuser=$BITCOIN_RPC_USER" "-rpcpa
|
||||
fi
|
||||
|
||||
# 2. Mempool stack (matches deploy) — depends on Bitcoin
|
||||
if [ "$BITCOIN_READY" = "true" ]; then
|
||||
# Note: containers created regardless of BITCOIN_READY — they will restart
|
||||
# automatically once Bitcoin becomes responsive (--restart=unless-stopped).
|
||||
if ! $DOCKER ps -a --format '{{.Names}}' 2>/dev/null | grep -qE 'archy-mempool-db|mysql-mempool'; then
|
||||
log "Creating mysql-mempool..."
|
||||
mkdir -p /var/lib/archipelago/mysql-mempool
|
||||
@@ -624,9 +676,7 @@ if ! $DOCKER ps --format '{{.Names}}' 2>/dev/null | grep -q fedimint-gateway; th
|
||||
fi
|
||||
track_container "fedimint-gateway"
|
||||
|
||||
else
|
||||
log "SKIPPED: mempool stack, electrumx, btcpay stack, lnd, fedimint (Bitcoin not ready)"
|
||||
fi # end BITCOIN_READY
|
||||
# (Bitcoin-dependent containers created above regardless of BITCOIN_READY)
|
||||
|
||||
# ── Tier 3: Applications (independent — always attempt) ───────────────────
|
||||
log "=== Tier 3: Applications ==="
|
||||
@@ -742,12 +792,33 @@ fi
|
||||
track_container "nextcloud"
|
||||
if ! $DOCKER ps --format '{{.Names}}' 2>/dev/null | grep -q searxng; then
|
||||
log "Creating SearXNG..."
|
||||
# SearXNG requires settings.yml or it exits immediately
|
||||
SEARXNG_CONF="/var/lib/archipelago/searxng"
|
||||
if [ ! -f "$SEARXNG_CONF/settings.yml" ]; then
|
||||
mkdir -p "$SEARXNG_CONF"
|
||||
SEARX_SECRET=$(openssl rand -hex 32)
|
||||
cat > "$SEARXNG_CONF/settings.yml" <<SEARXCFG
|
||||
use_default_settings: true
|
||||
general:
|
||||
instance_name: Archipelago Search
|
||||
server:
|
||||
secret_key: "$SEARX_SECRET"
|
||||
bind_address: "0.0.0.0"
|
||||
port: 8080
|
||||
limiter: false
|
||||
ui:
|
||||
default_theme: simple
|
||||
SEARXCFG
|
||||
chown -R 100000:100000 "$SEARXNG_CONF" 2>/dev/null
|
||||
log " Created SearXNG settings.yml"
|
||||
fi
|
||||
$DOCKER run -d --name searxng --restart unless-stopped \
|
||||
--health-cmd="curl -sf http://localhost:8080/ || exit 1" --health-interval=120s --health-timeout=5s --health-retries=3 \
|
||||
--memory=$(mem_limit searxng) \
|
||||
--cap-drop ALL --security-opt no-new-privileges:true \
|
||||
--read-only --tmpfs /tmp:rw,noexec,nosuid,size=256m --tmpfs /run:rw,noexec,nosuid,size=64m \
|
||||
-p 8888:8080 \
|
||||
-v /var/lib/archipelago/searxng:/etc/searxng \
|
||||
"${SEARXNG_IMAGE}" 2>>"$LOG" || true
|
||||
fi
|
||||
track_container "searxng"
|
||||
@@ -979,8 +1050,29 @@ elif [ -x "/opt/archipelago/scripts/container-doctor.sh" ]; then
|
||||
bash "/opt/archipelago/scripts/container-doctor.sh" --local 2>&1 | tee -a "$LOG"
|
||||
fi
|
||||
|
||||
# 12. Final summary
|
||||
# 11b. If any containers failed, run the reconciler to attempt recovery
|
||||
FAILED=$((TOTAL - SUCCESS))
|
||||
if [ "$FAILED" -gt 0 ]; then
|
||||
log "Attempting to recover $FAILED failed container(s) via reconciler..."
|
||||
RECONCILE_SCRIPT=""
|
||||
if [ -x "$SCRIPT_DIR/reconcile-containers.sh" ]; then
|
||||
RECONCILE_SCRIPT="$SCRIPT_DIR/reconcile-containers.sh"
|
||||
elif [ -x "/opt/archipelago/scripts/reconcile-containers.sh" ]; then
|
||||
RECONCILE_SCRIPT="/opt/archipelago/scripts/reconcile-containers.sh"
|
||||
fi
|
||||
if [ -n "$RECONCILE_SCRIPT" ]; then
|
||||
runuser -u archipelago -- bash "$RECONCILE_SCRIPT" 2>&1 | tee -a "$LOG"
|
||||
# Recount after reconciliation
|
||||
SUCCESS=0
|
||||
for name in $($DOCKER ps --format '{{.Names}}' 2>/dev/null); do
|
||||
SUCCESS=$((SUCCESS + 1))
|
||||
done
|
||||
FAILED=$((TOTAL - SUCCESS))
|
||||
log "After reconciliation: $SUCCESS running, $FAILED still failed"
|
||||
fi
|
||||
fi
|
||||
|
||||
# 12. Final summary
|
||||
log "============================================="
|
||||
log " FIRST-BOOT CONTAINER SUMMARY"
|
||||
log "============================================="
|
||||
@@ -988,7 +1080,7 @@ log " Total tracked: $TOTAL"
|
||||
log " Running: $SUCCESS"
|
||||
log " Failed: $FAILED"
|
||||
if [ "$BITCOIN_READY" != "true" ]; then
|
||||
log " Bitcoin: NOT READY (dependent containers skipped)"
|
||||
log " Bitcoin: NOT READY (dependent containers will auto-restart when ready)"
|
||||
fi
|
||||
if [ -n "$FAILED_LIST" ]; then
|
||||
log " Failed list: $FAILED_LIST"
|
||||
|
||||
Reference in New Issue
Block a user