- Added new dependencies: `adler2`, `crc32fast`, `flate2`, `miniz_oxide`, and `libredox`. - Updated existing dependencies: `tokio-rustls` to version 0.26.4 and `filetime` to version 0.2.27. - Removed the `backup.rs` file as it is no longer needed. - Introduced tests for configuration and credential management. - Enhanced the `identity` module to generate W3C compliant DID documents. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
340 lines
9.7 KiB
Bash
Executable File
340 lines
9.7 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# chaos-test.sh — Chaos/resilience test for Archipelago server.
|
|
#
|
|
# Tests the server's ability to survive adverse conditions:
|
|
# - Process kills (verify systemd restart)
|
|
# - Container stop/start cycling
|
|
# - Concurrent RPC requests (verify no crashes)
|
|
# - High disk usage warnings
|
|
# - Network interruption recovery
|
|
#
|
|
# Usage:
|
|
# ssh archipelago@192.168.1.228 "cd ~/archy && bash scripts/chaos-test.sh"
|
|
#
|
|
# Duration: ~30 minutes by default (set CHAOS_DURATION_HOURS for longer)
|
|
|
|
set -uo pipefail
|
|
|
|
CHAOS_DURATION_HOURS="${CHAOS_DURATION_HOURS:-0.5}"
|
|
RPC_URL="http://localhost:5678/rpc/v1"
|
|
HEALTH_URL="http://localhost/health"
|
|
MAX_RECOVERY_WAIT=60
|
|
|
|
# Colors
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[1;33m'
|
|
NC='\033[0m'
|
|
|
|
PASS=0
|
|
FAIL=0
|
|
TESTS=()
|
|
|
|
log() { echo -e "${GREEN}[CHAOS]${NC} $*"; }
|
|
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
|
|
fail() { echo -e "${RED}[FAIL]${NC} $*"; }
|
|
|
|
record() {
|
|
local name="$1" result="$2"
|
|
if [ "$result" = "PASS" ]; then
|
|
PASS=$((PASS + 1))
|
|
TESTS+=("PASS $name")
|
|
else
|
|
FAIL=$((FAIL + 1))
|
|
TESTS+=("FAIL $name")
|
|
fi
|
|
}
|
|
|
|
# Authenticate
|
|
COOKIE_FILE=$(mktemp)
|
|
authenticate() {
|
|
curl -s -c "$COOKIE_FILE" -X POST "$RPC_URL" \
|
|
-H "Content-Type: application/json" \
|
|
-d '{"method":"auth.login","params":{"password":"password123"}}' > /dev/null 2>&1
|
|
}
|
|
|
|
rpc() {
|
|
local method="$1"
|
|
local params="${2:-null}"
|
|
local csrf
|
|
csrf=$(grep csrf_token "$COOKIE_FILE" 2>/dev/null | awk '{print $NF}' || echo "")
|
|
curl -s -b "$COOKIE_FILE" -X POST "$RPC_URL" \
|
|
-H "Content-Type: application/json" \
|
|
-H "X-CSRF-Token: $csrf" \
|
|
-d "{\"method\":\"$method\",\"params\":$params}" 2>/dev/null
|
|
}
|
|
|
|
wait_for_health() {
|
|
local timeout="${1:-$MAX_RECOVERY_WAIT}"
|
|
local elapsed=0
|
|
while [ "$elapsed" -lt "$timeout" ]; do
|
|
if curl -sf "$HEALTH_URL" > /dev/null 2>&1; then
|
|
return 0
|
|
fi
|
|
sleep 2
|
|
elapsed=$((elapsed + 2))
|
|
done
|
|
return 1
|
|
}
|
|
|
|
echo ""
|
|
echo "============================================"
|
|
echo " Archipelago Chaos Test Suite"
|
|
echo "============================================"
|
|
echo " Duration: ${CHAOS_DURATION_HOURS}h"
|
|
echo ""
|
|
|
|
# Pre-check
|
|
if ! curl -sf "$HEALTH_URL" > /dev/null 2>&1; then
|
|
fail "Server not healthy at $HEALTH_URL — aborting"
|
|
exit 1
|
|
fi
|
|
log "Server is healthy"
|
|
authenticate
|
|
|
|
# =============================================================================
|
|
# Test 1: Process Kill Recovery
|
|
# =============================================================================
|
|
log "=== Test 1: Process Kill Recovery ==="
|
|
log "Killing archipelago process..."
|
|
|
|
sudo systemctl kill --signal=SIGKILL archipelago 2>/dev/null || \
|
|
sudo kill -9 $(pgrep -f "/usr/local/bin/archipelago" | head -1) 2>/dev/null
|
|
|
|
sleep 2
|
|
|
|
if wait_for_health 30; then
|
|
log "Backend recovered after SIGKILL in <30s"
|
|
record "Process kill recovery" "PASS"
|
|
else
|
|
fail "Backend did not recover after SIGKILL within 30s"
|
|
record "Process kill recovery" "FAIL"
|
|
# Try to restart manually
|
|
sudo systemctl start archipelago
|
|
sleep 5
|
|
fi
|
|
|
|
authenticate
|
|
|
|
# =============================================================================
|
|
# Test 2: Graceful Restart
|
|
# =============================================================================
|
|
log "=== Test 2: Graceful Restart ==="
|
|
log "Restarting archipelago service..."
|
|
|
|
sudo systemctl restart archipelago
|
|
sleep 2
|
|
|
|
if wait_for_health 20; then
|
|
log "Backend restarted gracefully"
|
|
record "Graceful restart" "PASS"
|
|
else
|
|
fail "Backend did not come up after restart"
|
|
record "Graceful restart" "FAIL"
|
|
fi
|
|
|
|
authenticate
|
|
|
|
# =============================================================================
|
|
# Test 3: Concurrent RPC Requests
|
|
# =============================================================================
|
|
log "=== Test 3: Concurrent RPC Load (100 requests) ==="
|
|
|
|
CONCURRENT_PASS=0
|
|
CONCURRENT_FAIL=0
|
|
|
|
for i in $(seq 1 100); do
|
|
(
|
|
result=$(curl -sf -X POST "$RPC_URL" \
|
|
-H "Content-Type: application/json" \
|
|
-d '{"method":"system.stats"}' 2>/dev/null)
|
|
if echo "$result" | grep -q "cpu_usage_percent"; then
|
|
echo "OK" >> /tmp/chaos-concurrent-ok
|
|
else
|
|
echo "FAIL" >> /tmp/chaos-concurrent-fail
|
|
fi
|
|
) &
|
|
done
|
|
|
|
wait
|
|
rm -f /tmp/chaos-concurrent-ok /tmp/chaos-concurrent-fail 2>/dev/null
|
|
|
|
# Re-authenticate in case cookies expired during load
|
|
authenticate
|
|
|
|
# Check server still healthy
|
|
if curl -sf "$HEALTH_URL" > /dev/null 2>&1; then
|
|
log "Server survived 100 concurrent requests"
|
|
record "Concurrent RPC load" "PASS"
|
|
else
|
|
fail "Server crashed under concurrent load"
|
|
record "Concurrent RPC load" "FAIL"
|
|
sudo systemctl restart archipelago
|
|
sleep 5
|
|
authenticate
|
|
fi
|
|
|
|
# =============================================================================
|
|
# Test 4: Container Stop/Start Cycling
|
|
# =============================================================================
|
|
log "=== Test 4: Container Stop/Start Cycling ==="
|
|
|
|
# Use filebrowser as test container (lightweight, quick to restart)
|
|
CONTAINER_ID="filebrowser"
|
|
if [ -n "$CONTAINER_ID" ]; then
|
|
log "Testing with container: $CONTAINER_ID"
|
|
|
|
# Stop
|
|
rpc "package.stop" "{\"id\":\"$CONTAINER_ID\"}" > /dev/null
|
|
sleep 3
|
|
|
|
# Verify stopped
|
|
status=$(rpc "container-status" "{\"id\":\"$CONTAINER_ID\"}")
|
|
|
|
# Start
|
|
rpc "package.start" "{\"id\":\"$CONTAINER_ID\"}" > /dev/null
|
|
sleep 10
|
|
|
|
# Verify running (check both container-status and podman directly)
|
|
status=$(rpc "container-status" "{\"id\":\"$CONTAINER_ID\"}")
|
|
podman_running=$(podman ps --filter "name=^${CONTAINER_ID}$" --format "{{.Status}}" 2>/dev/null | head -1 | grep -ci "up" || echo "0")
|
|
if echo "$status" | grep -qi "running" || [ "$podman_running" -gt 0 ]; then
|
|
log "Container $CONTAINER_ID stop/start cycle OK"
|
|
record "Container cycling" "PASS"
|
|
else
|
|
warn "Container $CONTAINER_ID may not have restarted"
|
|
record "Container cycling" "FAIL"
|
|
fi
|
|
else
|
|
warn "No running containers found, skipping container test"
|
|
TESTS+=("SKIP Container cycling (no containers)")
|
|
fi
|
|
|
|
# =============================================================================
|
|
# Test 5: RPC Error Handling
|
|
# =============================================================================
|
|
log "=== Test 5: RPC Error Handling ==="
|
|
|
|
# Invalid method
|
|
result=$(rpc "nonexistent.method")
|
|
if echo "$result" | grep -qi "error\|unknown"; then
|
|
log "Invalid method correctly returns error"
|
|
err_pass=true
|
|
else
|
|
fail "Invalid method did not return error"
|
|
err_pass=false
|
|
fi
|
|
|
|
# Malformed JSON — server should not crash (any response is acceptable)
|
|
http_code=$(curl -s -o /dev/null -w "%{http_code}" -X POST "$RPC_URL" -H "Content-Type: application/json" -d '{broken}' 2>/dev/null || echo "000")
|
|
if [ "$http_code" != "000" ]; then
|
|
log "Malformed JSON handled without crash (HTTP $http_code)"
|
|
else
|
|
# Server may have been restarting from previous test, wait and retry
|
|
sleep 3
|
|
http_code=$(curl -s -o /dev/null -w "%{http_code}" -X POST "$RPC_URL" -H "Content-Type: application/json" -d '{broken}' 2>/dev/null | tail -c 3 || echo "000")
|
|
if [ -n "$http_code" ] && [ "$http_code" != "000" ]; then
|
|
log "Malformed JSON handled without crash (HTTP $http_code, retry)"
|
|
else
|
|
warn "Server unreachable for malformed JSON test"
|
|
err_pass=false
|
|
fi
|
|
fi
|
|
|
|
# Missing params
|
|
result=$(rpc "backup.create")
|
|
if echo "$result" | grep -qi "error\|missing"; then
|
|
log "Missing params correctly returns error"
|
|
else
|
|
err_pass=false
|
|
fi
|
|
|
|
if [ "$err_pass" = true ]; then
|
|
record "RPC error handling" "PASS"
|
|
else
|
|
record "RPC error handling" "FAIL"
|
|
fi
|
|
|
|
# =============================================================================
|
|
# Test 6: Rapid Reconnection
|
|
# =============================================================================
|
|
log "=== Test 6: Rapid Restart Cycling ==="
|
|
|
|
for i in 1 2 3; do
|
|
sudo systemctl restart archipelago
|
|
sleep 3
|
|
if ! wait_for_health 15; then
|
|
fail "Failed to recover on cycle $i"
|
|
record "Rapid restart cycling" "FAIL"
|
|
break
|
|
fi
|
|
done
|
|
|
|
if curl -sf "$HEALTH_URL" > /dev/null 2>&1; then
|
|
log "Server survived 3 rapid restarts"
|
|
record "Rapid restart cycling" "PASS"
|
|
fi
|
|
|
|
authenticate
|
|
|
|
# =============================================================================
|
|
# Test 7: Data Integrity After Chaos
|
|
# =============================================================================
|
|
log "=== Test 7: Data Integrity Check ==="
|
|
|
|
# Check system stats still work
|
|
stats=$(rpc "system.stats")
|
|
if echo "$stats" | grep -q "cpu_usage_percent"; then
|
|
log "System stats OK"
|
|
data_ok=true
|
|
else
|
|
fail "System stats broken"
|
|
data_ok=false
|
|
fi
|
|
|
|
# Check update status
|
|
update=$(rpc "update.status")
|
|
if echo "$update" | grep -q "current_version"; then
|
|
log "Update status OK"
|
|
else
|
|
data_ok=false
|
|
fi
|
|
|
|
# Check backup list
|
|
backups=$(rpc "backup.list")
|
|
if echo "$backups" | grep -q "backups"; then
|
|
log "Backup list OK"
|
|
else
|
|
data_ok=false
|
|
fi
|
|
|
|
if [ "$data_ok" = true ]; then
|
|
record "Data integrity" "PASS"
|
|
else
|
|
record "Data integrity" "FAIL"
|
|
fi
|
|
|
|
# =============================================================================
|
|
# Summary
|
|
# =============================================================================
|
|
rm -f "$COOKIE_FILE"
|
|
|
|
echo ""
|
|
echo "============================================"
|
|
echo " Chaos Test Results"
|
|
echo "============================================"
|
|
for r in "${TESTS[@]}"; do
|
|
case "$r" in
|
|
PASS*) echo -e " ${GREEN}$r${NC}" ;;
|
|
FAIL*) echo -e " ${RED}$r${NC}" ;;
|
|
SKIP*) echo -e " ${YELLOW}$r${NC}" ;;
|
|
esac
|
|
done
|
|
echo ""
|
|
echo " Passed: $PASS Failed: $FAIL"
|
|
echo "============================================"
|
|
|
|
if [ "$FAIL" -gt 0 ]; then
|
|
exit 1
|
|
fi
|