archy/scripts/node-profile.sh

#!/bin/bash
# node-profile.sh — CPU/memory/container profiling across all Archipelago nodes
#
# Usage:
#   ./scripts/node-profile.sh              # All reachable nodes
#   ./scripts/node-profile.sh 192.168.1.228  # Single node
#   ./scripts/node-profile.sh --watch      # Repeat every 30s
#
# Requires: SSH key at ~/.ssh/archipelago-deploy (or ARCHIPELAGO_SSH_KEY)

set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
source "$SCRIPT_DIR/lib/common.sh"
source "$SCRIPT_DIR/deploy-config-defaults.sh"
[ -f "$SCRIPT_DIR/deploy-config.sh" ] && source "$SCRIPT_DIR/deploy-config.sh"

ALL_NODES=(
  "$DEFAULT_PRIMARY"
  "$DEFAULT_SECONDARY"
  "$TAILSCALE_ARCH1"
  "$TAILSCALE_ARCH2"
  "$TAILSCALE_ARCH3"
)

NODE_LABELS=(
  "primary (.228)"
  "secondary (.198)"
  "tailscale-1"
  "tailscale-2"
  "tailscale-3"
)

WATCH_MODE=false
WATCH_INTERVAL=30
TARGET_NODES=()

# ── Parse args ─────────────────────────────────────────────────────────

while [[ $# -gt 0 ]]; do
  case "$1" in
    --watch)
      WATCH_MODE=true
      shift
      ;;
    --interval)
      WATCH_INTERVAL="$2"
      shift 2
      ;;
    *)
      TARGET_NODES+=("$1")
      shift
      ;;
  esac
done

# If specific nodes given, use those; otherwise use all
if [ ${#TARGET_NODES[@]} -eq 0 ]; then
  TARGET_NODES=("${ALL_NODES[@]}")
fi

# ── Remote profiling command ───────────────────────────────────────────

PROFILE_CMD='
hostname_val=$(hostname 2>/dev/null || echo "unknown")
uptime_val=$(uptime -p 2>/dev/null || uptime | sed "s/.*up/up/;s/,.*//")

# CPU info
cpu_cores=$(nproc 2>/dev/null || echo "?")
load_avg=$(cat /proc/loadavg 2>/dev/null | awk "{print \$1, \$2, \$3}")

# Memory
mem_info=$(free -h 2>/dev/null | awk "/^Mem:/{printf \"%s / %s (%s free)\", \$3, \$2, \$4}")
swap_info=$(free -h 2>/dev/null | awk "/^Swap:/{if(\$2 != \"0B\" && \$2 != \"0\") printf \"%s / %s\", \$3, \$2; else print \"none\"}")

# Disk
disk_info=$(df -h / 2>/dev/null | awk "NR==2{printf \"%s / %s (%s)\", \$3, \$2, \$5}")

# CPU temperature (if available)
temp="n/a"
if [ -f /sys/class/thermal/thermal_zone0/temp ]; then
  raw=$(cat /sys/class/thermal/thermal_zone0/temp)
  temp="$((raw / 1000))°C"
fi

echo "HEADER|${hostname_val}|${uptime_val}|${cpu_cores} cores|load ${load_avg}|${temp}"
echo "MEM|${mem_info}"
echo "SWAP|${swap_info}"
echo "DISK|${disk_info}"

# Top 10 processes by CPU
echo "PROCS_START"
ps aux --sort=-%cpu 2>/dev/null | head -11 | awk "NR>1{printf \"%-6s %-5s %-5s %s\n\", \$2, \$3, \$4, \$11}" 2>/dev/null
echo "PROCS_END"

# Container status
echo "CONTAINERS_START"
if command -v podman >/dev/null 2>&1; then
  podman ps -a --format "{{.Names}}|{{.Status}}|{{.Size}}" 2>/dev/null || \
  podman ps -a --format "{{.Names}}|{{.Status}}" 2>/dev/null || \
  echo "podman error"
elif command -v docker >/dev/null 2>&1; then
  docker ps -a --format "{{.Names}}|{{.Status}}" 2>/dev/null || echo "docker error"
else
  echo "no container runtime"
fi
echo "CONTAINERS_END"
'

# ── Formatting ─────────────────────────────────────────────────────────

BOLD="\033[1m"
DIM="\033[2m"
GREEN="\033[0;32m"
YELLOW="\033[0;33m"
RED="\033[0;31m"
CYAN="\033[0;36m"
RESET="\033[0m"

SEP="━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"

print_node_report() {
  local ip="$1"
  local label="$2"
  local output="$3"

  echo -e "\n${BOLD}${CYAN}${SEP}${RESET}"
  echo -e "${BOLD}${CYAN}  ${label}  ${DIM}(${ip})${RESET}"
  echo -e "${BOLD}${CYAN}${SEP}${RESET}"

  # Parse HEADER line
  local header
  header=$(echo "$output" | grep "^HEADER|" | head -1)
  if [ -n "$header" ]; then
    IFS='|' read -r _ hostname uptime cores load temp <<< "$header"
    echo -e "  ${BOLD}Host:${RESET}  ${hostname}  ${DIM}${uptime}${RESET}"
    echo -e "  ${BOLD}CPU:${RESET}   ${cores}  ${load}  ${temp}"
  fi

  # Memory
  local mem
  mem=$(echo "$output" | grep "^MEM|" | cut -d'|' -f2)
  [ -n "$mem" ] && echo -e "  ${BOLD}Mem:${RESET}   ${mem}"

  local swap
  swap=$(echo "$output" | grep "^SWAP|" | cut -d'|' -f2)
  [ -n "$swap" ] && echo -e "  ${BOLD}Swap:${RESET}  ${swap}"

  local disk
  disk=$(echo "$output" | grep "^DISK|" | cut -d'|' -f2)
  [ -n "$disk" ] && echo -e "  ${BOLD}Disk:${RESET}  ${disk}"

  # Top processes
  echo ""
  echo -e "  ${BOLD}Top processes by CPU:${RESET}"
  echo -e "  ${DIM}PID    CPU%  MEM%  Command${RESET}"
  local procs
  procs=$(echo "$output" | sed -n '/^PROCS_START$/,/^PROCS_END$/p' | grep -v "^PROCS_")
  if [ -n "$procs" ]; then
    while IFS= read -r line; do
      local cpu_pct
      cpu_pct=$(echo "$line" | awk '{print $2}' | tr -d '.')
      if [ "${cpu_pct:-0}" -gt 500 ] 2>/dev/null; then
        echo -e "  ${RED}${line}${RESET}"
      elif [ "${cpu_pct:-0}" -gt 100 ] 2>/dev/null; then
        echo -e "  ${YELLOW}${line}${RESET}"
      else
        echo -e "  ${line}"
      fi
    done <<< "$procs"
  else
    echo -e "  ${DIM}(no process data)${RESET}"
  fi

  # Containers
  echo ""
  echo -e "  ${BOLD}Containers:${RESET}"
  local containers
  containers=$(echo "$output" | sed -n '/^CONTAINERS_START$/,/^CONTAINERS_END$/p' | grep -v "^CONTAINERS_")
  if [ -n "$containers" ] && [ "$containers" != "no container runtime" ] && [ "$containers" != "podman error" ]; then
    while IFS='|' read -r name status size; do
      local icon
      if echo "$status" | grep -qi "up"; then
        icon="${GREEN}●${RESET}"
      else
        icon="${RED}○${RESET}"
      fi
      echo -e "  ${icon} ${BOLD}${name}${RESET}  ${DIM}${status}${RESET}"
    done <<< "$containers"
  else
    echo -e "  ${DIM}${containers:-none}${RESET}"
  fi
}

# ── Main profiling loop ───────────────────────────────────────────────

profile_all() {
  echo -e "\n${BOLD}Archipelago Node Profile${RESET}  ${DIM}$(date '+%Y-%m-%d %H:%M:%S')${RESET}"

  local tmpdir
  tmpdir=$(mktemp -d)

  # Probe all nodes in parallel
  local pids=()
  for i in "${!TARGET_NODES[@]}"; do
    local ip="${TARGET_NODES[$i]}"
    local label="${NODE_LABELS[$i]:-$ip}"
    (
      result=$(ssh_cmd "$ip" "$PROFILE_CMD" 2>/dev/null) && \
        echo "$result" > "$tmpdir/$i.out" || \
        echo "UNREACHABLE" > "$tmpdir/$i.out"
    ) &
    pids+=($!)
  done

  # Wait for all probes
  for pid in "${pids[@]}"; do
    wait "$pid" 2>/dev/null || true
  done

  # Print reports
  local reachable=0 unreachable=0
  for i in "${!TARGET_NODES[@]}"; do
    local ip="${TARGET_NODES[$i]}"
    local label="${NODE_LABELS[$i]:-$ip}"
    local outfile="$tmpdir/$i.out"

    if [ -f "$outfile" ] && [ "$(cat "$outfile")" != "UNREACHABLE" ]; then
      print_node_report "$ip" "$label" "$(cat "$outfile")"
      reachable=$((reachable + 1))
    else
      echo -e "\n${DIM}${SEP}${RESET}"
      echo -e "${RED}  ${label} (${ip}) — unreachable${RESET}"
      echo -e "${DIM}${SEP}${RESET}"
      unreachable=$((unreachable + 1))
    fi
  done

  echo -e "\n${DIM}${reachable} reachable, ${unreachable} unreachable${RESET}\n"
  rm -rf "$tmpdir"
}

if $WATCH_MODE; then
  while true; do
    clear
    profile_all
    echo -e "${DIM}Refreshing every ${WATCH_INTERVAL}s — Ctrl+C to stop${RESET}"
    sleep "$WATCH_INTERVAL"
  done
else
  profile_all
fi