fix: fallback SSH user per host during bootstrap steps

2026-03-01 13:34:15 +00:00
parent 8bd064c828
commit 88db11292d
1 changed files with 21 additions and 1 deletions
--- a/nixos/kubeadm/scripts/rebuild-and-bootstrap.sh
+++ b/nixos/kubeadm/scripts/rebuild-and-bootstrap.sh
@@ -84,8 +84,26 @@ remote() {
  local host_ip="$1"
  local cmd="$2"
  local quoted_cmd
+  local candidate
+  local candidates=()
+
+  candidates+=("$ACTIVE_SSH_USER")
+  for candidate in $SSH_USER_CANDIDATES; do
+    if [ "$candidate" != "$ACTIVE_SSH_USER" ]; then
+      candidates+=("$candidate")
+    fi
+  done
+
  quoted_cmd="$(printf '%q' "$cmd")"
-  ssh $SSH_OPTS "$ACTIVE_SSH_USER@$host_ip" "bash -lc $quoted_cmd"
+  for candidate in "${candidates[@]}"; do
+    if ssh $SSH_OPTS "$candidate@$host_ip" "bash -lc $quoted_cmd"; then
+      ACTIVE_SSH_USER="$candidate"
+      return 0
+    fi
+  done
+
+  echo "Remote command failed for all SSH users on $host_ip"
+  return 1
 }

 detect_ssh_user() {
@@ -130,6 +148,7 @@ rebuild_node() {
  local node_ip="$2"

  echo "==> Rebuilding $node_name on $node_ip"
+  detect_ssh_user "$node_ip"
  timeout "$REBUILD_TIMEOUT" nixos-rebuild switch \
    --flake "$FLAKE_DIR#$node_name" \
    --target-host "$ACTIVE_SSH_USER@$node_ip" \
@@ -233,6 +252,7 @@ if [ "$worker_failures" -gt 0 ]; then
 fi

 echo "==> Initializing control plane on $PRIMARY_CONTROL_PLANE"
+detect_ssh_user "$PRIMARY_CP_IP"
 if cluster_ready; then
  echo "==> Existing cluster detected on $PRIMARY_CONTROL_PLANE; skipping kubeadm init"
 else