From c7a38df358d3252aa181c9dc8f7007a166690db7 Mon Sep 17 00:00:00 2001 From: rgcosta Date: Tue, 21 Apr 2026 15:34:03 +0000 Subject: [PATCH] Update pve/k8s-talos-safe-shutdown.sh --- pve/k8s-talos-safe-shutdown.sh | 128 +++++++++++++++++++++++++-------- 1 file changed, 100 insertions(+), 28 deletions(-) diff --git a/pve/k8s-talos-safe-shutdown.sh b/pve/k8s-talos-safe-shutdown.sh index 730e239..d67fca5 100644 --- a/pve/k8s-talos-safe-shutdown.sh +++ b/pve/k8s-talos-safe-shutdown.sh @@ -1,16 +1,17 @@ #!/usr/bin/env bash # ============================================================================= -# k8s-talos-safe-shutdown.sh +# k8s-safe-shutdown.sh # Safe shutdown/reboot for Talos K8s cluster before powering off Proxmox host # # Usage (from Proxmox host): -# chmod +x k8s-talos-safe-shutdown.sh -# ./k8s-talos-safe-shutdown.sh # graceful shutdown -# ./k8s-talos-safe-shutdown.sh --reboot # graceful reboot +# chmod +x k8s-safe-shutdown.sh +# ./k8s-safe-shutdown.sh # graceful shutdown +# ./k8s-safe-shutdown.sh --reboot # graceful reboot # # Requirements on Proxmox host: # - SSH key access to your cluster management node -# - talos + kubectl installed on that node +# - talosctl + kubectl installed on that node +# (talos is an alias to a function talosctl --talosconfig ~/.config/talosconfig -n "${TALOS_DEFAULT_NODE:-10.0.30.21}" "${args[@]}" ) # - TALOSCONFIG / KUBECONFIG set on that node # ============================================================================= @@ -35,9 +36,10 @@ for arg in "$@"; do done # ─── Configuration ──────────────────────────────────────────────────────────── -MGMT_HOST="10.0.30.40" # SSH target: your cluster mgmt node -MGMT_USER="sysadmin" # SSH user +MGMT_HOST="10.0.30.40" # SSH target: your cluster mgmt node / jumpbox +MGMT_USER="sysadmin" # SSH user SSH_KEY="$HOME/.ssh/id_ed25519" # SSH key on this Proxmox host +SSH_PORT="3333" # Talos node IPs (workers first, control plane last) WORKER_NODES=( @@ -64,7 +66,7 @@ warn() { echo -e "${YELLOW}[$(date +%H:%M:%S)] ⚠${RESET} $*"; } fail() { echo -e "${RED}[$(date +%H:%M:%S)] ✗${RESET} $*"; exit 1; } # Helper: run a command on the management node over SSH -mgmt() { ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no "${MGMT_USER}@${MGMT_HOST}" "$@"; } +mgmt() { ssh -i "$SSH_KEY" -p "$SSH_PORT" -o StrictHostKeyChecking=no "${MGMT_USER}@${MGMT_HOST}" "$@"; } # ─── Preflight ──────────────────────────────────────────────────────────────── echo "" @@ -103,20 +105,20 @@ if [[ "$TEST_ONLY" == true ]]; then fi done - log "③ talos — worker nodes..." + log "③ talosctl — worker nodes..." for node in "${WORKER_NODES[@]}"; do - if mgmt talosctl version --nodes "$node" --short &>/dev/null; then - ok " talos → $node reachable" + if mgmt talos version --nodes "$node" --short &>/dev/null; then + ok " talosctl → $node reachable" else - warn " talos → $node NOT reachable" + warn " talosctl → $node NOT reachable" fi done - log "④ talos — control plane ($CONTROL_PLANE)..." - if mgmt talosctl version --nodes "$CONTROL_PLANE" --short &>/dev/null; then - ok " talos → $CONTROL_PLANE reachable" + log "④ talosctl — control plane ($CONTROL_PLANE)..." + if mgmt talos version --nodes "$CONTROL_PLANE" --short &>/dev/null; then + ok " talosctl → $CONTROL_PLANE reachable" else - warn " talos → $CONTROL_PLANE NOT reachable" + warn " talosctl → $CONTROL_PLANE NOT reachable" fi log "⑤ Longhorn volumes..." @@ -175,20 +177,48 @@ for node in "${WORKER_NODES[@]}"; do done echo "" +# ─── Helper: nuke all Longhorn PDBs ────────────────────────────────────────── +delete_longhorn_pdbs() { + local PDBS + PDBS=$(mgmt kubectl get pdb -n longhorn-system --no-headers -o custom-columns="NAME:.metadata.name" 2>/dev/null || true) + if [[ -n "$PDBS" ]]; then + echo "$PDBS" | while read pdb; do + mgmt kubectl delete pdb "$pdb" -n longhorn-system 2>/dev/null && \ + ok " Deleted PDB: $pdb" || true + done + fi +} + # ─── Step 4: Drain workloads ───────────────────────────────────────────────── log "Step 4/7 — Draining workloads from worker nodes (timeout: ${DRAIN_TIMEOUT}s)..." for node in "${WORKER_NODES[@]}"; do - NODE_NAME=$(mgmt kubectl get nodes --no-headers -o custom-columns="NAME:.metadata.name,IP:.status.addresses[0].address" | grep "$node" | awk '{print $1}' || true) + NODE_NAME=$(mgmt kubectl get nodes --no-headers -o custom-columns="NAME:.metadata.name,IP:.status.addresses[0].address" \ + | grep "$node" | awk '{print $1}' || true) if [[ -n "$NODE_NAME" ]]; then + log " Clearing Longhorn PDBs before draining $NODE_NAME..." + delete_longhorn_pdbs + log " Draining $NODE_NAME..." + # Run drain in background, keep killing PDBs while it works mgmt kubectl drain "$NODE_NAME" \ --ignore-daemonsets \ --delete-emptydir-data \ --force \ --timeout="${DRAIN_TIMEOUT}s" \ - --grace-period=30 && \ + --grace-period=30 & + DRAIN_PID=$! + + # Poll and delete any new PDBs while drain is running + while kill -0 "$DRAIN_PID" 2>/dev/null; do + sleep 4 + delete_longhorn_pdbs + done + + wait "$DRAIN_PID" && \ ok " $NODE_NAME drained" || \ warn " $NODE_NAME drain had warnings (DaemonSets left behind is normal)" + else + warn " Could not find node for IP $node — skipping" fi done echo "" @@ -214,28 +244,70 @@ while true; do done echo "" -# ─── Step 6: Shut down Talos nodes (workers first, then control plane) ──────── + +# ─── Helper: wait for a Talos node to disappear from kubectl ───────────────── +wait_for_node_gone() { + local node_ip="$1" + local timeout=120 + local elapsed=0 + local interval=5 + while true; do + local status + status=$(mgmt kubectl get nodes --no-headers -o custom-columns="NAME:.metadata.name,IP:.status.addresses[0].address" \ + 2>/dev/null | grep "$node_ip" | awk '{print $1}' || true) + if [[ -z "$status" ]]; then + ok " $node_ip — node gone from cluster" + return 0 + fi + local ready + ready=$(mgmt kubectl get nodes --no-headers 2>/dev/null | grep "$status" | awk '{print $2}' || true) + if [[ "$ready" == "NotReady" ]]; then + ok " $node_ip — node is NotReady (safe to proceed)" + return 0 + fi + if [[ "$elapsed" -ge "$timeout" ]]; then + warn " $node_ip — still Ready after ${timeout}s, proceeding anyway" + return 0 + fi + log " $node_ip — waiting for NotReady... (${elapsed}s)" + sleep "$interval" + elapsed=$((elapsed + interval)) + done +} + +# ─── Step 6: Shut down Talos nodes ─────────────────────────────────────────── log "Step 6/7 — Initiating Talos shutdown sequence..." log " Shutting down worker nodes first..." for node in "${WORKER_NODES[@]}"; do log " Sending shutdown to $node..." - mgmt talosctl shutdown --nodes "$node" --force 2>/dev/null && \ - ok " $node shutdown initiated" || \ - warn " talos shutdown failed for $node — trying SSH poweroff" - # Fallback: direct SSH if talos fails + + if mgmt talos shutdown --nodes "$node" --force --wait=false 2>/dev/null; then + ok " $node shutdown initiated" + else + warn " talosctl shutdown failed for $node — trying SSH poweroff" ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 "root@$node" "poweroff" 2>/dev/null || true + fi + + # Wait for the node to actually go NotReady before moving to the next one + wait_for_node_gone "$node" sleep 5 done -log " Waiting 30s for workers to power off before stopping control plane..." -sleep 30 +log " Waiting 15s for workers to fully power off before stopping control plane..." +sleep 15 log " Shutting down control plane node ($CONTROL_PLANE)..." -mgmt talosctl shutdown --nodes "$CONTROL_PLANE" --force 2>/dev/null && \ - ok " Control plane shutdown initiated" || \ - warn " talos shutdown failed for control plane" +if mgmt talos shutdown --nodes "$CONTROL_PLANE" --force --wait=false 2>/dev/null; then + ok " Control plane shutdown initiated" +else + warn " talosctl shutdown failed for control plane — trying SSH poweroff" + ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 "root@$CONTROL_PLANE" "poweroff" 2>/dev/null || true +fi +wait_for_node_gone "$CONTROL_PLANE" echo "" + + # ─── Step 7: Stop remaining Proxmox VMs/CTs, then host action ──────────────── log "Step 7/7 — Stopping remaining Proxmox VMs and containers..."