Update pve/k8s-talos-safe-shutdown.sh

This commit is contained in:
2026-04-21 15:34:03 +00:00
parent 68bc7535d0
commit c7a38df358

View File

@@ -1,16 +1,17 @@
#!/usr/bin/env bash #!/usr/bin/env bash
# ============================================================================= # =============================================================================
# k8s-talos-safe-shutdown.sh # k8s-safe-shutdown.sh
# Safe shutdown/reboot for Talos K8s cluster before powering off Proxmox host # Safe shutdown/reboot for Talos K8s cluster before powering off Proxmox host
# #
# Usage (from Proxmox host): # Usage (from Proxmox host):
# chmod +x k8s-talos-safe-shutdown.sh # chmod +x k8s-safe-shutdown.sh
# ./k8s-talos-safe-shutdown.sh # graceful shutdown # ./k8s-safe-shutdown.sh # graceful shutdown
# ./k8s-talos-safe-shutdown.sh --reboot # graceful reboot # ./k8s-safe-shutdown.sh --reboot # graceful reboot
# #
# Requirements on Proxmox host: # Requirements on Proxmox host:
# - SSH key access to your cluster management node # - SSH key access to your cluster management node
# - talos + kubectl installed on that node # - talosctl + kubectl installed on that node
# (talos is an alias to a function talosctl --talosconfig ~/.config/talosconfig -n "${TALOS_DEFAULT_NODE:-10.0.30.21}" "${args[@]}" )
# - TALOSCONFIG / KUBECONFIG set on that node # - TALOSCONFIG / KUBECONFIG set on that node
# ============================================================================= # =============================================================================
@@ -35,9 +36,10 @@ for arg in "$@"; do
done done
# ─── Configuration ──────────────────────────────────────────────────────────── # ─── Configuration ────────────────────────────────────────────────────────────
MGMT_HOST="10.0.30.40" # SSH target: your cluster mgmt node MGMT_HOST="10.0.30.40" # SSH target: your cluster mgmt node / jumpbox
MGMT_USER="sysadmin" # SSH user MGMT_USER="sysadmin" # SSH user
SSH_KEY="$HOME/.ssh/id_ed25519" # SSH key on this Proxmox host SSH_KEY="$HOME/.ssh/id_ed25519" # SSH key on this Proxmox host
SSH_PORT="3333"
# Talos node IPs (workers first, control plane last) # Talos node IPs (workers first, control plane last)
WORKER_NODES=( WORKER_NODES=(
@@ -64,7 +66,7 @@ warn() { echo -e "${YELLOW}[$(date +%H:%M:%S)] ⚠${RESET} $*"; }
fail() { echo -e "${RED}[$(date +%H:%M:%S)] ✗${RESET} $*"; exit 1; } fail() { echo -e "${RED}[$(date +%H:%M:%S)] ✗${RESET} $*"; exit 1; }
# Helper: run a command on the management node over SSH # Helper: run a command on the management node over SSH
mgmt() { ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no "${MGMT_USER}@${MGMT_HOST}" "$@"; } mgmt() { ssh -i "$SSH_KEY" -p "$SSH_PORT" -o StrictHostKeyChecking=no "${MGMT_USER}@${MGMT_HOST}" "$@"; }
# ─── Preflight ──────────────────────────────────────────────────────────────── # ─── Preflight ────────────────────────────────────────────────────────────────
echo "" echo ""
@@ -103,20 +105,20 @@ if [[ "$TEST_ONLY" == true ]]; then
fi fi
done done
log "③ talos — worker nodes..." log "③ talosctl — worker nodes..."
for node in "${WORKER_NODES[@]}"; do for node in "${WORKER_NODES[@]}"; do
if mgmt talosctl version --nodes "$node" --short &>/dev/null; then if mgmt talos version --nodes "$node" --short &>/dev/null; then
ok " talos → $node reachable" ok " talosctl$node reachable"
else else
warn " talos → $node NOT reachable" warn " talosctl$node NOT reachable"
fi fi
done done
log "④ talos — control plane ($CONTROL_PLANE)..." log "④ talosctl — control plane ($CONTROL_PLANE)..."
if mgmt talosctl version --nodes "$CONTROL_PLANE" --short &>/dev/null; then if mgmt talos version --nodes "$CONTROL_PLANE" --short &>/dev/null; then
ok " talos → $CONTROL_PLANE reachable" ok " talosctl$CONTROL_PLANE reachable"
else else
warn " talos → $CONTROL_PLANE NOT reachable" warn " talosctl$CONTROL_PLANE NOT reachable"
fi fi
log "⑤ Longhorn volumes..." log "⑤ Longhorn volumes..."
@@ -175,20 +177,48 @@ for node in "${WORKER_NODES[@]}"; do
done done
echo "" echo ""
# ─── Helper: nuke all Longhorn PDBs ──────────────────────────────────────────
delete_longhorn_pdbs() {
local PDBS
PDBS=$(mgmt kubectl get pdb -n longhorn-system --no-headers -o custom-columns="NAME:.metadata.name" 2>/dev/null || true)
if [[ -n "$PDBS" ]]; then
echo "$PDBS" | while read pdb; do
mgmt kubectl delete pdb "$pdb" -n longhorn-system 2>/dev/null && \
ok " Deleted PDB: $pdb" || true
done
fi
}
# ─── Step 4: Drain workloads ───────────────────────────────────────────────── # ─── Step 4: Drain workloads ─────────────────────────────────────────────────
log "Step 4/7 — Draining workloads from worker nodes (timeout: ${DRAIN_TIMEOUT}s)..." log "Step 4/7 — Draining workloads from worker nodes (timeout: ${DRAIN_TIMEOUT}s)..."
for node in "${WORKER_NODES[@]}"; do for node in "${WORKER_NODES[@]}"; do
NODE_NAME=$(mgmt kubectl get nodes --no-headers -o custom-columns="NAME:.metadata.name,IP:.status.addresses[0].address" | grep "$node" | awk '{print $1}' || true) NODE_NAME=$(mgmt kubectl get nodes --no-headers -o custom-columns="NAME:.metadata.name,IP:.status.addresses[0].address" \
| grep "$node" | awk '{print $1}' || true)
if [[ -n "$NODE_NAME" ]]; then if [[ -n "$NODE_NAME" ]]; then
log " Clearing Longhorn PDBs before draining $NODE_NAME..."
delete_longhorn_pdbs
log " Draining $NODE_NAME..." log " Draining $NODE_NAME..."
# Run drain in background, keep killing PDBs while it works
mgmt kubectl drain "$NODE_NAME" \ mgmt kubectl drain "$NODE_NAME" \
--ignore-daemonsets \ --ignore-daemonsets \
--delete-emptydir-data \ --delete-emptydir-data \
--force \ --force \
--timeout="${DRAIN_TIMEOUT}s" \ --timeout="${DRAIN_TIMEOUT}s" \
--grace-period=30 && \ --grace-period=30 &
DRAIN_PID=$!
# Poll and delete any new PDBs while drain is running
while kill -0 "$DRAIN_PID" 2>/dev/null; do
sleep 4
delete_longhorn_pdbs
done
wait "$DRAIN_PID" && \
ok " $NODE_NAME drained" || \ ok " $NODE_NAME drained" || \
warn " $NODE_NAME drain had warnings (DaemonSets left behind is normal)" warn " $NODE_NAME drain had warnings (DaemonSets left behind is normal)"
else
warn " Could not find node for IP $node — skipping"
fi fi
done done
echo "" echo ""
@@ -214,28 +244,70 @@ while true; do
done done
echo "" echo ""
# ─── Step 6: Shut down Talos nodes (workers first, then control plane) ────────
# ─── Helper: wait for a Talos node to disappear from kubectl ─────────────────
wait_for_node_gone() {
local node_ip="$1"
local timeout=120
local elapsed=0
local interval=5
while true; do
local status
status=$(mgmt kubectl get nodes --no-headers -o custom-columns="NAME:.metadata.name,IP:.status.addresses[0].address" \
2>/dev/null | grep "$node_ip" | awk '{print $1}' || true)
if [[ -z "$status" ]]; then
ok " $node_ip — node gone from cluster"
return 0
fi
local ready
ready=$(mgmt kubectl get nodes --no-headers 2>/dev/null | grep "$status" | awk '{print $2}' || true)
if [[ "$ready" == "NotReady" ]]; then
ok " $node_ip — node is NotReady (safe to proceed)"
return 0
fi
if [[ "$elapsed" -ge "$timeout" ]]; then
warn " $node_ip — still Ready after ${timeout}s, proceeding anyway"
return 0
fi
log " $node_ip — waiting for NotReady... (${elapsed}s)"
sleep "$interval"
elapsed=$((elapsed + interval))
done
}
# ─── Step 6: Shut down Talos nodes ───────────────────────────────────────────
log "Step 6/7 — Initiating Talos shutdown sequence..." log "Step 6/7 — Initiating Talos shutdown sequence..."
log " Shutting down worker nodes first..." log " Shutting down worker nodes first..."
for node in "${WORKER_NODES[@]}"; do for node in "${WORKER_NODES[@]}"; do
log " Sending shutdown to $node..." log " Sending shutdown to $node..."
mgmt talosctl shutdown --nodes "$node" --force 2>/dev/null && \
ok " $node shutdown initiated" || \ if mgmt talos shutdown --nodes "$node" --force --wait=false 2>/dev/null; then
warn " talos shutdown failed for $node — trying SSH poweroff" ok " $node shutdown initiated"
# Fallback: direct SSH if talos fails else
warn " talosctl shutdown failed for $node — trying SSH poweroff"
ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 "root@$node" "poweroff" 2>/dev/null || true ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 "root@$node" "poweroff" 2>/dev/null || true
fi
# Wait for the node to actually go NotReady before moving to the next one
wait_for_node_gone "$node"
sleep 5 sleep 5
done done
log " Waiting 30s for workers to power off before stopping control plane..." log " Waiting 15s for workers to fully power off before stopping control plane..."
sleep 30 sleep 15
log " Shutting down control plane node ($CONTROL_PLANE)..." log " Shutting down control plane node ($CONTROL_PLANE)..."
mgmt talosctl shutdown --nodes "$CONTROL_PLANE" --force 2>/dev/null && \ if mgmt talos shutdown --nodes "$CONTROL_PLANE" --force --wait=false 2>/dev/null; then
ok " Control plane shutdown initiated" || \ ok " Control plane shutdown initiated"
warn " talos shutdown failed for control plane" else
warn " talosctl shutdown failed for control plane — trying SSH poweroff"
ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 "root@$CONTROL_PLANE" "poweroff" 2>/dev/null || true
fi
wait_for_node_gone "$CONTROL_PLANE"
echo "" echo ""
# ─── Step 7: Stop remaining Proxmox VMs/CTs, then host action ──────────────── # ─── Step 7: Stop remaining Proxmox VMs/CTs, then host action ────────────────
log "Step 7/7 — Stopping remaining Proxmox VMs and containers..." log "Step 7/7 — Stopping remaining Proxmox VMs and containers..."