#!/usr/bin/env bash
# =============================================================================
# k8s-talos-safe-shutdown.sh
# Safe shutdown/reboot for Talos K8s cluster before powering off Proxmox host
#
# Usage (from Proxmox host):
#   chmod +x k8s-talos-safe-shutdown.sh
#   ./k8s-talos-safe-shutdown.sh           # graceful shutdown
#   ./k8s-talos-safe-shutdown.sh --reboot   # graceful reboot
#
# Requirements on Proxmox host:
#   - SSH key access to your cluster management node
#   - talos + kubectl installed on that node
#   - TALOSCONFIG / KUBECONFIG set on that node
# =============================================================================

set -euo pipefail

# ─── Arguments ───────────────────────────────────────────────────────────────
ACTION="shutdown"
TEST_ONLY=false
for arg in "$@"; do
  case "$arg" in
    --reboot) ACTION="reboot" ;;
    --shutdown) ACTION="shutdown" ;;
    --test) TEST_ONLY=true ;;
    --help|-h)
      echo "Usage: $0 [--shutdown|--reboot|--test]"
      echo "  --shutdown  Gracefully stop cluster and power off host (default)"
      echo "  --reboot    Gracefully stop cluster and reboot host"
      echo "  --test      Check all connectivity only — no changes made"
      exit 0 ;;
    *) echo "Unknown argument: $arg"; exit 1 ;;
  esac
done

# ─── Configuration ────────────────────────────────────────────────────────────
MGMT_HOST="10.0.30.40"           # SSH target: your cluster mgmt node
MGMT_USER="sysadmin"             # SSH user
SSH_KEY="$HOME/.ssh/id_ed25519"  # SSH key on this Proxmox host

# Talos node IPs (workers first, control plane last)
WORKER_NODES=(
  "10.0.30.30"   # worker-1
  "10.0.30.31"   # worker-2
  "10.0.30.32"   # worker-3
)
CONTROL_PLANE="10.0.30.21"      # control-plane node

# How long to wait for pods to drain before forcing (seconds)
DRAIN_TIMEOUT=120
# How long to wait for Longhorn volumes to detach (seconds)
LONGHORN_TIMEOUT=120
# How long to wait for other VMs to shut down gracefully (seconds)
VM_SHUTDOWN_TIMEOUT=120

# ─── Colours ─────────────────────────────────────────────────────────────────
RED='\033[0;31m'; YELLOW='\033[1;33m'; GREEN='\033[0;32m'
CYAN='\033[0;36m'; BOLD='\033[1m'; RESET='\033[0m'

log()  { echo -e "${CYAN}[$(date +%H:%M:%S)]${RESET} $*"; }
ok()   { echo -e "${GREEN}[$(date +%H:%M:%S)] ✓${RESET} $*"; }
warn() { echo -e "${YELLOW}[$(date +%H:%M:%S)] ⚠${RESET} $*"; }
fail() { echo -e "${RED}[$(date +%H:%M:%S)] ✗${RESET} $*"; exit 1; }

# Helper: run a command on the management node over SSH
mgmt() { ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no "${MGMT_USER}@${MGMT_HOST}" "$@"; }

# ─── Preflight ────────────────────────────────────────────────────────────────
echo ""
if [[ "$ACTION" == "reboot" ]]; then
  echo -e "${BOLD}╔══════════════════════════════════════════════════════╗${RESET}"
  echo -e "${BOLD}║        K8S SAFE REBOOT — Talos / Proxmox             ║${RESET}"
  echo -e "${BOLD}╚══════════════════════════════════════════════════════╝${RESET}"
elif [[ "$TEST_ONLY" == true ]]; then
  echo -e "${BOLD}╔══════════════════════════════════════════════════════╗${RESET}"
  echo -e "${BOLD}║        K8S CONNECTIVITY TEST — no changes            ║${RESET}"
  echo -e "${BOLD}╚══════════════════════════════════════════════════════╝${RESET}"
else
  echo -e "${BOLD}╔══════════════════════════════════════════════════════╗${RESET}"
  echo -e "${BOLD}║        K8S SAFE SHUTDOWN — Talos / Proxmox           ║${RESET}"
  echo -e "${BOLD}╚══════════════════════════════════════════════════════╝${RESET}"
fi
echo ""

if [[ "$TEST_ONLY" == true ]]; then
  log "TEST MODE — checking connectivity only, nothing will be changed"
  echo ""

  log "① SSH to management node ($MGMT_USER@$MGMT_HOST)..."
  if mgmt echo "ok" &>/dev/null; then
    ok "  SSH connection successful"
  else
    fail "  Cannot SSH to $MGMT_HOST — check MGMT_HOST, MGMT_USER and SSH_KEY"
  fi

  log "② kubectl — cluster nodes..."
  mgmt kubectl get nodes --no-headers | awk '{print $1, $2}' | while read name status; do
    if [[ "$status" == "Ready" ]]; then
      ok "  $name → $status"
    else
      warn "  $name → $status"
    fi
  done

  log "③ talos — worker nodes..."
  for node in "${WORKER_NODES[@]}"; do
    if mgmt talosctl version --nodes "$node" --short &>/dev/null; then
      ok "  talos → $node reachable"
    else
      warn "  talos → $node NOT reachable"
    fi
  done

  log "④ talos — control plane ($CONTROL_PLANE)..."
  if mgmt talosctl version --nodes "$CONTROL_PLANE" --short &>/dev/null; then
    ok "  talos → $CONTROL_PLANE reachable"
  else
    warn "  talos → $CONTROL_PLANE NOT reachable"
  fi

  log "⑤ Longhorn volumes..."
  ATTACHED=$(mgmt kubectl get volumes -n longhorn-system --no-headers 2>/dev/null | grep -c "attached" || true)
  TOTAL=$(mgmt kubectl get volumes -n longhorn-system --no-headers 2>/dev/null | wc -l || true)
  ok "  $ATTACHED/$TOTAL volumes currently attached"

  log "⑥ Proxmox guests..."
  VM_COUNT=$(qm list 2>/dev/null | awk 'NR>1 && $3=="running"' | wc -l || echo "0")
  CT_COUNT=$(pct list 2>/dev/null | awk 'NR>1 && $2=="running"' | wc -l || echo "0")
  ok "  $VM_COUNT running VMs, $CT_COUNT running CTs (excluding K8s nodes)"

  echo ""
  ok "Test complete — all checks passed. Ready to run with --shutdown or --reboot."
  exit 0
fi
echo ""

warn "This will gracefully stop your entire Kubernetes cluster."
warn "All workloads will be stopped and volumes detached cleanly."
warn "Then all other Proxmox VMs will be stopped before the host ${ACTION}s."
echo ""
read -rp "$(echo -e "${YELLOW}Action: ${ACTION^^} — Type YES to continue: ${RESET}")" CONFIRM
[[ "$CONFIRM" == "YES" ]] || { echo "Aborted."; exit 0; }
echo ""

# ─── Step 1: Verify cluster is reachable ─────────────────────────────────────
log "Step 1/7 — Checking cluster connectivity..."
mgmt kubectl get nodes --no-headers | awk '{print $1, $2}' | while read name status; do
  if [[ "$status" == "Ready" ]]; then
    ok "  $name → $status"
  else
    warn "  $name → $status (not Ready — proceeding anyway)"
  fi
done
echo ""

# ─── Step 2: Suspend Flux reconciliation ─────────────────────────────────────
log "Step 2/7 — Suspending Flux to prevent reconcile loops during shutdown..."
mgmt kubectl get kustomizations -A --no-headers 2>/dev/null | while read ns name rest; do
  mgmt flux suspend kustomization "$name" -n "$ns" 2>/dev/null && \
    ok "  Suspended kustomization: $ns/$name" || \
    warn "  Could not suspend $ns/$name (flux CLI may not be installed — skipping)"
done || warn "  Flux not found or no kustomizations — skipping"
echo ""

# ─── Step 3: Cordon all worker nodes ─────────────────────────────────────────
log "Step 3/7 — Cordoning all worker nodes (no new scheduling)..."
for node in "${WORKER_NODES[@]}"; do
  NODE_NAME=$(mgmt kubectl get nodes --no-headers -o custom-columns="NAME:.metadata.name,IP:.status.addresses[0].address" | grep "$node" | awk '{print $1}' || true)
  if [[ -n "$NODE_NAME" ]]; then
    mgmt kubectl cordon "$NODE_NAME" && ok "  Cordoned $NODE_NAME ($node)" || warn "  Could not cordon $NODE_NAME"
  else
    warn "  Could not find node for IP $node — skipping cordon"
  fi
done
echo ""

# ─── Step 4: Drain workloads ─────────────────────────────────────────────────
log "Step 4/7 — Draining workloads from worker nodes (timeout: ${DRAIN_TIMEOUT}s)..."
for node in "${WORKER_NODES[@]}"; do
  NODE_NAME=$(mgmt kubectl get nodes --no-headers -o custom-columns="NAME:.metadata.name,IP:.status.addresses[0].address" | grep "$node" | awk '{print $1}' || true)
  if [[ -n "$NODE_NAME" ]]; then
    log "  Draining $NODE_NAME..."
    mgmt kubectl drain "$NODE_NAME" \
      --ignore-daemonsets \
      --delete-emptydir-data \
      --force \
      --timeout="${DRAIN_TIMEOUT}s" \
      --grace-period=30 && \
      ok "  $NODE_NAME drained" || \
      warn "  $NODE_NAME drain had warnings (DaemonSets left behind is normal)"
  fi
done
echo ""

# ─── Step 5: Wait for Longhorn volumes to detach ─────────────────────────────
log "Step 5/7 — Waiting for Longhorn volumes to detach..."
ELAPSED=0
INTERVAL=10
while true; do
  ATTACHED=$(mgmt kubectl get volumes -n longhorn-system --no-headers 2>/dev/null | grep -c "attached" || true)
  if [[ "$ATTACHED" -eq 0 ]]; then
    ok "  All Longhorn volumes detached"
    break
  fi
  if [[ "$ELAPSED" -ge "$LONGHORN_TIMEOUT" ]]; then
    warn "  Timeout waiting for Longhorn — $ATTACHED volume(s) still attached"
    warn "  Proceeding anyway (Longhorn will recover on next boot)"
    break
  fi
  log "  $ATTACHED volume(s) still attached — waiting ${INTERVAL}s... (${ELAPSED}s elapsed)"
  sleep "$INTERVAL"
  ELAPSED=$((ELAPSED + INTERVAL))
done
echo ""

# ─── Step 6: Shut down Talos nodes (workers first, then control plane) ────────
log "Step 6/7 — Initiating Talos shutdown sequence..."
log "  Shutting down worker nodes first..."
for node in "${WORKER_NODES[@]}"; do
  log "  Sending shutdown to $node..."
  mgmt talosctl shutdown --nodes "$node" --force 2>/dev/null && \
    ok "  $node shutdown initiated" || \
    warn "  talos shutdown failed for $node — trying SSH poweroff"
    # Fallback: direct SSH if talos fails
    ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 "root@$node" "poweroff" 2>/dev/null || true
  sleep 5
done

log "  Waiting 30s for workers to power off before stopping control plane..."
sleep 30

log "  Shutting down control plane node ($CONTROL_PLANE)..."
mgmt talosctl shutdown --nodes "$CONTROL_PLANE" --force 2>/dev/null && \
  ok "  Control plane shutdown initiated" || \
  warn "  talos shutdown failed for control plane"
echo ""

# ─── Step 7: Stop remaining Proxmox VMs/CTs, then host action ────────────────
log "Step 7/7 — Stopping remaining Proxmox VMs and containers..."

# Shut down all still-running VMs (in parallel)
RUNNING_VMS=$(qm list 2>/dev/null | awk 'NR>1 && $3=="running" {print $1}' || true)
RUNNING_CTS=$(pct list 2>/dev/null | awk 'NR>1 && $2=="running" {print $1}' || true)

if [[ -n "$RUNNING_VMS" ]]; then
  for vmid in $RUNNING_VMS; do
    log "  Stopping VM $vmid..."
    qm shutdown "$vmid" --timeout "$VM_SHUTDOWN_TIMEOUT" &
  done
else
  ok "  No running VMs found"
fi

if [[ -n "$RUNNING_CTS" ]]; then
  for ctid in $RUNNING_CTS; do
    log "  Stopping CT $ctid..."
    pct shutdown "$ctid" --timeout "$VM_SHUTDOWN_TIMEOUT" &
  done
else
  ok "  No running CTs found"
fi

log "  Waiting for all guests to stop (up to ${VM_SHUTDOWN_TIMEOUT}s)..."
wait
ok "  All guests stopped"
echo ""

# ─── Final action ─────────────────────────────────────────────────────────────
if [[ "$ACTION" == "reboot" ]]; then
  ok "All done! Rebooting host now."
  echo ""
  shutdown -r now
else
  ok "All done! Powering off host now."
  echo ""
  shutdown -h now
fi