titan-iac/scripts/node_recover.sh

#!/usr/bin/env bash
set -euo pipefail

usage() {
  cat <<USAGE
Usage: scripts/node_recover.sh <node-name> [options]

Options:
  --yes             Skip confirmation prompt
  --skip-drain      Do not cordon/drain; only capture recovery artifacts
  --delete-node     Delete Node object after drain (for hard-dead node replacement)
  --out-dir <dir>   Recovery artifact directory (default: ./artifacts/node-recovery)
  -h, --help        Show this help
USAGE
}

if ! command -v kubectl >/dev/null 2>&1; then
  echo "kubectl is required" >&2
  exit 1
fi
if ! command -v jq >/dev/null 2>&1; then
  echo "jq is required" >&2
  exit 1
fi

if [ "$#" -lt 1 ]; then
  usage
  exit 1
fi

node=""
assume_yes="false"
skip_drain="false"
delete_node="false"
out_dir="./artifacts/node-recovery"

while [ "$#" -gt 0 ]; do
  case "$1" in
    --yes)
      assume_yes="true"
      shift
      ;;
    --skip-drain)
      skip_drain="true"
      shift
      ;;
    --delete-node)
      delete_node="true"
      shift
      ;;
    --out-dir)
      out_dir="$2"
      shift 2
      ;;
    -h|--help)
      usage
      exit 0
      ;;
    -*)
      echo "Unknown option: $1" >&2
      usage
      exit 1
      ;;
    *)
      if [ -z "${node}" ]; then
        node="$1"
      else
        echo "Unexpected argument: $1" >&2
        usage
        exit 1
      fi
      shift
      ;;
  esac
done

if [ -z "${node}" ]; then
  echo "Node name is required" >&2
  usage
  exit 1
fi

if ! kubectl get node "${node}" >/dev/null 2>&1; then
  echo "Node ${node} not found in cluster API" >&2
  exit 1
fi

if [ "${assume_yes}" != "true" ]; then
  echo "About to prepare recovery workflow for node: ${node}"
  echo "skip_drain=${skip_drain} delete_node=${delete_node}"
  read -r -p "Type the node name to continue: " confirm
  if [ "${confirm}" != "${node}" ]; then
    echo "Confirmation did not match node name; aborting."
    exit 1
  fi
fi

timestamp="$(date +%Y%m%d-%H%M%S)"
artifacts_dir="${out_dir}/${node}-${timestamp}"
mkdir -p "${artifacts_dir}"

echo "Saving node and workload artifacts to ${artifacts_dir}"
kubectl get node "${node}" -o json > "${artifacts_dir}/node.json"
kubectl get node "${node}" --show-labels > "${artifacts_dir}/node.txt"
kubectl get pods -A --field-selector "spec.nodeName=${node}" -o wide > "${artifacts_dir}/pods-on-node.txt"

jq -r '
  .metadata.labels
  | to_entries[]
  | select(
      .key != "kubernetes.io/hostname"
      and .key != "beta.kubernetes.io/hostname"
      and .key != "node.kubernetes.io/instance-type"
      and .key != "beta.kubernetes.io/instance-type"
      and (.key | startswith("kubernetes.io/") | not)
      and (.key | startswith("beta.kubernetes.io/") | not)
      and (.key | startswith("node.kubernetes.io/") | not)
    )
  | "kubectl label node <replacement-node> " + .key + "=" + .value + " --overwrite"
' "${artifacts_dir}/node.json" > "${artifacts_dir}/restore-labels.sh"

jq -r '
  (.spec.taints // [])[]
  | "kubectl taint node <replacement-node> "
    + .key
    + (if .value then "=" + .value else "" end)
    + ":"
    + .effect
    + " --overwrite"
' "${artifacts_dir}/node.json" > "${artifacts_dir}/restore-taints.sh"

chmod +x "${artifacts_dir}/restore-labels.sh" "${artifacts_dir}/restore-taints.sh"

if [ "${skip_drain}" != "true" ]; then
  echo "Cordoning ${node}"
  kubectl cordon "${node}" || true

  echo "Draining ${node}"
  if ! kubectl drain "${node}" --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=20m; then
    echo "Standard drain failed; retrying with --force"
    if ! kubectl drain "${node}" --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=20m --force; then
      echo "Force drain failed; retrying with --disable-eviction"
      kubectl drain "${node}" --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=20m --force --disable-eviction
    fi
  fi
fi

if [ "${delete_node}" = "true" ]; then
  echo "Deleting node object ${node}"
  kubectl delete node "${node}" || true
fi

cat <<NEXT
Recovery prep complete for ${node}.
Artifacts: ${artifacts_dir}

Next steps:
1) Reimage/reprovision replacement host.
2) Rejoin k3s and wait for node Ready.
3) Reapply labels: ${artifacts_dir}/restore-labels.sh
4) Reapply taints: ${artifacts_dir}/restore-taints.sh
5) Validate pods and uncordon replacement when ready.
NEXT
maintenance: harden sd-write controls and recovery workflow 2026-03-31 00:06:44 -03:00			`#!/usr/bin/env bash`
			`set -euo pipefail`

			`usage() {`
			`cat <<USAGE`
			`Usage: scripts/node_recover.sh <node-name> [options]`

			`Options:`
			`--yes Skip confirmation prompt`
			`--skip-drain Do not cordon/drain; only capture recovery artifacts`
			`--delete-node Delete Node object after drain (for hard-dead node replacement)`
			`--out-dir <dir> Recovery artifact directory (default: ./artifacts/node-recovery)`
			`-h, --help Show this help`
			`USAGE`
			`}`

			`if ! command -v kubectl >/dev/null 2>&1; then`
			`echo "kubectl is required" >&2`
			`exit 1`
			`fi`
			`if ! command -v jq >/dev/null 2>&1; then`
			`echo "jq is required" >&2`
			`exit 1`
			`fi`

			`if [ "$#" -lt 1 ]; then`
			`usage`
			`exit 1`
			`fi`

			`node=""`
			`assume_yes="false"`
			`skip_drain="false"`
			`delete_node="false"`
			`out_dir="./artifacts/node-recovery"`

			`while [ "$#" -gt 0 ]; do`
			`case "$1" in`
			`--yes)`
			`assume_yes="true"`
			`shift`
			`;;`
			`--skip-drain)`
			`skip_drain="true"`
			`shift`
			`;;`
			`--delete-node)`
			`delete_node="true"`
			`shift`
			`;;`
			`--out-dir)`
			`out_dir="$2"`
			`shift 2`
			`;;`
			`-h\|--help)`
			`usage`
			`exit 0`
			`;;`
			`-*)`
			`echo "Unknown option: $1" >&2`
			`usage`
			`exit 1`
			`;;`
			`*)`
			`if [ -z "${node}" ]; then`
			`node="$1"`
			`else`
			`echo "Unexpected argument: $1" >&2`
			`usage`
			`exit 1`
			`fi`
			`shift`
			`;;`
			`esac`
			`done`

			`if [ -z "${node}" ]; then`
			`echo "Node name is required" >&2`
			`usage`
			`exit 1`
			`fi`

			`if ! kubectl get node "${node}" >/dev/null 2>&1; then`
			`echo "Node ${node} not found in cluster API" >&2`
			`exit 1`
			`fi`

			`if [ "${assume_yes}" != "true" ]; then`
			`echo "About to prepare recovery workflow for node: ${node}"`
			`echo "skip_drain=${skip_drain} delete_node=${delete_node}"`
			`read -r -p "Type the node name to continue: " confirm`
			`if [ "${confirm}" != "${node}" ]; then`
			`echo "Confirmation did not match node name; aborting."`
			`exit 1`
			`fi`
			`fi`

			`timestamp="$(date +%Y%m%d-%H%M%S)"`
			`artifacts_dir="${out_dir}/${node}-${timestamp}"`
			`mkdir -p "${artifacts_dir}"`

			`echo "Saving node and workload artifacts to ${artifacts_dir}"`
			`kubectl get node "${node}" -o json > "${artifacts_dir}/node.json"`
			`kubectl get node "${node}" --show-labels > "${artifacts_dir}/node.txt"`
			`kubectl get pods -A --field-selector "spec.nodeName=${node}" -o wide > "${artifacts_dir}/pods-on-node.txt"`

			`jq -r '`
			`.metadata.labels`
			`\| to_entries[]`
			`\| select(`
			`.key != "kubernetes.io/hostname"`
			`and .key != "beta.kubernetes.io/hostname"`
			`and .key != "node.kubernetes.io/instance-type"`
			`and .key != "beta.kubernetes.io/instance-type"`
			`and (.key \| startswith("kubernetes.io/") \| not)`
			`and (.key \| startswith("beta.kubernetes.io/") \| not)`
			`and (.key \| startswith("node.kubernetes.io/") \| not)`
			`)`
			`\| "kubectl label node <replacement-node> " + .key + "=" + .value + " --overwrite"`
			`' "${artifacts_dir}/node.json" > "${artifacts_dir}/restore-labels.sh"`

			`jq -r '`
			`(.spec.taints // [])[]`
			`\| "kubectl taint node <replacement-node> "`
			`+ .key`
			`+ (if .value then "=" + .value else "" end)`
			`+ ":"`
			`+ .effect`
			`+ " --overwrite"`
			`' "${artifacts_dir}/node.json" > "${artifacts_dir}/restore-taints.sh"`

			`chmod +x "${artifacts_dir}/restore-labels.sh" "${artifacts_dir}/restore-taints.sh"`

			`if [ "${skip_drain}" != "true" ]; then`
			`echo "Cordoning ${node}"`
			`kubectl cordon "${node}" \|\| true`

			`echo "Draining ${node}"`
			`if ! kubectl drain "${node}" --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=20m; then`
			`echo "Standard drain failed; retrying with --force"`
			`if ! kubectl drain "${node}" --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=20m --force; then`
			`echo "Force drain failed; retrying with --disable-eviction"`
			`kubectl drain "${node}" --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=20m --force --disable-eviction`
			`fi`
			`fi`
			`fi`

			`if [ "${delete_node}" = "true" ]; then`
			`echo "Deleting node object ${node}"`
			`kubectl delete node "${node}" \|\| true`
			`fi`

			`cat <<NEXT`
			`Recovery prep complete for ${node}.`
			`Artifacts: ${artifacts_dir}`

			`Next steps:`
			`1) Reimage/reprovision replacement host.`
			`2) Rejoin k3s and wait for node Ready.`
			`3) Reapply labels: ${artifacts_dir}/restore-labels.sh`
			`4) Reapply taints: ${artifacts_dir}/restore-taints.sh`
			`5) Validate pods and uncordon replacement when ready.`
			`NEXT`