#!/usr/bin/env bash set -euo pipefail usage() { cat < [options] Options: --yes Skip confirmation prompt --skip-drain Do not cordon/drain; only capture recovery artifacts --delete-node Delete Node object after drain (for hard-dead node replacement) --out-dir Recovery artifact directory (default: ./artifacts/node-recovery) -h, --help Show this help USAGE } if ! command -v kubectl >/dev/null 2>&1; then echo "kubectl is required" >&2 exit 1 fi if ! command -v jq >/dev/null 2>&1; then echo "jq is required" >&2 exit 1 fi if [ "$#" -lt 1 ]; then usage exit 1 fi node="" assume_yes="false" skip_drain="false" delete_node="false" out_dir="./artifacts/node-recovery" while [ "$#" -gt 0 ]; do case "$1" in --yes) assume_yes="true" shift ;; --skip-drain) skip_drain="true" shift ;; --delete-node) delete_node="true" shift ;; --out-dir) out_dir="$2" shift 2 ;; -h|--help) usage exit 0 ;; -*) echo "Unknown option: $1" >&2 usage exit 1 ;; *) if [ -z "${node}" ]; then node="$1" else echo "Unexpected argument: $1" >&2 usage exit 1 fi shift ;; esac done if [ -z "${node}" ]; then echo "Node name is required" >&2 usage exit 1 fi if ! kubectl get node "${node}" >/dev/null 2>&1; then echo "Node ${node} not found in cluster API" >&2 exit 1 fi if [ "${assume_yes}" != "true" ]; then echo "About to prepare recovery workflow for node: ${node}" echo "skip_drain=${skip_drain} delete_node=${delete_node}" read -r -p "Type the node name to continue: " confirm if [ "${confirm}" != "${node}" ]; then echo "Confirmation did not match node name; aborting." exit 1 fi fi timestamp="$(date +%Y%m%d-%H%M%S)" artifacts_dir="${out_dir}/${node}-${timestamp}" mkdir -p "${artifacts_dir}" echo "Saving node and workload artifacts to ${artifacts_dir}" kubectl get node "${node}" -o json > "${artifacts_dir}/node.json" kubectl get node "${node}" --show-labels > "${artifacts_dir}/node.txt" kubectl get pods -A --field-selector "spec.nodeName=${node}" -o wide > "${artifacts_dir}/pods-on-node.txt" jq -r ' .metadata.labels | to_entries[] | select( .key != "kubernetes.io/hostname" and .key != "beta.kubernetes.io/hostname" and .key != "node.kubernetes.io/instance-type" and .key != "beta.kubernetes.io/instance-type" and (.key | startswith("kubernetes.io/") | not) and (.key | startswith("beta.kubernetes.io/") | not) and (.key | startswith("node.kubernetes.io/") | not) ) | "kubectl label node " + .key + "=" + .value + " --overwrite" ' "${artifacts_dir}/node.json" > "${artifacts_dir}/restore-labels.sh" jq -r ' (.spec.taints // [])[] | "kubectl taint node " + .key + (if .value then "=" + .value else "" end) + ":" + .effect + " --overwrite" ' "${artifacts_dir}/node.json" > "${artifacts_dir}/restore-taints.sh" chmod +x "${artifacts_dir}/restore-labels.sh" "${artifacts_dir}/restore-taints.sh" if [ "${skip_drain}" != "true" ]; then echo "Cordoning ${node}" kubectl cordon "${node}" || true echo "Draining ${node}" if ! kubectl drain "${node}" --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=20m; then echo "Standard drain failed; retrying with --force" if ! kubectl drain "${node}" --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=20m --force; then echo "Force drain failed; retrying with --disable-eviction" kubectl drain "${node}" --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=20m --force --disable-eviction fi fi fi if [ "${delete_node}" = "true" ]; then echo "Deleting node object ${node}" kubectl delete node "${node}" || true fi cat <