titan-iac/scripts/node_recover.sh

164 lines
4.2 KiB
Bash
Raw Normal View History

#!/usr/bin/env bash
set -euo pipefail
usage() {
cat <<USAGE
Usage: scripts/node_recover.sh <node-name> [options]
Options:
--yes Skip confirmation prompt
--skip-drain Do not cordon/drain; only capture recovery artifacts
--delete-node Delete Node object after drain (for hard-dead node replacement)
--out-dir <dir> Recovery artifact directory (default: ./artifacts/node-recovery)
-h, --help Show this help
USAGE
}
if ! command -v kubectl >/dev/null 2>&1; then
echo "kubectl is required" >&2
exit 1
fi
if ! command -v jq >/dev/null 2>&1; then
echo "jq is required" >&2
exit 1
fi
if [ "$#" -lt 1 ]; then
usage
exit 1
fi
node=""
assume_yes="false"
skip_drain="false"
delete_node="false"
out_dir="./artifacts/node-recovery"
while [ "$#" -gt 0 ]; do
case "$1" in
--yes)
assume_yes="true"
shift
;;
--skip-drain)
skip_drain="true"
shift
;;
--delete-node)
delete_node="true"
shift
;;
--out-dir)
out_dir="$2"
shift 2
;;
-h|--help)
usage
exit 0
;;
-*)
echo "Unknown option: $1" >&2
usage
exit 1
;;
*)
if [ -z "${node}" ]; then
node="$1"
else
echo "Unexpected argument: $1" >&2
usage
exit 1
fi
shift
;;
esac
done
if [ -z "${node}" ]; then
echo "Node name is required" >&2
usage
exit 1
fi
if ! kubectl get node "${node}" >/dev/null 2>&1; then
echo "Node ${node} not found in cluster API" >&2
exit 1
fi
if [ "${assume_yes}" != "true" ]; then
echo "About to prepare recovery workflow for node: ${node}"
echo "skip_drain=${skip_drain} delete_node=${delete_node}"
read -r -p "Type the node name to continue: " confirm
if [ "${confirm}" != "${node}" ]; then
echo "Confirmation did not match node name; aborting."
exit 1
fi
fi
timestamp="$(date +%Y%m%d-%H%M%S)"
artifacts_dir="${out_dir}/${node}-${timestamp}"
mkdir -p "${artifacts_dir}"
echo "Saving node and workload artifacts to ${artifacts_dir}"
kubectl get node "${node}" -o json > "${artifacts_dir}/node.json"
kubectl get node "${node}" --show-labels > "${artifacts_dir}/node.txt"
kubectl get pods -A --field-selector "spec.nodeName=${node}" -o wide > "${artifacts_dir}/pods-on-node.txt"
jq -r '
.metadata.labels
| to_entries[]
| select(
.key != "kubernetes.io/hostname"
and .key != "beta.kubernetes.io/hostname"
and .key != "node.kubernetes.io/instance-type"
and .key != "beta.kubernetes.io/instance-type"
and (.key | startswith("kubernetes.io/") | not)
and (.key | startswith("beta.kubernetes.io/") | not)
and (.key | startswith("node.kubernetes.io/") | not)
)
| "kubectl label node <replacement-node> " + .key + "=" + .value + " --overwrite"
' "${artifacts_dir}/node.json" > "${artifacts_dir}/restore-labels.sh"
jq -r '
(.spec.taints // [])[]
| "kubectl taint node <replacement-node> "
+ .key
+ (if .value then "=" + .value else "" end)
+ ":"
+ .effect
+ " --overwrite"
' "${artifacts_dir}/node.json" > "${artifacts_dir}/restore-taints.sh"
chmod +x "${artifacts_dir}/restore-labels.sh" "${artifacts_dir}/restore-taints.sh"
if [ "${skip_drain}" != "true" ]; then
echo "Cordoning ${node}"
kubectl cordon "${node}" || true
echo "Draining ${node}"
if ! kubectl drain "${node}" --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=20m; then
echo "Standard drain failed; retrying with --force"
if ! kubectl drain "${node}" --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=20m --force; then
echo "Force drain failed; retrying with --disable-eviction"
kubectl drain "${node}" --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=20m --force --disable-eviction
fi
fi
fi
if [ "${delete_node}" = "true" ]; then
echo "Deleting node object ${node}"
kubectl delete node "${node}" || true
fi
cat <<NEXT
Recovery prep complete for ${node}.
Artifacts: ${artifacts_dir}
Next steps:
1) Reimage/reprovision replacement host.
2) Rejoin k3s and wait for node Ready.
3) Reapply labels: ${artifacts_dir}/restore-labels.sh
4) Reapply taints: ${artifacts_dir}/restore-taints.sh
5) Validate pods and uncordon replacement when ready.
NEXT