164 lines
4.2 KiB
Bash
164 lines
4.2 KiB
Bash
|
|
#!/usr/bin/env bash
|
||
|
|
set -euo pipefail
|
||
|
|
|
||
|
|
usage() {
|
||
|
|
cat <<USAGE
|
||
|
|
Usage: scripts/node_recover.sh <node-name> [options]
|
||
|
|
|
||
|
|
Options:
|
||
|
|
--yes Skip confirmation prompt
|
||
|
|
--skip-drain Do not cordon/drain; only capture recovery artifacts
|
||
|
|
--delete-node Delete Node object after drain (for hard-dead node replacement)
|
||
|
|
--out-dir <dir> Recovery artifact directory (default: ./artifacts/node-recovery)
|
||
|
|
-h, --help Show this help
|
||
|
|
USAGE
|
||
|
|
}
|
||
|
|
|
||
|
|
if ! command -v kubectl >/dev/null 2>&1; then
|
||
|
|
echo "kubectl is required" >&2
|
||
|
|
exit 1
|
||
|
|
fi
|
||
|
|
if ! command -v jq >/dev/null 2>&1; then
|
||
|
|
echo "jq is required" >&2
|
||
|
|
exit 1
|
||
|
|
fi
|
||
|
|
|
||
|
|
if [ "$#" -lt 1 ]; then
|
||
|
|
usage
|
||
|
|
exit 1
|
||
|
|
fi
|
||
|
|
|
||
|
|
node=""
|
||
|
|
assume_yes="false"
|
||
|
|
skip_drain="false"
|
||
|
|
delete_node="false"
|
||
|
|
out_dir="./artifacts/node-recovery"
|
||
|
|
|
||
|
|
while [ "$#" -gt 0 ]; do
|
||
|
|
case "$1" in
|
||
|
|
--yes)
|
||
|
|
assume_yes="true"
|
||
|
|
shift
|
||
|
|
;;
|
||
|
|
--skip-drain)
|
||
|
|
skip_drain="true"
|
||
|
|
shift
|
||
|
|
;;
|
||
|
|
--delete-node)
|
||
|
|
delete_node="true"
|
||
|
|
shift
|
||
|
|
;;
|
||
|
|
--out-dir)
|
||
|
|
out_dir="$2"
|
||
|
|
shift 2
|
||
|
|
;;
|
||
|
|
-h|--help)
|
||
|
|
usage
|
||
|
|
exit 0
|
||
|
|
;;
|
||
|
|
-*)
|
||
|
|
echo "Unknown option: $1" >&2
|
||
|
|
usage
|
||
|
|
exit 1
|
||
|
|
;;
|
||
|
|
*)
|
||
|
|
if [ -z "${node}" ]; then
|
||
|
|
node="$1"
|
||
|
|
else
|
||
|
|
echo "Unexpected argument: $1" >&2
|
||
|
|
usage
|
||
|
|
exit 1
|
||
|
|
fi
|
||
|
|
shift
|
||
|
|
;;
|
||
|
|
esac
|
||
|
|
done
|
||
|
|
|
||
|
|
if [ -z "${node}" ]; then
|
||
|
|
echo "Node name is required" >&2
|
||
|
|
usage
|
||
|
|
exit 1
|
||
|
|
fi
|
||
|
|
|
||
|
|
if ! kubectl get node "${node}" >/dev/null 2>&1; then
|
||
|
|
echo "Node ${node} not found in cluster API" >&2
|
||
|
|
exit 1
|
||
|
|
fi
|
||
|
|
|
||
|
|
if [ "${assume_yes}" != "true" ]; then
|
||
|
|
echo "About to prepare recovery workflow for node: ${node}"
|
||
|
|
echo "skip_drain=${skip_drain} delete_node=${delete_node}"
|
||
|
|
read -r -p "Type the node name to continue: " confirm
|
||
|
|
if [ "${confirm}" != "${node}" ]; then
|
||
|
|
echo "Confirmation did not match node name; aborting."
|
||
|
|
exit 1
|
||
|
|
fi
|
||
|
|
fi
|
||
|
|
|
||
|
|
timestamp="$(date +%Y%m%d-%H%M%S)"
|
||
|
|
artifacts_dir="${out_dir}/${node}-${timestamp}"
|
||
|
|
mkdir -p "${artifacts_dir}"
|
||
|
|
|
||
|
|
echo "Saving node and workload artifacts to ${artifacts_dir}"
|
||
|
|
kubectl get node "${node}" -o json > "${artifacts_dir}/node.json"
|
||
|
|
kubectl get node "${node}" --show-labels > "${artifacts_dir}/node.txt"
|
||
|
|
kubectl get pods -A --field-selector "spec.nodeName=${node}" -o wide > "${artifacts_dir}/pods-on-node.txt"
|
||
|
|
|
||
|
|
jq -r '
|
||
|
|
.metadata.labels
|
||
|
|
| to_entries[]
|
||
|
|
| select(
|
||
|
|
.key != "kubernetes.io/hostname"
|
||
|
|
and .key != "beta.kubernetes.io/hostname"
|
||
|
|
and .key != "node.kubernetes.io/instance-type"
|
||
|
|
and .key != "beta.kubernetes.io/instance-type"
|
||
|
|
and (.key | startswith("kubernetes.io/") | not)
|
||
|
|
and (.key | startswith("beta.kubernetes.io/") | not)
|
||
|
|
and (.key | startswith("node.kubernetes.io/") | not)
|
||
|
|
)
|
||
|
|
| "kubectl label node <replacement-node> " + .key + "=" + .value + " --overwrite"
|
||
|
|
' "${artifacts_dir}/node.json" > "${artifacts_dir}/restore-labels.sh"
|
||
|
|
|
||
|
|
jq -r '
|
||
|
|
(.spec.taints // [])[]
|
||
|
|
| "kubectl taint node <replacement-node> "
|
||
|
|
+ .key
|
||
|
|
+ (if .value then "=" + .value else "" end)
|
||
|
|
+ ":"
|
||
|
|
+ .effect
|
||
|
|
+ " --overwrite"
|
||
|
|
' "${artifacts_dir}/node.json" > "${artifacts_dir}/restore-taints.sh"
|
||
|
|
|
||
|
|
chmod +x "${artifacts_dir}/restore-labels.sh" "${artifacts_dir}/restore-taints.sh"
|
||
|
|
|
||
|
|
if [ "${skip_drain}" != "true" ]; then
|
||
|
|
echo "Cordoning ${node}"
|
||
|
|
kubectl cordon "${node}" || true
|
||
|
|
|
||
|
|
echo "Draining ${node}"
|
||
|
|
if ! kubectl drain "${node}" --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=20m; then
|
||
|
|
echo "Standard drain failed; retrying with --force"
|
||
|
|
if ! kubectl drain "${node}" --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=20m --force; then
|
||
|
|
echo "Force drain failed; retrying with --disable-eviction"
|
||
|
|
kubectl drain "${node}" --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=20m --force --disable-eviction
|
||
|
|
fi
|
||
|
|
fi
|
||
|
|
fi
|
||
|
|
|
||
|
|
if [ "${delete_node}" = "true" ]; then
|
||
|
|
echo "Deleting node object ${node}"
|
||
|
|
kubectl delete node "${node}" || true
|
||
|
|
fi
|
||
|
|
|
||
|
|
cat <<NEXT
|
||
|
|
Recovery prep complete for ${node}.
|
||
|
|
Artifacts: ${artifacts_dir}
|
||
|
|
|
||
|
|
Next steps:
|
||
|
|
1) Reimage/reprovision replacement host.
|
||
|
|
2) Rejoin k3s and wait for node Ready.
|
||
|
|
3) Reapply labels: ${artifacts_dir}/restore-labels.sh
|
||
|
|
4) Reapply taints: ${artifacts_dir}/restore-taints.sh
|
||
|
|
5) Validate pods and uncordon replacement when ready.
|
||
|
|
NEXT
|