titan-iac/scripts/k3s_version_update.fish

170 lines
6.7 KiB
Fish
Raw Normal View History

2025-09-01 18:40:02 -05:00
# Pick the correct K3s asset for a remote host (arm64 vs x86_64)
function __k3s_asset_for_host
set -l host $argv[1]
set -l arch (ssh atlas@titan-db "ssh atlas@$host 'uname -m'" 2>/dev/null)
switch $arch
case aarch64 arm64
echo k3s-arm64
case x86_64 amd64
echo k3s
case '*'
# Default to arm64 in your environment
echo k3s-arm64
end
end
# Safer control-plane upgrade via jump host using a binary swap (recommended)
# usage: upgrade_server_via_jump <host> <version>
function upgrade_server_via_jump
set -l host $argv[1]
set -l ver $argv[2]
if test (count $argv) -lt 2
echo "usage: upgrade_server_via_jump <host> <version>"; return 1
end
set -l jump titan-db
set -l asset (__k3s_asset_for_host $host)
# If already at target, skip
set -l curr (kubectl get node $host -o jsonpath='{.status.nodeInfo.kubeletVersion}' 2>/dev/null)
if test "$curr" = "$ver"
echo "=== [$host] already at $ver; skipping"
return 0
end
echo "=== [$host] preflight: check datastore-endpoint is present and DB TCP reachable"
# 1) datastore-endpoint existence in config, env file, or unit
set -l dsn_lines (ssh atlas@$jump "ssh atlas@$host 'sudo sh -lc \" \
(test -f /etc/rancher/k3s/config.yaml && grep -E ^datastore-endpoint: /etc/rancher/k3s/config.yaml || true); \
(test -f /etc/systemd/system/k3s.service.env && grep -E ^K3S_DATASTORE_ENDPOINT= /etc/systemd/system/k3s.service.env || true); \
(test -f /etc/systemd/system/k3s.service && grep -F -- \"--datastore-endpoint=\" /etc/systemd/system/k3s.service || true) \
\"'")
if test -z "$dsn_lines"
echo "ERROR: $host has no datastore-endpoint configured (config/env/unit). Aborting."; return 2
end
if string match -q '*datastore-endpoint: ""*' -- $dsn_lines
echo "ERROR: $host datastore-endpoint is empty in config.yaml. Aborting."; return 2
end
if string match -q '*K3S_DATASTORE_ENDPOINT=""*' -- $dsn_lines
echo "ERROR: $host K3S_DATASTORE_ENDPOINT is empty in k3s.service.env. Aborting."; return 2
end
# 2) DB TCP reachability from the target
set -l dbcheck (ssh atlas@$jump "ssh atlas@$host 'command -v nc >/dev/null && nc -vz -w2 192.168.22.10 5432 >/dev/null 2>&1 && echo ok || echo skip'" 2>/dev/null)
if test "$dbcheck" != "ok" -a "$dbcheck" != "skip"
echo "ERROR: $host cannot reach 192.168.22.10:5432. Aborting."; return 3
end
echo "=== [$host] cordon + drain"
kubectl cordon $host
set -l drained 0
# Store flags as a list (not a single quoted string)
set -l drain_common --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=20m
# 1) Try a normal drain (respects PDBs)
if kubectl drain $host $drain_common
set drained 1
else
echo "WARN: standard drain on $host failed (likely a PDB). Retrying with --force."
# 2) Retry with --force (unmanaged pods etc.)
if kubectl drain $host $drain_common --force
set drained 1
else
echo "WARN: drain still blocked on $host. Falling back to --disable-eviction (bypass PDBs)."
# 3) Last resort: bypass PDBs entirely (deletes pods instead of Evictions; PDBs don't apply)
if kubectl drain $host $drain_common --disable-eviction --force
set drained 1
else
echo "ERROR: drain failed on $host even with --disable-eviction."
kubectl get pods -A -o wide --field-selector spec.nodeName=$host | head -n 50
return 4
end
end
end
echo "=== [$host] binary swap to $ver ($asset)"
set -l rc 0
ssh atlas@$jump "ssh atlas@$host 'set -euo pipefail
sudo systemctl stop k3s
if test -x /usr/local/bin/k3s; then
sudo cp /usr/local/bin/k3s /usr/local/bin/k3s.bak.\$(date -Iseconds)
fi
url=\"https://github.com/k3s-io/k3s/releases/download/$ver/$asset\"
sudo curl -fL -o /usr/local/bin/k3s \"\$url\"
sudo chmod +x /usr/local/bin/k3s
sudo systemctl start k3s
sleep 4
sudo k3s --version
'" ; set rc $status
if test $rc -ne 0
echo "ERROR: remote swap/start failed on $host (rc=$rc)."
if test $drained -eq 1
kubectl uncordon $host
end
return $rc
end
echo "=== [$host] wait for Ready and target version: $ver"
set -l tries 0
while true
set -l v (kubectl get node $host -o jsonpath='{.status.nodeInfo.kubeletVersion}' 2>/dev/null)
set -l r (kubectl get node $host -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null)
echo "$host -> $v Ready=$r"
if test "$v" = "$ver" -a "$r" = "True"
break
end
if test $tries -eq 0
# one-time nudge if the service came up slow
ssh atlas@$jump "ssh atlas@$host 'sudo systemctl daemon-reload; sudo systemctl restart k3s'"
end
set tries (math $tries + 1)
if test $tries -gt 100
echo "ERROR: $host did not reach Ready/$ver; showing last logs:"
ssh atlas@$jump "ssh atlas@$host 'sudo journalctl -u k3s -n 200 --no-pager | tail -n +1'"
if test $drained -eq 1
kubectl uncordon $host
end
return 5
end
sleep 3
end
echo "=== [$host] uncordon"
kubectl uncordon $host
end
# Rolling control-plane upgrade to a target version (do NOT run in parallel)
# usage: upgrade_control_plane_to <version> [hosts...]
# If hosts omitted, defaults to: titan-0b titan-0c titan-0a
function upgrade_control_plane_to
set -l ver $argv[1]
if test -z "$ver"
echo "usage: upgrade_control_plane_to <version> [titan-0b titan-0c titan-0a]"; return 1
end
set -l hosts $argv[2..-1]
if test (count $hosts) -eq 0
set hosts titan-0b titan-0c titan-0a
end
for n in $hosts
# Build union of CP nodes (master control-plane)
set -l ready_cp (begin
kubectl get nodes -l 'node-role.kubernetes.io/control-plane' \
-o jsonpath='{range .items[*]}{.metadata.name}{"|"}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}'
kubectl get nodes -l 'node-role.kubernetes.io/master' \
-o jsonpath='{range .items[*]}{.metadata.name}{"|"}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}'
end | sort -u | grep -E '\|True$' | grep -v $n | wc -l)
if test (math $ready_cp) -lt 1
echo "ERROR: upgrading $n would drop remaining Ready control-plane count below 1. Aborting."
return 9
end
upgrade_server_via_jump $n $ver; or return $status
end
kubectl get nodes -o wide
end