titan-iac/scripts/k3s_version_update.fish

170 lines
6.7 KiB
Fish
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Pick the correct K3s asset for a remote host (arm64 vs x86_64)
function __k3s_asset_for_host
set -l host $argv[1]
set -l arch (ssh atlas@titan-db "ssh atlas@$host 'uname -m'" 2>/dev/null)
switch $arch
case aarch64 arm64
echo k3s-arm64
case x86_64 amd64
echo k3s
case '*'
# Default to arm64 in your environment
echo k3s-arm64
end
end
# Safer control-plane upgrade via jump host using a binary swap (recommended)
# usage: upgrade_server_via_jump <host> <version>
function upgrade_server_via_jump
set -l host $argv[1]
set -l ver $argv[2]
if test (count $argv) -lt 2
echo "usage: upgrade_server_via_jump <host> <version>"; return 1
end
set -l jump titan-db
set -l asset (__k3s_asset_for_host $host)
# If already at target, skip
set -l curr (kubectl get node $host -o jsonpath='{.status.nodeInfo.kubeletVersion}' 2>/dev/null)
if test "$curr" = "$ver"
echo "=== [$host] already at $ver; skipping"
return 0
end
echo "=== [$host] preflight: check datastore-endpoint is present and DB TCP reachable"
# 1) datastore-endpoint existence in config, env file, or unit
set -l dsn_lines (ssh atlas@$jump "ssh atlas@$host 'sudo sh -lc \" \
(test -f /etc/rancher/k3s/config.yaml && grep -E ^datastore-endpoint: /etc/rancher/k3s/config.yaml || true); \
(test -f /etc/systemd/system/k3s.service.env && grep -E ^K3S_DATASTORE_ENDPOINT= /etc/systemd/system/k3s.service.env || true); \
(test -f /etc/systemd/system/k3s.service && grep -F -- \"--datastore-endpoint=\" /etc/systemd/system/k3s.service || true) \
\"'")
if test -z "$dsn_lines"
echo "ERROR: $host has no datastore-endpoint configured (config/env/unit). Aborting."; return 2
end
if string match -q '*datastore-endpoint: ""*' -- $dsn_lines
echo "ERROR: $host datastore-endpoint is empty in config.yaml. Aborting."; return 2
end
if string match -q '*K3S_DATASTORE_ENDPOINT=""*' -- $dsn_lines
echo "ERROR: $host K3S_DATASTORE_ENDPOINT is empty in k3s.service.env. Aborting."; return 2
end
# 2) DB TCP reachability from the target
set -l dbcheck (ssh atlas@$jump "ssh atlas@$host 'command -v nc >/dev/null && nc -vz -w2 192.168.22.10 5432 >/dev/null 2>&1 && echo ok || echo skip'" 2>/dev/null)
if test "$dbcheck" != "ok" -a "$dbcheck" != "skip"
echo "ERROR: $host cannot reach 192.168.22.10:5432. Aborting."; return 3
end
echo "=== [$host] cordon + drain"
kubectl cordon $host
set -l drained 0
# Store flags as a list (not a single quoted string)
set -l drain_common --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=20m
# 1) Try a normal drain (respects PDBs)
if kubectl drain $host $drain_common
set drained 1
else
echo "WARN: standard drain on $host failed (likely a PDB). Retrying with --force."
# 2) Retry with --force (unmanaged pods etc.)
if kubectl drain $host $drain_common --force
set drained 1
else
echo "WARN: drain still blocked on $host. Falling back to --disable-eviction (bypass PDBs)."
# 3) Last resort: bypass PDBs entirely (deletes pods instead of Evictions; PDBs don't apply)
if kubectl drain $host $drain_common --disable-eviction --force
set drained 1
else
echo "ERROR: drain failed on $host even with --disable-eviction."
kubectl get pods -A -o wide --field-selector spec.nodeName=$host | head -n 50
return 4
end
end
end
echo "=== [$host] binary swap to $ver ($asset)"
set -l rc 0
ssh atlas@$jump "ssh atlas@$host 'set -euo pipefail
sudo systemctl stop k3s
if test -x /usr/local/bin/k3s; then
sudo cp /usr/local/bin/k3s /usr/local/bin/k3s.bak.\$(date -Iseconds)
fi
url=\"https://github.com/k3s-io/k3s/releases/download/$ver/$asset\"
sudo curl -fL -o /usr/local/bin/k3s \"\$url\"
sudo chmod +x /usr/local/bin/k3s
sudo systemctl start k3s
sleep 4
sudo k3s --version
'" ; set rc $status
if test $rc -ne 0
echo "ERROR: remote swap/start failed on $host (rc=$rc)."
if test $drained -eq 1
kubectl uncordon $host
end
return $rc
end
echo "=== [$host] wait for Ready and target version: $ver"
set -l tries 0
while true
set -l v (kubectl get node $host -o jsonpath='{.status.nodeInfo.kubeletVersion}' 2>/dev/null)
set -l r (kubectl get node $host -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null)
echo "$host -> $v Ready=$r"
if test "$v" = "$ver" -a "$r" = "True"
break
end
if test $tries -eq 0
# one-time nudge if the service came up slow
ssh atlas@$jump "ssh atlas@$host 'sudo systemctl daemon-reload; sudo systemctl restart k3s'"
end
set tries (math $tries + 1)
if test $tries -gt 100
echo "ERROR: $host did not reach Ready/$ver; showing last logs:"
ssh atlas@$jump "ssh atlas@$host 'sudo journalctl -u k3s -n 200 --no-pager | tail -n +1'"
if test $drained -eq 1
kubectl uncordon $host
end
return 5
end
sleep 3
end
echo "=== [$host] uncordon"
kubectl uncordon $host
end
# Rolling control-plane upgrade to a target version (do NOT run in parallel)
# usage: upgrade_control_plane_to <version> [hosts...]
# If hosts omitted, defaults to: titan-0b titan-0c titan-0a
function upgrade_control_plane_to
set -l ver $argv[1]
if test -z "$ver"
echo "usage: upgrade_control_plane_to <version> [titan-0b titan-0c titan-0a]"; return 1
end
set -l hosts $argv[2..-1]
if test (count $hosts) -eq 0
set hosts titan-0b titan-0c titan-0a
end
for n in $hosts
# Build union of CP nodes (master control-plane)
set -l ready_cp (begin
kubectl get nodes -l 'node-role.kubernetes.io/control-plane' \
-o jsonpath='{range .items[*]}{.metadata.name}{"|"}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}'
kubectl get nodes -l 'node-role.kubernetes.io/master' \
-o jsonpath='{range .items[*]}{.metadata.name}{"|"}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}'
end | sort -u | grep -E '\|True$' | grep -v $n | wc -l)
if test (math $ready_cp) -lt 1
echo "ERROR: upgrading $n would drop remaining Ready control-plane count below 1. Aborting."
return 9
end
upgrade_server_via_jump $n $ver; or return $status
end
kubectl get nodes -o wide
end