titan-iac/scripts/k3s_version_update.fish

# Pick the correct K3s asset for a remote host (arm64 vs x86_64)
function __k3s_asset_for_host
    set -l host $argv[1]
    set -l arch (ssh atlas@titan-db "ssh atlas@$host 'uname -m'" 2>/dev/null)
    switch $arch
        case aarch64 arm64
            echo k3s-arm64
        case x86_64 amd64
            echo k3s
        case '*'
            # Default to arm64 in your environment
            echo k3s-arm64
    end
end

# Safer control-plane upgrade via jump host using a binary swap (recommended)
# usage: upgrade_server_via_jump <host> <version>
function upgrade_server_via_jump
    set -l host $argv[1]
    set -l ver  $argv[2]
    if test (count $argv) -lt 2
        echo "usage: upgrade_server_via_jump <host> <version>"; return 1
    end

    set -l jump titan-db
    set -l asset (__k3s_asset_for_host $host)
    # If already at target, skip
    set -l curr (kubectl get node $host -o jsonpath='{.status.nodeInfo.kubeletVersion}' 2>/dev/null)
    if test "$curr" = "$ver"
        echo "=== [$host] already at $ver; skipping"
        return 0
    end

    echo "=== [$host] preflight: check datastore-endpoint is present and DB TCP reachable"
    # 1) datastore-endpoint existence in config, env file, or unit
    set -l dsn_lines (ssh atlas@$jump "ssh atlas@$host 'sudo sh -lc \" \
        (test -f /etc/rancher/k3s/config.yaml  && grep -E ^datastore-endpoint: /etc/rancher/k3s/config.yaml || true); \
        (test -f /etc/systemd/system/k3s.service.env && grep -E ^K3S_DATASTORE_ENDPOINT= /etc/systemd/system/k3s.service.env || true); \
        (test -f /etc/systemd/system/k3s.service && grep -F -- \"--datastore-endpoint=\" /etc/systemd/system/k3s.service || true) \
    \"'")

    if test -z "$dsn_lines"
        echo "ERROR: $host has no datastore-endpoint configured (config/env/unit). Aborting."; return 2
    end
    if string match -q '*datastore-endpoint: ""*' -- $dsn_lines
        echo "ERROR: $host datastore-endpoint is empty in config.yaml. Aborting."; return 2
    end
    if string match -q '*K3S_DATASTORE_ENDPOINT=""*' -- $dsn_lines
        echo "ERROR: $host K3S_DATASTORE_ENDPOINT is empty in k3s.service.env. Aborting."; return 2
    end

    # 2) DB TCP reachability from the target
    set -l dbcheck (ssh atlas@$jump "ssh atlas@$host 'command -v nc >/dev/null && nc -vz -w2 192.168.22.10 5432 >/dev/null 2>&1 && echo ok || echo skip'" 2>/dev/null)
    if test "$dbcheck" != "ok" -a "$dbcheck" != "skip"
        echo "ERROR: $host cannot reach 192.168.22.10:5432. Aborting."; return 3
    end

    echo "=== [$host] cordon + drain"
    kubectl cordon $host
    set -l drained 0

    # Store flags as a list (not a single quoted string)
    set -l drain_common --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=20m

    # 1) Try a normal drain (respects PDBs)
    if kubectl drain $host $drain_common
        set drained 1
    else
        echo "WARN: standard drain on $host failed (likely a PDB). Retrying with --force."
        # 2) Retry with --force (unmanaged pods etc.)
        if kubectl drain $host $drain_common --force
            set drained 1
        else
            echo "WARN: drain still blocked on $host. Falling back to --disable-eviction (bypass PDBs)."
            # 3) Last resort: bypass PDBs entirely (deletes pods instead of Evictions; PDBs don't apply)
            if kubectl drain $host $drain_common --disable-eviction --force
                set drained 1
            else
                echo "ERROR: drain failed on $host even with --disable-eviction."
                kubectl get pods -A -o wide --field-selector spec.nodeName=$host | head -n 50
                return 4
            end
        end
    end

    echo "=== [$host] binary swap to $ver ($asset)"
    set -l rc 0
    ssh atlas@$jump "ssh atlas@$host 'set -euo pipefail
      sudo systemctl stop k3s
      if test -x /usr/local/bin/k3s; then
        sudo cp /usr/local/bin/k3s /usr/local/bin/k3s.bak.\$(date -Iseconds)
      fi
      url=\"https://github.com/k3s-io/k3s/releases/download/$ver/$asset\"
      sudo curl -fL -o /usr/local/bin/k3s \"\$url\"
      sudo chmod +x /usr/local/bin/k3s
      sudo systemctl start k3s
      sleep 4
      sudo k3s --version
    '" ; set rc $status

    if test $rc -ne 0
        echo "ERROR: remote swap/start failed on $host (rc=$rc)."
        if test $drained -eq 1
            kubectl uncordon $host
        end
        return $rc
    end

    echo "=== [$host] wait for Ready and target version: $ver"
    set -l tries 0
    while true
        set -l v (kubectl get node $host -o jsonpath='{.status.nodeInfo.kubeletVersion}' 2>/dev/null)
        set -l r (kubectl get node $host -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null)
        echo "$host -> $v Ready=$r"
        if test "$v" = "$ver" -a "$r" = "True"
            break
        end
        if test $tries -eq 0
            # one-time nudge if the service came up slow
            ssh atlas@$jump "ssh atlas@$host 'sudo systemctl daemon-reload; sudo systemctl restart k3s'"
        end
        set tries (math $tries + 1)
        if test $tries -gt 100  
            echo "ERROR: $host did not reach Ready/$ver; showing last logs:"
            ssh atlas@$jump "ssh atlas@$host 'sudo journalctl -u k3s -n 200 --no-pager | tail -n +1'"
            if test $drained -eq 1
                kubectl uncordon $host
            end
            return 5
        end
        sleep 3
    end

    echo "=== [$host] uncordon"
    kubectl uncordon $host
end

# Rolling control-plane upgrade to a target version (do NOT run in parallel)
# usage: upgrade_control_plane_to <version>  [hosts...]
# If hosts omitted, defaults to: titan-0b titan-0c titan-0a
function upgrade_control_plane_to
    set -l ver $argv[1]
    if test -z "$ver"
        echo "usage: upgrade_control_plane_to <version> [titan-0b titan-0c titan-0a]"; return 1
    end
    set -l hosts $argv[2..-1]
    if test (count $hosts) -eq 0
        set hosts titan-0b titan-0c titan-0a
    end

    for n in $hosts
        # Build union of CP nodes (master ∪ control-plane)
        set -l ready_cp (begin
            kubectl get nodes -l 'node-role.kubernetes.io/control-plane' \
              -o jsonpath='{range .items[*]}{.metadata.name}{"|"}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}'
            kubectl get nodes -l 'node-role.kubernetes.io/master' \
              -o jsonpath='{range .items[*]}{.metadata.name}{"|"}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}'
        end | sort -u | grep -E '\|True$' | grep -v $n | wc -l)

        if test (math $ready_cp) -lt 1
            echo "ERROR: upgrading $n would drop remaining Ready control-plane count below 1. Aborting."
            return 9
        end

        upgrade_server_via_jump $n $ver; or return $status
    end

    kubectl get nodes -o wide
end