From 3b5cacdc34dd88c4d5209814384705ffe13dd1fb Mon Sep 17 00:00:00 2001 From: codex Date: Sun, 17 May 2026 04:49:42 -0300 Subject: [PATCH] test: split kubelet proxy autoheal coverage --- .../orchestrator_autorepair_proxy_test.go | 340 ++++++++++++++++++ .../cluster/orchestrator_autorepair_test.go | 329 ----------------- 2 files changed, 340 insertions(+), 329 deletions(-) create mode 100644 internal/cluster/orchestrator_autorepair_proxy_test.go diff --git a/internal/cluster/orchestrator_autorepair_proxy_test.go b/internal/cluster/orchestrator_autorepair_proxy_test.go new file mode 100644 index 0000000..3c985b1 --- /dev/null +++ b/internal/cluster/orchestrator_autorepair_proxy_test.go @@ -0,0 +1,340 @@ +package cluster + +import ( + "context" + "errors" + "fmt" + "strings" + "testing" + "time" + + "scm.bstein.dev/bstein/ananke/internal/config" +) + +// TestRepairBrokenKubeletProxiesRestartsSchedulableNode runs one orchestration or CLI step. +// Signature: TestRepairBrokenKubeletProxiesRestartsSchedulableNode(t *testing.T). +// Why: Jenkins agents depend on apiserver-to-kubelet exec; a Ready node with a +// broken proxy needs a narrow k3s-agent restart, not a broad cluster restart. +func TestRepairBrokenKubeletProxiesRestartsSchedulableNode(t *testing.T) { + cfg := config.Config{SSHManagedNodes: []string{"titan-07"}} + orch := buildOrchestratorWithStubs(t, cfg, nil) + + healthChecks := 0 + cordoned := false + restarted := false + waited := false + uncordoned := false + dispatch := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) { + joined := strings.Join(args, " ") + switch { + case name == "kubectl" && strings.Contains(joined, "get nodes -o json"): + return `{"items":[{"metadata":{"name":"titan-07"},"spec":{"unschedulable":false},"status":{"conditions":[{"type":"Ready","status":"True"}]}},{"metadata":{"name":"titan-18"},"status":{"conditions":[{"type":"Ready","status":"Unknown"}]}}]}`, nil + case name == "kubectl" && strings.Contains(joined, "get --raw /api/v1/nodes/titan-07/proxy/healthz"): + healthChecks++ + if healthChecks == 1 { + return "proxy error from 127.0.0.1:6443 while dialing 192.168.22.33:10250, code 502: 502 Bad Gateway", errors.New("exit status 1") + } + return "ok", nil + case name == "kubectl" && strings.Contains(joined, "uncordon titan-07"): + uncordoned = true + return "", nil + case name == "kubectl" && strings.Contains(joined, "cordon titan-07"): + cordoned = true + return "", nil + case name == "ssh" && strings.Contains(joined, "sudo -n systemctl restart k3s-agent"): + restarted = true + return "", nil + case name == "kubectl" && strings.Contains(joined, "wait node/titan-07 --for=condition=Ready --timeout=120s"): + waited = true + return "", nil + default: + return "", nil + } + } + orch.SetCommandOverrides(dispatch, dispatch) + + repaired, err := orch.repairBrokenKubeletProxies(context.Background()) + if err != nil { + t.Fatalf("repairBrokenKubeletProxies failed: %v", err) + } + if repaired != 1 { + t.Fatalf("expected one repaired node, got %d", repaired) + } + for name, ok := range map[string]bool{ + "cordoned": cordoned, + "restarted": restarted, + "waited": waited, + "uncordoned": uncordoned, + } { + if !ok { + t.Fatalf("expected %s action", name) + } + } + if healthChecks != 2 { + t.Fatalf("expected health check before and after repair, got %d", healthChecks) + } +} + +// TestRepairBrokenKubeletProxiesPreservesExistingCordon runs one orchestration or CLI step. +// Signature: TestRepairBrokenKubeletProxiesPreservesExistingCordon(t *testing.T). +// Why: nodes intentionally kept out of service must not be accidentally +// uncordoned just because Ananke repaired their kubelet proxy. +func TestRepairBrokenKubeletProxiesPreservesExistingCordon(t *testing.T) { + cfg := config.Config{SSHManagedNodes: []string{"titan-18"}} + orch := buildOrchestratorWithStubs(t, cfg, nil) + + healthChecks := 0 + cordonTouched := false + restarted := false + dispatch := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) { + joined := strings.Join(args, " ") + switch { + case name == "kubectl" && strings.Contains(joined, "get nodes -o json"): + return `{"items":[{"metadata":{"name":"titan-18"},"spec":{"unschedulable":true},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`, nil + case name == "kubectl" && strings.Contains(joined, "get --raw /api/v1/nodes/titan-18/proxy/healthz"): + healthChecks++ + if healthChecks == 1 { + return "error trying to reach service: proxy error from 127.0.0.1:6443 while dialing 192.168.22.46:10250", errors.New("exit status 1") + } + return "ok", nil + case name == "kubectl" && (strings.Contains(joined, "cordon titan-18") || strings.Contains(joined, "uncordon titan-18")): + cordonTouched = true + return "", nil + case name == "ssh" && strings.Contains(joined, "sudo -n systemctl restart k3s-agent"): + restarted = true + return "", nil + case name == "kubectl" && strings.Contains(joined, "wait node/titan-18 --for=condition=Ready --timeout=120s"): + return "", nil + default: + return "", nil + } + } + orch.SetCommandOverrides(dispatch, dispatch) + + repaired, err := orch.repairBrokenKubeletProxies(context.Background()) + if err != nil { + t.Fatalf("repairBrokenKubeletProxies failed: %v", err) + } + if repaired != 1 { + t.Fatalf("expected one repaired node, got %d", repaired) + } + if !restarted { + t.Fatalf("expected k3s-agent restart") + } + if cordonTouched { + t.Fatalf("did not expect cordon state to change for already-unschedulable node") + } +} + +// TestRepairBrokenKubeletProxiesFailureBranches runs one orchestration or CLI step. +// Signature: TestRepairBrokenKubeletProxiesFailureBranches(t *testing.T). +// Why: auto-repair must report exact blockers without broadening into unsafe +// node restarts for unrelated kubectl or access failures. +func TestRepairBrokenKubeletProxiesFailureBranches(t *testing.T) { + t.Run("dry run skips", func(t *testing.T) { + orch := buildOrchestratorWithStubs(t, config.Config{}, nil) + orch.runner.DryRun = true + repaired, err := orch.repairBrokenKubeletProxies(context.Background()) + if err != nil || repaired != 0 { + t.Fatalf("expected dry-run skip, got repaired=%d err=%v", repaired, err) + } + }) + + t.Run("node query error surfaces", func(t *testing.T) { + orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{ + {match: matchContains("kubectl", "get nodes -o json"), err: errors.New("api down")}, + }) + _, err := orch.repairBrokenKubeletProxies(context.Background()) + if err == nil || !strings.Contains(err.Error(), "query nodes") { + t.Fatalf("expected node query error, got %v", err) + } + }) + + t.Run("node json decode error surfaces", func(t *testing.T) { + orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{ + {match: matchContains("kubectl", "get nodes -o json"), out: `{bad json`}, + }) + _, err := orch.readyNodeCandidates(context.Background()) + if err == nil || !strings.Contains(err.Error(), "decode nodes") { + t.Fatalf("expected decode error, got %v", err) + } + }) + + t.Run("non repairable health error is reported", func(t *testing.T) { + orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{ + {cmd: "health", out: "forbidden", err: errors.New("forbidden")}, + }) + repaired, err := orch.repairBrokenKubeletProxies(context.Background()) + if repaired != 0 || err == nil || !strings.Contains(err.Error(), "proxy health check") { + t.Fatalf("expected health check error, repaired=%d err=%v", repaired, err) + } + }) + + t.Run("unmanaged broken node is reported", func(t *testing.T) { + cfg := config.Config{SSHManagedNodes: []string{"other"}} + orch := kubeletProxyRepairStub(t, cfg, false, []kubeletProxyRepairAction{ + {cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")}, + }) + repaired, err := orch.repairBrokenKubeletProxies(context.Background()) + if repaired != 0 || err == nil || !strings.Contains(err.Error(), "not SSH-managed") { + t.Fatalf("expected unmanaged node error, repaired=%d err=%v", repaired, err) + } + }) + + t.Run("cordon failure is reported", func(t *testing.T) { + orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{ + {cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")}, + {cmd: "cordon", err: errors.New("cordon denied")}, + }) + repaired, err := orch.repairBrokenKubeletProxies(context.Background()) + if repaired != 0 || err == nil || !strings.Contains(err.Error(), "cordon before kubelet restart") { + t.Fatalf("expected cordon error, repaired=%d err=%v", repaired, err) + } + }) + + t.Run("restart failure uncordons schedulable node", func(t *testing.T) { + uncordoned := false + orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{ + {cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")}, + {cmd: "cordon"}, + {cmd: "restart", err: errors.New("sudo rejected")}, + {cmd: "uncordon", sideEffect: func() { uncordoned = true }}, + }) + repaired, err := orch.repairBrokenKubeletProxies(context.Background()) + if repaired != 0 || err == nil || !strings.Contains(err.Error(), "restart k3s-agent") { + t.Fatalf("expected restart error, repaired=%d err=%v", repaired, err) + } + if !uncordoned { + t.Fatalf("expected best-effort uncordon after restart failure") + } + }) + + t.Run("wait failure is reported", func(t *testing.T) { + orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{ + {cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")}, + {cmd: "cordon"}, + {cmd: "restart"}, + {cmd: "wait", err: errors.New("not ready")}, + }) + repaired, err := orch.repairBrokenKubeletProxies(context.Background()) + if repaired != 0 || err == nil || !strings.Contains(err.Error(), "wait Ready") { + t.Fatalf("expected wait error, repaired=%d err=%v", repaired, err) + } + }) + + t.Run("post restart health failure is reported", func(t *testing.T) { + orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{ + {cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")}, + {cmd: "cordon"}, + {cmd: "restart"}, + {cmd: "wait"}, + {cmd: "health", out: "failed to find Session for client", err: errors.New("exit status 1")}, + }) + repaired, err := orch.repairBrokenKubeletProxies(context.Background()) + if repaired != 0 || err == nil || !strings.Contains(err.Error(), "proxy still broken") { + t.Fatalf("expected post-restart health error, repaired=%d err=%v", repaired, err) + } + }) + + t.Run("uncordon failure is reported", func(t *testing.T) { + orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{ + {cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")}, + {cmd: "cordon"}, + {cmd: "restart"}, + {cmd: "wait"}, + {cmd: "health", out: "ok"}, + {cmd: "uncordon", err: errors.New("uncordon denied")}, + }) + repaired, err := orch.repairBrokenKubeletProxies(context.Background()) + if repaired != 0 || err == nil || !strings.Contains(err.Error(), "uncordon after kubelet proxy repair") { + t.Fatalf("expected uncordon error, repaired=%d err=%v", repaired, err) + } + }) +} + +type kubeletProxyRepairAction struct { + cmd string + out string + err error + sideEffect func() +} + +// kubeletProxyRepairStub runs one orchestration or CLI step. +// Signature: kubeletProxyRepairStub(t *testing.T, cfg config.Config, unschedulable bool, actions []kubeletProxyRepairAction) *Orchestrator. +// Why: keeps the kubelet proxy repair branch tests readable while preserving +// strict command order for safety-sensitive node operations. +func kubeletProxyRepairStub(t *testing.T, cfg config.Config, unschedulable bool, actions []kubeletProxyRepairAction) *Orchestrator { + t.Helper() + orch := buildOrchestratorWithStubs(t, cfg, nil) + index := 0 + dispatch := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) { + joined := strings.Join(args, " ") + if name == "kubectl" && strings.Contains(joined, "get nodes -o json") { + return fmt.Sprintf(`{"items":[{"metadata":{"name":"titan-07"},"spec":{"unschedulable":%t},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`, unschedulable), nil + } + actual := "" + switch { + case name == "kubectl" && strings.Contains(joined, "get --raw /api/v1/nodes/titan-07/proxy/healthz"): + actual = "health" + case name == "kubectl" && strings.Contains(joined, "uncordon titan-07"): + actual = "uncordon" + case name == "kubectl" && strings.Contains(joined, "cordon titan-07"): + actual = "cordon" + case name == "ssh" && strings.Contains(joined, "sudo -n systemctl restart k3s-agent"): + actual = "restart" + case name == "kubectl" && strings.Contains(joined, "wait node/titan-07 --for=condition=Ready --timeout=120s"): + actual = "wait" + default: + return "", nil + } + if index >= len(actions) { + t.Fatalf("unexpected %s command after actions exhausted: %s %s", actual, name, joined) + } + action := actions[index] + index++ + if action.cmd != actual { + t.Fatalf("expected %s command, got %s (%s %s)", action.cmd, actual, name, joined) + } + if action.sideEffect != nil { + action.sideEffect() + } + return action.out, action.err + } + orch.SetCommandOverrides(dispatch, dispatch) + return orch +} + +// TestKubeletProxyHealthAndRepairableErrorHelpers runs one orchestration or CLI step. +// Signature: TestKubeletProxyHealthAndRepairableErrorHelpers(t *testing.T). +// Why: helper branches decide whether Ananke restarts a node agent, so both +// positive and negative cases need direct coverage. +func TestKubeletProxyHealthAndRepairableErrorHelpers(t *testing.T) { + t.Run("health error without output is preserved", func(t *testing.T) { + orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{ + { + match: matchContains("kubectl", "get --raw /api/v1/nodes/titan-07/proxy/healthz"), + err: errors.New("network timeout"), + }, + }) + healthy, err := orch.kubeletProxyHealthy(context.Background(), "titan-07") + if healthy || err == nil || !strings.Contains(err.Error(), "network timeout") { + t.Fatalf("expected raw health error, healthy=%v err=%v", healthy, err) + } + }) + + for _, tc := range []struct { + name string + err error + want bool + }{ + {name: "nil", err: nil, want: false}, + {name: "repairable", err: errors.New("502 Bad Gateway while dialing 10250"), want: true}, + {name: "not repairable", err: errors.New("forbidden"), want: false}, + } { + t.Run(tc.name, func(t *testing.T) { + if got := isRepairableKubeletProxyErr(tc.err); got != tc.want { + t.Fatalf("isRepairableKubeletProxyErr()=%v want %v", got, tc.want) + } + }) + } +} diff --git a/internal/cluster/orchestrator_autorepair_test.go b/internal/cluster/orchestrator_autorepair_test.go index 86a2655..4a80172 100644 --- a/internal/cluster/orchestrator_autorepair_test.go +++ b/internal/cluster/orchestrator_autorepair_test.go @@ -4,7 +4,6 @@ import ( "context" "encoding/base64" "errors" - "fmt" "io" "log" "path/filepath" @@ -191,334 +190,6 @@ func TestRunPostStartAutoHealDryRun(t *testing.T) { } } -// TestRepairBrokenKubeletProxiesRestartsSchedulableNode runs one orchestration or CLI step. -// Signature: TestRepairBrokenKubeletProxiesRestartsSchedulableNode(t *testing.T). -// Why: Jenkins agents depend on apiserver-to-kubelet exec; a Ready node with a -// broken proxy needs a narrow k3s-agent restart, not a broad cluster restart. -func TestRepairBrokenKubeletProxiesRestartsSchedulableNode(t *testing.T) { - cfg := config.Config{SSHManagedNodes: []string{"titan-07"}} - orch := buildOrchestratorWithStubs(t, cfg, nil) - - healthChecks := 0 - cordoned := false - restarted := false - waited := false - uncordoned := false - dispatch := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) { - joined := strings.Join(args, " ") - switch { - case name == "kubectl" && strings.Contains(joined, "get nodes -o json"): - return `{"items":[{"metadata":{"name":"titan-07"},"spec":{"unschedulable":false},"status":{"conditions":[{"type":"Ready","status":"True"}]}},{"metadata":{"name":"titan-18"},"status":{"conditions":[{"type":"Ready","status":"Unknown"}]}}]}`, nil - case name == "kubectl" && strings.Contains(joined, "get --raw /api/v1/nodes/titan-07/proxy/healthz"): - healthChecks++ - if healthChecks == 1 { - return "proxy error from 127.0.0.1:6443 while dialing 192.168.22.33:10250, code 502: 502 Bad Gateway", errors.New("exit status 1") - } - return "ok", nil - case name == "kubectl" && strings.Contains(joined, "uncordon titan-07"): - uncordoned = true - return "", nil - case name == "kubectl" && strings.Contains(joined, "cordon titan-07"): - cordoned = true - return "", nil - case name == "ssh" && strings.Contains(joined, "sudo -n systemctl restart k3s-agent"): - restarted = true - return "", nil - case name == "kubectl" && strings.Contains(joined, "wait node/titan-07 --for=condition=Ready --timeout=120s"): - waited = true - return "", nil - default: - return "", nil - } - } - orch.SetCommandOverrides(dispatch, dispatch) - - repaired, err := orch.repairBrokenKubeletProxies(context.Background()) - if err != nil { - t.Fatalf("repairBrokenKubeletProxies failed: %v", err) - } - if repaired != 1 { - t.Fatalf("expected one repaired node, got %d", repaired) - } - for name, ok := range map[string]bool{ - "cordoned": cordoned, - "restarted": restarted, - "waited": waited, - "uncordoned": uncordoned, - } { - if !ok { - t.Fatalf("expected %s action", name) - } - } - if healthChecks != 2 { - t.Fatalf("expected health check before and after repair, got %d", healthChecks) - } -} - -// TestRepairBrokenKubeletProxiesPreservesExistingCordon runs one orchestration or CLI step. -// Signature: TestRepairBrokenKubeletProxiesPreservesExistingCordon(t *testing.T). -// Why: nodes intentionally kept out of service must not be accidentally -// uncordoned just because Ananke repaired their kubelet proxy. -func TestRepairBrokenKubeletProxiesPreservesExistingCordon(t *testing.T) { - cfg := config.Config{SSHManagedNodes: []string{"titan-18"}} - orch := buildOrchestratorWithStubs(t, cfg, nil) - - healthChecks := 0 - cordonTouched := false - restarted := false - dispatch := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) { - joined := strings.Join(args, " ") - switch { - case name == "kubectl" && strings.Contains(joined, "get nodes -o json"): - return `{"items":[{"metadata":{"name":"titan-18"},"spec":{"unschedulable":true},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`, nil - case name == "kubectl" && strings.Contains(joined, "get --raw /api/v1/nodes/titan-18/proxy/healthz"): - healthChecks++ - if healthChecks == 1 { - return "error trying to reach service: proxy error from 127.0.0.1:6443 while dialing 192.168.22.46:10250", errors.New("exit status 1") - } - return "ok", nil - case name == "kubectl" && (strings.Contains(joined, "cordon titan-18") || strings.Contains(joined, "uncordon titan-18")): - cordonTouched = true - return "", nil - case name == "ssh" && strings.Contains(joined, "sudo -n systemctl restart k3s-agent"): - restarted = true - return "", nil - case name == "kubectl" && strings.Contains(joined, "wait node/titan-18 --for=condition=Ready --timeout=120s"): - return "", nil - default: - return "", nil - } - } - orch.SetCommandOverrides(dispatch, dispatch) - - repaired, err := orch.repairBrokenKubeletProxies(context.Background()) - if err != nil { - t.Fatalf("repairBrokenKubeletProxies failed: %v", err) - } - if repaired != 1 { - t.Fatalf("expected one repaired node, got %d", repaired) - } - if !restarted { - t.Fatalf("expected k3s-agent restart") - } - if cordonTouched { - t.Fatalf("did not expect cordon state to change for already-unschedulable node") - } -} - -// TestRepairBrokenKubeletProxiesFailureBranches runs one orchestration or CLI step. -// Signature: TestRepairBrokenKubeletProxiesFailureBranches(t *testing.T). -// Why: auto-repair must report exact blockers without broadening into unsafe -// node restarts for unrelated kubectl or access failures. -func TestRepairBrokenKubeletProxiesFailureBranches(t *testing.T) { - t.Run("dry run skips", func(t *testing.T) { - orch := buildOrchestratorWithStubs(t, config.Config{}, nil) - orch.runner.DryRun = true - repaired, err := orch.repairBrokenKubeletProxies(context.Background()) - if err != nil || repaired != 0 { - t.Fatalf("expected dry-run skip, got repaired=%d err=%v", repaired, err) - } - }) - - t.Run("node query error surfaces", func(t *testing.T) { - orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{ - {match: matchContains("kubectl", "get nodes -o json"), err: errors.New("api down")}, - }) - _, err := orch.repairBrokenKubeletProxies(context.Background()) - if err == nil || !strings.Contains(err.Error(), "query nodes") { - t.Fatalf("expected node query error, got %v", err) - } - }) - - t.Run("node json decode error surfaces", func(t *testing.T) { - orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{ - {match: matchContains("kubectl", "get nodes -o json"), out: `{bad json`}, - }) - _, err := orch.readyNodeCandidates(context.Background()) - if err == nil || !strings.Contains(err.Error(), "decode nodes") { - t.Fatalf("expected decode error, got %v", err) - } - }) - - t.Run("non repairable health error is reported", func(t *testing.T) { - orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{ - {cmd: "health", out: "forbidden", err: errors.New("forbidden")}, - }) - repaired, err := orch.repairBrokenKubeletProxies(context.Background()) - if repaired != 0 || err == nil || !strings.Contains(err.Error(), "proxy health check") { - t.Fatalf("expected health check error, repaired=%d err=%v", repaired, err) - } - }) - - t.Run("unmanaged broken node is reported", func(t *testing.T) { - cfg := config.Config{SSHManagedNodes: []string{"other"}} - orch := kubeletProxyRepairStub(t, cfg, false, []kubeletProxyRepairAction{ - {cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")}, - }) - repaired, err := orch.repairBrokenKubeletProxies(context.Background()) - if repaired != 0 || err == nil || !strings.Contains(err.Error(), "not SSH-managed") { - t.Fatalf("expected unmanaged node error, repaired=%d err=%v", repaired, err) - } - }) - - t.Run("cordon failure is reported", func(t *testing.T) { - orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{ - {cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")}, - {cmd: "cordon", err: errors.New("cordon denied")}, - }) - repaired, err := orch.repairBrokenKubeletProxies(context.Background()) - if repaired != 0 || err == nil || !strings.Contains(err.Error(), "cordon before kubelet restart") { - t.Fatalf("expected cordon error, repaired=%d err=%v", repaired, err) - } - }) - - t.Run("restart failure uncordons schedulable node", func(t *testing.T) { - uncordoned := false - orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{ - {cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")}, - {cmd: "cordon"}, - {cmd: "restart", err: errors.New("sudo rejected")}, - {cmd: "uncordon", sideEffect: func() { uncordoned = true }}, - }) - repaired, err := orch.repairBrokenKubeletProxies(context.Background()) - if repaired != 0 || err == nil || !strings.Contains(err.Error(), "restart k3s-agent") { - t.Fatalf("expected restart error, repaired=%d err=%v", repaired, err) - } - if !uncordoned { - t.Fatalf("expected best-effort uncordon after restart failure") - } - }) - - t.Run("wait failure is reported", func(t *testing.T) { - orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{ - {cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")}, - {cmd: "cordon"}, - {cmd: "restart"}, - {cmd: "wait", err: errors.New("not ready")}, - }) - repaired, err := orch.repairBrokenKubeletProxies(context.Background()) - if repaired != 0 || err == nil || !strings.Contains(err.Error(), "wait Ready") { - t.Fatalf("expected wait error, repaired=%d err=%v", repaired, err) - } - }) - - t.Run("post restart health failure is reported", func(t *testing.T) { - orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{ - {cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")}, - {cmd: "cordon"}, - {cmd: "restart"}, - {cmd: "wait"}, - {cmd: "health", out: "failed to find Session for client", err: errors.New("exit status 1")}, - }) - repaired, err := orch.repairBrokenKubeletProxies(context.Background()) - if repaired != 0 || err == nil || !strings.Contains(err.Error(), "proxy still broken") { - t.Fatalf("expected post-restart health error, repaired=%d err=%v", repaired, err) - } - }) - - t.Run("uncordon failure is reported", func(t *testing.T) { - orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{ - {cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")}, - {cmd: "cordon"}, - {cmd: "restart"}, - {cmd: "wait"}, - {cmd: "health", out: "ok"}, - {cmd: "uncordon", err: errors.New("uncordon denied")}, - }) - repaired, err := orch.repairBrokenKubeletProxies(context.Background()) - if repaired != 0 || err == nil || !strings.Contains(err.Error(), "uncordon after kubelet proxy repair") { - t.Fatalf("expected uncordon error, repaired=%d err=%v", repaired, err) - } - }) -} - -type kubeletProxyRepairAction struct { - cmd string - out string - err error - sideEffect func() -} - -// kubeletProxyRepairStub runs one orchestration or CLI step. -// Signature: kubeletProxyRepairStub(t *testing.T, cfg config.Config, unschedulable bool, actions []kubeletProxyRepairAction) *Orchestrator. -// Why: keeps the kubelet proxy repair branch tests readable while preserving -// strict command order for safety-sensitive node operations. -func kubeletProxyRepairStub(t *testing.T, cfg config.Config, unschedulable bool, actions []kubeletProxyRepairAction) *Orchestrator { - t.Helper() - orch := buildOrchestratorWithStubs(t, cfg, nil) - index := 0 - dispatch := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) { - joined := strings.Join(args, " ") - if name == "kubectl" && strings.Contains(joined, "get nodes -o json") { - return fmt.Sprintf(`{"items":[{"metadata":{"name":"titan-07"},"spec":{"unschedulable":%t},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`, unschedulable), nil - } - actual := "" - switch { - case name == "kubectl" && strings.Contains(joined, "get --raw /api/v1/nodes/titan-07/proxy/healthz"): - actual = "health" - case name == "kubectl" && strings.Contains(joined, "uncordon titan-07"): - actual = "uncordon" - case name == "kubectl" && strings.Contains(joined, "cordon titan-07"): - actual = "cordon" - case name == "ssh" && strings.Contains(joined, "sudo -n systemctl restart k3s-agent"): - actual = "restart" - case name == "kubectl" && strings.Contains(joined, "wait node/titan-07 --for=condition=Ready --timeout=120s"): - actual = "wait" - default: - return "", nil - } - if index >= len(actions) { - t.Fatalf("unexpected %s command after actions exhausted: %s %s", actual, name, joined) - } - action := actions[index] - index++ - if action.cmd != actual { - t.Fatalf("expected %s command, got %s (%s %s)", action.cmd, actual, name, joined) - } - if action.sideEffect != nil { - action.sideEffect() - } - return action.out, action.err - } - orch.SetCommandOverrides(dispatch, dispatch) - return orch -} - -// TestKubeletProxyHealthAndRepairableErrorHelpers runs one orchestration or CLI step. -// Signature: TestKubeletProxyHealthAndRepairableErrorHelpers(t *testing.T). -// Why: helper branches decide whether Ananke restarts a node agent, so both -// positive and negative cases need direct coverage. -func TestKubeletProxyHealthAndRepairableErrorHelpers(t *testing.T) { - t.Run("health error without output is preserved", func(t *testing.T) { - orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{ - { - match: matchContains("kubectl", "get --raw /api/v1/nodes/titan-07/proxy/healthz"), - err: errors.New("network timeout"), - }, - }) - healthy, err := orch.kubeletProxyHealthy(context.Background(), "titan-07") - if healthy || err == nil || !strings.Contains(err.Error(), "network timeout") { - t.Fatalf("expected raw health error, healthy=%v err=%v", healthy, err) - } - }) - - for _, tc := range []struct { - name string - err error - want bool - }{ - {name: "nil", err: nil, want: false}, - {name: "repairable", err: errors.New("502 Bad Gateway while dialing 10250"), want: true}, - {name: "not repairable", err: errors.New("forbidden"), want: false}, - } { - t.Run(tc.name, func(t *testing.T) { - if got := isRepairableKubeletProxyErr(tc.err); got != tc.want { - t.Fatalf("isRepairableKubeletProxyErr()=%v want %v", got, tc.want) - } - }) - } -} - // TestPostStartAutoHealAggregatesErrors runs one orchestration or CLI step. // Signature: TestPostStartAutoHealAggregatesErrors(t *testing.T). // Why: proves the daemon reports each failed sub-repair together instead of