package cluster import ( "context" "errors" "fmt" "strings" "testing" "time" "scm.bstein.dev/bstein/ananke/internal/config" ) // TestRepairBrokenKubeletProxiesRestartsSchedulableNode runs one orchestration or CLI step. // Signature: TestRepairBrokenKubeletProxiesRestartsSchedulableNode(t *testing.T). // Why: Jenkins agents depend on apiserver-to-kubelet exec; a Ready node with a // broken proxy needs a narrow k3s-agent restart, not a broad cluster restart. func TestRepairBrokenKubeletProxiesRestartsSchedulableNode(t *testing.T) { cfg := config.Config{SSHManagedNodes: []string{"titan-07"}} orch := buildOrchestratorWithStubs(t, cfg, nil) healthChecks := 0 cordoned := false restarted := false waited := false uncordoned := false dispatch := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) { joined := strings.Join(args, " ") switch { case name == "kubectl" && strings.Contains(joined, "get nodes -o json"): return `{"items":[{"metadata":{"name":"titan-07"},"spec":{"unschedulable":false},"status":{"conditions":[{"type":"Ready","status":"True"}]}},{"metadata":{"name":"titan-18"},"status":{"conditions":[{"type":"Ready","status":"Unknown"}]}}]}`, nil case name == "kubectl" && strings.Contains(joined, "get --raw /api/v1/nodes/titan-07/proxy/healthz"): healthChecks++ if healthChecks == 1 { return "proxy error from 127.0.0.1:6443 while dialing 192.168.22.33:10250, code 502: 502 Bad Gateway", errors.New("exit status 1") } return "ok", nil case name == "kubectl" && strings.Contains(joined, "uncordon titan-07"): uncordoned = true return "", nil case name == "kubectl" && strings.Contains(joined, "cordon titan-07"): cordoned = true return "", nil case name == "ssh" && strings.Contains(joined, "sudo -n systemctl restart k3s-agent"): restarted = true return "", nil case name == "kubectl" && strings.Contains(joined, "wait node/titan-07 --for=condition=Ready --timeout=120s"): waited = true return "", nil default: return "", nil } } orch.SetCommandOverrides(dispatch, dispatch) repaired, err := orch.repairBrokenKubeletProxies(context.Background()) if err != nil { t.Fatalf("repairBrokenKubeletProxies failed: %v", err) } if repaired != 1 { t.Fatalf("expected one repaired node, got %d", repaired) } for name, ok := range map[string]bool{ "cordoned": cordoned, "restarted": restarted, "waited": waited, "uncordoned": uncordoned, } { if !ok { t.Fatalf("expected %s action", name) } } if healthChecks != 2 { t.Fatalf("expected health check before and after repair, got %d", healthChecks) } } // TestRepairBrokenKubeletProxiesPreservesExistingCordon runs one orchestration or CLI step. // Signature: TestRepairBrokenKubeletProxiesPreservesExistingCordon(t *testing.T). // Why: nodes intentionally kept out of service must not be accidentally // uncordoned just because Ananke repaired their kubelet proxy. func TestRepairBrokenKubeletProxiesPreservesExistingCordon(t *testing.T) { cfg := config.Config{SSHManagedNodes: []string{"titan-18"}} orch := buildOrchestratorWithStubs(t, cfg, nil) healthChecks := 0 cordonTouched := false restarted := false dispatch := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) { joined := strings.Join(args, " ") switch { case name == "kubectl" && strings.Contains(joined, "get nodes -o json"): return `{"items":[{"metadata":{"name":"titan-18"},"spec":{"unschedulable":true},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`, nil case name == "kubectl" && strings.Contains(joined, "get --raw /api/v1/nodes/titan-18/proxy/healthz"): healthChecks++ if healthChecks == 1 { return "error trying to reach service: proxy error from 127.0.0.1:6443 while dialing 192.168.22.46:10250", errors.New("exit status 1") } return "ok", nil case name == "kubectl" && (strings.Contains(joined, "cordon titan-18") || strings.Contains(joined, "uncordon titan-18")): cordonTouched = true return "", nil case name == "ssh" && strings.Contains(joined, "sudo -n systemctl restart k3s-agent"): restarted = true return "", nil case name == "kubectl" && strings.Contains(joined, "wait node/titan-18 --for=condition=Ready --timeout=120s"): return "", nil default: return "", nil } } orch.SetCommandOverrides(dispatch, dispatch) repaired, err := orch.repairBrokenKubeletProxies(context.Background()) if err != nil { t.Fatalf("repairBrokenKubeletProxies failed: %v", err) } if repaired != 1 { t.Fatalf("expected one repaired node, got %d", repaired) } if !restarted { t.Fatalf("expected k3s-agent restart") } if cordonTouched { t.Fatalf("did not expect cordon state to change for already-unschedulable node") } } // TestRepairBrokenKubeletProxiesFailureBranches runs one orchestration or CLI step. // Signature: TestRepairBrokenKubeletProxiesFailureBranches(t *testing.T). // Why: auto-repair must report exact blockers without broadening into unsafe // node restarts for unrelated kubectl or access failures. func TestRepairBrokenKubeletProxiesFailureBranches(t *testing.T) { t.Run("dry run skips", func(t *testing.T) { orch := buildOrchestratorWithStubs(t, config.Config{}, nil) orch.runner.DryRun = true repaired, err := orch.repairBrokenKubeletProxies(context.Background()) if err != nil || repaired != 0 { t.Fatalf("expected dry-run skip, got repaired=%d err=%v", repaired, err) } }) t.Run("node query error surfaces", func(t *testing.T) { orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{ {match: matchContains("kubectl", "get nodes -o json"), err: errors.New("api down")}, }) _, err := orch.repairBrokenKubeletProxies(context.Background()) if err == nil || !strings.Contains(err.Error(), "query nodes") { t.Fatalf("expected node query error, got %v", err) } }) t.Run("node json decode error surfaces", func(t *testing.T) { orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{ {match: matchContains("kubectl", "get nodes -o json"), out: `{bad json`}, }) _, err := orch.readyNodeCandidates(context.Background()) if err == nil || !strings.Contains(err.Error(), "decode nodes") { t.Fatalf("expected decode error, got %v", err) } }) t.Run("non repairable health error is reported", func(t *testing.T) { orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{ {cmd: "health", out: "forbidden", err: errors.New("forbidden")}, }) repaired, err := orch.repairBrokenKubeletProxies(context.Background()) if repaired != 0 || err == nil || !strings.Contains(err.Error(), "proxy health check") { t.Fatalf("expected health check error, repaired=%d err=%v", repaired, err) } }) t.Run("unmanaged broken node is reported", func(t *testing.T) { cfg := config.Config{SSHManagedNodes: []string{"other"}} orch := kubeletProxyRepairStub(t, cfg, false, []kubeletProxyRepairAction{ {cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")}, }) repaired, err := orch.repairBrokenKubeletProxies(context.Background()) if repaired != 0 || err == nil || !strings.Contains(err.Error(), "not SSH-managed") { t.Fatalf("expected unmanaged node error, repaired=%d err=%v", repaired, err) } }) t.Run("cordon failure is reported", func(t *testing.T) { orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{ {cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")}, {cmd: "cordon", err: errors.New("cordon denied")}, }) repaired, err := orch.repairBrokenKubeletProxies(context.Background()) if repaired != 0 || err == nil || !strings.Contains(err.Error(), "cordon before kubelet restart") { t.Fatalf("expected cordon error, repaired=%d err=%v", repaired, err) } }) t.Run("restart failure uncordons schedulable node", func(t *testing.T) { uncordoned := false orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{ {cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")}, {cmd: "cordon"}, {cmd: "restart", err: errors.New("sudo rejected")}, {cmd: "uncordon", sideEffect: func() { uncordoned = true }}, }) repaired, err := orch.repairBrokenKubeletProxies(context.Background()) if repaired != 0 || err == nil || !strings.Contains(err.Error(), "restart k3s-agent") { t.Fatalf("expected restart error, repaired=%d err=%v", repaired, err) } if !uncordoned { t.Fatalf("expected best-effort uncordon after restart failure") } }) t.Run("wait failure is reported", func(t *testing.T) { orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{ {cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")}, {cmd: "cordon"}, {cmd: "restart"}, {cmd: "wait", err: errors.New("not ready")}, }) repaired, err := orch.repairBrokenKubeletProxies(context.Background()) if repaired != 0 || err == nil || !strings.Contains(err.Error(), "wait Ready") { t.Fatalf("expected wait error, repaired=%d err=%v", repaired, err) } }) t.Run("post restart health failure is reported", func(t *testing.T) { orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{ {cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")}, {cmd: "cordon"}, {cmd: "restart"}, {cmd: "wait"}, {cmd: "health", out: "failed to find Session for client", err: errors.New("exit status 1")}, }) repaired, err := orch.repairBrokenKubeletProxies(context.Background()) if repaired != 0 || err == nil || !strings.Contains(err.Error(), "proxy still broken") { t.Fatalf("expected post-restart health error, repaired=%d err=%v", repaired, err) } }) t.Run("uncordon failure is reported", func(t *testing.T) { orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{ {cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")}, {cmd: "cordon"}, {cmd: "restart"}, {cmd: "wait"}, {cmd: "health", out: "ok"}, {cmd: "uncordon", err: errors.New("uncordon denied")}, }) repaired, err := orch.repairBrokenKubeletProxies(context.Background()) if repaired != 0 || err == nil || !strings.Contains(err.Error(), "uncordon after kubelet proxy repair") { t.Fatalf("expected uncordon error, repaired=%d err=%v", repaired, err) } }) } type kubeletProxyRepairAction struct { cmd string out string err error sideEffect func() } // kubeletProxyRepairStub runs one orchestration or CLI step. // Signature: kubeletProxyRepairStub(t *testing.T, cfg config.Config, unschedulable bool, actions []kubeletProxyRepairAction) *Orchestrator. // Why: keeps the kubelet proxy repair branch tests readable while preserving // strict command order for safety-sensitive node operations. func kubeletProxyRepairStub(t *testing.T, cfg config.Config, unschedulable bool, actions []kubeletProxyRepairAction) *Orchestrator { t.Helper() orch := buildOrchestratorWithStubs(t, cfg, nil) index := 0 dispatch := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) { joined := strings.Join(args, " ") if name == "kubectl" && strings.Contains(joined, "get nodes -o json") { return fmt.Sprintf(`{"items":[{"metadata":{"name":"titan-07"},"spec":{"unschedulable":%t},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`, unschedulable), nil } actual := "" switch { case name == "kubectl" && strings.Contains(joined, "get --raw /api/v1/nodes/titan-07/proxy/healthz"): actual = "health" case name == "kubectl" && strings.Contains(joined, "uncordon titan-07"): actual = "uncordon" case name == "kubectl" && strings.Contains(joined, "cordon titan-07"): actual = "cordon" case name == "ssh" && strings.Contains(joined, "sudo -n systemctl restart k3s-agent"): actual = "restart" case name == "kubectl" && strings.Contains(joined, "wait node/titan-07 --for=condition=Ready --timeout=120s"): actual = "wait" default: return "", nil } if index >= len(actions) { t.Fatalf("unexpected %s command after actions exhausted: %s %s", actual, name, joined) } action := actions[index] index++ if action.cmd != actual { t.Fatalf("expected %s command, got %s (%s %s)", action.cmd, actual, name, joined) } if action.sideEffect != nil { action.sideEffect() } return action.out, action.err } orch.SetCommandOverrides(dispatch, dispatch) return orch } // TestKubeletProxyHealthAndRepairableErrorHelpers runs one orchestration or CLI step. // Signature: TestKubeletProxyHealthAndRepairableErrorHelpers(t *testing.T). // Why: helper branches decide whether Ananke restarts a node agent, so both // positive and negative cases need direct coverage. func TestKubeletProxyHealthAndRepairableErrorHelpers(t *testing.T) { t.Run("health error without output is preserved", func(t *testing.T) { orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{ { match: matchContains("kubectl", "get --raw /api/v1/nodes/titan-07/proxy/healthz"), err: errors.New("network timeout"), }, }) healthy, err := orch.kubeletProxyHealthy(context.Background(), "titan-07") if healthy || err == nil || !strings.Contains(err.Error(), "network timeout") { t.Fatalf("expected raw health error, healthy=%v err=%v", healthy, err) } }) for _, tc := range []struct { name string err error want bool }{ {name: "nil", err: nil, want: false}, {name: "repairable", err: errors.New("502 Bad Gateway while dialing 10250"), want: true}, {name: "not repairable", err: errors.New("forbidden"), want: false}, } { t.Run(tc.name, func(t *testing.T) { if got := isRepairableKubeletProxyErr(tc.err); got != tc.want { t.Fatalf("isRepairableKubeletProxyErr()=%v want %v", got, tc.want) } }) } }