diff --git a/internal/cluster/orchestrator_autorepair.go b/internal/cluster/orchestrator_autorepair.go index 7be7d91..8f392f3 100644 --- a/internal/cluster/orchestrator_autorepair.go +++ b/internal/cluster/orchestrator_autorepair.go @@ -275,11 +275,7 @@ func (o *Orchestrator) repairBrokenKubeletProxies(ctx context.Context) (int, err } healthy, checkErr = o.kubeletProxyHealthy(ctx, node.Name) if !healthy { - if checkErr != nil { - errs = append(errs, fmt.Sprintf("%s proxy still broken after k3s-agent restart: %v", node.Name, checkErr)) - } else { - errs = append(errs, fmt.Sprintf("%s proxy still broken after k3s-agent restart", node.Name)) - } + errs = append(errs, fmt.Sprintf("%s proxy still broken after k3s-agent restart: %v", node.Name, checkErr)) continue } if !node.Unschedulable { diff --git a/internal/cluster/orchestrator_autorepair_test.go b/internal/cluster/orchestrator_autorepair_test.go index db78b5a..86a2655 100644 --- a/internal/cluster/orchestrator_autorepair_test.go +++ b/internal/cluster/orchestrator_autorepair_test.go @@ -4,6 +4,7 @@ import ( "context" "encoding/base64" "errors" + "fmt" "io" "log" "path/filepath" @@ -305,6 +306,219 @@ func TestRepairBrokenKubeletProxiesPreservesExistingCordon(t *testing.T) { } } +// TestRepairBrokenKubeletProxiesFailureBranches runs one orchestration or CLI step. +// Signature: TestRepairBrokenKubeletProxiesFailureBranches(t *testing.T). +// Why: auto-repair must report exact blockers without broadening into unsafe +// node restarts for unrelated kubectl or access failures. +func TestRepairBrokenKubeletProxiesFailureBranches(t *testing.T) { + t.Run("dry run skips", func(t *testing.T) { + orch := buildOrchestratorWithStubs(t, config.Config{}, nil) + orch.runner.DryRun = true + repaired, err := orch.repairBrokenKubeletProxies(context.Background()) + if err != nil || repaired != 0 { + t.Fatalf("expected dry-run skip, got repaired=%d err=%v", repaired, err) + } + }) + + t.Run("node query error surfaces", func(t *testing.T) { + orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{ + {match: matchContains("kubectl", "get nodes -o json"), err: errors.New("api down")}, + }) + _, err := orch.repairBrokenKubeletProxies(context.Background()) + if err == nil || !strings.Contains(err.Error(), "query nodes") { + t.Fatalf("expected node query error, got %v", err) + } + }) + + t.Run("node json decode error surfaces", func(t *testing.T) { + orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{ + {match: matchContains("kubectl", "get nodes -o json"), out: `{bad json`}, + }) + _, err := orch.readyNodeCandidates(context.Background()) + if err == nil || !strings.Contains(err.Error(), "decode nodes") { + t.Fatalf("expected decode error, got %v", err) + } + }) + + t.Run("non repairable health error is reported", func(t *testing.T) { + orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{ + {cmd: "health", out: "forbidden", err: errors.New("forbidden")}, + }) + repaired, err := orch.repairBrokenKubeletProxies(context.Background()) + if repaired != 0 || err == nil || !strings.Contains(err.Error(), "proxy health check") { + t.Fatalf("expected health check error, repaired=%d err=%v", repaired, err) + } + }) + + t.Run("unmanaged broken node is reported", func(t *testing.T) { + cfg := config.Config{SSHManagedNodes: []string{"other"}} + orch := kubeletProxyRepairStub(t, cfg, false, []kubeletProxyRepairAction{ + {cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")}, + }) + repaired, err := orch.repairBrokenKubeletProxies(context.Background()) + if repaired != 0 || err == nil || !strings.Contains(err.Error(), "not SSH-managed") { + t.Fatalf("expected unmanaged node error, repaired=%d err=%v", repaired, err) + } + }) + + t.Run("cordon failure is reported", func(t *testing.T) { + orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{ + {cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")}, + {cmd: "cordon", err: errors.New("cordon denied")}, + }) + repaired, err := orch.repairBrokenKubeletProxies(context.Background()) + if repaired != 0 || err == nil || !strings.Contains(err.Error(), "cordon before kubelet restart") { + t.Fatalf("expected cordon error, repaired=%d err=%v", repaired, err) + } + }) + + t.Run("restart failure uncordons schedulable node", func(t *testing.T) { + uncordoned := false + orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{ + {cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")}, + {cmd: "cordon"}, + {cmd: "restart", err: errors.New("sudo rejected")}, + {cmd: "uncordon", sideEffect: func() { uncordoned = true }}, + }) + repaired, err := orch.repairBrokenKubeletProxies(context.Background()) + if repaired != 0 || err == nil || !strings.Contains(err.Error(), "restart k3s-agent") { + t.Fatalf("expected restart error, repaired=%d err=%v", repaired, err) + } + if !uncordoned { + t.Fatalf("expected best-effort uncordon after restart failure") + } + }) + + t.Run("wait failure is reported", func(t *testing.T) { + orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{ + {cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")}, + {cmd: "cordon"}, + {cmd: "restart"}, + {cmd: "wait", err: errors.New("not ready")}, + }) + repaired, err := orch.repairBrokenKubeletProxies(context.Background()) + if repaired != 0 || err == nil || !strings.Contains(err.Error(), "wait Ready") { + t.Fatalf("expected wait error, repaired=%d err=%v", repaired, err) + } + }) + + t.Run("post restart health failure is reported", func(t *testing.T) { + orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{ + {cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")}, + {cmd: "cordon"}, + {cmd: "restart"}, + {cmd: "wait"}, + {cmd: "health", out: "failed to find Session for client", err: errors.New("exit status 1")}, + }) + repaired, err := orch.repairBrokenKubeletProxies(context.Background()) + if repaired != 0 || err == nil || !strings.Contains(err.Error(), "proxy still broken") { + t.Fatalf("expected post-restart health error, repaired=%d err=%v", repaired, err) + } + }) + + t.Run("uncordon failure is reported", func(t *testing.T) { + orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{ + {cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")}, + {cmd: "cordon"}, + {cmd: "restart"}, + {cmd: "wait"}, + {cmd: "health", out: "ok"}, + {cmd: "uncordon", err: errors.New("uncordon denied")}, + }) + repaired, err := orch.repairBrokenKubeletProxies(context.Background()) + if repaired != 0 || err == nil || !strings.Contains(err.Error(), "uncordon after kubelet proxy repair") { + t.Fatalf("expected uncordon error, repaired=%d err=%v", repaired, err) + } + }) +} + +type kubeletProxyRepairAction struct { + cmd string + out string + err error + sideEffect func() +} + +// kubeletProxyRepairStub runs one orchestration or CLI step. +// Signature: kubeletProxyRepairStub(t *testing.T, cfg config.Config, unschedulable bool, actions []kubeletProxyRepairAction) *Orchestrator. +// Why: keeps the kubelet proxy repair branch tests readable while preserving +// strict command order for safety-sensitive node operations. +func kubeletProxyRepairStub(t *testing.T, cfg config.Config, unschedulable bool, actions []kubeletProxyRepairAction) *Orchestrator { + t.Helper() + orch := buildOrchestratorWithStubs(t, cfg, nil) + index := 0 + dispatch := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) { + joined := strings.Join(args, " ") + if name == "kubectl" && strings.Contains(joined, "get nodes -o json") { + return fmt.Sprintf(`{"items":[{"metadata":{"name":"titan-07"},"spec":{"unschedulable":%t},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`, unschedulable), nil + } + actual := "" + switch { + case name == "kubectl" && strings.Contains(joined, "get --raw /api/v1/nodes/titan-07/proxy/healthz"): + actual = "health" + case name == "kubectl" && strings.Contains(joined, "uncordon titan-07"): + actual = "uncordon" + case name == "kubectl" && strings.Contains(joined, "cordon titan-07"): + actual = "cordon" + case name == "ssh" && strings.Contains(joined, "sudo -n systemctl restart k3s-agent"): + actual = "restart" + case name == "kubectl" && strings.Contains(joined, "wait node/titan-07 --for=condition=Ready --timeout=120s"): + actual = "wait" + default: + return "", nil + } + if index >= len(actions) { + t.Fatalf("unexpected %s command after actions exhausted: %s %s", actual, name, joined) + } + action := actions[index] + index++ + if action.cmd != actual { + t.Fatalf("expected %s command, got %s (%s %s)", action.cmd, actual, name, joined) + } + if action.sideEffect != nil { + action.sideEffect() + } + return action.out, action.err + } + orch.SetCommandOverrides(dispatch, dispatch) + return orch +} + +// TestKubeletProxyHealthAndRepairableErrorHelpers runs one orchestration or CLI step. +// Signature: TestKubeletProxyHealthAndRepairableErrorHelpers(t *testing.T). +// Why: helper branches decide whether Ananke restarts a node agent, so both +// positive and negative cases need direct coverage. +func TestKubeletProxyHealthAndRepairableErrorHelpers(t *testing.T) { + t.Run("health error without output is preserved", func(t *testing.T) { + orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{ + { + match: matchContains("kubectl", "get --raw /api/v1/nodes/titan-07/proxy/healthz"), + err: errors.New("network timeout"), + }, + }) + healthy, err := orch.kubeletProxyHealthy(context.Background(), "titan-07") + if healthy || err == nil || !strings.Contains(err.Error(), "network timeout") { + t.Fatalf("expected raw health error, healthy=%v err=%v", healthy, err) + } + }) + + for _, tc := range []struct { + name string + err error + want bool + }{ + {name: "nil", err: nil, want: false}, + {name: "repairable", err: errors.New("502 Bad Gateway while dialing 10250"), want: true}, + {name: "not repairable", err: errors.New("forbidden"), want: false}, + } { + t.Run(tc.name, func(t *testing.T) { + if got := isRepairableKubeletProxyErr(tc.err); got != tc.want { + t.Fatalf("isRepairableKubeletProxyErr()=%v want %v", got, tc.want) + } + }) + } +} + // TestPostStartAutoHealAggregatesErrors runs one orchestration or CLI step. // Signature: TestPostStartAutoHealAggregatesErrors(t *testing.T). // Why: proves the daemon reports each failed sub-repair together instead of