test: cover kubelet proxy autoheal

This commit is contained in:
codex 2026-05-17 04:40:17 -03:00
parent 0b4b05233e
commit e3afc9ea7b
2 changed files with 215 additions and 5 deletions

View File

@ -275,11 +275,7 @@ func (o *Orchestrator) repairBrokenKubeletProxies(ctx context.Context) (int, err
} }
healthy, checkErr = o.kubeletProxyHealthy(ctx, node.Name) healthy, checkErr = o.kubeletProxyHealthy(ctx, node.Name)
if !healthy { if !healthy {
if checkErr != nil { errs = append(errs, fmt.Sprintf("%s proxy still broken after k3s-agent restart: %v", node.Name, checkErr))
errs = append(errs, fmt.Sprintf("%s proxy still broken after k3s-agent restart: %v", node.Name, checkErr))
} else {
errs = append(errs, fmt.Sprintf("%s proxy still broken after k3s-agent restart", node.Name))
}
continue continue
} }
if !node.Unschedulable { if !node.Unschedulable {

View File

@ -4,6 +4,7 @@ import (
"context" "context"
"encoding/base64" "encoding/base64"
"errors" "errors"
"fmt"
"io" "io"
"log" "log"
"path/filepath" "path/filepath"
@ -305,6 +306,219 @@ func TestRepairBrokenKubeletProxiesPreservesExistingCordon(t *testing.T) {
} }
} }
// TestRepairBrokenKubeletProxiesFailureBranches runs one orchestration or CLI step.
// Signature: TestRepairBrokenKubeletProxiesFailureBranches(t *testing.T).
// Why: auto-repair must report exact blockers without broadening into unsafe
// node restarts for unrelated kubectl or access failures.
func TestRepairBrokenKubeletProxiesFailureBranches(t *testing.T) {
t.Run("dry run skips", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
orch.runner.DryRun = true
repaired, err := orch.repairBrokenKubeletProxies(context.Background())
if err != nil || repaired != 0 {
t.Fatalf("expected dry-run skip, got repaired=%d err=%v", repaired, err)
}
})
t.Run("node query error surfaces", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
{match: matchContains("kubectl", "get nodes -o json"), err: errors.New("api down")},
})
_, err := orch.repairBrokenKubeletProxies(context.Background())
if err == nil || !strings.Contains(err.Error(), "query nodes") {
t.Fatalf("expected node query error, got %v", err)
}
})
t.Run("node json decode error surfaces", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
{match: matchContains("kubectl", "get nodes -o json"), out: `{bad json`},
})
_, err := orch.readyNodeCandidates(context.Background())
if err == nil || !strings.Contains(err.Error(), "decode nodes") {
t.Fatalf("expected decode error, got %v", err)
}
})
t.Run("non repairable health error is reported", func(t *testing.T) {
orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{
{cmd: "health", out: "forbidden", err: errors.New("forbidden")},
})
repaired, err := orch.repairBrokenKubeletProxies(context.Background())
if repaired != 0 || err == nil || !strings.Contains(err.Error(), "proxy health check") {
t.Fatalf("expected health check error, repaired=%d err=%v", repaired, err)
}
})
t.Run("unmanaged broken node is reported", func(t *testing.T) {
cfg := config.Config{SSHManagedNodes: []string{"other"}}
orch := kubeletProxyRepairStub(t, cfg, false, []kubeletProxyRepairAction{
{cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")},
})
repaired, err := orch.repairBrokenKubeletProxies(context.Background())
if repaired != 0 || err == nil || !strings.Contains(err.Error(), "not SSH-managed") {
t.Fatalf("expected unmanaged node error, repaired=%d err=%v", repaired, err)
}
})
t.Run("cordon failure is reported", func(t *testing.T) {
orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{
{cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")},
{cmd: "cordon", err: errors.New("cordon denied")},
})
repaired, err := orch.repairBrokenKubeletProxies(context.Background())
if repaired != 0 || err == nil || !strings.Contains(err.Error(), "cordon before kubelet restart") {
t.Fatalf("expected cordon error, repaired=%d err=%v", repaired, err)
}
})
t.Run("restart failure uncordons schedulable node", func(t *testing.T) {
uncordoned := false
orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{
{cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")},
{cmd: "cordon"},
{cmd: "restart", err: errors.New("sudo rejected")},
{cmd: "uncordon", sideEffect: func() { uncordoned = true }},
})
repaired, err := orch.repairBrokenKubeletProxies(context.Background())
if repaired != 0 || err == nil || !strings.Contains(err.Error(), "restart k3s-agent") {
t.Fatalf("expected restart error, repaired=%d err=%v", repaired, err)
}
if !uncordoned {
t.Fatalf("expected best-effort uncordon after restart failure")
}
})
t.Run("wait failure is reported", func(t *testing.T) {
orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{
{cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")},
{cmd: "cordon"},
{cmd: "restart"},
{cmd: "wait", err: errors.New("not ready")},
})
repaired, err := orch.repairBrokenKubeletProxies(context.Background())
if repaired != 0 || err == nil || !strings.Contains(err.Error(), "wait Ready") {
t.Fatalf("expected wait error, repaired=%d err=%v", repaired, err)
}
})
t.Run("post restart health failure is reported", func(t *testing.T) {
orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{
{cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")},
{cmd: "cordon"},
{cmd: "restart"},
{cmd: "wait"},
{cmd: "health", out: "failed to find Session for client", err: errors.New("exit status 1")},
})
repaired, err := orch.repairBrokenKubeletProxies(context.Background())
if repaired != 0 || err == nil || !strings.Contains(err.Error(), "proxy still broken") {
t.Fatalf("expected post-restart health error, repaired=%d err=%v", repaired, err)
}
})
t.Run("uncordon failure is reported", func(t *testing.T) {
orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{
{cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")},
{cmd: "cordon"},
{cmd: "restart"},
{cmd: "wait"},
{cmd: "health", out: "ok"},
{cmd: "uncordon", err: errors.New("uncordon denied")},
})
repaired, err := orch.repairBrokenKubeletProxies(context.Background())
if repaired != 0 || err == nil || !strings.Contains(err.Error(), "uncordon after kubelet proxy repair") {
t.Fatalf("expected uncordon error, repaired=%d err=%v", repaired, err)
}
})
}
type kubeletProxyRepairAction struct {
cmd string
out string
err error
sideEffect func()
}
// kubeletProxyRepairStub runs one orchestration or CLI step.
// Signature: kubeletProxyRepairStub(t *testing.T, cfg config.Config, unschedulable bool, actions []kubeletProxyRepairAction) *Orchestrator.
// Why: keeps the kubelet proxy repair branch tests readable while preserving
// strict command order for safety-sensitive node operations.
func kubeletProxyRepairStub(t *testing.T, cfg config.Config, unschedulable bool, actions []kubeletProxyRepairAction) *Orchestrator {
t.Helper()
orch := buildOrchestratorWithStubs(t, cfg, nil)
index := 0
dispatch := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
joined := strings.Join(args, " ")
if name == "kubectl" && strings.Contains(joined, "get nodes -o json") {
return fmt.Sprintf(`{"items":[{"metadata":{"name":"titan-07"},"spec":{"unschedulable":%t},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`, unschedulable), nil
}
actual := ""
switch {
case name == "kubectl" && strings.Contains(joined, "get --raw /api/v1/nodes/titan-07/proxy/healthz"):
actual = "health"
case name == "kubectl" && strings.Contains(joined, "uncordon titan-07"):
actual = "uncordon"
case name == "kubectl" && strings.Contains(joined, "cordon titan-07"):
actual = "cordon"
case name == "ssh" && strings.Contains(joined, "sudo -n systemctl restart k3s-agent"):
actual = "restart"
case name == "kubectl" && strings.Contains(joined, "wait node/titan-07 --for=condition=Ready --timeout=120s"):
actual = "wait"
default:
return "", nil
}
if index >= len(actions) {
t.Fatalf("unexpected %s command after actions exhausted: %s %s", actual, name, joined)
}
action := actions[index]
index++
if action.cmd != actual {
t.Fatalf("expected %s command, got %s (%s %s)", action.cmd, actual, name, joined)
}
if action.sideEffect != nil {
action.sideEffect()
}
return action.out, action.err
}
orch.SetCommandOverrides(dispatch, dispatch)
return orch
}
// TestKubeletProxyHealthAndRepairableErrorHelpers runs one orchestration or CLI step.
// Signature: TestKubeletProxyHealthAndRepairableErrorHelpers(t *testing.T).
// Why: helper branches decide whether Ananke restarts a node agent, so both
// positive and negative cases need direct coverage.
func TestKubeletProxyHealthAndRepairableErrorHelpers(t *testing.T) {
t.Run("health error without output is preserved", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
{
match: matchContains("kubectl", "get --raw /api/v1/nodes/titan-07/proxy/healthz"),
err: errors.New("network timeout"),
},
})
healthy, err := orch.kubeletProxyHealthy(context.Background(), "titan-07")
if healthy || err == nil || !strings.Contains(err.Error(), "network timeout") {
t.Fatalf("expected raw health error, healthy=%v err=%v", healthy, err)
}
})
for _, tc := range []struct {
name string
err error
want bool
}{
{name: "nil", err: nil, want: false},
{name: "repairable", err: errors.New("502 Bad Gateway while dialing 10250"), want: true},
{name: "not repairable", err: errors.New("forbidden"), want: false},
} {
t.Run(tc.name, func(t *testing.T) {
if got := isRepairableKubeletProxyErr(tc.err); got != tc.want {
t.Fatalf("isRepairableKubeletProxyErr()=%v want %v", got, tc.want)
}
})
}
}
// TestPostStartAutoHealAggregatesErrors runs one orchestration or CLI step. // TestPostStartAutoHealAggregatesErrors runs one orchestration or CLI step.
// Signature: TestPostStartAutoHealAggregatesErrors(t *testing.T). // Signature: TestPostStartAutoHealAggregatesErrors(t *testing.T).
// Why: proves the daemon reports each failed sub-repair together instead of // Why: proves the daemon reports each failed sub-repair together instead of