test: split kubelet proxy autoheal coverage
This commit is contained in:
parent
e3afc9ea7b
commit
3b5cacdc34
340
internal/cluster/orchestrator_autorepair_proxy_test.go
Normal file
340
internal/cluster/orchestrator_autorepair_proxy_test.go
Normal file
@ -0,0 +1,340 @@
|
||||
package cluster
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"scm.bstein.dev/bstein/ananke/internal/config"
|
||||
)
|
||||
|
||||
// TestRepairBrokenKubeletProxiesRestartsSchedulableNode runs one orchestration or CLI step.
|
||||
// Signature: TestRepairBrokenKubeletProxiesRestartsSchedulableNode(t *testing.T).
|
||||
// Why: Jenkins agents depend on apiserver-to-kubelet exec; a Ready node with a
|
||||
// broken proxy needs a narrow k3s-agent restart, not a broad cluster restart.
|
||||
func TestRepairBrokenKubeletProxiesRestartsSchedulableNode(t *testing.T) {
|
||||
cfg := config.Config{SSHManagedNodes: []string{"titan-07"}}
|
||||
orch := buildOrchestratorWithStubs(t, cfg, nil)
|
||||
|
||||
healthChecks := 0
|
||||
cordoned := false
|
||||
restarted := false
|
||||
waited := false
|
||||
uncordoned := false
|
||||
dispatch := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
|
||||
joined := strings.Join(args, " ")
|
||||
switch {
|
||||
case name == "kubectl" && strings.Contains(joined, "get nodes -o json"):
|
||||
return `{"items":[{"metadata":{"name":"titan-07"},"spec":{"unschedulable":false},"status":{"conditions":[{"type":"Ready","status":"True"}]}},{"metadata":{"name":"titan-18"},"status":{"conditions":[{"type":"Ready","status":"Unknown"}]}}]}`, nil
|
||||
case name == "kubectl" && strings.Contains(joined, "get --raw /api/v1/nodes/titan-07/proxy/healthz"):
|
||||
healthChecks++
|
||||
if healthChecks == 1 {
|
||||
return "proxy error from 127.0.0.1:6443 while dialing 192.168.22.33:10250, code 502: 502 Bad Gateway", errors.New("exit status 1")
|
||||
}
|
||||
return "ok", nil
|
||||
case name == "kubectl" && strings.Contains(joined, "uncordon titan-07"):
|
||||
uncordoned = true
|
||||
return "", nil
|
||||
case name == "kubectl" && strings.Contains(joined, "cordon titan-07"):
|
||||
cordoned = true
|
||||
return "", nil
|
||||
case name == "ssh" && strings.Contains(joined, "sudo -n systemctl restart k3s-agent"):
|
||||
restarted = true
|
||||
return "", nil
|
||||
case name == "kubectl" && strings.Contains(joined, "wait node/titan-07 --for=condition=Ready --timeout=120s"):
|
||||
waited = true
|
||||
return "", nil
|
||||
default:
|
||||
return "", nil
|
||||
}
|
||||
}
|
||||
orch.SetCommandOverrides(dispatch, dispatch)
|
||||
|
||||
repaired, err := orch.repairBrokenKubeletProxies(context.Background())
|
||||
if err != nil {
|
||||
t.Fatalf("repairBrokenKubeletProxies failed: %v", err)
|
||||
}
|
||||
if repaired != 1 {
|
||||
t.Fatalf("expected one repaired node, got %d", repaired)
|
||||
}
|
||||
for name, ok := range map[string]bool{
|
||||
"cordoned": cordoned,
|
||||
"restarted": restarted,
|
||||
"waited": waited,
|
||||
"uncordoned": uncordoned,
|
||||
} {
|
||||
if !ok {
|
||||
t.Fatalf("expected %s action", name)
|
||||
}
|
||||
}
|
||||
if healthChecks != 2 {
|
||||
t.Fatalf("expected health check before and after repair, got %d", healthChecks)
|
||||
}
|
||||
}
|
||||
|
||||
// TestRepairBrokenKubeletProxiesPreservesExistingCordon runs one orchestration or CLI step.
|
||||
// Signature: TestRepairBrokenKubeletProxiesPreservesExistingCordon(t *testing.T).
|
||||
// Why: nodes intentionally kept out of service must not be accidentally
|
||||
// uncordoned just because Ananke repaired their kubelet proxy.
|
||||
func TestRepairBrokenKubeletProxiesPreservesExistingCordon(t *testing.T) {
|
||||
cfg := config.Config{SSHManagedNodes: []string{"titan-18"}}
|
||||
orch := buildOrchestratorWithStubs(t, cfg, nil)
|
||||
|
||||
healthChecks := 0
|
||||
cordonTouched := false
|
||||
restarted := false
|
||||
dispatch := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
|
||||
joined := strings.Join(args, " ")
|
||||
switch {
|
||||
case name == "kubectl" && strings.Contains(joined, "get nodes -o json"):
|
||||
return `{"items":[{"metadata":{"name":"titan-18"},"spec":{"unschedulable":true},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`, nil
|
||||
case name == "kubectl" && strings.Contains(joined, "get --raw /api/v1/nodes/titan-18/proxy/healthz"):
|
||||
healthChecks++
|
||||
if healthChecks == 1 {
|
||||
return "error trying to reach service: proxy error from 127.0.0.1:6443 while dialing 192.168.22.46:10250", errors.New("exit status 1")
|
||||
}
|
||||
return "ok", nil
|
||||
case name == "kubectl" && (strings.Contains(joined, "cordon titan-18") || strings.Contains(joined, "uncordon titan-18")):
|
||||
cordonTouched = true
|
||||
return "", nil
|
||||
case name == "ssh" && strings.Contains(joined, "sudo -n systemctl restart k3s-agent"):
|
||||
restarted = true
|
||||
return "", nil
|
||||
case name == "kubectl" && strings.Contains(joined, "wait node/titan-18 --for=condition=Ready --timeout=120s"):
|
||||
return "", nil
|
||||
default:
|
||||
return "", nil
|
||||
}
|
||||
}
|
||||
orch.SetCommandOverrides(dispatch, dispatch)
|
||||
|
||||
repaired, err := orch.repairBrokenKubeletProxies(context.Background())
|
||||
if err != nil {
|
||||
t.Fatalf("repairBrokenKubeletProxies failed: %v", err)
|
||||
}
|
||||
if repaired != 1 {
|
||||
t.Fatalf("expected one repaired node, got %d", repaired)
|
||||
}
|
||||
if !restarted {
|
||||
t.Fatalf("expected k3s-agent restart")
|
||||
}
|
||||
if cordonTouched {
|
||||
t.Fatalf("did not expect cordon state to change for already-unschedulable node")
|
||||
}
|
||||
}
|
||||
|
||||
// TestRepairBrokenKubeletProxiesFailureBranches runs one orchestration or CLI step.
|
||||
// Signature: TestRepairBrokenKubeletProxiesFailureBranches(t *testing.T).
|
||||
// Why: auto-repair must report exact blockers without broadening into unsafe
|
||||
// node restarts for unrelated kubectl or access failures.
|
||||
func TestRepairBrokenKubeletProxiesFailureBranches(t *testing.T) {
|
||||
t.Run("dry run skips", func(t *testing.T) {
|
||||
orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
|
||||
orch.runner.DryRun = true
|
||||
repaired, err := orch.repairBrokenKubeletProxies(context.Background())
|
||||
if err != nil || repaired != 0 {
|
||||
t.Fatalf("expected dry-run skip, got repaired=%d err=%v", repaired, err)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("node query error surfaces", func(t *testing.T) {
|
||||
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
||||
{match: matchContains("kubectl", "get nodes -o json"), err: errors.New("api down")},
|
||||
})
|
||||
_, err := orch.repairBrokenKubeletProxies(context.Background())
|
||||
if err == nil || !strings.Contains(err.Error(), "query nodes") {
|
||||
t.Fatalf("expected node query error, got %v", err)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("node json decode error surfaces", func(t *testing.T) {
|
||||
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
||||
{match: matchContains("kubectl", "get nodes -o json"), out: `{bad json`},
|
||||
})
|
||||
_, err := orch.readyNodeCandidates(context.Background())
|
||||
if err == nil || !strings.Contains(err.Error(), "decode nodes") {
|
||||
t.Fatalf("expected decode error, got %v", err)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("non repairable health error is reported", func(t *testing.T) {
|
||||
orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{
|
||||
{cmd: "health", out: "forbidden", err: errors.New("forbidden")},
|
||||
})
|
||||
repaired, err := orch.repairBrokenKubeletProxies(context.Background())
|
||||
if repaired != 0 || err == nil || !strings.Contains(err.Error(), "proxy health check") {
|
||||
t.Fatalf("expected health check error, repaired=%d err=%v", repaired, err)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("unmanaged broken node is reported", func(t *testing.T) {
|
||||
cfg := config.Config{SSHManagedNodes: []string{"other"}}
|
||||
orch := kubeletProxyRepairStub(t, cfg, false, []kubeletProxyRepairAction{
|
||||
{cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")},
|
||||
})
|
||||
repaired, err := orch.repairBrokenKubeletProxies(context.Background())
|
||||
if repaired != 0 || err == nil || !strings.Contains(err.Error(), "not SSH-managed") {
|
||||
t.Fatalf("expected unmanaged node error, repaired=%d err=%v", repaired, err)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("cordon failure is reported", func(t *testing.T) {
|
||||
orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{
|
||||
{cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")},
|
||||
{cmd: "cordon", err: errors.New("cordon denied")},
|
||||
})
|
||||
repaired, err := orch.repairBrokenKubeletProxies(context.Background())
|
||||
if repaired != 0 || err == nil || !strings.Contains(err.Error(), "cordon before kubelet restart") {
|
||||
t.Fatalf("expected cordon error, repaired=%d err=%v", repaired, err)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("restart failure uncordons schedulable node", func(t *testing.T) {
|
||||
uncordoned := false
|
||||
orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{
|
||||
{cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")},
|
||||
{cmd: "cordon"},
|
||||
{cmd: "restart", err: errors.New("sudo rejected")},
|
||||
{cmd: "uncordon", sideEffect: func() { uncordoned = true }},
|
||||
})
|
||||
repaired, err := orch.repairBrokenKubeletProxies(context.Background())
|
||||
if repaired != 0 || err == nil || !strings.Contains(err.Error(), "restart k3s-agent") {
|
||||
t.Fatalf("expected restart error, repaired=%d err=%v", repaired, err)
|
||||
}
|
||||
if !uncordoned {
|
||||
t.Fatalf("expected best-effort uncordon after restart failure")
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("wait failure is reported", func(t *testing.T) {
|
||||
orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{
|
||||
{cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")},
|
||||
{cmd: "cordon"},
|
||||
{cmd: "restart"},
|
||||
{cmd: "wait", err: errors.New("not ready")},
|
||||
})
|
||||
repaired, err := orch.repairBrokenKubeletProxies(context.Background())
|
||||
if repaired != 0 || err == nil || !strings.Contains(err.Error(), "wait Ready") {
|
||||
t.Fatalf("expected wait error, repaired=%d err=%v", repaired, err)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("post restart health failure is reported", func(t *testing.T) {
|
||||
orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{
|
||||
{cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")},
|
||||
{cmd: "cordon"},
|
||||
{cmd: "restart"},
|
||||
{cmd: "wait"},
|
||||
{cmd: "health", out: "failed to find Session for client", err: errors.New("exit status 1")},
|
||||
})
|
||||
repaired, err := orch.repairBrokenKubeletProxies(context.Background())
|
||||
if repaired != 0 || err == nil || !strings.Contains(err.Error(), "proxy still broken") {
|
||||
t.Fatalf("expected post-restart health error, repaired=%d err=%v", repaired, err)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("uncordon failure is reported", func(t *testing.T) {
|
||||
orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{
|
||||
{cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")},
|
||||
{cmd: "cordon"},
|
||||
{cmd: "restart"},
|
||||
{cmd: "wait"},
|
||||
{cmd: "health", out: "ok"},
|
||||
{cmd: "uncordon", err: errors.New("uncordon denied")},
|
||||
})
|
||||
repaired, err := orch.repairBrokenKubeletProxies(context.Background())
|
||||
if repaired != 0 || err == nil || !strings.Contains(err.Error(), "uncordon after kubelet proxy repair") {
|
||||
t.Fatalf("expected uncordon error, repaired=%d err=%v", repaired, err)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
type kubeletProxyRepairAction struct {
|
||||
cmd string
|
||||
out string
|
||||
err error
|
||||
sideEffect func()
|
||||
}
|
||||
|
||||
// kubeletProxyRepairStub runs one orchestration or CLI step.
|
||||
// Signature: kubeletProxyRepairStub(t *testing.T, cfg config.Config, unschedulable bool, actions []kubeletProxyRepairAction) *Orchestrator.
|
||||
// Why: keeps the kubelet proxy repair branch tests readable while preserving
|
||||
// strict command order for safety-sensitive node operations.
|
||||
func kubeletProxyRepairStub(t *testing.T, cfg config.Config, unschedulable bool, actions []kubeletProxyRepairAction) *Orchestrator {
|
||||
t.Helper()
|
||||
orch := buildOrchestratorWithStubs(t, cfg, nil)
|
||||
index := 0
|
||||
dispatch := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
|
||||
joined := strings.Join(args, " ")
|
||||
if name == "kubectl" && strings.Contains(joined, "get nodes -o json") {
|
||||
return fmt.Sprintf(`{"items":[{"metadata":{"name":"titan-07"},"spec":{"unschedulable":%t},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`, unschedulable), nil
|
||||
}
|
||||
actual := ""
|
||||
switch {
|
||||
case name == "kubectl" && strings.Contains(joined, "get --raw /api/v1/nodes/titan-07/proxy/healthz"):
|
||||
actual = "health"
|
||||
case name == "kubectl" && strings.Contains(joined, "uncordon titan-07"):
|
||||
actual = "uncordon"
|
||||
case name == "kubectl" && strings.Contains(joined, "cordon titan-07"):
|
||||
actual = "cordon"
|
||||
case name == "ssh" && strings.Contains(joined, "sudo -n systemctl restart k3s-agent"):
|
||||
actual = "restart"
|
||||
case name == "kubectl" && strings.Contains(joined, "wait node/titan-07 --for=condition=Ready --timeout=120s"):
|
||||
actual = "wait"
|
||||
default:
|
||||
return "", nil
|
||||
}
|
||||
if index >= len(actions) {
|
||||
t.Fatalf("unexpected %s command after actions exhausted: %s %s", actual, name, joined)
|
||||
}
|
||||
action := actions[index]
|
||||
index++
|
||||
if action.cmd != actual {
|
||||
t.Fatalf("expected %s command, got %s (%s %s)", action.cmd, actual, name, joined)
|
||||
}
|
||||
if action.sideEffect != nil {
|
||||
action.sideEffect()
|
||||
}
|
||||
return action.out, action.err
|
||||
}
|
||||
orch.SetCommandOverrides(dispatch, dispatch)
|
||||
return orch
|
||||
}
|
||||
|
||||
// TestKubeletProxyHealthAndRepairableErrorHelpers runs one orchestration or CLI step.
|
||||
// Signature: TestKubeletProxyHealthAndRepairableErrorHelpers(t *testing.T).
|
||||
// Why: helper branches decide whether Ananke restarts a node agent, so both
|
||||
// positive and negative cases need direct coverage.
|
||||
func TestKubeletProxyHealthAndRepairableErrorHelpers(t *testing.T) {
|
||||
t.Run("health error without output is preserved", func(t *testing.T) {
|
||||
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
||||
{
|
||||
match: matchContains("kubectl", "get --raw /api/v1/nodes/titan-07/proxy/healthz"),
|
||||
err: errors.New("network timeout"),
|
||||
},
|
||||
})
|
||||
healthy, err := orch.kubeletProxyHealthy(context.Background(), "titan-07")
|
||||
if healthy || err == nil || !strings.Contains(err.Error(), "network timeout") {
|
||||
t.Fatalf("expected raw health error, healthy=%v err=%v", healthy, err)
|
||||
}
|
||||
})
|
||||
|
||||
for _, tc := range []struct {
|
||||
name string
|
||||
err error
|
||||
want bool
|
||||
}{
|
||||
{name: "nil", err: nil, want: false},
|
||||
{name: "repairable", err: errors.New("502 Bad Gateway while dialing 10250"), want: true},
|
||||
{name: "not repairable", err: errors.New("forbidden"), want: false},
|
||||
} {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
if got := isRepairableKubeletProxyErr(tc.err); got != tc.want {
|
||||
t.Fatalf("isRepairableKubeletProxyErr()=%v want %v", got, tc.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@ -4,7 +4,6 @@ import (
|
||||
"context"
|
||||
"encoding/base64"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
"path/filepath"
|
||||
@ -191,334 +190,6 @@ func TestRunPostStartAutoHealDryRun(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestRepairBrokenKubeletProxiesRestartsSchedulableNode runs one orchestration or CLI step.
|
||||
// Signature: TestRepairBrokenKubeletProxiesRestartsSchedulableNode(t *testing.T).
|
||||
// Why: Jenkins agents depend on apiserver-to-kubelet exec; a Ready node with a
|
||||
// broken proxy needs a narrow k3s-agent restart, not a broad cluster restart.
|
||||
func TestRepairBrokenKubeletProxiesRestartsSchedulableNode(t *testing.T) {
|
||||
cfg := config.Config{SSHManagedNodes: []string{"titan-07"}}
|
||||
orch := buildOrchestratorWithStubs(t, cfg, nil)
|
||||
|
||||
healthChecks := 0
|
||||
cordoned := false
|
||||
restarted := false
|
||||
waited := false
|
||||
uncordoned := false
|
||||
dispatch := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
|
||||
joined := strings.Join(args, " ")
|
||||
switch {
|
||||
case name == "kubectl" && strings.Contains(joined, "get nodes -o json"):
|
||||
return `{"items":[{"metadata":{"name":"titan-07"},"spec":{"unschedulable":false},"status":{"conditions":[{"type":"Ready","status":"True"}]}},{"metadata":{"name":"titan-18"},"status":{"conditions":[{"type":"Ready","status":"Unknown"}]}}]}`, nil
|
||||
case name == "kubectl" && strings.Contains(joined, "get --raw /api/v1/nodes/titan-07/proxy/healthz"):
|
||||
healthChecks++
|
||||
if healthChecks == 1 {
|
||||
return "proxy error from 127.0.0.1:6443 while dialing 192.168.22.33:10250, code 502: 502 Bad Gateway", errors.New("exit status 1")
|
||||
}
|
||||
return "ok", nil
|
||||
case name == "kubectl" && strings.Contains(joined, "uncordon titan-07"):
|
||||
uncordoned = true
|
||||
return "", nil
|
||||
case name == "kubectl" && strings.Contains(joined, "cordon titan-07"):
|
||||
cordoned = true
|
||||
return "", nil
|
||||
case name == "ssh" && strings.Contains(joined, "sudo -n systemctl restart k3s-agent"):
|
||||
restarted = true
|
||||
return "", nil
|
||||
case name == "kubectl" && strings.Contains(joined, "wait node/titan-07 --for=condition=Ready --timeout=120s"):
|
||||
waited = true
|
||||
return "", nil
|
||||
default:
|
||||
return "", nil
|
||||
}
|
||||
}
|
||||
orch.SetCommandOverrides(dispatch, dispatch)
|
||||
|
||||
repaired, err := orch.repairBrokenKubeletProxies(context.Background())
|
||||
if err != nil {
|
||||
t.Fatalf("repairBrokenKubeletProxies failed: %v", err)
|
||||
}
|
||||
if repaired != 1 {
|
||||
t.Fatalf("expected one repaired node, got %d", repaired)
|
||||
}
|
||||
for name, ok := range map[string]bool{
|
||||
"cordoned": cordoned,
|
||||
"restarted": restarted,
|
||||
"waited": waited,
|
||||
"uncordoned": uncordoned,
|
||||
} {
|
||||
if !ok {
|
||||
t.Fatalf("expected %s action", name)
|
||||
}
|
||||
}
|
||||
if healthChecks != 2 {
|
||||
t.Fatalf("expected health check before and after repair, got %d", healthChecks)
|
||||
}
|
||||
}
|
||||
|
||||
// TestRepairBrokenKubeletProxiesPreservesExistingCordon runs one orchestration or CLI step.
|
||||
// Signature: TestRepairBrokenKubeletProxiesPreservesExistingCordon(t *testing.T).
|
||||
// Why: nodes intentionally kept out of service must not be accidentally
|
||||
// uncordoned just because Ananke repaired their kubelet proxy.
|
||||
func TestRepairBrokenKubeletProxiesPreservesExistingCordon(t *testing.T) {
|
||||
cfg := config.Config{SSHManagedNodes: []string{"titan-18"}}
|
||||
orch := buildOrchestratorWithStubs(t, cfg, nil)
|
||||
|
||||
healthChecks := 0
|
||||
cordonTouched := false
|
||||
restarted := false
|
||||
dispatch := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
|
||||
joined := strings.Join(args, " ")
|
||||
switch {
|
||||
case name == "kubectl" && strings.Contains(joined, "get nodes -o json"):
|
||||
return `{"items":[{"metadata":{"name":"titan-18"},"spec":{"unschedulable":true},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`, nil
|
||||
case name == "kubectl" && strings.Contains(joined, "get --raw /api/v1/nodes/titan-18/proxy/healthz"):
|
||||
healthChecks++
|
||||
if healthChecks == 1 {
|
||||
return "error trying to reach service: proxy error from 127.0.0.1:6443 while dialing 192.168.22.46:10250", errors.New("exit status 1")
|
||||
}
|
||||
return "ok", nil
|
||||
case name == "kubectl" && (strings.Contains(joined, "cordon titan-18") || strings.Contains(joined, "uncordon titan-18")):
|
||||
cordonTouched = true
|
||||
return "", nil
|
||||
case name == "ssh" && strings.Contains(joined, "sudo -n systemctl restart k3s-agent"):
|
||||
restarted = true
|
||||
return "", nil
|
||||
case name == "kubectl" && strings.Contains(joined, "wait node/titan-18 --for=condition=Ready --timeout=120s"):
|
||||
return "", nil
|
||||
default:
|
||||
return "", nil
|
||||
}
|
||||
}
|
||||
orch.SetCommandOverrides(dispatch, dispatch)
|
||||
|
||||
repaired, err := orch.repairBrokenKubeletProxies(context.Background())
|
||||
if err != nil {
|
||||
t.Fatalf("repairBrokenKubeletProxies failed: %v", err)
|
||||
}
|
||||
if repaired != 1 {
|
||||
t.Fatalf("expected one repaired node, got %d", repaired)
|
||||
}
|
||||
if !restarted {
|
||||
t.Fatalf("expected k3s-agent restart")
|
||||
}
|
||||
if cordonTouched {
|
||||
t.Fatalf("did not expect cordon state to change for already-unschedulable node")
|
||||
}
|
||||
}
|
||||
|
||||
// TestRepairBrokenKubeletProxiesFailureBranches runs one orchestration or CLI step.
|
||||
// Signature: TestRepairBrokenKubeletProxiesFailureBranches(t *testing.T).
|
||||
// Why: auto-repair must report exact blockers without broadening into unsafe
|
||||
// node restarts for unrelated kubectl or access failures.
|
||||
func TestRepairBrokenKubeletProxiesFailureBranches(t *testing.T) {
|
||||
t.Run("dry run skips", func(t *testing.T) {
|
||||
orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
|
||||
orch.runner.DryRun = true
|
||||
repaired, err := orch.repairBrokenKubeletProxies(context.Background())
|
||||
if err != nil || repaired != 0 {
|
||||
t.Fatalf("expected dry-run skip, got repaired=%d err=%v", repaired, err)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("node query error surfaces", func(t *testing.T) {
|
||||
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
||||
{match: matchContains("kubectl", "get nodes -o json"), err: errors.New("api down")},
|
||||
})
|
||||
_, err := orch.repairBrokenKubeletProxies(context.Background())
|
||||
if err == nil || !strings.Contains(err.Error(), "query nodes") {
|
||||
t.Fatalf("expected node query error, got %v", err)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("node json decode error surfaces", func(t *testing.T) {
|
||||
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
||||
{match: matchContains("kubectl", "get nodes -o json"), out: `{bad json`},
|
||||
})
|
||||
_, err := orch.readyNodeCandidates(context.Background())
|
||||
if err == nil || !strings.Contains(err.Error(), "decode nodes") {
|
||||
t.Fatalf("expected decode error, got %v", err)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("non repairable health error is reported", func(t *testing.T) {
|
||||
orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{
|
||||
{cmd: "health", out: "forbidden", err: errors.New("forbidden")},
|
||||
})
|
||||
repaired, err := orch.repairBrokenKubeletProxies(context.Background())
|
||||
if repaired != 0 || err == nil || !strings.Contains(err.Error(), "proxy health check") {
|
||||
t.Fatalf("expected health check error, repaired=%d err=%v", repaired, err)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("unmanaged broken node is reported", func(t *testing.T) {
|
||||
cfg := config.Config{SSHManagedNodes: []string{"other"}}
|
||||
orch := kubeletProxyRepairStub(t, cfg, false, []kubeletProxyRepairAction{
|
||||
{cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")},
|
||||
})
|
||||
repaired, err := orch.repairBrokenKubeletProxies(context.Background())
|
||||
if repaired != 0 || err == nil || !strings.Contains(err.Error(), "not SSH-managed") {
|
||||
t.Fatalf("expected unmanaged node error, repaired=%d err=%v", repaired, err)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("cordon failure is reported", func(t *testing.T) {
|
||||
orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{
|
||||
{cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")},
|
||||
{cmd: "cordon", err: errors.New("cordon denied")},
|
||||
})
|
||||
repaired, err := orch.repairBrokenKubeletProxies(context.Background())
|
||||
if repaired != 0 || err == nil || !strings.Contains(err.Error(), "cordon before kubelet restart") {
|
||||
t.Fatalf("expected cordon error, repaired=%d err=%v", repaired, err)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("restart failure uncordons schedulable node", func(t *testing.T) {
|
||||
uncordoned := false
|
||||
orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{
|
||||
{cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")},
|
||||
{cmd: "cordon"},
|
||||
{cmd: "restart", err: errors.New("sudo rejected")},
|
||||
{cmd: "uncordon", sideEffect: func() { uncordoned = true }},
|
||||
})
|
||||
repaired, err := orch.repairBrokenKubeletProxies(context.Background())
|
||||
if repaired != 0 || err == nil || !strings.Contains(err.Error(), "restart k3s-agent") {
|
||||
t.Fatalf("expected restart error, repaired=%d err=%v", repaired, err)
|
||||
}
|
||||
if !uncordoned {
|
||||
t.Fatalf("expected best-effort uncordon after restart failure")
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("wait failure is reported", func(t *testing.T) {
|
||||
orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{
|
||||
{cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")},
|
||||
{cmd: "cordon"},
|
||||
{cmd: "restart"},
|
||||
{cmd: "wait", err: errors.New("not ready")},
|
||||
})
|
||||
repaired, err := orch.repairBrokenKubeletProxies(context.Background())
|
||||
if repaired != 0 || err == nil || !strings.Contains(err.Error(), "wait Ready") {
|
||||
t.Fatalf("expected wait error, repaired=%d err=%v", repaired, err)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("post restart health failure is reported", func(t *testing.T) {
|
||||
orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{
|
||||
{cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")},
|
||||
{cmd: "cordon"},
|
||||
{cmd: "restart"},
|
||||
{cmd: "wait"},
|
||||
{cmd: "health", out: "failed to find Session for client", err: errors.New("exit status 1")},
|
||||
})
|
||||
repaired, err := orch.repairBrokenKubeletProxies(context.Background())
|
||||
if repaired != 0 || err == nil || !strings.Contains(err.Error(), "proxy still broken") {
|
||||
t.Fatalf("expected post-restart health error, repaired=%d err=%v", repaired, err)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("uncordon failure is reported", func(t *testing.T) {
|
||||
orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{
|
||||
{cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")},
|
||||
{cmd: "cordon"},
|
||||
{cmd: "restart"},
|
||||
{cmd: "wait"},
|
||||
{cmd: "health", out: "ok"},
|
||||
{cmd: "uncordon", err: errors.New("uncordon denied")},
|
||||
})
|
||||
repaired, err := orch.repairBrokenKubeletProxies(context.Background())
|
||||
if repaired != 0 || err == nil || !strings.Contains(err.Error(), "uncordon after kubelet proxy repair") {
|
||||
t.Fatalf("expected uncordon error, repaired=%d err=%v", repaired, err)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
type kubeletProxyRepairAction struct {
|
||||
cmd string
|
||||
out string
|
||||
err error
|
||||
sideEffect func()
|
||||
}
|
||||
|
||||
// kubeletProxyRepairStub runs one orchestration or CLI step.
|
||||
// Signature: kubeletProxyRepairStub(t *testing.T, cfg config.Config, unschedulable bool, actions []kubeletProxyRepairAction) *Orchestrator.
|
||||
// Why: keeps the kubelet proxy repair branch tests readable while preserving
|
||||
// strict command order for safety-sensitive node operations.
|
||||
func kubeletProxyRepairStub(t *testing.T, cfg config.Config, unschedulable bool, actions []kubeletProxyRepairAction) *Orchestrator {
|
||||
t.Helper()
|
||||
orch := buildOrchestratorWithStubs(t, cfg, nil)
|
||||
index := 0
|
||||
dispatch := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
|
||||
joined := strings.Join(args, " ")
|
||||
if name == "kubectl" && strings.Contains(joined, "get nodes -o json") {
|
||||
return fmt.Sprintf(`{"items":[{"metadata":{"name":"titan-07"},"spec":{"unschedulable":%t},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`, unschedulable), nil
|
||||
}
|
||||
actual := ""
|
||||
switch {
|
||||
case name == "kubectl" && strings.Contains(joined, "get --raw /api/v1/nodes/titan-07/proxy/healthz"):
|
||||
actual = "health"
|
||||
case name == "kubectl" && strings.Contains(joined, "uncordon titan-07"):
|
||||
actual = "uncordon"
|
||||
case name == "kubectl" && strings.Contains(joined, "cordon titan-07"):
|
||||
actual = "cordon"
|
||||
case name == "ssh" && strings.Contains(joined, "sudo -n systemctl restart k3s-agent"):
|
||||
actual = "restart"
|
||||
case name == "kubectl" && strings.Contains(joined, "wait node/titan-07 --for=condition=Ready --timeout=120s"):
|
||||
actual = "wait"
|
||||
default:
|
||||
return "", nil
|
||||
}
|
||||
if index >= len(actions) {
|
||||
t.Fatalf("unexpected %s command after actions exhausted: %s %s", actual, name, joined)
|
||||
}
|
||||
action := actions[index]
|
||||
index++
|
||||
if action.cmd != actual {
|
||||
t.Fatalf("expected %s command, got %s (%s %s)", action.cmd, actual, name, joined)
|
||||
}
|
||||
if action.sideEffect != nil {
|
||||
action.sideEffect()
|
||||
}
|
||||
return action.out, action.err
|
||||
}
|
||||
orch.SetCommandOverrides(dispatch, dispatch)
|
||||
return orch
|
||||
}
|
||||
|
||||
// TestKubeletProxyHealthAndRepairableErrorHelpers runs one orchestration or CLI step.
|
||||
// Signature: TestKubeletProxyHealthAndRepairableErrorHelpers(t *testing.T).
|
||||
// Why: helper branches decide whether Ananke restarts a node agent, so both
|
||||
// positive and negative cases need direct coverage.
|
||||
func TestKubeletProxyHealthAndRepairableErrorHelpers(t *testing.T) {
|
||||
t.Run("health error without output is preserved", func(t *testing.T) {
|
||||
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
||||
{
|
||||
match: matchContains("kubectl", "get --raw /api/v1/nodes/titan-07/proxy/healthz"),
|
||||
err: errors.New("network timeout"),
|
||||
},
|
||||
})
|
||||
healthy, err := orch.kubeletProxyHealthy(context.Background(), "titan-07")
|
||||
if healthy || err == nil || !strings.Contains(err.Error(), "network timeout") {
|
||||
t.Fatalf("expected raw health error, healthy=%v err=%v", healthy, err)
|
||||
}
|
||||
})
|
||||
|
||||
for _, tc := range []struct {
|
||||
name string
|
||||
err error
|
||||
want bool
|
||||
}{
|
||||
{name: "nil", err: nil, want: false},
|
||||
{name: "repairable", err: errors.New("502 Bad Gateway while dialing 10250"), want: true},
|
||||
{name: "not repairable", err: errors.New("forbidden"), want: false},
|
||||
} {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
if got := isRepairableKubeletProxyErr(tc.err); got != tc.want {
|
||||
t.Fatalf("isRepairableKubeletProxyErr()=%v want %v", got, tc.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestPostStartAutoHealAggregatesErrors runs one orchestration or CLI step.
|
||||
// Signature: TestPostStartAutoHealAggregatesErrors(t *testing.T).
|
||||
// Why: proves the daemon reports each failed sub-repair together instead of
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user