ananke/internal/cluster/orchestrator_autorepair_test.go

712 lines
27 KiB
Go
Raw Normal View History

package cluster
import (
"context"
"encoding/base64"
"errors"
2026-05-17 04:40:17 -03:00
"fmt"
"io"
"log"
"path/filepath"
"strings"
"testing"
"time"
"scm.bstein.dev/bstein/ananke/internal/config"
"scm.bstein.dev/bstein/ananke/internal/execx"
"scm.bstein.dev/bstein/ananke/internal/state"
)
// TestPostStartAutoHealRepairsVaultAndUnavailableNodes runs one orchestration or CLI step.
// Signature: TestPostStartAutoHealRepairsVaultAndUnavailableNodes(t *testing.T).
// Why: covers the new daemon-triggered repair path for late Vault reseals and
// stale terminating pods anchored to unavailable nodes.
func TestPostStartAutoHealRepairsVaultAndUnavailableNodes(t *testing.T) {
cfg := config.Config{
Startup: config.Startup{
DeadNodeCleanupGraceSeconds: 300,
RequiredNodeLabels: map[string]map[string]string{
"titan-07": {"node-role.kubernetes.io/worker": "true"},
},
},
State: config.State{
Dir: t.TempDir(),
ReportsDir: filepath.Join(t.TempDir(), "reports"),
RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"),
},
}
orch := &Orchestrator{
cfg: cfg,
runner: &execx.Runner{},
store: state.New(filepath.Join(t.TempDir(), "runs.json")),
log: log.New(io.Discard, "", 0),
}
oldDelete := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
unsealCalls := 0
jobCreated := false
reconciled := false
deleted := map[string]bool{}
dispatch := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
if name != "kubectl" {
return "", nil
}
joined := strings.Join(args, " ")
switch {
case strings.Contains(joined, "label node titan-07 --overwrite node-role.kubernetes.io/worker=true"):
return "", nil
case strings.Contains(joined, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
return "Running", nil
case strings.Contains(joined, "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"):
if unsealCalls == 0 {
return `{"initialized":true,"sealed":true}`, nil
}
return `{"initialized":true,"sealed":false}`, nil
case strings.Contains(joined, "-n vault get secret vault-init -o jsonpath={.data.unseal_key_b64}"):
return base64.StdEncoding.EncodeToString([]byte("vault-unseal-key")), nil
case strings.Contains(joined, "vault operator unseal"):
unsealCalls++
return "", nil
case strings.Contains(joined, "-n vault create job --from=cronjob/vault-k8s-auth-config"):
jobCreated = true
return "", nil
case strings.Contains(joined, "get nodes -o json"):
return `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"Unknown"}]}},{"metadata":{"name":"titan-07"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`, nil
case strings.Contains(joined, "get pods -A -o json"):
return `{"items":[{"metadata":{"namespace":"maintenance","name":"stale-pod","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-22"}},{"metadata":{"namespace":"logging","name":"healthy-node-pod","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-18"}}]}`, nil
case strings.Contains(joined, "-n maintenance delete pod stale-pod --grace-period=0 --force --wait=false"):
deleted["maintenance/stale-pod"] = true
return "", nil
case strings.Contains(joined, "-n flux-system annotate gitrepository flux-system reconcile.fluxcd.io/requestedAt="):
reconciled = true
return "", nil
case strings.Contains(joined, "-n flux-system annotate kustomizations.kustomize.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt="):
return "", nil
case strings.Contains(joined, "annotate --all-namespaces helmreleases.helm.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt="):
return "", nil
default:
return "", nil
}
}
orch.SetCommandOverrides(dispatch, dispatch)
if err := orch.postStartAutoHeal(context.Background()); err != nil {
t.Fatalf("postStartAutoHeal failed: %v", err)
}
if unsealCalls != 1 {
t.Fatalf("expected one Vault unseal attempt, got %d", unsealCalls)
}
if !jobCreated {
t.Fatalf("expected vault k8s auth config job to be created")
}
if !deleted["maintenance/stale-pod"] {
t.Fatalf("expected stale unavailable-node pod to be deleted")
}
if !reconciled {
t.Fatalf("expected flux reconcile request after repairs")
}
if deleted["logging/healthy-node-pod"] {
t.Fatalf("did not expect terminating pod on healthy node to be deleted")
}
}
// TestPostStartAutoHealSkipsWhenClusterIsHealthy runs one orchestration or CLI step.
// Signature: TestPostStartAutoHealSkipsWhenClusterIsHealthy(t *testing.T).
// Why: proves the new post-start repair loop stays quiet when the specific
// failure patterns are absent.
func TestPostStartAutoHealSkipsWhenClusterIsHealthy(t *testing.T) {
cfg := config.Config{
Startup: config.Startup{
DeadNodeCleanupGraceSeconds: 300,
},
State: config.State{
Dir: t.TempDir(),
ReportsDir: filepath.Join(t.TempDir(), "reports"),
RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"),
},
}
orch := &Orchestrator{
cfg: cfg,
runner: &execx.Runner{},
store: state.New(filepath.Join(t.TempDir(), "runs.json")),
log: log.New(io.Discard, "", 0),
}
unsealCalls := 0
jobCreated := false
reconciled := false
dispatch := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
if name != "kubectl" {
return "", nil
}
joined := strings.Join(args, " ")
switch {
case strings.Contains(joined, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
return "Running", nil
case strings.Contains(joined, "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"):
return `{"initialized":true,"sealed":false}`, nil
case strings.Contains(joined, "-n vault create job --from=cronjob/vault-k8s-auth-config"):
jobCreated = true
return "", nil
case strings.Contains(joined, "vault operator unseal"):
unsealCalls++
return "", nil
case strings.Contains(joined, "get nodes -o json"):
return `{"items":[{"metadata":{"name":"titan-07"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`, nil
case strings.Contains(joined, "get pods -A -o json"):
return `{"items":[]}`, nil
case strings.Contains(joined, "reconcile.fluxcd.io/requestedAt="):
reconciled = true
return "", nil
default:
return "", nil
}
}
orch.SetCommandOverrides(dispatch, dispatch)
if err := orch.postStartAutoHeal(context.Background()); err != nil {
t.Fatalf("postStartAutoHeal failed: %v", err)
}
if unsealCalls != 0 {
t.Fatalf("did not expect Vault unseal calls, got %d", unsealCalls)
}
if jobCreated {
t.Fatalf("did not expect vault auth config job creation")
}
if reconciled {
t.Fatalf("did not expect flux reconcile request for healthy cluster")
}
}
// TestRunPostStartAutoHealDryRun runs one orchestration or CLI step.
// Signature: TestRunPostStartAutoHealDryRun(t *testing.T).
// Why: covers the exported wrapper and the top-level dry-run guard so daemon
// auto-heal never mutates cluster state during rehearsal runs.
func TestRunPostStartAutoHealDryRun(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
orch.runner.DryRun = true
if err := orch.RunPostStartAutoHeal(context.Background()); err != nil {
t.Fatalf("RunPostStartAutoHeal dry-run failed: %v", err)
}
}
// TestRepairBrokenKubeletProxiesRestartsSchedulableNode runs one orchestration or CLI step.
// Signature: TestRepairBrokenKubeletProxiesRestartsSchedulableNode(t *testing.T).
// Why: Jenkins agents depend on apiserver-to-kubelet exec; a Ready node with a
// broken proxy needs a narrow k3s-agent restart, not a broad cluster restart.
func TestRepairBrokenKubeletProxiesRestartsSchedulableNode(t *testing.T) {
cfg := config.Config{SSHManagedNodes: []string{"titan-07"}}
orch := buildOrchestratorWithStubs(t, cfg, nil)
healthChecks := 0
cordoned := false
restarted := false
waited := false
uncordoned := false
dispatch := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
joined := strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(joined, "get nodes -o json"):
return `{"items":[{"metadata":{"name":"titan-07"},"spec":{"unschedulable":false},"status":{"conditions":[{"type":"Ready","status":"True"}]}},{"metadata":{"name":"titan-18"},"status":{"conditions":[{"type":"Ready","status":"Unknown"}]}}]}`, nil
case name == "kubectl" && strings.Contains(joined, "get --raw /api/v1/nodes/titan-07/proxy/healthz"):
healthChecks++
if healthChecks == 1 {
return "proxy error from 127.0.0.1:6443 while dialing 192.168.22.33:10250, code 502: 502 Bad Gateway", errors.New("exit status 1")
}
return "ok", nil
case name == "kubectl" && strings.Contains(joined, "uncordon titan-07"):
uncordoned = true
return "", nil
case name == "kubectl" && strings.Contains(joined, "cordon titan-07"):
cordoned = true
return "", nil
case name == "ssh" && strings.Contains(joined, "sudo -n systemctl restart k3s-agent"):
restarted = true
return "", nil
case name == "kubectl" && strings.Contains(joined, "wait node/titan-07 --for=condition=Ready --timeout=120s"):
waited = true
return "", nil
default:
return "", nil
}
}
orch.SetCommandOverrides(dispatch, dispatch)
repaired, err := orch.repairBrokenKubeletProxies(context.Background())
if err != nil {
t.Fatalf("repairBrokenKubeletProxies failed: %v", err)
}
if repaired != 1 {
t.Fatalf("expected one repaired node, got %d", repaired)
}
for name, ok := range map[string]bool{
"cordoned": cordoned,
"restarted": restarted,
"waited": waited,
"uncordoned": uncordoned,
} {
if !ok {
t.Fatalf("expected %s action", name)
}
}
if healthChecks != 2 {
t.Fatalf("expected health check before and after repair, got %d", healthChecks)
}
}
// TestRepairBrokenKubeletProxiesPreservesExistingCordon runs one orchestration or CLI step.
// Signature: TestRepairBrokenKubeletProxiesPreservesExistingCordon(t *testing.T).
// Why: nodes intentionally kept out of service must not be accidentally
// uncordoned just because Ananke repaired their kubelet proxy.
func TestRepairBrokenKubeletProxiesPreservesExistingCordon(t *testing.T) {
cfg := config.Config{SSHManagedNodes: []string{"titan-18"}}
orch := buildOrchestratorWithStubs(t, cfg, nil)
healthChecks := 0
cordonTouched := false
restarted := false
dispatch := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
joined := strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(joined, "get nodes -o json"):
return `{"items":[{"metadata":{"name":"titan-18"},"spec":{"unschedulable":true},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`, nil
case name == "kubectl" && strings.Contains(joined, "get --raw /api/v1/nodes/titan-18/proxy/healthz"):
healthChecks++
if healthChecks == 1 {
return "error trying to reach service: proxy error from 127.0.0.1:6443 while dialing 192.168.22.46:10250", errors.New("exit status 1")
}
return "ok", nil
case name == "kubectl" && (strings.Contains(joined, "cordon titan-18") || strings.Contains(joined, "uncordon titan-18")):
cordonTouched = true
return "", nil
case name == "ssh" && strings.Contains(joined, "sudo -n systemctl restart k3s-agent"):
restarted = true
return "", nil
case name == "kubectl" && strings.Contains(joined, "wait node/titan-18 --for=condition=Ready --timeout=120s"):
return "", nil
default:
return "", nil
}
}
orch.SetCommandOverrides(dispatch, dispatch)
repaired, err := orch.repairBrokenKubeletProxies(context.Background())
if err != nil {
t.Fatalf("repairBrokenKubeletProxies failed: %v", err)
}
if repaired != 1 {
t.Fatalf("expected one repaired node, got %d", repaired)
}
if !restarted {
t.Fatalf("expected k3s-agent restart")
}
if cordonTouched {
t.Fatalf("did not expect cordon state to change for already-unschedulable node")
}
}
2026-05-17 04:40:17 -03:00
// TestRepairBrokenKubeletProxiesFailureBranches runs one orchestration or CLI step.
// Signature: TestRepairBrokenKubeletProxiesFailureBranches(t *testing.T).
// Why: auto-repair must report exact blockers without broadening into unsafe
// node restarts for unrelated kubectl or access failures.
func TestRepairBrokenKubeletProxiesFailureBranches(t *testing.T) {
t.Run("dry run skips", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
orch.runner.DryRun = true
repaired, err := orch.repairBrokenKubeletProxies(context.Background())
if err != nil || repaired != 0 {
t.Fatalf("expected dry-run skip, got repaired=%d err=%v", repaired, err)
}
})
t.Run("node query error surfaces", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
{match: matchContains("kubectl", "get nodes -o json"), err: errors.New("api down")},
})
_, err := orch.repairBrokenKubeletProxies(context.Background())
if err == nil || !strings.Contains(err.Error(), "query nodes") {
t.Fatalf("expected node query error, got %v", err)
}
})
t.Run("node json decode error surfaces", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
{match: matchContains("kubectl", "get nodes -o json"), out: `{bad json`},
})
_, err := orch.readyNodeCandidates(context.Background())
if err == nil || !strings.Contains(err.Error(), "decode nodes") {
t.Fatalf("expected decode error, got %v", err)
}
})
t.Run("non repairable health error is reported", func(t *testing.T) {
orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{
{cmd: "health", out: "forbidden", err: errors.New("forbidden")},
})
repaired, err := orch.repairBrokenKubeletProxies(context.Background())
if repaired != 0 || err == nil || !strings.Contains(err.Error(), "proxy health check") {
t.Fatalf("expected health check error, repaired=%d err=%v", repaired, err)
}
})
t.Run("unmanaged broken node is reported", func(t *testing.T) {
cfg := config.Config{SSHManagedNodes: []string{"other"}}
orch := kubeletProxyRepairStub(t, cfg, false, []kubeletProxyRepairAction{
{cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")},
})
repaired, err := orch.repairBrokenKubeletProxies(context.Background())
if repaired != 0 || err == nil || !strings.Contains(err.Error(), "not SSH-managed") {
t.Fatalf("expected unmanaged node error, repaired=%d err=%v", repaired, err)
}
})
t.Run("cordon failure is reported", func(t *testing.T) {
orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{
{cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")},
{cmd: "cordon", err: errors.New("cordon denied")},
})
repaired, err := orch.repairBrokenKubeletProxies(context.Background())
if repaired != 0 || err == nil || !strings.Contains(err.Error(), "cordon before kubelet restart") {
t.Fatalf("expected cordon error, repaired=%d err=%v", repaired, err)
}
})
t.Run("restart failure uncordons schedulable node", func(t *testing.T) {
uncordoned := false
orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{
{cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")},
{cmd: "cordon"},
{cmd: "restart", err: errors.New("sudo rejected")},
{cmd: "uncordon", sideEffect: func() { uncordoned = true }},
})
repaired, err := orch.repairBrokenKubeletProxies(context.Background())
if repaired != 0 || err == nil || !strings.Contains(err.Error(), "restart k3s-agent") {
t.Fatalf("expected restart error, repaired=%d err=%v", repaired, err)
}
if !uncordoned {
t.Fatalf("expected best-effort uncordon after restart failure")
}
})
t.Run("wait failure is reported", func(t *testing.T) {
orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{
{cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")},
{cmd: "cordon"},
{cmd: "restart"},
{cmd: "wait", err: errors.New("not ready")},
})
repaired, err := orch.repairBrokenKubeletProxies(context.Background())
if repaired != 0 || err == nil || !strings.Contains(err.Error(), "wait Ready") {
t.Fatalf("expected wait error, repaired=%d err=%v", repaired, err)
}
})
t.Run("post restart health failure is reported", func(t *testing.T) {
orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{
{cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")},
{cmd: "cordon"},
{cmd: "restart"},
{cmd: "wait"},
{cmd: "health", out: "failed to find Session for client", err: errors.New("exit status 1")},
})
repaired, err := orch.repairBrokenKubeletProxies(context.Background())
if repaired != 0 || err == nil || !strings.Contains(err.Error(), "proxy still broken") {
t.Fatalf("expected post-restart health error, repaired=%d err=%v", repaired, err)
}
})
t.Run("uncordon failure is reported", func(t *testing.T) {
orch := kubeletProxyRepairStub(t, config.Config{}, false, []kubeletProxyRepairAction{
{cmd: "health", out: "502 bad gateway", err: errors.New("exit status 1")},
{cmd: "cordon"},
{cmd: "restart"},
{cmd: "wait"},
{cmd: "health", out: "ok"},
{cmd: "uncordon", err: errors.New("uncordon denied")},
})
repaired, err := orch.repairBrokenKubeletProxies(context.Background())
if repaired != 0 || err == nil || !strings.Contains(err.Error(), "uncordon after kubelet proxy repair") {
t.Fatalf("expected uncordon error, repaired=%d err=%v", repaired, err)
}
})
}
type kubeletProxyRepairAction struct {
cmd string
out string
err error
sideEffect func()
}
// kubeletProxyRepairStub runs one orchestration or CLI step.
// Signature: kubeletProxyRepairStub(t *testing.T, cfg config.Config, unschedulable bool, actions []kubeletProxyRepairAction) *Orchestrator.
// Why: keeps the kubelet proxy repair branch tests readable while preserving
// strict command order for safety-sensitive node operations.
func kubeletProxyRepairStub(t *testing.T, cfg config.Config, unschedulable bool, actions []kubeletProxyRepairAction) *Orchestrator {
t.Helper()
orch := buildOrchestratorWithStubs(t, cfg, nil)
index := 0
dispatch := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
joined := strings.Join(args, " ")
if name == "kubectl" && strings.Contains(joined, "get nodes -o json") {
return fmt.Sprintf(`{"items":[{"metadata":{"name":"titan-07"},"spec":{"unschedulable":%t},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`, unschedulable), nil
}
actual := ""
switch {
case name == "kubectl" && strings.Contains(joined, "get --raw /api/v1/nodes/titan-07/proxy/healthz"):
actual = "health"
case name == "kubectl" && strings.Contains(joined, "uncordon titan-07"):
actual = "uncordon"
case name == "kubectl" && strings.Contains(joined, "cordon titan-07"):
actual = "cordon"
case name == "ssh" && strings.Contains(joined, "sudo -n systemctl restart k3s-agent"):
actual = "restart"
case name == "kubectl" && strings.Contains(joined, "wait node/titan-07 --for=condition=Ready --timeout=120s"):
actual = "wait"
default:
return "", nil
}
if index >= len(actions) {
t.Fatalf("unexpected %s command after actions exhausted: %s %s", actual, name, joined)
}
action := actions[index]
index++
if action.cmd != actual {
t.Fatalf("expected %s command, got %s (%s %s)", action.cmd, actual, name, joined)
}
if action.sideEffect != nil {
action.sideEffect()
}
return action.out, action.err
}
orch.SetCommandOverrides(dispatch, dispatch)
return orch
}
// TestKubeletProxyHealthAndRepairableErrorHelpers runs one orchestration or CLI step.
// Signature: TestKubeletProxyHealthAndRepairableErrorHelpers(t *testing.T).
// Why: helper branches decide whether Ananke restarts a node agent, so both
// positive and negative cases need direct coverage.
func TestKubeletProxyHealthAndRepairableErrorHelpers(t *testing.T) {
t.Run("health error without output is preserved", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
{
match: matchContains("kubectl", "get --raw /api/v1/nodes/titan-07/proxy/healthz"),
err: errors.New("network timeout"),
},
})
healthy, err := orch.kubeletProxyHealthy(context.Background(), "titan-07")
if healthy || err == nil || !strings.Contains(err.Error(), "network timeout") {
t.Fatalf("expected raw health error, healthy=%v err=%v", healthy, err)
}
})
for _, tc := range []struct {
name string
err error
want bool
}{
{name: "nil", err: nil, want: false},
{name: "repairable", err: errors.New("502 Bad Gateway while dialing 10250"), want: true},
{name: "not repairable", err: errors.New("forbidden"), want: false},
} {
t.Run(tc.name, func(t *testing.T) {
if got := isRepairableKubeletProxyErr(tc.err); got != tc.want {
t.Fatalf("isRepairableKubeletProxyErr()=%v want %v", got, tc.want)
}
})
}
}
// TestPostStartAutoHealAggregatesErrors runs one orchestration or CLI step.
// Signature: TestPostStartAutoHealAggregatesErrors(t *testing.T).
// Why: proves the daemon reports each failed sub-repair together instead of
// hiding later failures behind the first problem.
func TestPostStartAutoHealAggregatesErrors(t *testing.T) {
cfg := config.Config{
Startup: config.Startup{
DeadNodeCleanupGraceSeconds: 300,
RequiredNodeLabels: map[string]map[string]string{
"titan-07": {"node-role.kubernetes.io/worker": "true"},
},
},
}
orch := buildOrchestratorWithStubs(t, cfg, []commandStub{
{
match: matchContains("kubectl", "label node titan-07 --overwrite node-role.kubernetes.io/worker=true"),
err: errors.New("label failed"),
},
{
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
err: errors.New("vault phase failed"),
},
{
match: matchContains("kubectl", "get nodes -o json"),
err: errors.New("node query failed"),
},
})
err := orch.postStartAutoHeal(context.Background())
if err == nil {
t.Fatalf("expected aggregated error")
}
msg := err.Error()
for _, want := range []string{
"required node labels:",
"vault auto-recovery:",
"dead-node terminating pod cleanup:",
} {
if !strings.Contains(msg, want) {
t.Fatalf("expected %q in %q", want, msg)
}
}
}
// TestAutoRecoverSealedVaultBranches runs one orchestration or CLI step.
// Signature: TestAutoRecoverSealedVaultBranches(t *testing.T).
// Why: late Vault reseals are a high-risk failure path, so the daemon needs
// coverage across the quiet-skip, parse-failure, and unseal-failure branches.
func TestAutoRecoverSealedVaultBranches(t *testing.T) {
t.Run("dry run skips", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
orch.runner.DryRun = true
recovered, err := orch.autoRecoverSealedVault(context.Background())
if err != nil || recovered {
t.Fatalf("expected dry-run skip, got recovered=%v err=%v", recovered, err)
}
})
t.Run("pod missing is quiet", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
{
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
err: errors.New("vault-0 not found"),
},
})
recovered, err := orch.autoRecoverSealedVault(context.Background())
if err != nil || recovered {
t.Fatalf("expected quiet skip, got recovered=%v err=%v", recovered, err)
}
})
t.Run("phase check error surfaces", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
{
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
err: errors.New("phase check failed"),
},
})
recovered, err := orch.autoRecoverSealedVault(context.Background())
if recovered || err == nil || !strings.Contains(err.Error(), "vault pod phase check failed") {
t.Fatalf("expected phase check error, got recovered=%v err=%v", recovered, err)
}
})
t.Run("non-running pod defers", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
{
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
out: "Pending",
},
})
recovered, err := orch.autoRecoverSealedVault(context.Background())
if err != nil || recovered {
t.Fatalf("expected pending pod skip, got recovered=%v err=%v", recovered, err)
}
})
t.Run("status parse failure surfaces", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
{
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
out: "Running",
},
{
match: matchContains("kubectl", "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"),
out: "garbage",
},
})
recovered, err := orch.autoRecoverSealedVault(context.Background())
if recovered || err == nil || !strings.Contains(err.Error(), "parse vault status") {
t.Fatalf("expected parse error, got recovered=%v err=%v", recovered, err)
}
})
t.Run("already unsealed stays quiet", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
{
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
out: "Running",
},
{
match: matchContains("kubectl", "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"),
out: `{"sealed":false}`,
},
})
recovered, err := orch.autoRecoverSealedVault(context.Background())
if err != nil || recovered {
t.Fatalf("expected already-unsealed skip, got recovered=%v err=%v", recovered, err)
}
})
t.Run("unseal failure surfaces", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
{
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
out: "Running",
},
{
match: matchContains("kubectl", "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"),
out: `{"sealed":true}`,
},
{
match: matchContains("kubectl", "-n vault get secret vault-init -o jsonpath={.data.unseal_key_b64}"),
out: base64.StdEncoding.EncodeToString([]byte("vault-unseal-key")),
},
{
match: matchContains("kubectl", "vault operator unseal"),
err: errors.New("exec boom"),
},
})
recovered, err := orch.autoRecoverSealedVault(context.Background())
if recovered || err == nil || !strings.Contains(err.Error(), "vault unseal attempt 1 failed") {
t.Fatalf("expected unseal failure, got recovered=%v err=%v", recovered, err)
}
})
}
// TestRerunVaultK8sAuthConfigJobBranches runs one orchestration or CLI step.
// Signature: TestRerunVaultK8sAuthConfigJobBranches(t *testing.T).
// Why: the post-unseal auth job is part of the production recovery chain, so
// dry-run and create-error behavior both need explicit coverage.
func TestRerunVaultK8sAuthConfigJobBranches(t *testing.T) {
t.Run("dry run skips", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
orch.runner.DryRun = true
if err := orch.rerunVaultK8sAuthConfigJob(context.Background()); err != nil {
t.Fatalf("dry-run rerunVaultK8sAuthConfigJob failed: %v", err)
}
})
t.Run("create error surfaces", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
{
match: matchContains("kubectl", "-n vault create job --from=cronjob/vault-k8s-auth-config"),
err: errors.New("create failed"),
},
})
err := orch.rerunVaultK8sAuthConfigJob(context.Background())
if err == nil || !strings.Contains(err.Error(), "create job vault-k8s-auth-config-autoheal-") {
t.Fatalf("expected create-job error, got %v", err)
}
})
}