383 lines
14 KiB
Go
383 lines
14 KiB
Go
package cluster
|
|
|
|
import (
|
|
"context"
|
|
"encoding/base64"
|
|
"errors"
|
|
"io"
|
|
"log"
|
|
"path/filepath"
|
|
"strings"
|
|
"testing"
|
|
"time"
|
|
|
|
"scm.bstein.dev/bstein/ananke/internal/config"
|
|
"scm.bstein.dev/bstein/ananke/internal/execx"
|
|
"scm.bstein.dev/bstein/ananke/internal/state"
|
|
)
|
|
|
|
// TestPostStartAutoHealRepairsVaultAndUnavailableNodes runs one orchestration or CLI step.
|
|
// Signature: TestPostStartAutoHealRepairsVaultAndUnavailableNodes(t *testing.T).
|
|
// Why: covers the new daemon-triggered repair path for late Vault reseals and
|
|
// stale terminating pods anchored to unavailable nodes.
|
|
func TestPostStartAutoHealRepairsVaultAndUnavailableNodes(t *testing.T) {
|
|
cfg := config.Config{
|
|
Startup: config.Startup{
|
|
DeadNodeCleanupGraceSeconds: 300,
|
|
RequiredNodeLabels: map[string]map[string]string{
|
|
"titan-07": {"node-role.kubernetes.io/worker": "true"},
|
|
},
|
|
},
|
|
State: config.State{
|
|
Dir: t.TempDir(),
|
|
ReportsDir: filepath.Join(t.TempDir(), "reports"),
|
|
RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"),
|
|
},
|
|
}
|
|
orch := &Orchestrator{
|
|
cfg: cfg,
|
|
runner: &execx.Runner{},
|
|
store: state.New(filepath.Join(t.TempDir(), "runs.json")),
|
|
log: log.New(io.Discard, "", 0),
|
|
}
|
|
|
|
oldDelete := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
|
|
unsealCalls := 0
|
|
jobCreated := false
|
|
reconciled := false
|
|
deleted := map[string]bool{}
|
|
dispatch := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
|
|
if name != "kubectl" {
|
|
return "", nil
|
|
}
|
|
joined := strings.Join(args, " ")
|
|
switch {
|
|
case strings.Contains(joined, "label node titan-07 --overwrite node-role.kubernetes.io/worker=true"):
|
|
return "", nil
|
|
case strings.Contains(joined, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
|
|
return "Running", nil
|
|
case strings.Contains(joined, "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"):
|
|
if unsealCalls == 0 {
|
|
return `{"initialized":true,"sealed":true}`, nil
|
|
}
|
|
return `{"initialized":true,"sealed":false}`, nil
|
|
case strings.Contains(joined, "-n vault get secret vault-init -o jsonpath={.data.unseal_key_b64}"):
|
|
return base64.StdEncoding.EncodeToString([]byte("vault-unseal-key")), nil
|
|
case strings.Contains(joined, "vault operator unseal"):
|
|
unsealCalls++
|
|
return "", nil
|
|
case strings.Contains(joined, "-n vault create job --from=cronjob/vault-k8s-auth-config"):
|
|
jobCreated = true
|
|
return "", nil
|
|
case strings.Contains(joined, "get nodes -o json"):
|
|
return `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"Unknown"}]}},{"metadata":{"name":"titan-07"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`, nil
|
|
case strings.Contains(joined, "get pods -A -o json"):
|
|
return `{"items":[{"metadata":{"namespace":"maintenance","name":"stale-pod","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-22"}},{"metadata":{"namespace":"logging","name":"healthy-node-pod","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-18"}}]}`, nil
|
|
case strings.Contains(joined, "-n maintenance delete pod stale-pod --grace-period=0 --force --wait=false"):
|
|
deleted["maintenance/stale-pod"] = true
|
|
return "", nil
|
|
case strings.Contains(joined, "-n flux-system annotate gitrepository flux-system reconcile.fluxcd.io/requestedAt="):
|
|
reconciled = true
|
|
return "", nil
|
|
case strings.Contains(joined, "-n flux-system annotate kustomizations.kustomize.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt="):
|
|
return "", nil
|
|
case strings.Contains(joined, "annotate --all-namespaces helmreleases.helm.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt="):
|
|
return "", nil
|
|
default:
|
|
return "", nil
|
|
}
|
|
}
|
|
orch.SetCommandOverrides(dispatch, dispatch)
|
|
|
|
if err := orch.postStartAutoHeal(context.Background()); err != nil {
|
|
t.Fatalf("postStartAutoHeal failed: %v", err)
|
|
}
|
|
if unsealCalls != 1 {
|
|
t.Fatalf("expected one Vault unseal attempt, got %d", unsealCalls)
|
|
}
|
|
if !jobCreated {
|
|
t.Fatalf("expected vault k8s auth config job to be created")
|
|
}
|
|
if !deleted["maintenance/stale-pod"] {
|
|
t.Fatalf("expected stale unavailable-node pod to be deleted")
|
|
}
|
|
if !reconciled {
|
|
t.Fatalf("expected flux reconcile request after repairs")
|
|
}
|
|
if deleted["logging/healthy-node-pod"] {
|
|
t.Fatalf("did not expect terminating pod on healthy node to be deleted")
|
|
}
|
|
}
|
|
|
|
// TestPostStartAutoHealSkipsWhenClusterIsHealthy runs one orchestration or CLI step.
|
|
// Signature: TestPostStartAutoHealSkipsWhenClusterIsHealthy(t *testing.T).
|
|
// Why: proves the new post-start repair loop stays quiet when the specific
|
|
// failure patterns are absent.
|
|
func TestPostStartAutoHealSkipsWhenClusterIsHealthy(t *testing.T) {
|
|
cfg := config.Config{
|
|
Startup: config.Startup{
|
|
DeadNodeCleanupGraceSeconds: 300,
|
|
},
|
|
State: config.State{
|
|
Dir: t.TempDir(),
|
|
ReportsDir: filepath.Join(t.TempDir(), "reports"),
|
|
RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"),
|
|
},
|
|
}
|
|
orch := &Orchestrator{
|
|
cfg: cfg,
|
|
runner: &execx.Runner{},
|
|
store: state.New(filepath.Join(t.TempDir(), "runs.json")),
|
|
log: log.New(io.Discard, "", 0),
|
|
}
|
|
|
|
unsealCalls := 0
|
|
jobCreated := false
|
|
reconciled := false
|
|
dispatch := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
|
|
if name != "kubectl" {
|
|
return "", nil
|
|
}
|
|
joined := strings.Join(args, " ")
|
|
switch {
|
|
case strings.Contains(joined, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
|
|
return "Running", nil
|
|
case strings.Contains(joined, "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"):
|
|
return `{"initialized":true,"sealed":false}`, nil
|
|
case strings.Contains(joined, "-n vault create job --from=cronjob/vault-k8s-auth-config"):
|
|
jobCreated = true
|
|
return "", nil
|
|
case strings.Contains(joined, "vault operator unseal"):
|
|
unsealCalls++
|
|
return "", nil
|
|
case strings.Contains(joined, "get nodes -o json"):
|
|
return `{"items":[{"metadata":{"name":"titan-07"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`, nil
|
|
case strings.Contains(joined, "get pods -A -o json"):
|
|
return `{"items":[]}`, nil
|
|
case strings.Contains(joined, "reconcile.fluxcd.io/requestedAt="):
|
|
reconciled = true
|
|
return "", nil
|
|
default:
|
|
return "", nil
|
|
}
|
|
}
|
|
orch.SetCommandOverrides(dispatch, dispatch)
|
|
|
|
if err := orch.postStartAutoHeal(context.Background()); err != nil {
|
|
t.Fatalf("postStartAutoHeal failed: %v", err)
|
|
}
|
|
if unsealCalls != 0 {
|
|
t.Fatalf("did not expect Vault unseal calls, got %d", unsealCalls)
|
|
}
|
|
if jobCreated {
|
|
t.Fatalf("did not expect vault auth config job creation")
|
|
}
|
|
if reconciled {
|
|
t.Fatalf("did not expect flux reconcile request for healthy cluster")
|
|
}
|
|
}
|
|
|
|
// TestRunPostStartAutoHealDryRun runs one orchestration or CLI step.
|
|
// Signature: TestRunPostStartAutoHealDryRun(t *testing.T).
|
|
// Why: covers the exported wrapper and the top-level dry-run guard so daemon
|
|
// auto-heal never mutates cluster state during rehearsal runs.
|
|
func TestRunPostStartAutoHealDryRun(t *testing.T) {
|
|
orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
|
|
orch.runner.DryRun = true
|
|
|
|
if err := orch.RunPostStartAutoHeal(context.Background()); err != nil {
|
|
t.Fatalf("RunPostStartAutoHeal dry-run failed: %v", err)
|
|
}
|
|
}
|
|
|
|
// TestPostStartAutoHealAggregatesErrors runs one orchestration or CLI step.
|
|
// Signature: TestPostStartAutoHealAggregatesErrors(t *testing.T).
|
|
// Why: proves the daemon reports each failed sub-repair together instead of
|
|
// hiding later failures behind the first problem.
|
|
func TestPostStartAutoHealAggregatesErrors(t *testing.T) {
|
|
cfg := config.Config{
|
|
Startup: config.Startup{
|
|
DeadNodeCleanupGraceSeconds: 300,
|
|
RequiredNodeLabels: map[string]map[string]string{
|
|
"titan-07": {"node-role.kubernetes.io/worker": "true"},
|
|
},
|
|
},
|
|
}
|
|
orch := buildOrchestratorWithStubs(t, cfg, []commandStub{
|
|
{
|
|
match: matchContains("kubectl", "label node titan-07 --overwrite node-role.kubernetes.io/worker=true"),
|
|
err: errors.New("label failed"),
|
|
},
|
|
{
|
|
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
|
|
err: errors.New("vault phase failed"),
|
|
},
|
|
{
|
|
match: matchContains("kubectl", "get nodes -o json"),
|
|
err: errors.New("node query failed"),
|
|
},
|
|
})
|
|
|
|
err := orch.postStartAutoHeal(context.Background())
|
|
if err == nil {
|
|
t.Fatalf("expected aggregated error")
|
|
}
|
|
msg := err.Error()
|
|
for _, want := range []string{
|
|
"required node labels:",
|
|
"vault auto-recovery:",
|
|
"dead-node terminating pod cleanup:",
|
|
} {
|
|
if !strings.Contains(msg, want) {
|
|
t.Fatalf("expected %q in %q", want, msg)
|
|
}
|
|
}
|
|
}
|
|
|
|
// TestAutoRecoverSealedVaultBranches runs one orchestration or CLI step.
|
|
// Signature: TestAutoRecoverSealedVaultBranches(t *testing.T).
|
|
// Why: late Vault reseals are a high-risk failure path, so the daemon needs
|
|
// coverage across the quiet-skip, parse-failure, and unseal-failure branches.
|
|
func TestAutoRecoverSealedVaultBranches(t *testing.T) {
|
|
t.Run("dry run skips", func(t *testing.T) {
|
|
orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
|
|
orch.runner.DryRun = true
|
|
|
|
recovered, err := orch.autoRecoverSealedVault(context.Background())
|
|
if err != nil || recovered {
|
|
t.Fatalf("expected dry-run skip, got recovered=%v err=%v", recovered, err)
|
|
}
|
|
})
|
|
|
|
t.Run("pod missing is quiet", func(t *testing.T) {
|
|
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
|
{
|
|
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
|
|
err: errors.New("vault-0 not found"),
|
|
},
|
|
})
|
|
|
|
recovered, err := orch.autoRecoverSealedVault(context.Background())
|
|
if err != nil || recovered {
|
|
t.Fatalf("expected quiet skip, got recovered=%v err=%v", recovered, err)
|
|
}
|
|
})
|
|
|
|
t.Run("phase check error surfaces", func(t *testing.T) {
|
|
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
|
{
|
|
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
|
|
err: errors.New("phase check failed"),
|
|
},
|
|
})
|
|
|
|
recovered, err := orch.autoRecoverSealedVault(context.Background())
|
|
if recovered || err == nil || !strings.Contains(err.Error(), "vault pod phase check failed") {
|
|
t.Fatalf("expected phase check error, got recovered=%v err=%v", recovered, err)
|
|
}
|
|
})
|
|
|
|
t.Run("non-running pod defers", func(t *testing.T) {
|
|
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
|
{
|
|
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
|
|
out: "Pending",
|
|
},
|
|
})
|
|
|
|
recovered, err := orch.autoRecoverSealedVault(context.Background())
|
|
if err != nil || recovered {
|
|
t.Fatalf("expected pending pod skip, got recovered=%v err=%v", recovered, err)
|
|
}
|
|
})
|
|
|
|
t.Run("status parse failure surfaces", func(t *testing.T) {
|
|
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
|
{
|
|
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
|
|
out: "Running",
|
|
},
|
|
{
|
|
match: matchContains("kubectl", "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"),
|
|
out: "garbage",
|
|
},
|
|
})
|
|
|
|
recovered, err := orch.autoRecoverSealedVault(context.Background())
|
|
if recovered || err == nil || !strings.Contains(err.Error(), "parse vault status") {
|
|
t.Fatalf("expected parse error, got recovered=%v err=%v", recovered, err)
|
|
}
|
|
})
|
|
|
|
t.Run("already unsealed stays quiet", func(t *testing.T) {
|
|
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
|
{
|
|
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
|
|
out: "Running",
|
|
},
|
|
{
|
|
match: matchContains("kubectl", "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"),
|
|
out: `{"sealed":false}`,
|
|
},
|
|
})
|
|
|
|
recovered, err := orch.autoRecoverSealedVault(context.Background())
|
|
if err != nil || recovered {
|
|
t.Fatalf("expected already-unsealed skip, got recovered=%v err=%v", recovered, err)
|
|
}
|
|
})
|
|
|
|
t.Run("unseal failure surfaces", func(t *testing.T) {
|
|
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
|
{
|
|
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
|
|
out: "Running",
|
|
},
|
|
{
|
|
match: matchContains("kubectl", "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"),
|
|
out: `{"sealed":true}`,
|
|
},
|
|
{
|
|
match: matchContains("kubectl", "-n vault get secret vault-init -o jsonpath={.data.unseal_key_b64}"),
|
|
out: base64.StdEncoding.EncodeToString([]byte("vault-unseal-key")),
|
|
},
|
|
{
|
|
match: matchContains("kubectl", "vault operator unseal"),
|
|
err: errors.New("exec boom"),
|
|
},
|
|
})
|
|
|
|
recovered, err := orch.autoRecoverSealedVault(context.Background())
|
|
if recovered || err == nil || !strings.Contains(err.Error(), "vault unseal attempt 1 failed") {
|
|
t.Fatalf("expected unseal failure, got recovered=%v err=%v", recovered, err)
|
|
}
|
|
})
|
|
}
|
|
|
|
// TestRerunVaultK8sAuthConfigJobBranches runs one orchestration or CLI step.
|
|
// Signature: TestRerunVaultK8sAuthConfigJobBranches(t *testing.T).
|
|
// Why: the post-unseal auth job is part of the production recovery chain, so
|
|
// dry-run and create-error behavior both need explicit coverage.
|
|
func TestRerunVaultK8sAuthConfigJobBranches(t *testing.T) {
|
|
t.Run("dry run skips", func(t *testing.T) {
|
|
orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
|
|
orch.runner.DryRun = true
|
|
if err := orch.rerunVaultK8sAuthConfigJob(context.Background()); err != nil {
|
|
t.Fatalf("dry-run rerunVaultK8sAuthConfigJob failed: %v", err)
|
|
}
|
|
})
|
|
|
|
t.Run("create error surfaces", func(t *testing.T) {
|
|
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
|
{
|
|
match: matchContains("kubectl", "-n vault create job --from=cronjob/vault-k8s-auth-config"),
|
|
err: errors.New("create failed"),
|
|
},
|
|
})
|
|
err := orch.rerunVaultK8sAuthConfigJob(context.Background())
|
|
if err == nil || !strings.Contains(err.Error(), "create job vault-k8s-auth-config-autoheal-") {
|
|
t.Fatalf("expected create-job error, got %v", err)
|
|
}
|
|
})
|
|
}
|