ananke/internal/cluster/orchestrator_autorepair_cleanup_test.go

297 lines
12 KiB
Go

package cluster
import (
"context"
"errors"
"io"
"log"
"os"
"path/filepath"
"strings"
"testing"
"time"
"scm.bstein.dev/bstein/ananke/internal/config"
"scm.bstein.dev/bstein/ananke/internal/execx"
"scm.bstein.dev/bstein/ananke/internal/state"
)
// TestCleanupTerminatingPodsOnUnavailableNodesBranches runs one orchestration or CLI step.
// Signature: TestCleanupTerminatingPodsOnUnavailableNodesBranches(t *testing.T).
// Why: cleanup on dead nodes must be selective so Ananke only force-deletes the
// truly stranded pods and tolerates already-gone objects.
func TestCleanupTerminatingPodsOnUnavailableNodesBranches(t *testing.T) {
t.Run("dry run skips", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
orch.runner.DryRun = true
count, err := orch.cleanupTerminatingPodsOnUnavailableNodes(context.Background())
if err != nil || count != 0 {
t.Fatalf("expected dry-run skip, got count=%d err=%v", count, err)
}
})
t.Run("selective cleanup tolerates not found", func(t *testing.T) {
oldDelete := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
recentDelete := time.Now().Add(-2 * time.Minute).UTC().Format(time.RFC3339)
orch := buildOrchestratorWithStubs(t, config.Config{
Startup: config.Startup{DeadNodeCleanupGraceSeconds: 300},
}, []commandStub{
{
match: matchContains("kubectl", "get nodes -o json"),
out: `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"Unknown"}]}},{"metadata":{"name":"titan-07"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`,
},
{
match: matchContains("kubectl", "get pods -A -o json"),
out: `{"items":[` +
`{"metadata":{"namespace":"maintenance","name":"old-stale","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-22"}},` +
`{"metadata":{"namespace":"maintenance","name":"fresh-stale","deletionTimestamp":"` + recentDelete + `"},"spec":{"nodeName":"titan-22"}},` +
`{"metadata":{"namespace":"logging","name":"healthy-node","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-18"}},` +
`{"metadata":{"namespace":"logging","name":"no-delete"},"spec":{"nodeName":"titan-22"}}]}`,
},
{
match: matchContains("kubectl", "-n maintenance delete pod old-stale --grace-period=0 --force --wait=false"),
err: errors.New("pod old-stale not found"),
},
})
count, err := orch.cleanupTerminatingPodsOnUnavailableNodes(context.Background())
if err != nil {
t.Fatalf("cleanupTerminatingPodsOnUnavailableNodes failed: %v", err)
}
if count != 1 {
t.Fatalf("expected one cleaned pod, got %d", count)
}
})
t.Run("query and decode errors surface", func(t *testing.T) {
queryErrOrch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
{
match: matchContains("kubectl", "get nodes -o json"),
err: errors.New("nodes failed"),
},
})
if _, err := queryErrOrch.cleanupTerminatingPodsOnUnavailableNodes(context.Background()); err == nil || !strings.Contains(err.Error(), "query nodes") {
t.Fatalf("expected node query error, got %v", err)
}
decodeErrOrch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
{
match: matchContains("kubectl", "get nodes -o json"),
out: `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"Unknown"}]}}]}`,
},
{
match: matchContains("kubectl", "get pods -A -o json"),
out: `{bad json`,
},
})
if _, err := decodeErrOrch.cleanupTerminatingPodsOnUnavailableNodes(context.Background()); err == nil || !strings.Contains(err.Error(), "decode pods") {
t.Fatalf("expected pod decode error, got %v", err)
}
})
t.Run("delete hard error surfaces", func(t *testing.T) {
oldDelete := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
orch := buildOrchestratorWithStubs(t, config.Config{
Startup: config.Startup{DeadNodeCleanupGraceSeconds: 300},
}, []commandStub{
{
match: matchContains("kubectl", "get nodes -o json"),
out: `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"False"}]}}]}`,
},
{
match: matchContains("kubectl", "get pods -A -o json"),
out: `{"items":[{"metadata":{"namespace":"maintenance","name":"old-stale","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-22"}}]}`,
},
{
match: matchContains("kubectl", "-n maintenance delete pod old-stale --grace-period=0 --force --wait=false"),
err: errors.New("delete failed"),
},
})
count, err := orch.cleanupTerminatingPodsOnUnavailableNodes(context.Background())
if count != 0 || err == nil || !strings.Contains(err.Error(), "delete pod maintenance/old-stale") {
t.Fatalf("expected delete failure, got count=%d err=%v", count, err)
}
})
}
// TestUnavailableNodeSetBranches runs one orchestration or CLI step.
// Signature: TestUnavailableNodeSetBranches(t *testing.T).
// Why: node Ready parsing drives dead-node cleanup, so malformed and missing
// Ready condition payloads need direct coverage too.
func TestUnavailableNodeSetBranches(t *testing.T) {
t.Run("decode error surfaces", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
{match: matchContains("kubectl", "get nodes -o json"), out: `{bad json`},
})
if _, err := orch.unavailableNodeSet(context.Background()); err == nil || !strings.Contains(err.Error(), "decode nodes") {
t.Fatalf("expected decode error, got %v", err)
}
})
t.Run("missing ready condition counts as unavailable", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
{
match: matchContains("kubectl", "get nodes -o json"),
out: `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"MemoryPressure","status":"False"}]}},{"metadata":{"name":"titan-07"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`,
},
})
nodes, err := orch.unavailableNodeSet(context.Background())
if err != nil {
t.Fatalf("unavailableNodeSet failed: %v", err)
}
if _, ok := nodes["titan-22"]; !ok {
t.Fatalf("expected titan-22 to be treated as unavailable")
}
if _, ok := nodes["titan-07"]; ok {
t.Fatalf("did not expect titan-07 to be treated as unavailable")
}
})
}
// TestRequestFluxReconcileBranches runs one orchestration or CLI step.
// Signature: TestRequestFluxReconcileBranches(t *testing.T).
// Why: the post-start repair loop needs predictable Flux refresh behavior even
// when one annotation call is flaky.
func TestRequestFluxReconcileBranches(t *testing.T) {
t.Run("dry run skips", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
orch.runner.DryRun = true
if err := orch.requestFluxReconcile(context.Background()); err != nil {
t.Fatalf("dry-run requestFluxReconcile failed: %v", err)
}
})
t.Run("git source annotate error surfaces", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
{
match: matchContains("kubectl", "-n flux-system annotate gitrepository flux-system reconcile.fluxcd.io/requestedAt="),
err: errors.New("annotate failed"),
},
})
if err := orch.requestFluxReconcile(context.Background()); err == nil || !strings.Contains(err.Error(), "annotate flux source reconcile") {
t.Fatalf("expected gitrepository annotate error, got %v", err)
}
})
t.Run("kustomization annotate error surfaces", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
{
match: matchContains("kubectl", "-n flux-system annotate gitrepository flux-system reconcile.fluxcd.io/requestedAt="),
out: "",
},
{
match: matchContains("kubectl", "-n flux-system annotate kustomizations.kustomize.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt="),
err: errors.New("annotate failed"),
},
})
if err := orch.requestFluxReconcile(context.Background()); err == nil || !strings.Contains(err.Error(), "annotate flux kustomizations reconcile") {
t.Fatalf("expected kustomization annotate error, got %v", err)
}
})
t.Run("helm annotate warning and flux command path", func(t *testing.T) {
tmpDir := t.TempDir()
callLog := filepath.Join(tmpDir, "calls.log")
kubectlPath := filepath.Join(tmpDir, "kubectl")
fluxPath := filepath.Join(tmpDir, "flux")
kubectlScript := "#!/bin/sh\n" +
"printf '%s\\n' \"$*\" >> \"" + callLog + "\"\n" +
"case \"$*\" in\n" +
" *helmreleases.helm.toolkit.fluxcd.io*) echo helm annotate failed >&2; exit 1 ;;\n" +
"esac\n" +
"exit 0\n"
fluxScript := "#!/bin/sh\n" +
"printf 'flux %s\\n' \"$*\" >> \"" + callLog + "\"\n" +
"exit 0\n"
if err := os.WriteFile(kubectlPath, []byte(kubectlScript), 0o755); err != nil {
t.Fatalf("write fake kubectl: %v", err)
}
if err := os.WriteFile(fluxPath, []byte(fluxScript), 0o755); err != nil {
t.Fatalf("write fake flux: %v", err)
}
t.Setenv("PATH", tmpDir+":"+os.Getenv("PATH"))
cfg := config.Config{
State: config.State{
Dir: t.TempDir(),
ReportsDir: filepath.Join(t.TempDir(), "reports"),
RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"),
},
}
orch := &Orchestrator{
cfg: cfg,
runner: &execx.Runner{},
store: state.New(cfg.State.RunHistoryPath),
log: log.New(io.Discard, "", 0),
}
if err := orch.requestFluxReconcile(context.Background()); err != nil {
t.Fatalf("requestFluxReconcile with fake binaries failed: %v", err)
}
calls, err := os.ReadFile(callLog)
if err != nil {
t.Fatalf("read fake command log: %v", err)
}
logText := string(calls)
if !strings.Contains(logText, "annotate gitrepository flux-system") {
t.Fatalf("expected gitrepository annotate call, got %q", logText)
}
if !strings.Contains(logText, "annotate kustomizations.kustomize.toolkit.fluxcd.io --all") {
t.Fatalf("expected kustomization annotate call, got %q", logText)
}
if !strings.Contains(logText, "flux reconcile source git flux-system -n flux-system --timeout=60s") {
t.Fatalf("expected flux reconcile command, got %q", logText)
}
})
t.Run("flux command failure is tolerated", func(t *testing.T) {
tmpDir := t.TempDir()
callLog := filepath.Join(tmpDir, "calls.log")
kubectlPath := filepath.Join(tmpDir, "kubectl")
fluxPath := filepath.Join(tmpDir, "flux")
kubectlScript := "#!/bin/sh\n" +
"printf '%s\\n' \"$*\" >> \"" + callLog + "\"\n" +
"exit 0\n"
fluxScript := "#!/bin/sh\n" +
"printf 'flux-fail %s\\n' \"$*\" >> \"" + callLog + "\"\n" +
"exit 1\n"
if err := os.WriteFile(kubectlPath, []byte(kubectlScript), 0o755); err != nil {
t.Fatalf("write fake kubectl: %v", err)
}
if err := os.WriteFile(fluxPath, []byte(fluxScript), 0o755); err != nil {
t.Fatalf("write fake flux: %v", err)
}
t.Setenv("PATH", tmpDir+":"+os.Getenv("PATH"))
cfg := config.Config{
State: config.State{
Dir: t.TempDir(),
ReportsDir: filepath.Join(t.TempDir(), "reports"),
RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"),
},
}
orch := &Orchestrator{
cfg: cfg,
runner: &execx.Runner{},
store: state.New(cfg.State.RunHistoryPath),
log: log.New(io.Discard, "", 0),
}
if err := orch.requestFluxReconcile(context.Background()); err != nil {
t.Fatalf("requestFluxReconcile should tolerate flux failure, got %v", err)
}
calls, err := os.ReadFile(callLog)
if err != nil {
t.Fatalf("read fake command log: %v", err)
}
if !strings.Contains(string(calls), "flux-fail reconcile source git flux-system -n flux-system --timeout=60s") {
t.Fatalf("expected failing flux command to be attempted, got %q", string(calls))
}
})
}