297 lines
12 KiB
Go
297 lines
12 KiB
Go
|
|
package cluster
|
||
|
|
|
||
|
|
import (
|
||
|
|
"context"
|
||
|
|
"errors"
|
||
|
|
"io"
|
||
|
|
"log"
|
||
|
|
"os"
|
||
|
|
"path/filepath"
|
||
|
|
"strings"
|
||
|
|
"testing"
|
||
|
|
"time"
|
||
|
|
|
||
|
|
"scm.bstein.dev/bstein/ananke/internal/config"
|
||
|
|
"scm.bstein.dev/bstein/ananke/internal/execx"
|
||
|
|
"scm.bstein.dev/bstein/ananke/internal/state"
|
||
|
|
)
|
||
|
|
|
||
|
|
// TestCleanupTerminatingPodsOnUnavailableNodesBranches runs one orchestration or CLI step.
|
||
|
|
// Signature: TestCleanupTerminatingPodsOnUnavailableNodesBranches(t *testing.T).
|
||
|
|
// Why: cleanup on dead nodes must be selective so Ananke only force-deletes the
|
||
|
|
// truly stranded pods and tolerates already-gone objects.
|
||
|
|
func TestCleanupTerminatingPodsOnUnavailableNodesBranches(t *testing.T) {
|
||
|
|
t.Run("dry run skips", func(t *testing.T) {
|
||
|
|
orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
|
||
|
|
orch.runner.DryRun = true
|
||
|
|
count, err := orch.cleanupTerminatingPodsOnUnavailableNodes(context.Background())
|
||
|
|
if err != nil || count != 0 {
|
||
|
|
t.Fatalf("expected dry-run skip, got count=%d err=%v", count, err)
|
||
|
|
}
|
||
|
|
})
|
||
|
|
|
||
|
|
t.Run("selective cleanup tolerates not found", func(t *testing.T) {
|
||
|
|
oldDelete := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
|
||
|
|
recentDelete := time.Now().Add(-2 * time.Minute).UTC().Format(time.RFC3339)
|
||
|
|
orch := buildOrchestratorWithStubs(t, config.Config{
|
||
|
|
Startup: config.Startup{DeadNodeCleanupGraceSeconds: 300},
|
||
|
|
}, []commandStub{
|
||
|
|
{
|
||
|
|
match: matchContains("kubectl", "get nodes -o json"),
|
||
|
|
out: `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"Unknown"}]}},{"metadata":{"name":"titan-07"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`,
|
||
|
|
},
|
||
|
|
{
|
||
|
|
match: matchContains("kubectl", "get pods -A -o json"),
|
||
|
|
out: `{"items":[` +
|
||
|
|
`{"metadata":{"namespace":"maintenance","name":"old-stale","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-22"}},` +
|
||
|
|
`{"metadata":{"namespace":"maintenance","name":"fresh-stale","deletionTimestamp":"` + recentDelete + `"},"spec":{"nodeName":"titan-22"}},` +
|
||
|
|
`{"metadata":{"namespace":"logging","name":"healthy-node","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-18"}},` +
|
||
|
|
`{"metadata":{"namespace":"logging","name":"no-delete"},"spec":{"nodeName":"titan-22"}}]}`,
|
||
|
|
},
|
||
|
|
{
|
||
|
|
match: matchContains("kubectl", "-n maintenance delete pod old-stale --grace-period=0 --force --wait=false"),
|
||
|
|
err: errors.New("pod old-stale not found"),
|
||
|
|
},
|
||
|
|
})
|
||
|
|
|
||
|
|
count, err := orch.cleanupTerminatingPodsOnUnavailableNodes(context.Background())
|
||
|
|
if err != nil {
|
||
|
|
t.Fatalf("cleanupTerminatingPodsOnUnavailableNodes failed: %v", err)
|
||
|
|
}
|
||
|
|
if count != 1 {
|
||
|
|
t.Fatalf("expected one cleaned pod, got %d", count)
|
||
|
|
}
|
||
|
|
})
|
||
|
|
|
||
|
|
t.Run("query and decode errors surface", func(t *testing.T) {
|
||
|
|
queryErrOrch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
||
|
|
{
|
||
|
|
match: matchContains("kubectl", "get nodes -o json"),
|
||
|
|
err: errors.New("nodes failed"),
|
||
|
|
},
|
||
|
|
})
|
||
|
|
if _, err := queryErrOrch.cleanupTerminatingPodsOnUnavailableNodes(context.Background()); err == nil || !strings.Contains(err.Error(), "query nodes") {
|
||
|
|
t.Fatalf("expected node query error, got %v", err)
|
||
|
|
}
|
||
|
|
|
||
|
|
decodeErrOrch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
||
|
|
{
|
||
|
|
match: matchContains("kubectl", "get nodes -o json"),
|
||
|
|
out: `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"Unknown"}]}}]}`,
|
||
|
|
},
|
||
|
|
{
|
||
|
|
match: matchContains("kubectl", "get pods -A -o json"),
|
||
|
|
out: `{bad json`,
|
||
|
|
},
|
||
|
|
})
|
||
|
|
if _, err := decodeErrOrch.cleanupTerminatingPodsOnUnavailableNodes(context.Background()); err == nil || !strings.Contains(err.Error(), "decode pods") {
|
||
|
|
t.Fatalf("expected pod decode error, got %v", err)
|
||
|
|
}
|
||
|
|
})
|
||
|
|
|
||
|
|
t.Run("delete hard error surfaces", func(t *testing.T) {
|
||
|
|
oldDelete := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
|
||
|
|
orch := buildOrchestratorWithStubs(t, config.Config{
|
||
|
|
Startup: config.Startup{DeadNodeCleanupGraceSeconds: 300},
|
||
|
|
}, []commandStub{
|
||
|
|
{
|
||
|
|
match: matchContains("kubectl", "get nodes -o json"),
|
||
|
|
out: `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"False"}]}}]}`,
|
||
|
|
},
|
||
|
|
{
|
||
|
|
match: matchContains("kubectl", "get pods -A -o json"),
|
||
|
|
out: `{"items":[{"metadata":{"namespace":"maintenance","name":"old-stale","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-22"}}]}`,
|
||
|
|
},
|
||
|
|
{
|
||
|
|
match: matchContains("kubectl", "-n maintenance delete pod old-stale --grace-period=0 --force --wait=false"),
|
||
|
|
err: errors.New("delete failed"),
|
||
|
|
},
|
||
|
|
})
|
||
|
|
|
||
|
|
count, err := orch.cleanupTerminatingPodsOnUnavailableNodes(context.Background())
|
||
|
|
if count != 0 || err == nil || !strings.Contains(err.Error(), "delete pod maintenance/old-stale") {
|
||
|
|
t.Fatalf("expected delete failure, got count=%d err=%v", count, err)
|
||
|
|
}
|
||
|
|
})
|
||
|
|
}
|
||
|
|
|
||
|
|
// TestUnavailableNodeSetBranches runs one orchestration or CLI step.
|
||
|
|
// Signature: TestUnavailableNodeSetBranches(t *testing.T).
|
||
|
|
// Why: node Ready parsing drives dead-node cleanup, so malformed and missing
|
||
|
|
// Ready condition payloads need direct coverage too.
|
||
|
|
func TestUnavailableNodeSetBranches(t *testing.T) {
|
||
|
|
t.Run("decode error surfaces", func(t *testing.T) {
|
||
|
|
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
||
|
|
{match: matchContains("kubectl", "get nodes -o json"), out: `{bad json`},
|
||
|
|
})
|
||
|
|
if _, err := orch.unavailableNodeSet(context.Background()); err == nil || !strings.Contains(err.Error(), "decode nodes") {
|
||
|
|
t.Fatalf("expected decode error, got %v", err)
|
||
|
|
}
|
||
|
|
})
|
||
|
|
|
||
|
|
t.Run("missing ready condition counts as unavailable", func(t *testing.T) {
|
||
|
|
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
||
|
|
{
|
||
|
|
match: matchContains("kubectl", "get nodes -o json"),
|
||
|
|
out: `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"MemoryPressure","status":"False"}]}},{"metadata":{"name":"titan-07"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`,
|
||
|
|
},
|
||
|
|
})
|
||
|
|
nodes, err := orch.unavailableNodeSet(context.Background())
|
||
|
|
if err != nil {
|
||
|
|
t.Fatalf("unavailableNodeSet failed: %v", err)
|
||
|
|
}
|
||
|
|
if _, ok := nodes["titan-22"]; !ok {
|
||
|
|
t.Fatalf("expected titan-22 to be treated as unavailable")
|
||
|
|
}
|
||
|
|
if _, ok := nodes["titan-07"]; ok {
|
||
|
|
t.Fatalf("did not expect titan-07 to be treated as unavailable")
|
||
|
|
}
|
||
|
|
})
|
||
|
|
}
|
||
|
|
|
||
|
|
// TestRequestFluxReconcileBranches runs one orchestration or CLI step.
|
||
|
|
// Signature: TestRequestFluxReconcileBranches(t *testing.T).
|
||
|
|
// Why: the post-start repair loop needs predictable Flux refresh behavior even
|
||
|
|
// when one annotation call is flaky.
|
||
|
|
func TestRequestFluxReconcileBranches(t *testing.T) {
|
||
|
|
t.Run("dry run skips", func(t *testing.T) {
|
||
|
|
orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
|
||
|
|
orch.runner.DryRun = true
|
||
|
|
if err := orch.requestFluxReconcile(context.Background()); err != nil {
|
||
|
|
t.Fatalf("dry-run requestFluxReconcile failed: %v", err)
|
||
|
|
}
|
||
|
|
})
|
||
|
|
|
||
|
|
t.Run("git source annotate error surfaces", func(t *testing.T) {
|
||
|
|
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
||
|
|
{
|
||
|
|
match: matchContains("kubectl", "-n flux-system annotate gitrepository flux-system reconcile.fluxcd.io/requestedAt="),
|
||
|
|
err: errors.New("annotate failed"),
|
||
|
|
},
|
||
|
|
})
|
||
|
|
if err := orch.requestFluxReconcile(context.Background()); err == nil || !strings.Contains(err.Error(), "annotate flux source reconcile") {
|
||
|
|
t.Fatalf("expected gitrepository annotate error, got %v", err)
|
||
|
|
}
|
||
|
|
})
|
||
|
|
|
||
|
|
t.Run("kustomization annotate error surfaces", func(t *testing.T) {
|
||
|
|
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
||
|
|
{
|
||
|
|
match: matchContains("kubectl", "-n flux-system annotate gitrepository flux-system reconcile.fluxcd.io/requestedAt="),
|
||
|
|
out: "",
|
||
|
|
},
|
||
|
|
{
|
||
|
|
match: matchContains("kubectl", "-n flux-system annotate kustomizations.kustomize.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt="),
|
||
|
|
err: errors.New("annotate failed"),
|
||
|
|
},
|
||
|
|
})
|
||
|
|
if err := orch.requestFluxReconcile(context.Background()); err == nil || !strings.Contains(err.Error(), "annotate flux kustomizations reconcile") {
|
||
|
|
t.Fatalf("expected kustomization annotate error, got %v", err)
|
||
|
|
}
|
||
|
|
})
|
||
|
|
|
||
|
|
t.Run("helm annotate warning and flux command path", func(t *testing.T) {
|
||
|
|
tmpDir := t.TempDir()
|
||
|
|
callLog := filepath.Join(tmpDir, "calls.log")
|
||
|
|
kubectlPath := filepath.Join(tmpDir, "kubectl")
|
||
|
|
fluxPath := filepath.Join(tmpDir, "flux")
|
||
|
|
|
||
|
|
kubectlScript := "#!/bin/sh\n" +
|
||
|
|
"printf '%s\\n' \"$*\" >> \"" + callLog + "\"\n" +
|
||
|
|
"case \"$*\" in\n" +
|
||
|
|
" *helmreleases.helm.toolkit.fluxcd.io*) echo helm annotate failed >&2; exit 1 ;;\n" +
|
||
|
|
"esac\n" +
|
||
|
|
"exit 0\n"
|
||
|
|
fluxScript := "#!/bin/sh\n" +
|
||
|
|
"printf 'flux %s\\n' \"$*\" >> \"" + callLog + "\"\n" +
|
||
|
|
"exit 0\n"
|
||
|
|
|
||
|
|
if err := os.WriteFile(kubectlPath, []byte(kubectlScript), 0o755); err != nil {
|
||
|
|
t.Fatalf("write fake kubectl: %v", err)
|
||
|
|
}
|
||
|
|
if err := os.WriteFile(fluxPath, []byte(fluxScript), 0o755); err != nil {
|
||
|
|
t.Fatalf("write fake flux: %v", err)
|
||
|
|
}
|
||
|
|
t.Setenv("PATH", tmpDir+":"+os.Getenv("PATH"))
|
||
|
|
|
||
|
|
cfg := config.Config{
|
||
|
|
State: config.State{
|
||
|
|
Dir: t.TempDir(),
|
||
|
|
ReportsDir: filepath.Join(t.TempDir(), "reports"),
|
||
|
|
RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"),
|
||
|
|
},
|
||
|
|
}
|
||
|
|
orch := &Orchestrator{
|
||
|
|
cfg: cfg,
|
||
|
|
runner: &execx.Runner{},
|
||
|
|
store: state.New(cfg.State.RunHistoryPath),
|
||
|
|
log: log.New(io.Discard, "", 0),
|
||
|
|
}
|
||
|
|
|
||
|
|
if err := orch.requestFluxReconcile(context.Background()); err != nil {
|
||
|
|
t.Fatalf("requestFluxReconcile with fake binaries failed: %v", err)
|
||
|
|
}
|
||
|
|
calls, err := os.ReadFile(callLog)
|
||
|
|
if err != nil {
|
||
|
|
t.Fatalf("read fake command log: %v", err)
|
||
|
|
}
|
||
|
|
logText := string(calls)
|
||
|
|
if !strings.Contains(logText, "annotate gitrepository flux-system") {
|
||
|
|
t.Fatalf("expected gitrepository annotate call, got %q", logText)
|
||
|
|
}
|
||
|
|
if !strings.Contains(logText, "annotate kustomizations.kustomize.toolkit.fluxcd.io --all") {
|
||
|
|
t.Fatalf("expected kustomization annotate call, got %q", logText)
|
||
|
|
}
|
||
|
|
if !strings.Contains(logText, "flux reconcile source git flux-system -n flux-system --timeout=60s") {
|
||
|
|
t.Fatalf("expected flux reconcile command, got %q", logText)
|
||
|
|
}
|
||
|
|
})
|
||
|
|
|
||
|
|
t.Run("flux command failure is tolerated", func(t *testing.T) {
|
||
|
|
tmpDir := t.TempDir()
|
||
|
|
callLog := filepath.Join(tmpDir, "calls.log")
|
||
|
|
kubectlPath := filepath.Join(tmpDir, "kubectl")
|
||
|
|
fluxPath := filepath.Join(tmpDir, "flux")
|
||
|
|
|
||
|
|
kubectlScript := "#!/bin/sh\n" +
|
||
|
|
"printf '%s\\n' \"$*\" >> \"" + callLog + "\"\n" +
|
||
|
|
"exit 0\n"
|
||
|
|
fluxScript := "#!/bin/sh\n" +
|
||
|
|
"printf 'flux-fail %s\\n' \"$*\" >> \"" + callLog + "\"\n" +
|
||
|
|
"exit 1\n"
|
||
|
|
|
||
|
|
if err := os.WriteFile(kubectlPath, []byte(kubectlScript), 0o755); err != nil {
|
||
|
|
t.Fatalf("write fake kubectl: %v", err)
|
||
|
|
}
|
||
|
|
if err := os.WriteFile(fluxPath, []byte(fluxScript), 0o755); err != nil {
|
||
|
|
t.Fatalf("write fake flux: %v", err)
|
||
|
|
}
|
||
|
|
t.Setenv("PATH", tmpDir+":"+os.Getenv("PATH"))
|
||
|
|
|
||
|
|
cfg := config.Config{
|
||
|
|
State: config.State{
|
||
|
|
Dir: t.TempDir(),
|
||
|
|
ReportsDir: filepath.Join(t.TempDir(), "reports"),
|
||
|
|
RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"),
|
||
|
|
},
|
||
|
|
}
|
||
|
|
orch := &Orchestrator{
|
||
|
|
cfg: cfg,
|
||
|
|
runner: &execx.Runner{},
|
||
|
|
store: state.New(cfg.State.RunHistoryPath),
|
||
|
|
log: log.New(io.Discard, "", 0),
|
||
|
|
}
|
||
|
|
|
||
|
|
if err := orch.requestFluxReconcile(context.Background()); err != nil {
|
||
|
|
t.Fatalf("requestFluxReconcile should tolerate flux failure, got %v", err)
|
||
|
|
}
|
||
|
|
calls, err := os.ReadFile(callLog)
|
||
|
|
if err != nil {
|
||
|
|
t.Fatalf("read fake command log: %v", err)
|
||
|
|
}
|
||
|
|
if !strings.Contains(string(calls), "flux-fail reconcile source git flux-system -n flux-system --timeout=60s") {
|
||
|
|
t.Fatalf("expected failing flux command to be attempted, got %q", string(calls))
|
||
|
|
}
|
||
|
|
})
|
||
|
|
}
|