ananke/internal/cluster/orchestrator_autorepair_cleanup_test.go

package cluster

import (
	"context"
	"errors"
	"io"
	"log"
	"os"
	"path/filepath"
	"strings"
	"testing"
	"time"

	"scm.bstein.dev/bstein/ananke/internal/config"
	"scm.bstein.dev/bstein/ananke/internal/execx"
	"scm.bstein.dev/bstein/ananke/internal/state"
)

// TestCleanupTerminatingPodsOnUnavailableNodesBranches runs one orchestration or CLI step.
// Signature: TestCleanupTerminatingPodsOnUnavailableNodesBranches(t *testing.T).
// Why: cleanup on dead nodes must be selective so Ananke only force-deletes the
// truly stranded pods and tolerates already-gone objects.
func TestCleanupTerminatingPodsOnUnavailableNodesBranches(t *testing.T) {
	t.Run("dry run skips", func(t *testing.T) {
		orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
		orch.runner.DryRun = true
		count, err := orch.cleanupTerminatingPodsOnUnavailableNodes(context.Background())
		if err != nil || count != 0 {
			t.Fatalf("expected dry-run skip, got count=%d err=%v", count, err)
		}
	})

	t.Run("selective cleanup tolerates not found", func(t *testing.T) {
		oldDelete := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
		recentDelete := time.Now().Add(-2 * time.Minute).UTC().Format(time.RFC3339)
		orch := buildOrchestratorWithStubs(t, config.Config{
			Startup: config.Startup{DeadNodeCleanupGraceSeconds: 300},
		}, []commandStub{
			{
				match: matchContains("kubectl", "get nodes -o json"),
				out:   `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"Unknown"}]}},{"metadata":{"name":"titan-07"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`,
			},
			{
				match: matchContains("kubectl", "get pods -A -o json"),
				out: `{"items":[` +
					`{"metadata":{"namespace":"maintenance","name":"old-stale","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-22"}},` +
					`{"metadata":{"namespace":"maintenance","name":"fresh-stale","deletionTimestamp":"` + recentDelete + `"},"spec":{"nodeName":"titan-22"}},` +
					`{"metadata":{"namespace":"logging","name":"healthy-node","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-18"}},` +
					`{"metadata":{"namespace":"logging","name":"no-delete"},"spec":{"nodeName":"titan-22"}}]}`,
			},
			{
				match: matchContains("kubectl", "-n maintenance delete pod old-stale --grace-period=0 --force --wait=false"),
				err:   errors.New("pod old-stale not found"),
			},
		})

		count, err := orch.cleanupTerminatingPodsOnUnavailableNodes(context.Background())
		if err != nil {
			t.Fatalf("cleanupTerminatingPodsOnUnavailableNodes failed: %v", err)
		}
		if count != 1 {
			t.Fatalf("expected one cleaned pod, got %d", count)
		}
	})

	t.Run("query and decode errors surface", func(t *testing.T) {
		queryErrOrch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
			{
				match: matchContains("kubectl", "get nodes -o json"),
				err:   errors.New("nodes failed"),
			},
		})
		if _, err := queryErrOrch.cleanupTerminatingPodsOnUnavailableNodes(context.Background()); err == nil || !strings.Contains(err.Error(), "query nodes") {
			t.Fatalf("expected node query error, got %v", err)
		}

		decodeErrOrch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
			{
				match: matchContains("kubectl", "get nodes -o json"),
				out:   `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"Unknown"}]}}]}`,
			},
			{
				match: matchContains("kubectl", "get pods -A -o json"),
				out:   `{bad json`,
			},
		})
		if _, err := decodeErrOrch.cleanupTerminatingPodsOnUnavailableNodes(context.Background()); err == nil || !strings.Contains(err.Error(), "decode pods") {
			t.Fatalf("expected pod decode error, got %v", err)
		}
	})

	t.Run("delete hard error surfaces", func(t *testing.T) {
		oldDelete := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
		orch := buildOrchestratorWithStubs(t, config.Config{
			Startup: config.Startup{DeadNodeCleanupGraceSeconds: 300},
		}, []commandStub{
			{
				match: matchContains("kubectl", "get nodes -o json"),
				out:   `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"False"}]}}]}`,
			},
			{
				match: matchContains("kubectl", "get pods -A -o json"),
				out:   `{"items":[{"metadata":{"namespace":"maintenance","name":"old-stale","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-22"}}]}`,
			},
			{
				match: matchContains("kubectl", "-n maintenance delete pod old-stale --grace-period=0 --force --wait=false"),
				err:   errors.New("delete failed"),
			},
		})

		count, err := orch.cleanupTerminatingPodsOnUnavailableNodes(context.Background())
		if count != 0 || err == nil || !strings.Contains(err.Error(), "delete pod maintenance/old-stale") {
			t.Fatalf("expected delete failure, got count=%d err=%v", count, err)
		}
	})
}

// TestUnavailableNodeSetBranches runs one orchestration or CLI step.
// Signature: TestUnavailableNodeSetBranches(t *testing.T).
// Why: node Ready parsing drives dead-node cleanup, so malformed and missing
// Ready condition payloads need direct coverage too.
func TestUnavailableNodeSetBranches(t *testing.T) {
	t.Run("decode error surfaces", func(t *testing.T) {
		orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
			{match: matchContains("kubectl", "get nodes -o json"), out: `{bad json`},
		})
		if _, err := orch.unavailableNodeSet(context.Background()); err == nil || !strings.Contains(err.Error(), "decode nodes") {
			t.Fatalf("expected decode error, got %v", err)
		}
	})

	t.Run("missing ready condition counts as unavailable", func(t *testing.T) {
		orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
			{
				match: matchContains("kubectl", "get nodes -o json"),
				out:   `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"MemoryPressure","status":"False"}]}},{"metadata":{"name":"titan-07"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`,
			},
		})
		nodes, err := orch.unavailableNodeSet(context.Background())
		if err != nil {
			t.Fatalf("unavailableNodeSet failed: %v", err)
		}
		if _, ok := nodes["titan-22"]; !ok {
			t.Fatalf("expected titan-22 to be treated as unavailable")
		}
		if _, ok := nodes["titan-07"]; ok {
			t.Fatalf("did not expect titan-07 to be treated as unavailable")
		}
	})
}

// TestRequestFluxReconcileBranches runs one orchestration or CLI step.
// Signature: TestRequestFluxReconcileBranches(t *testing.T).
// Why: the post-start repair loop needs predictable Flux refresh behavior even
// when one annotation call is flaky.
func TestRequestFluxReconcileBranches(t *testing.T) {
	t.Run("dry run skips", func(t *testing.T) {
		orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
		orch.runner.DryRun = true
		if err := orch.requestFluxReconcile(context.Background()); err != nil {
			t.Fatalf("dry-run requestFluxReconcile failed: %v", err)
		}
	})

	t.Run("git source annotate error surfaces", func(t *testing.T) {
		orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
			{
				match: matchContains("kubectl", "-n flux-system annotate gitrepository flux-system reconcile.fluxcd.io/requestedAt="),
				err:   errors.New("annotate failed"),
			},
		})
		if err := orch.requestFluxReconcile(context.Background()); err == nil || !strings.Contains(err.Error(), "annotate flux source reconcile") {
			t.Fatalf("expected gitrepository annotate error, got %v", err)
		}
	})

	t.Run("kustomization annotate error surfaces", func(t *testing.T) {
		orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
			{
				match: matchContains("kubectl", "-n flux-system annotate gitrepository flux-system reconcile.fluxcd.io/requestedAt="),
				out:   "",
			},
			{
				match: matchContains("kubectl", "-n flux-system annotate kustomizations.kustomize.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt="),
				err:   errors.New("annotate failed"),
			},
		})
		if err := orch.requestFluxReconcile(context.Background()); err == nil || !strings.Contains(err.Error(), "annotate flux kustomizations reconcile") {
			t.Fatalf("expected kustomization annotate error, got %v", err)
		}
	})

	t.Run("helm annotate warning and flux command path", func(t *testing.T) {
		tmpDir := t.TempDir()
		callLog := filepath.Join(tmpDir, "calls.log")
		kubectlPath := filepath.Join(tmpDir, "kubectl")
		fluxPath := filepath.Join(tmpDir, "flux")

		kubectlScript := "#!/bin/sh\n" +
			"printf '%s\\n' \"$*\" >> \"" + callLog + "\"\n" +
			"case \"$*\" in\n" +
			"  *helmreleases.helm.toolkit.fluxcd.io*) echo helm annotate failed >&2; exit 1 ;;\n" +
			"esac\n" +
			"exit 0\n"
		fluxScript := "#!/bin/sh\n" +
			"printf 'flux %s\\n' \"$*\" >> \"" + callLog + "\"\n" +
			"exit 0\n"

		if err := os.WriteFile(kubectlPath, []byte(kubectlScript), 0o755); err != nil {
			t.Fatalf("write fake kubectl: %v", err)
		}
		if err := os.WriteFile(fluxPath, []byte(fluxScript), 0o755); err != nil {
			t.Fatalf("write fake flux: %v", err)
		}
		t.Setenv("PATH", tmpDir+":"+os.Getenv("PATH"))

		cfg := config.Config{
			State: config.State{
				Dir:            t.TempDir(),
				ReportsDir:     filepath.Join(t.TempDir(), "reports"),
				RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"),
			},
		}
		orch := &Orchestrator{
			cfg:    cfg,
			runner: &execx.Runner{},
			store:  state.New(cfg.State.RunHistoryPath),
			log:    log.New(io.Discard, "", 0),
		}

		if err := orch.requestFluxReconcile(context.Background()); err != nil {
			t.Fatalf("requestFluxReconcile with fake binaries failed: %v", err)
		}
		calls, err := os.ReadFile(callLog)
		if err != nil {
			t.Fatalf("read fake command log: %v", err)
		}
		logText := string(calls)
		if !strings.Contains(logText, "annotate gitrepository flux-system") {
			t.Fatalf("expected gitrepository annotate call, got %q", logText)
		}
		if !strings.Contains(logText, "annotate kustomizations.kustomize.toolkit.fluxcd.io --all") {
			t.Fatalf("expected kustomization annotate call, got %q", logText)
		}
		if !strings.Contains(logText, "flux reconcile source git flux-system -n flux-system --timeout=60s") {
			t.Fatalf("expected flux reconcile command, got %q", logText)
		}
	})

	t.Run("flux command failure is tolerated", func(t *testing.T) {
		tmpDir := t.TempDir()
		callLog := filepath.Join(tmpDir, "calls.log")
		kubectlPath := filepath.Join(tmpDir, "kubectl")
		fluxPath := filepath.Join(tmpDir, "flux")

		kubectlScript := "#!/bin/sh\n" +
			"printf '%s\\n' \"$*\" >> \"" + callLog + "\"\n" +
			"exit 0\n"
		fluxScript := "#!/bin/sh\n" +
			"printf 'flux-fail %s\\n' \"$*\" >> \"" + callLog + "\"\n" +
			"exit 1\n"

		if err := os.WriteFile(kubectlPath, []byte(kubectlScript), 0o755); err != nil {
			t.Fatalf("write fake kubectl: %v", err)
		}
		if err := os.WriteFile(fluxPath, []byte(fluxScript), 0o755); err != nil {
			t.Fatalf("write fake flux: %v", err)
		}
		t.Setenv("PATH", tmpDir+":"+os.Getenv("PATH"))

		cfg := config.Config{
			State: config.State{
				Dir:            t.TempDir(),
				ReportsDir:     filepath.Join(t.TempDir(), "reports"),
				RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"),
			},
		}
		orch := &Orchestrator{
			cfg:    cfg,
			runner: &execx.Runner{},
			store:  state.New(cfg.State.RunHistoryPath),
			log:    log.New(io.Discard, "", 0),
		}

		if err := orch.requestFluxReconcile(context.Background()); err != nil {
			t.Fatalf("requestFluxReconcile should tolerate flux failure, got %v", err)
		}
		calls, err := os.ReadFile(callLog)
		if err != nil {
			t.Fatalf("read fake command log: %v", err)
		}
		if !strings.Contains(string(calls), "flux-fail reconcile source git flux-system -n flux-system --timeout=60s") {
			t.Fatalf("expected failing flux command to be attempted, got %q", string(calls))
		}
	})
}