ananke/testing/orchestrator/hooks_gap_matrix_part2_test.go

package orchestrator

import (
	"context"
	"errors"
	"io"
	"log"
	"os"
	"path/filepath"
	"strings"
	"testing"
	"time"

	"scm.bstein.dev/bstein/ananke/internal/cluster"
	"scm.bstein.dev/bstein/ananke/internal/config"
	"scm.bstein.dev/bstein/ananke/internal/execx"
	"scm.bstein.dev/bstein/ananke/internal/state"
)

// TestHookGapMatrixPart2TimesyncAndStability runs one orchestration or CLI step.
// Signature: TestHookGapMatrixPart2TimesyncAndStability(t *testing.T).
// Why: drives low-coverage time-sync, datastore parsing, and startup stability
// branches from the top-level testing module.
func TestHookGapMatrixPart2TimesyncAndStability(t *testing.T) {
	t.Run("parse-datastore-endpoint-matrix", func(t *testing.T) {
		cases := []struct {
			line string
			want string
		}{
			{"ExecStart=/usr/local/bin/k3s server", ""},
			{"ExecStart=/usr/local/bin/k3s server --datastore-endpoint=postgres://db:5432/k3s", "postgres://db:5432/k3s"},
			{"ExecStart=/usr/local/bin/k3s server --datastore-endpoint='postgres://db:5432/k3s' \\", "postgres://db:5432/k3s"},
			{"ExecStart=/usr/local/bin/k3s server --datastore-endpoint = \"postgres://db:5432/k3s\" \\", "="},
			{"X --datastore-endpoint=   \"postgres://db:5432/k3s\" ", "postgres://db:5432/k3s"},
		}
		for _, tc := range cases {
			got := cluster.TestHookParseDatastoreEndpoint(tc.line)
			if got != tc.want {
				t.Fatalf("parseDatastoreEndpoint(%q)=%q want %q", tc.line, got, tc.want)
			}
		}
	})

	t.Run("wait-for-time-sync-strict-timeout", func(t *testing.T) {
		cfg := lifecycleConfig(t)
		cfg.Startup.TimeSyncMode = "strict"
		cfg.Startup.TimeSyncWaitSeconds = 1
		cfg.Startup.TimeSyncPollSeconds = 1
		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			if name == "sh" && strings.Contains(command, "timedatectl show -p NTPSynchronized --value") {
				return "no", nil
			}
			if name == "ssh" && strings.Contains(command, "timedatectl show -p NTPSynchronized --value") {
				return "no", nil
			}
			return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
		}
		orch, _ := newHookOrchestrator(t, cfg, run, run)
		err := orch.TestHookWaitForTimeSync(context.Background(), []string{"titan-db"})
		if err == nil || !strings.Contains(err.Error(), "time sync not ready") {
			t.Fatalf("expected strict time-sync timeout branch, got %v", err)
		}
	})

	t.Run("wait-for-time-sync-quorum-success", func(t *testing.T) {
		cfg := lifecycleConfig(t)
		cfg.ControlPlanes = []string{"titan-db", "titan-23"}
		cfg.Startup.TimeSyncMode = "quorum"
		cfg.Startup.TimeSyncQuorum = 1
		cfg.Startup.TimeSyncWaitSeconds = 2
		cfg.Startup.TimeSyncPollSeconds = 1
		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			switch {
			case name == "sh" && strings.Contains(command, "timedatectl show -p NTPSynchronized --value"):
				return "yes", nil
			case name == "ssh" && strings.Contains(command, "titan-db") && strings.Contains(command, "timedatectl show -p NTPSynchronized --value"):
				return "yes", nil
			case name == "ssh" && strings.Contains(command, "titan-23") && strings.Contains(command, "timedatectl show -p NTPSynchronized --value"):
				return "no", nil
			default:
				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
			}
		}
		orch, _ := newHookOrchestrator(t, cfg, run, run)
		if err := orch.TestHookWaitForTimeSync(context.Background(), []string{"titan-db", "titan-23"}); err != nil {
			t.Fatalf("expected quorum time-sync success, got %v", err)
		}
	})

	t.Run("startup-stability-failure-matrix", func(t *testing.T) {
		baseCfg := lifecycleConfig(t)
		baseCfg.Startup.RequireIngressChecklist = false
		baseCfg.Startup.RequireServiceChecklist = false
		baseCfg.Startup.RequireWorkloadConvergence = false
		baseCfg.Startup.RequireFluxHealth = false

		runPodsCrash := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			if name == "kubectl" && strings.Contains(command, "get pods -A -o json") {
				return `{"items":[{"metadata":{"namespace":"default","name":"bad-pod"},"status":{"containerStatuses":[{"state":{"waiting":{"reason":"CrashLoopBackOff"}}}]}}]}`, nil
			}
			return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
		}
		orchCrash, _ := newHookOrchestrator(t, baseCfg, runPodsCrash, runPodsCrash)
		if err := orchCrash.TestHookStartupStabilityHealthy(context.Background()); err == nil || !strings.Contains(err.Error(), "pods in crash/image-pull failures") {
			t.Fatalf("expected crashloop stability failure, got %v", err)
		}

		cfgFlux := baseCfg
		cfgFlux.Startup.RequireFluxHealth = true
		runFlux := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			if name == "kubectl" && strings.Contains(command, "get kustomizations.kustomize.toolkit.fluxcd.io -A -o json") {
				return `{"items":[{"metadata":{"namespace":"flux-system","name":"services"},"spec":{"suspend":false},"status":{"conditions":[{"type":"Ready","status":"False","message":"syncing"}]}}]}`, nil
			}
			if name == "kubectl" && strings.Contains(command, "get pods -A -o json") {
				return `{"items":[]}`, nil
			}
			return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
		}
		orchFlux, _ := newHookOrchestrator(t, cfgFlux, runFlux, runFlux)
		if err := orchFlux.TestHookStartupStabilityHealthy(context.Background()); err == nil || !strings.Contains(err.Error(), "flux not ready") {
			t.Fatalf("expected flux-not-ready stability failure, got %v", err)
		}

		cfgWork := baseCfg
		cfgWork.Startup.RequireWorkloadConvergence = true
		runWork := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			switch {
			case name == "kubectl" && strings.Contains(command, "get deploy,statefulset,daemonset -A -o json"):
				return `{"items":[{"kind":"Deployment","metadata":{"namespace":"default","name":"app"},"spec":{"replicas":1},"status":{"readyReplicas":0}}]}`, nil
			case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
				return `{"items":[]}`, nil
			default:
				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
			}
		}
		orchWork, _ := newHookOrchestrator(t, cfgWork, runWork, runWork)
		if err := orchWork.TestHookStartupStabilityHealthy(context.Background()); err == nil || !strings.Contains(err.Error(), "workloads not converged") {
			t.Fatalf("expected workload convergence stability failure, got %v", err)
		}

		cfgService := baseCfg
		cfgService.Startup.RequireServiceChecklist = true
		cfgService.Startup.ServiceChecklist = []config.ServiceChecklistCheck{
			{Name: "api", URL: "http://127.0.0.1:1/health", AcceptedStatuses: []int{200}, TimeoutSeconds: 1},
		}
		runService := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			if name == "kubectl" && strings.Contains(command, "get pods -A -o json") {
				return `{"items":[]}`, nil
			}
			return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
		}
		orchService, _ := newHookOrchestrator(t, cfgService, runService, runService)
		if err := orchService.TestHookStartupStabilityHealthy(context.Background()); err == nil || !strings.Contains(err.Error(), "external services not healthy") {
			t.Fatalf("expected service checklist stability failure, got %v", err)
		}
	})
}

// TestHookGapMatrixPart2FluxScalingReport runs one orchestration or CLI step.
// Signature: TestHookGapMatrixPart2FluxScalingReport(t *testing.T).
// Why: targets low branch density in flux-health, scaling snapshot handling,
// and report sanitization helpers.
func TestHookGapMatrixPart2FluxScalingReport(t *testing.T) {
	t.Run("flux-helper-matrix", func(t *testing.T) {
		if !cluster.TestHookLooksLikeImmutableJobError("Job update failed: FIELD IS IMMUTABLE") {
			t.Fatalf("expected immutable matcher true for uppercase+job variant")
		}
		if cluster.TestHookLooksLikeImmutableJobError("totally unrelated error") {
			t.Fatalf("expected immutable matcher false")
		}
		if !cluster.TestHookJobLooksFluxManaged("flux-system", "job-a", map[string]string{"kustomize.toolkit.fluxcd.io/name": "services"}, nil) {
			t.Fatalf("expected flux-managed job by kustomize label")
		}
		if cluster.TestHookJobFailed(1, 0, []string{"Failed"}, []string{"True"}) != true {
			t.Fatalf("expected Failed=True to mark job failed")
		}

		cfg := lifecycleConfig(t)
		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			switch {
			case name == "kubectl" && strings.Contains(command, "get jobs -A -o json"):
				return `{"items":[{"metadata":{"namespace":"flux-system","name":"job-a","labels":{"kustomize.toolkit.fluxcd.io/name":"services"}},"status":{"failed":1,"conditions":[{"type":"Failed","status":"True"}]}}]}`, nil
			case name == "kubectl" && strings.Contains(command, "delete job -n flux-system job-a"):
				return "", nil
			case name == "kubectl" && strings.Contains(command, "get kustomizations.kustomize.toolkit.fluxcd.io -A -o json"):
				return `{"items":[{"metadata":{"namespace":"flux-system","name":"services"},"spec":{"suspend":false},"status":{"conditions":[{"type":"Ready","status":"True","message":"ok"}]}}]}`, nil
			default:
				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
			}
		}
		orch, _ := newHookOrchestrator(t, cfg, run, run)
		healed, err := orch.TestHookHealImmutableFluxJobs(context.Background())
		if err != nil || !healed {
			t.Fatalf("expected immutable flux job heal success, healed=%t err=%v", healed, err)
		}
		if _, _, err := orch.TestHookAdaptiveFluxHealthWait(context.Background(), 2*time.Second); err != nil {
			t.Fatalf("expected adaptive flux wait success, got %v", err)
		}
	})

	t.Run("scaling-snapshot-branch-matrix", func(t *testing.T) {
		cfg := lifecycleConfig(t)
		orch, _ := newHookOrchestrator(t, cfg, nil, nil)
		if err := orch.TestHookRestoreScaledApps(context.Background()); err != nil {
			t.Fatalf("expected empty snapshot restore success, got %v", err)
		}

		stateFile := filepath.Join(t.TempDir(), "state-file")
		if err := os.WriteFile(stateFile, []byte("x"), 0o644); err != nil {
			t.Fatalf("write state-file: %v", err)
		}
		cfgWriteErr := lifecycleConfig(t)
		cfgWriteErr.State.Dir = stateFile
		orchWriteErr := cluster.New(cfgWriteErr, &execx.Runner{DryRun: false}, state.New(cfgWriteErr.State.RunHistoryPath), log.New(io.Discard, "", 0))
		dispatch := lifecycleDispatcher(&commandRecorder{})
		orchWriteErr.SetCommandOverrides(dispatch, dispatch)
		if err := orchWriteErr.TestHookWriteScaledWorkloadSnapshot(nil); err == nil || !strings.Contains(err.Error(), "ensure state dir") {
			t.Fatalf("expected scaled snapshot mkdir failure, got %v", err)
		}
	})

	t.Run("report-sanitize-and-checklist-host-parsers", func(t *testing.T) {
		got := cluster.TestHookSanitizeReportFileName(" Startup / Drill : Night#2 ")
		if got == "" || strings.Contains(got, " ") || strings.Contains(got, "/") {
			t.Fatalf("unexpected sanitized report filename: %q", got)
		}

		cfg := lifecycleConfig(t)
		orch, _ := newHookOrchestrator(t, cfg, nil, nil)
		host := orch.TestHookChecklistFailureHost("metrics.bstein.dev: GET https://metrics.bstein.dev/: EOF")
		if host != "metrics.bstein.dev" {
			t.Fatalf("expected checklist failure host extraction, got %q", host)
		}
	})
}

// TestHookGapMatrixPart2VaultAndCoordination runs one orchestration or CLI step.
// Signature: TestHookGapMatrixPart2VaultAndCoordination(t *testing.T).
// Why: raises branch coverage on vault/key and coordination helpers without
// requiring package-local tests.
func TestHookGapMatrixPart2VaultAndCoordination(t *testing.T) {
	t.Run("vault-unseal-and-file-branches", func(t *testing.T) {
		cfg := lifecycleConfig(t)
		cfg.Startup.VaultUnsealKeyFile = ""
		orch, _ := newHookOrchestrator(t, cfg, nil, nil)
		if err := orch.TestHookWriteVaultUnsealKeyFile("abc"); err == nil || !strings.Contains(err.Error(), "path is empty") {
			t.Fatalf("expected empty vault key path error")
		}
		if cluster.TestHookIsNotFoundErr("") {
			t.Fatalf("expected nil/notfound helper false on empty input")
		}
		if !cluster.TestHookIsNotFoundErr("resource not found") {
			t.Fatalf("expected notfound helper true for notfound text")
		}

		runPhase := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			if name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}") {
				return "Pending", nil
			}
			return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
		}
		orchPhase, _ := newHookOrchestrator(t, lifecycleConfig(t), runPhase, runPhase)
		if err := orchPhase.TestHookEnsureVaultUnsealed(context.Background()); err == nil || !strings.Contains(err.Error(), "pod phase") {
			t.Fatalf("expected vault phase gate error, got %v", err)
		}
	})

	t.Run("coordination-peers-and-snapshot-stat-error", func(t *testing.T) {
		cfg := lifecycleConfig(t)
		cfg.Coordination.PeerHosts = []string{" titan-24 ", "titan-24", "  ", "titan-jh"}
		orch, _ := newHookOrchestrator(t, cfg, nil, nil)
		peers := orch.TestHookCoordinationPeers()
		if len(peers) != 2 || peers[0] != "titan-24" || peers[1] != "titan-jh" {
			t.Fatalf("unexpected normalized peers: %v", peers)
		}

		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			if name == "ssh" && strings.Contains(command, "stat -c %s") {
				return "", errors.New("stat failed")
			}
			return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
		}
		orchErr, _ := newHookOrchestrator(t, lifecycleConfig(t), run, run)
		if err := orchErr.TestHookVerifyEtcdSnapshot(context.Background(), "titan-db", "/snap/path"); err == nil || !strings.Contains(err.Error(), "stat failed") {
			t.Fatalf("expected snapshot stat error branch, got %v", err)
		}
	})
}

// TestHookGapMatrixPart2WorkloadIgnore runs one orchestration or CLI step.
// Signature: TestHookGapMatrixPart2WorkloadIgnore(t *testing.T).
// Why: expands low branch coverage in workload ignore helpers and startup-failure
// pod classification.
func TestHookGapMatrixPart2WorkloadIgnore(t *testing.T) {
	t.Run("ignored-node-helper-matrix", func(t *testing.T) {
		if !cluster.TestHookWorkloadTargetsIgnoredNodes("titan-22", nil, []string{"titan-22"}) {
			t.Fatalf("expected selector-host ignored match")
		}
		if cluster.TestHookWorkloadTargetsIgnoredNodes("titan-23", []string{"titan-24"}, []string{"titan-22"}) {
			t.Fatalf("expected workload targets ignored false when no ignored host targeted")
		}
		if !cluster.TestHookWorkloadTargetsIgnoredNodes("", []string{"titan-22"}, []string{"titan-22"}) {
			t.Fatalf("expected affinity host ignored match")
		}
		if !cluster.TestHookPodTargetsIgnoredNode("titan-22", []string{"titan-22"}) {
			t.Fatalf("expected pod ignored-node match")
		}
		if cluster.TestHookPodTargetsIgnoredNode("titan-23", []string{"titan-22"}) {
			t.Fatalf("expected pod ignored-node mismatch")
		}
	})

	t.Run("startup-failure-pods-decode-error-and-success", func(t *testing.T) {
		cfg := lifecycleConfig(t)
		runBad := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			if name == "kubectl" && strings.Contains(command, "get pods -A -o json") {
				return "{bad json", nil
			}
			return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
		}
		orchBad, _ := newHookOrchestrator(t, cfg, runBad, runBad)
		if _, err := orchBad.TestHookStartupFailurePods(context.Background()); err == nil {
			t.Fatalf("expected startupFailurePods decode error")
		}

		runOK := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			if name == "kubectl" && strings.Contains(command, "get pods -A -o json") {
				return `{"items":[{"metadata":{"namespace":"default","name":"ok-pod"},"status":{"containerStatuses":[{"state":{"running":{}}}]}}]}`, nil
			}
			return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
		}
		orchOK, _ := newHookOrchestrator(t, cfg, runOK, runOK)
		failures, err := orchOK.TestHookStartupFailurePods(context.Background())
		if err != nil || len(failures) != 0 {
			t.Fatalf("expected no startup failures, failures=%v err=%v", failures, err)
		}
	})

	t.Run("stuck-vault-init-reason-matrix", func(t *testing.T) {
		if got := cluster.TestHookStuckVaultInitReason("Running", true, 0, 10*time.Second); got != "" {
			t.Fatalf("expected no stuck init reason without running init, got %q", got)
		}
		if got := cluster.TestHookStuckVaultInitReason("Pending", true, 30*time.Second, 10*time.Second); !strings.Contains(got, "VaultInitStuck") {
			t.Fatalf("expected stuck vault init reason, got %q", got)
		}
	})
}