ananke/testing/orchestrator/hooks_gap_matrix_part7_test.go

package orchestrator

import (
	"context"
	"errors"
	"io"
	"log"
	"strings"
	"testing"
	"time"

	"scm.bstein.dev/bstein/ananke/internal/cluster"
	"scm.bstein.dev/bstein/ananke/internal/execx"
	"scm.bstein.dev/bstein/ananke/internal/state"
)

// TestHookGapMatrixPart7CoverageClosure runs one orchestration or CLI step.
// Signature: TestHookGapMatrixPart7CoverageClosure(t *testing.T).
// Why: closes additional low-coverage branches in convergence, storage, access,
// flux, lifecycle, and sensitive command wrappers.
func TestHookGapMatrixPart7CoverageClosure(t *testing.T) {
	t.Run("workload-convergence-branch-matrix", func(t *testing.T) {
		cfg := lifecycleConfig(t)
		cfg.Startup.WorkloadConvergenceWaitSeconds = 1
		cfg.Startup.WorkloadConvergencePollSeconds = 1
		cfg.Startup.IgnoreWorkloadNamespaces = []string{"ignored-ns"}
		cfg.Startup.IgnoreUnavailableNodes = []string{"titan-22"}
		cfg.Startup.IgnoreWorkloads = []string{"monitoring/deployment/skip-me"}
		cfg.Startup.IgnoreFluxKustomizations = []string{"flux-system/ignored"}

		runReady := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			switch {
			case name == "kubectl" && strings.Contains(command, "get deploy,statefulset,daemonset -A -o json"):
				return `{"items":[
{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"grafana"},"spec":{"replicas":1},"status":{"readyReplicas":1}},
{"kind":"DaemonSet","metadata":{"namespace":"monitoring","name":"node-exporter"},"spec":{"template":{"spec":{"nodeName":"titan-22"}}},"status":{"desiredNumberScheduled":2,"numberReady":1}},
{"kind":"Deployment","metadata":{"namespace":"ignored-ns","name":"ignore-ns"},"spec":{"replicas":1},"status":{"readyReplicas":0}},
{"kind":"Deployment","metadata":{"namespace":"flux-system","name":"ignored"},"spec":{"replicas":1},"status":{"readyReplicas":1}},
{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"skip-me"},"spec":{"replicas":1},"status":{"readyReplicas":0}}
]}`, nil
			default:
				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
			}
		}
		orchReady, _ := newHookOrchestrator(t, cfg, runReady, runReady)
		ready, detail, err := orchReady.TestHookWorkloadConvergenceReady(context.Background())
		if err != nil || !ready || !strings.Contains(detail, "controllers ready=") {
			t.Fatalf("expected workload convergence ready branch, ready=%v detail=%q err=%v", ready, detail, err)
		}

		runDecodeErr := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			if name == "kubectl" && strings.Contains(command, "get deploy,statefulset,daemonset -A -o json") {
				return "{bad-json", nil
			}
			return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
		}
		orchDecodeErr, _ := newHookOrchestrator(t, cfg, runDecodeErr, runDecodeErr)
		if _, _, err := orchDecodeErr.TestHookWorkloadConvergenceReady(context.Background()); err == nil {
			t.Fatalf("expected workload convergence decode error")
		}

		runPending := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			switch {
			case name == "kubectl" && strings.Contains(command, "get deploy,statefulset,daemonset -A -o json"):
				return `{"items":[{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"grafana"},"spec":{"replicas":1},"status":{"readyReplicas":0}}]}`, nil
			case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
				return `{"items":[]}`, nil
			default:
				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
			}
		}
		orchPending, _ := newHookOrchestrator(t, cfg, runPending, runPending)
		if err := orchPending.TestHookWaitForWorkloadConvergence(context.Background()); err == nil || !strings.Contains(err.Error(), "workload convergence not satisfied") {
			t.Fatalf("expected workload convergence timeout branch, got %v", err)
		}

		cancelCtx, cancel := context.WithCancel(context.Background())
		cancel()
		if err := orchPending.TestHookWaitForWorkloadConvergence(cancelCtx); !errors.Is(err, context.Canceled) {
			t.Fatalf("expected canceled convergence wait, got %v", err)
		}

		cases := []cluster.TestHookStuckVaultInitInput{
			{Phase: "Running", Inject: true, InitContainerName: "vault-agent-init", Running: true, StartedAtOffsetSec: 600, GraceSeconds: 60},
			{Phase: "Pending", Inject: false, InitContainerName: "vault-agent-init", Running: true, StartedAtOffsetSec: 600, GraceSeconds: 60},
			{Phase: "Pending", Inject: true, InitContainerName: "other-init", Running: true, StartedAtOffsetSec: 600, GraceSeconds: 60},
			{Phase: "Pending", Inject: true, InitContainerName: "vault-agent-init", Running: true, StartedAtOffsetSec: 0, GraceSeconds: 60},
			{Phase: "Pending", Inject: true, InitContainerName: "vault-agent-init", Running: false, StartedAtOffsetSec: 600, GraceSeconds: 60},
		}
		for _, in := range cases {
			if got := cluster.TestHookStuckVaultInitReasonRaw(in); got != "" {
				t.Fatalf("expected no stuck reason for %+v, got %q", in, got)
			}
		}
		if got := cluster.TestHookStuckVaultInitReasonRaw(cluster.TestHookStuckVaultInitInput{
			Phase:              "Pending",
			Inject:             true,
			InitContainerName:  "vault-agent-init",
			Running:            true,
			StartedAtOffsetSec: 600,
			GraceSeconds:       60,
		}); got != "VaultInitStuck" {
			t.Fatalf("expected VaultInitStuck branch, got %q", got)
		}
	})

	t.Run("storage-access-and-reachability-branches", func(t *testing.T) {
		t.Run("storage-ready-invalid-entry-and-query-error", func(t *testing.T) {
			cfg := lifecycleConfig(t)
			cfg.Startup.StorageCriticalPVCs = []string{"bad-entry"}
			run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
				command := name + " " + strings.Join(args, " ")
				if name == "kubectl" && strings.Contains(command, "get nodes.longhorn.io") {
					return "a:True:True\nb:True:True\n", nil
				}
				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
			}
			orch, _ := newHookOrchestrator(t, cfg, run, run)
			if _, _, err := orch.TestHookStorageReady(context.Background()); err == nil {
				t.Fatalf("expected invalid storage_critical_pvcs entry error")
			}

			cfg.Startup.StorageCriticalPVCs = []string{"monitoring/grafana"}
			runPVCError := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
				command := name + " " + strings.Join(args, " ")
				switch {
				case name == "kubectl" && strings.Contains(command, "get nodes.longhorn.io"):
					return "a:True:True\nb:True:True\n", nil
				case name == "kubectl" && strings.Contains(command, "-n monitoring get pvc grafana -o jsonpath={.status.phase}"):
					return "", errors.New("query pvc failed")
				default:
					return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
				}
			}
			orchPVCError, _ := newHookOrchestrator(t, cfg, runPVCError, runPVCError)
			if _, _, err := orchPVCError.TestHookStorageReady(context.Background()); err == nil {
				t.Fatalf("expected pvc query error branch")
			}
		})

		t.Run("wait-for-node-ssh-auth-and-inventory-timeouts", func(t *testing.T) {
			cfg := lifecycleConfig(t)
			cfg.Startup.RequireNodeSSHAuth = true
			cfg.Startup.NodeSSHAuthWaitSeconds = 1
			cfg.Startup.NodeSSHAuthPollSeconds = 1
			cfg.Startup.RequireNodeInventoryReach = true
			cfg.Startup.NodeInventoryReachWaitSeconds = 1
			cfg.Startup.NodeInventoryReachPollSeconds = 1

			runUnexpected := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
				command := name + " " + strings.Join(args, " ")
				if name == "ssh" && strings.Contains(command, "__ANANKE_") {
					return "unexpected", nil
				}
				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
			}
			orch, _ := newHookOrchestrator(t, cfg, runUnexpected, runUnexpected)
			if err := orch.TestHookWaitForNodeSSHAuth(context.Background(), []string{"titan-db"}); err == nil || !strings.Contains(err.Error(), "did not pass") {
				t.Fatalf("expected ssh-auth timeout on unexpected output, got %v", err)
			}
			if err := orch.TestHookWaitForNodeInventoryReachability(context.Background()); err == nil || !strings.Contains(err.Error(), "did not pass") {
				t.Fatalf("expected inventory reachability timeout on unexpected output, got %v", err)
			}
		})
	})

	t.Run("flux-lifecycle-and-sensitive-run-branches", func(t *testing.T) {
		t.Run("sensitive-run-error-shapes", func(t *testing.T) {
			cfg := lifecycleConfig(t)
			orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0))
			if _, err := orch.TestHookRunSensitive(context.Background(), 3*time.Second, "sh", "-lc", "exit 1"); err == nil {
				t.Fatalf("expected runSensitive failure without output")
			}
			out, err := orch.TestHookRunSensitive(context.Background(), 3*time.Second, "sh", "-lc", "echo boom; exit 1")
			if err == nil || strings.TrimSpace(out) != "boom" {
				t.Fatalf("expected runSensitive failure with captured output, out=%q err=%v", out, err)
			}
		})

		t.Run("flux-health-helper-branches", func(t *testing.T) {
			cfg := lifecycleConfig(t)
			run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
				command := name + " " + strings.Join(args, " ")
				switch {
				case name == "kubectl" && strings.Contains(command, "get jobs -A -o json"):
					return `{"items":[
{"metadata":{"namespace":"flux-system","name":"job-a","labels":{"kustomize.toolkit.fluxcd.io/name":"services"}},"status":{"failed":1,"conditions":[{"type":"Complete","status":"False"}]}},
{"metadata":{"namespace":"flux-system","name":"job-b","labels":{"kustomize.toolkit.fluxcd.io/name":"services"}},"status":{"failed":1,"conditions":[{"type":"Failed","status":"True"}]}}
]}`, nil
				case name == "kubectl" && strings.Contains(command, "-n flux-system delete job job-b"):
					return "", nil
				default:
					return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
				}
			}
			orch, _ := newHookOrchestrator(t, cfg, run, run)
			healed, err := orch.TestHookHealImmutableFluxJobs(context.Background())
			if err != nil || !healed {
				t.Fatalf("expected immutable-job heal branch, healed=%v err=%v", healed, err)
			}
		})

		t.Run("lifecycle-etcd-restore-dryrun-and-shutdown-mode", func(t *testing.T) {
			cfg := lifecycleConfig(t)
			dry := cluster.New(cfg, &execx.Runner{DryRun: true}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0))
			if err := dry.EtcdRestore(context.Background(), cluster.EtcdRestoreOptions{}); err != nil {
				t.Fatalf("expected dry-run etcd restore path, got %v", err)
			}

			orch, _ := newHookOrchestrator(t, cfg, nil, nil)
			err := orch.Shutdown(context.Background(), cluster.ShutdownOptions{Reason: "drill", Mode: "poweroff"})
			if err == nil || !strings.Contains(err.Error(), "has been removed") {
				t.Fatalf("expected removed poweroff mode error, got %v", err)
			}
		})

		t.Run("scale-down-list-error-and-restore-no-snapshot", func(t *testing.T) {
			cfg := lifecycleConfig(t)
			runListErr := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
				command := name + " " + strings.Join(args, " ")
				if name == "kubectl" && strings.Contains(command, "get deployment -A -o jsonpath=") {
					return "", errors.New("list deployments failed")
				}
				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
			}
			orchListErr, _ := newHookOrchestrator(t, cfg, runListErr, runListErr)
			if err := orchListErr.TestHookScaleDownApps(context.Background()); err == nil || !strings.Contains(err.Error(), "collect deployments") {
				t.Fatalf("expected scaleDownApps list error, got %v", err)
			}

			orchNoSnapshot, _ := newHookOrchestrator(t, lifecycleConfig(t), nil, nil)
			if err := orchNoSnapshot.TestHookRestoreScaledApps(context.Background()); err != nil {
				t.Fatalf("expected restore with missing snapshot to succeed, got %v", err)
			}
		})
	})
}