ananke/testing/orchestrator/hooks_gap_matrix_part4_test.go

package orchestrator

import (
	"context"
	"errors"
	"io"
	"log"
	"net/http"
	"net/http/httptest"
	"os"
	"path/filepath"
	"strings"
	"testing"
	"time"

	"scm.bstein.dev/bstein/ananke/internal/cluster"
	"scm.bstein.dev/bstein/ananke/internal/config"
	"scm.bstein.dev/bstein/ananke/internal/execx"
	"scm.bstein.dev/bstein/ananke/internal/state"
)

// TestHookGapMatrixPart4CoordinationAndReachability runs one orchestration or CLI step.
// Signature: TestHookGapMatrixPart4CoordinationAndReachability(t *testing.T).
// Why: closes remaining coordination/reachability low branches with deterministic
// command responses and short timeouts.
func TestHookGapMatrixPart4CoordinationAndReachability(t *testing.T) {
	t.Run("peer-shutdown-complete-cooldown-blocks-startup", func(t *testing.T) {
		cfg := lifecycleConfig(t)
		cfg.Coordination.PeerHosts = []string{"titan-24"}
		cfg.SSHManagedNodes = append(cfg.SSHManagedNodes, "titan-24")
		cfg.SSHNodeHosts["titan-24"] = "titan-24"

		base := lifecycleDispatcher(&commandRecorder{})
		now := time.Now().UTC().Format(time.RFC3339)
		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			if name == "ssh" && strings.Contains(command, "ananke intent --config /etc/ananke/ananke.yaml") {
				return "__ANANKE_BOOTSTRAP_IDLE__\nintent=shutdown_complete reason=\"recent\" source=peer updated_at=" + now + "\n", nil
			}
			return base(ctx, timeout, name, args...)
		}
		orch, _ := newHookOrchestrator(t, cfg, run, run)

		err := orch.TestHookGuardPeerStartupIntents(context.Background())
		if err == nil || !strings.Contains(err.Error(), "completed shutdown too recently") {
			t.Fatalf("expected shutdown-complete cooldown block, got %v", err)
		}
	})

	t.Run("peer-startup-stale-auto-clears-when-bootstrap-idle", func(t *testing.T) {
		cfg := lifecycleConfig(t)
		cfg.Coordination.PeerHosts = []string{"titan-24"}
		cfg.SSHManagedNodes = append(cfg.SSHManagedNodes, "titan-24")
		cfg.SSHNodeHosts["titan-24"] = "titan-24"
		cfg.Coordination.StartupGuardMaxAgeSec = 30
		stale := time.Now().UTC().Add(-3 * time.Hour).Format(time.RFC3339)

		base := lifecycleDispatcher(&commandRecorder{})
		clearCalls := 0
		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			switch {
			case name == "ssh" && strings.Contains(command, "ananke intent --config /etc/ananke/ananke.yaml --set normal"):
				clearCalls++
				return "ok", nil
			case name == "ssh" && strings.Contains(command, "ananke intent --config /etc/ananke/ananke.yaml"):
				return "__ANANKE_BOOTSTRAP_IDLE__\nintent=startup_in_progress reason=\"stale\" source=peer updated_at=" + stale + "\n", nil
			default:
				return base(ctx, timeout, name, args...)
			}
		}
		orch, _ := newHookOrchestrator(t, cfg, run, run)

		if err := orch.TestHookGuardPeerStartupIntents(context.Background()); err != nil {
			t.Fatalf("expected stale peer startup intent to auto-clear, got %v", err)
		}
		if clearCalls == 0 {
			t.Fatalf("expected remote stale-intent clear call")
		}
	})

	t.Run("read-peer-parse-error-api-timeout-and-snapshot-size-parse", func(t *testing.T) {
		cfg := lifecycleConfig(t)
		cfg.Coordination.PeerHosts = []string{"titan-24"}
		cfg.SSHManagedNodes = append(cfg.SSHManagedNodes, "titan-24")
		cfg.SSHNodeHosts["titan-24"] = "titan-24"

		base := lifecycleDispatcher(&commandRecorder{})
		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			switch {
			case name == "ssh" && strings.Contains(command, "ananke intent --config /etc/ananke/ananke.yaml"):
				return "__ANANKE_BOOTSTRAP_IDLE__\nnot-an-intent-payload\n", nil
			case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
				return "", errors.New("api unreachable")
			case name == "ssh" && strings.Contains(command, "stat -c %s"):
				return "not-a-size", nil
			default:
				return base(ctx, timeout, name, args...)
			}
		}
		orch, _ := newHookOrchestrator(t, cfg, run, run)

		if _, err := orch.TestHookReadRemotePeerStatus(context.Background(), "titan-24"); err == nil {
			t.Fatalf("expected parse failure for remote peer intent output")
		}
		if err := orch.TestHookWaitForAPI(context.Background(), 1, 0); err == nil {
			t.Fatalf("expected waitForAPI timeout error")
		}
		err := orch.TestHookVerifyEtcdSnapshot(context.Background(), "titan-db", "/var/lib/rancher/k3s/server/db/snapshots/pre-shutdown")
		if err == nil || !strings.Contains(err.Error(), "parse size") {
			t.Fatalf("expected snapshot size parse error, got %v", err)
		}
	})

	t.Run("inventory-reachability-times-out-on-unexpected-output", func(t *testing.T) {
		cfg := lifecycleConfig(t)
		cfg.Startup.RequireNodeInventoryReach = true
		cfg.Startup.NodeInventoryReachWaitSeconds = 1
		cfg.Startup.NodeInventoryReachPollSeconds = 1

		base := lifecycleDispatcher(&commandRecorder{})
		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			if name == "ssh" && strings.Contains(command, "__ANANKE_NODE_REACHABLE__") {
				return "unexpected", nil
			}
			return base(ctx, timeout, name, args...)
		}
		orch, _ := newHookOrchestrator(t, cfg, run, run)

		err := orch.TestHookWaitForNodeInventoryReachability(context.Background())
		if err == nil || !strings.Contains(err.Error(), "unexpected output") {
			t.Fatalf("expected unexpected-output timeout branch, got %v", err)
		}
	})
}

// TestHookGapMatrixPart4IngressServiceAndPostStart runs one orchestration or CLI step.
// Signature: TestHookGapMatrixPart4IngressServiceAndPostStart(t *testing.T).
// Why: drives ingress/service checklist and post-start branches that were still
// under-covered after drill-focused matrix tests.
func TestHookGapMatrixPart4IngressServiceAndPostStart(t *testing.T) {
	t.Run("ingress-backend-autoscale-cooldown-and-host-parser", func(t *testing.T) {
		cfg := lifecycleConfig(t)
		cfg.Startup.ServiceChecklist = []config.ServiceChecklistCheck{
			{Name: "metrics", URL: "https://metrics.bstein.dev/api/health"},
		}
		cfg.Startup.IngressChecklistIgnoreHosts = []string{"ignore.bstein.dev"}

		base := lifecycleDispatcher(&commandRecorder{})
		scaleCalls := 0
		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			switch {
			case name == "kubectl" && strings.Contains(command, "get ingress -A -o json"):
				return `{"items":[{"metadata":{"namespace":"monitoring","name":"grafana"},"spec":{"rules":[{"host":"metrics.bstein.dev"},{"host":"ignore.bstein.dev"}]}}]}`, nil
			case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"):
				return `{"items":[{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"grafana"},"spec":{"replicas":0},"status":{"readyReplicas":0}}]}`, nil
			case name == "kubectl" && strings.Contains(command, " scale deployment grafana --replicas=1"):
				scaleCalls++
				return "", nil
			default:
				return base(ctx, timeout, name, args...)
			}
		}
		orch, _ := newHookOrchestrator(t, cfg, run, run)

		last := time.Time{}
		orch.TestHookMaybeAutoHealIngressHostBackends(context.Background(), &last, "metrics: unexpected status code=502")
		orch.TestHookMaybeAutoHealIngressHostBackends(context.Background(), &last, "metrics: still bad")
		if scaleCalls != 1 {
			t.Fatalf("expected one scale call due cooldown gate, got %d", scaleCalls)
		}

		if got := orch.TestHookChecklistFailureHost("metrics: request failed"); got != "metrics.bstein.dev" {
			t.Fatalf("expected mapped host metrics.bstein.dev, got %q", got)
		}
		if got := orch.TestHookChecklistFailureHost("not-a-host detail"); got != "" {
			t.Fatalf("expected empty host for unknown failure prefix, got %q", got)
		}
		if !cluster.TestHookChecklistContains("hello \n world", "HELLO WORLD") {
			t.Fatalf("expected compact checklist matcher branch")
		}
	})

	t.Run("service-check-body-notcontains-and-poststart-timeout", func(t *testing.T) {
		srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
			w.WriteHeader(http.StatusOK)
			_, _ = w.Write([]byte("metrics ready marker"))
		}))
		defer srv.Close()

		cfg := lifecycleConfig(t)
		orch, _ := newHookOrchestrator(t, cfg, nil, nil)
		ok, detail := orch.TestHookServiceCheckReady(context.Background(), config.ServiceChecklistCheck{
			Name:            "forbidden-marker",
			URL:             srv.URL,
			AcceptedStatuses: []int{200},
			BodyNotContains: "marker",
			TimeoutSeconds:  2,
		})
		if ok || !strings.Contains(detail, "forbidden marker") {
			t.Fatalf("expected forbidden-marker branch, ok=%v detail=%q", ok, detail)
		}

		cfg = lifecycleConfig(t)
		cfg.Startup.PostStartProbeWaitSeconds = 1
		cfg.Startup.PostStartProbePollSeconds = 1
		cfg.Startup.PostStartProbes = []string{"https://metrics.bstein.dev/health"}
		base := lifecycleDispatcher(&commandRecorder{})
		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			if name == "curl" || (name == "kubectl" && strings.Contains(command, "curl")) {
				return "500", nil
			}
			return base(ctx, timeout, name, args...)
		}
		orch, _ = newHookOrchestrator(t, cfg, run, run)
		err := orch.TestHookWaitForPostStartProbes(context.Background())
		if err == nil || !strings.Contains(err.Error(), "post-start probes did not pass") {
			t.Fatalf("expected post-start timeout branch, got %v", err)
		}
	})

	t.Run("hostname-heuristic-negative-cases", func(t *testing.T) {
		cases := []string{"", "not-a-host", "metrics bstein dev", "metrics.bstein.dev/path"}
		for _, in := range cases {
			if cluster.TestHookIsLikelyHostname(in) {
				t.Fatalf("expected %q to be treated as non-hostname", in)
			}
		}
	})
}

// TestHookGapMatrixPart4ReportScalingStorageDrain runs one orchestration or CLI step.
// Signature: TestHookGapMatrixPart4ReportScalingStorageDrain(t *testing.T).
// Why: covers artifact, scaling snapshot, storage, and drain error branches that
// are difficult to hit from happy-path lifecycle drills.
func TestHookGapMatrixPart4ReportScalingStorageDrain(t *testing.T) {
	t.Run("report-artifact-and-progress-error-paths", func(t *testing.T) {
		cfg := lifecycleConfig(t)
		reportsFile := filepath.Join(t.TempDir(), "reports-as-file")
		if err := os.WriteFile(reportsFile, []byte("x"), 0o644); err != nil {
			t.Fatalf("create reports file: %v", err)
		}
		cfg.State.ReportsDir = reportsFile
		orch, _ := newHookOrchestrator(t, cfg, nil, nil)

		err := orch.TestHookWriteRunRecordArtifact(state.RunRecord{
			ID:        "shutdown-record",
			Action:    "shutdown",
			Reason:    "drill",
			StartedAt: time.Now().UTC(),
			EndedAt:   time.Now().UTC(),
		})
		if err == nil {
			t.Fatalf("expected report archive dir error")
		}

		if err := orch.TestHookWriteStartupReportFile(filepath.Join(reportsFile, "startup.json"), "running"); err == nil {
			t.Fatalf("expected startup report path mkdir error")
		}

		cfg2 := lifecycleConfig(t)
		cfg2.State.Dir = filepath.Join(t.TempDir(), "state")
		cfg2.State.ReportsDir = reportsFile
		orch2, _ := newHookOrchestrator(t, cfg2, nil, nil)
		orch2.TestHookPersistStartupProgress("running")
		orch2.TestHookBeginStartupReport("drill")
		orch2.TestHookFinalizeStartupReport(errors.New("boom"))
	})

	t.Run("scaled-workload-snapshot-write-and-read-error-paths", func(t *testing.T) {
		cfg := lifecycleConfig(t)
		stateFile := filepath.Join(t.TempDir(), "state-file")
		if err := os.WriteFile(stateFile, []byte("x"), 0o644); err != nil {
			t.Fatalf("create state file: %v", err)
		}
		cfg.State.Dir = filepath.Join(stateFile, "nested")
		orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0))
		dispatch := lifecycleDispatcher(&commandRecorder{})
		orch.SetCommandOverrides(dispatch, dispatch)
		entries, err := orch.TestHookListScalableWorkloads(context.Background())
		if err != nil {
			t.Fatalf("list scalable workloads: %v", err)
		}
		err = orch.TestHookWriteScaledWorkloadSnapshot(entries[:1])
		if err == nil || !strings.Contains(err.Error(), "ensure state dir") {
			t.Fatalf("expected scaled snapshot state-dir failure, got %v", err)
		}

		cfg2 := lifecycleConfig(t)
		orch2, _ := newHookOrchestrator(t, cfg2, nil, nil)
		snapshotPath := filepath.Join(cfg2.State.Dir, "scaled-workloads.json")
		if err := os.WriteFile(snapshotPath, []byte("{bad"), 0o644); err != nil {
			t.Fatalf("write corrupt snapshot: %v", err)
		}
		if _, err := orch2.TestHookReadScaledWorkloadSnapshot(); err == nil {
			t.Fatalf("expected corrupt snapshot decode error")
		}
	})

	t.Run("storage-and-drain-failure-branches", func(t *testing.T) {
		cfg := lifecycleConfig(t)
		cfg.Startup.StorageReadyWaitSeconds = 1
		cfg.Startup.StorageReadyPollSeconds = 1
		cfg.Startup.StorageMinReadyNodes = 3
		cfg.Startup.StorageCriticalPVCs = []string{"monitoring/grafana-data"}
		cfg.Shutdown.DrainParallelism = 1

		base := lifecycleDispatcher(&commandRecorder{})
		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			switch {
			case name == "kubectl" && strings.Contains(command, "nodes.longhorn.io"):
				return "titan-23:True:True\ntitan-24:False:False\n", nil
			case name == "kubectl" && strings.Contains(command, " drain titan-23 "):
				return "", errors.New("drain blocked")
			case name == "kubectl" && strings.Contains(command, "--field-selector spec.nodeName=titan-23"):
				return "monitoring grafana-0 Running ReplicaSet\n", nil
			default:
				return base(ctx, timeout, name, args...)
			}
		}
		orch, _ := newHookOrchestrator(t, cfg, run, run)

		if _, _, err := orch.TestHookStorageReady(context.Background()); err != nil {
			t.Fatalf("expected storageReady non-error not-ready branch, got %v", err)
		}
		if err := orch.TestHookWaitForStorageReady(context.Background()); err == nil {
			t.Fatalf("expected storage readiness timeout")
		}

		err := orch.TestHookDrainWorkers(context.Background(), []string{"titan-23"})
		if err == nil || !strings.Contains(err.Error(), "details:") {
			t.Fatalf("expected drain diagnostics branch, got %v", err)
		}
	})
}

// TestHookGapMatrixPart4TimesyncLifecycleAndAccess runs one orchestration or CLI step.
// Signature: TestHookGapMatrixPart4TimesyncLifecycleAndAccess(t *testing.T).
// Why: closes remaining timing/access/lifecycle branches that still sat below
// target after the earlier matrices.
func TestHookGapMatrixPart4TimesyncLifecycleAndAccess(t *testing.T) {
	t.Run("timesync-quorum-and-strict-failure-branches", func(t *testing.T) {
		cfg := lifecycleConfig(t)
		cfg.Startup.TimeSyncMode = "quorum"
		cfg.Startup.TimeSyncQuorum = 1
		cfg.Startup.TimeSyncWaitSeconds = 1
		cfg.Startup.TimeSyncPollSeconds = 1

		base := lifecycleDispatcher(&commandRecorder{})
		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			switch {
			case name == "sh" && strings.Contains(command, "timedatectl show -p NTPSynchronized"):
				return "yes", nil
			case name == "ssh" && strings.Contains(command, "timedatectl show -p NTPSynchronized") && strings.Contains(command, "titan-db"):
				return "yes", nil
			case name == "ssh" && strings.Contains(command, "timedatectl show -p NTPSynchronized") && strings.Contains(command, "titan-23"):
				return "no", nil
			default:
				return base(ctx, timeout, name, args...)
			}
		}
		orch, _ := newHookOrchestrator(t, cfg, run, run)
		if err := orch.TestHookWaitForTimeSync(context.Background(), []string{"titan-db", "titan-23"}); err != nil {
			t.Fatalf("expected quorum-mode success, got %v", err)
		}

		cfgStrict := lifecycleConfig(t)
		cfgStrict.Startup.TimeSyncMode = "strict"
		cfgStrict.Startup.TimeSyncWaitSeconds = 1
		cfgStrict.Startup.TimeSyncPollSeconds = 1
		runStrict := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			if (name == "sh" || name == "ssh") && strings.Contains(command, "timedatectl show -p NTPSynchronized") {
				return "no", nil
			}
			return base(ctx, timeout, name, args...)
		}
		orchStrict, _ := newHookOrchestrator(t, cfgStrict, runStrict, runStrict)
		err := orchStrict.TestHookWaitForTimeSync(context.Background(), []string{"titan-db"})
		if err == nil || !strings.Contains(err.Error(), "time sync not ready") {
			t.Fatalf("expected strict-mode timesync failure, got %v", err)
		}
	})

	t.Run("validate-inventory-and-access-guard-branches", func(t *testing.T) {
		cfg := lifecycleConfig(t)
		cfg.SSHPort = 70000
		cfg.SSHManagedNodes = []string{"titan-db"}
		cfg.Workers = []string{"titan-23"}
		cfg.SSHNodeUsers = map[string]string{}
		cfg.SSHNodeHosts["titan-db"] = "bad/host"
		cfg.SSHNodeUsers["titan-db"] = "bad user"
		orch, _ := newHookOrchestrator(t, cfg, nil, nil)
		if err := orch.TestHookValidateNodeInventory(); err == nil {
			t.Fatalf("expected inventory validation failure")
		}

		base := lifecycleDispatcher(&commandRecorder{})
		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			switch {
			case name == "ssh" && strings.Contains(command, "/usr/bin/systemctl --version") && strings.Contains(command, "titan-23"):
				return "", errors.New("permission denied")
			case name == "kubectl" && strings.Contains(command, "jsonpath={.spec.ref.branch}"):
				return "feature/sso", nil
			default:
				return base(ctx, timeout, name, args...)
			}
		}
		cfg2 := lifecycleConfig(t)
		orch2, _ := newHookOrchestrator(t, cfg2, run, run)
		if err := orch2.TestHookReconcileNodeAccess(context.Background(), []string{"titan-db", "titan-23"}); err == nil {
			t.Fatalf("expected reconcileNodeAccess aggregated error")
		}
		if err := orch2.TestHookEnsureFluxBranch(context.Background(), "main", false); err == nil {
			t.Fatalf("expected ensureFluxBranch mismatch guard")
		}
	})

	t.Run("lifecycle-restore-and-mode-guard-branches", func(t *testing.T) {
		cfg := lifecycleConfig(t)
		cfg.ControlPlanes = []string{"titan-db"}
		cfg.SSHManagedNodes = []string{"titan-db"}
		orch, _ := newHookOrchestrator(t, cfg, nil, nil)
		if err := orch.EtcdRestore(context.Background(), cluster.EtcdRestoreOptions{ControlPlane: "not-a-cp"}); err == nil {
			t.Fatalf("expected etcd restore control-plane membership guard")
		}

		cfgDry := lifecycleConfig(t)
		cfgDry.ControlPlanes = []string{"titan-db"}
		cfgDry.SSHManagedNodes = []string{"titan-db"}
		orchDry := newDryRunHookOrchestrator(t, cfgDry, nil)
		if err := orchDry.EtcdRestore(context.Background(), cluster.EtcdRestoreOptions{ControlPlane: "titan-db"}); err != nil {
			t.Fatalf("expected dry-run etcd restore success, got %v", err)
		}

		cfgMode := lifecycleConfig(t)
		orchMode, _ := newHookOrchestrator(t, cfgMode, nil, nil)
		if err := orchMode.Shutdown(context.Background(), cluster.ShutdownOptions{Mode: "poweroff"}); err == nil {
			t.Fatalf("expected removed shutdown mode guard")
		}
	})
}