ananke/testing/orchestrator/hooks_wrappers_coverage_test.go

package orchestrator

import (
	"context"
	"io"
	"log"
	"net/http"
	"net/http/httptest"
	"strings"
	"testing"
	"time"

	"scm.bstein.dev/bstein/ananke/internal/cluster"
	"scm.bstein.dev/bstein/ananke/internal/config"
	"scm.bstein.dev/bstein/ananke/internal/execx"
	"scm.bstein.dev/bstein/ananke/internal/state"
)

// wrapperCoverageDispatcher runs one orchestration or CLI step.
// Signature: wrapperCoverageDispatcher(recorder *commandRecorder) func(context.Context, time.Duration, string, ...string) (string, error).
// Why: centralizes deterministic command output so hook-wrapper tests can execute all exported test hooks without live cluster access.
func wrapperCoverageDispatcher(recorder *commandRecorder) func(context.Context, time.Duration, string, ...string) (string, error) {
	base := lifecycleDispatcher(recorder)
	return func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
		command := name + " " + strings.Join(args, " ")
		switch {
		case name == "curl":
			recorder.record(name, args)
			return "200", nil
		case name == "ssh" && strings.Contains(command, "k3s etcd-snapshot ls"):
			recorder.record(name, args)
			return "Name Size Created Location\npre-shutdown 4.2M now \"file:///var/lib/rancher/k3s/server/db/snapshots/pre-shutdown\"\n", nil
		case name == "ssh" && strings.Contains(command, "k3s etcd-snapshot save"):
			recorder.record(name, args)
			return "snapshot saved", nil
		case name == "kubectl" && strings.Contains(command, "get ingress -A -o json"):
			recorder.record(name, args)
			return `{"items":[{"metadata":{"namespace":"monitoring"},"spec":{"rules":[{"host":"metrics.bstein.dev"}]}}]}`, nil
		case name == "kubectl" && strings.Contains(command, "get endpoints victoria-metrics-single-server"):
			recorder.record(name, args)
			return "10.42.0.10\n", nil
		case name == "kubectl" && strings.Contains(command, "get deploy,statefulset,daemonset -A -o json"):
			recorder.record(name, args)
			return `{"items":[{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"grafana"},"spec":{"replicas":1,"template":{"spec":{"nodeSelector":{"kubernetes.io/hostname":"titan-23"}}}},"status":{"readyReplicas":1}},{"kind":"StatefulSet","metadata":{"namespace":"monitoring","name":"victoria-metrics-single-server"},"spec":{"replicas":1,"template":{"spec":{}}},"status":{"readyReplicas":1}},{"kind":"DaemonSet","metadata":{"namespace":"monitoring","name":"node-exporter"},"spec":{"template":{"spec":{}}},"status":{"desiredNumberScheduled":1,"numberReady":1}}]}`, nil
		case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"):
			recorder.record(name, args)
			return `{"items":[{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"grafana"},"spec":{"replicas":0,"template":{"spec":{}}},"status":{"readyReplicas":0}},{"kind":"StatefulSet","metadata":{"namespace":"monitoring","name":"victoria-metrics-single-server"},"spec":{"replicas":1,"template":{"spec":{}}},"status":{"readyReplicas":1}}]}`, nil
		case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
			recorder.record(name, args)
			return `{"items":[{"metadata":{"namespace":"vault","name":"vault-0","creationTimestamp":"2020-01-01T00:00:00Z","ownerReferences":[{"kind":"StatefulSet","name":"vault"}]},"spec":{"nodeName":"titan-23","containers":[{"name":"vault"}]},"status":{"phase":"Pending","initContainerStatuses":[{"name":"vault-agent-init","state":{"running":{"startedAt":"2020-01-01T00:00:00Z"}}}]}}]}`, nil
		case name == "kubectl" && strings.Contains(command, "get deployment -A -o jsonpath="):
			recorder.record(name, args)
			return "monitoring\tgrafana\t1\nflux-system\tsource-controller\t1\ngitea\tgitea\t1\n", nil
		case name == "kubectl" && strings.Contains(command, "get statefulset -A -o jsonpath="):
			recorder.record(name, args)
			return "monitoring\tvictoria-metrics-single-server\t1\nvault\tvault\t1\npostgres\tpostgres\t1\n", nil
		case name == "kubectl" && strings.Contains(command, "get nodes -o custom-columns="):
			recorder.record(name, args)
			return "titan-23 <none> <none>\n", nil
		case name == "kubectl" && strings.Contains(command, "-n flux-system get kustomizations.kustomize.toolkit.fluxcd.io -o jsonpath="):
			recorder.record(name, args)
			return "services\n", nil
		case name == "kubectl" && strings.Contains(command, "get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath="):
			recorder.record(name, args)
			return "monitoring/grafana\n", nil
		case name == "kubectl" && strings.Contains(command, "get statefulset vault -o jsonpath={.status.readyReplicas}"):
			recorder.record(name, args)
			return "1", nil
		case name == "kubectl" && strings.Contains(command, "jsonpath={.status.readyReplicas}"):
			recorder.record(name, args)
			return "1", nil
		case name == "kubectl" && strings.Contains(command, "rollout status"):
			recorder.record(name, args)
			return "rolled out", nil
		case name == "kubectl" && strings.Contains(command, "scale "):
			recorder.record(name, args)
			return "", nil
		case name == "kubectl" && strings.Contains(command, "delete pod"):
			recorder.record(name, args)
			return "", nil
		case name == "kubectl" && strings.Contains(command, "patch "):
			recorder.record(name, args)
			return "", nil
		default:
			return base(ctx, timeout, name, args...)
		}
	}
}

// TestHookLifecycleWrappersCoverage runs one orchestration or CLI step.
// Signature: TestHookLifecycleWrappersCoverage(t *testing.T).
// Why: executes lifecycle hook wrappers so test-hook files and their orchestration backends stay covered from the top-level testing module.
func TestHookLifecycleWrappersCoverage(t *testing.T) {
	cfg := lifecycleConfig(t)
	cfg.Startup.PostStartProbes = []string{"https://metrics.bstein.dev/healthz"}
	cfg.Startup.PostStartProbeWaitSeconds = 1
	cfg.Startup.PostStartProbePollSeconds = 1
	cfg.Startup.RequireIngressChecklist = false
	cfg.Startup.RequireServiceChecklist = false
	cfg.Startup.RequireCriticalServiceEndpoints = false
	cfg.Startup.RequireFluxHealth = false
	cfg.Startup.RequireWorkloadConvergence = false
	cfg.Startup.ServiceChecklistStabilitySec = 1
	cfg.Startup.IngressChecklistWaitSeconds = 1
	cfg.Startup.IngressChecklistPollSeconds = 1
	cfg.Startup.ServiceChecklistWaitSeconds = 1
	cfg.Startup.ServiceChecklistPollSeconds = 1
	cfg.Startup.CriticalServiceEndpoints = []string{"monitoring/victoria-metrics-single-server"}
	cfg.Startup.CriticalServiceEndpointWaitSec = 1
	cfg.Startup.CriticalServiceEndpointPollSec = 1

	svc := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
		w.WriteHeader(http.StatusOK)
		_, _ = w.Write([]byte(`{"database":"ok"}`))
	}))
	defer svc.Close()

	cfg.Startup.ServiceChecklist = []config.ServiceChecklistCheck{
		{
			Name:             "grafana",
			URL:              svc.URL,
			AcceptedStatuses: []int{200},
			BodyContains:     `"database":"ok"`,
			TimeoutSeconds:   2,
		},
	}

	recorder := &commandRecorder{}
	dispatch := wrapperCoverageDispatcher(recorder)
	orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0))
	orch.SetCommandOverrides(dispatch, dispatch)

	ctx := context.Background()
	if err := orch.TestHookWaitForPostStartProbes(ctx); err != nil {
		t.Fatalf("wait for post-start probes: %v", err)
	}
	if _, _ = orch.TestHookPostStartProbesReady(ctx); true {
	}
	if _, err := orch.TestHookHTTPProbe(ctx, "https://metrics.bstein.dev/healthz"); err != nil {
		t.Fatalf("http probe: %v", err)
	}
	if err := orch.TestHookResumeFluxAndReconcile(ctx); err != nil {
		t.Fatalf("resume flux and reconcile: %v", err)
	}
	if _, err := orch.TestHookSSHWithTimeout(ctx, "titan-db", "echo ok", time.Second); err != nil {
		t.Fatalf("ssh with timeout: %v", err)
	}
	if _, err := orch.TestHookRunSensitive(ctx, time.Second, "kubectl", "version", "--request-timeout=5s"); err != nil {
		t.Fatalf("runSensitive wrapper: %v", err)
	}
	if err := orch.TestHookWaitForStartupConvergence(ctx); err != nil {
		t.Fatalf("wait for startup convergence: %v", err)
	}
	if err := orch.TestHookWaitForServiceChecklist(ctx); err != nil {
		t.Fatalf("wait for service checklist: %v", err)
	}
	if ok, detail := orch.TestHookServiceChecklistReady(ctx); !ok || detail == "" {
		t.Fatalf("service checklist ready result unexpected: ok=%v detail=%q", ok, detail)
	}
	if ok, detail := orch.TestHookServiceCheckReady(ctx, cfg.Startup.ServiceChecklist[0]); !ok || detail == "" {
		t.Fatalf("service check ready result unexpected: ok=%v detail=%q", ok, detail)
	}
	if status, body, err := orch.TestHookHTTPChecklistProbe(ctx, cfg.Startup.ServiceChecklist[0]); err != nil || status != 200 || body == "" {
		t.Fatalf("http checklist probe unexpected result status=%d body=%q err=%v", status, body, err)
	}
	if err := orch.TestHookWaitForStabilityWindow(ctx); err != nil {
		t.Fatalf("wait for stability window: %v", err)
	}
	if err := orch.TestHookStartupStabilityHealthy(ctx); err != nil {
		t.Fatalf("startup stability healthy: %v", err)
	}

	cancelCtx, cancel := context.WithCancel(ctx)
	cancel()
	_ = orch.TestHookWaitForIngressChecklist(cancelCtx)
	if _, _ = orch.TestHookIngressChecklistReady(ctx); true {
	}
	if hosts, err := orch.TestHookDiscoverIngressHosts(ctx); err != nil || len(hosts) == 0 {
		t.Fatalf("discover ingress hosts failed: hosts=%v err=%v", hosts, err)
	}
	if namespaces, err := orch.TestHookDiscoverIngressNamespacesForHost(ctx, "metrics.bstein.dev"); err != nil || len(namespaces) == 0 {
		t.Fatalf("discover ingress namespaces failed: ns=%v err=%v", namespaces, err)
	}
	last := time.Time{}
	orch.TestHookMaybeAutoHealIngressHostBackends(ctx, &last, "metrics.bstein.dev: status=503")
	if got := orch.TestHookChecklistFailureHost("metrics.bstein.dev: status=503"); got != "metrics.bstein.dev" {
		t.Fatalf("unexpected checklistFailureHost parse: %q", got)
	}
	if got := cluster.TestHookHostFromURL("https://metrics.bstein.dev/api/health"); got != "metrics.bstein.dev" {
		t.Fatalf("unexpected host parse: %q", got)
	}
}

// TestHookOpsAndWorkloadWrappersCoverage runs one orchestration or CLI step.
// Signature: TestHookOpsAndWorkloadWrappersCoverage(t *testing.T).
// Why: executes ops/workload wrappers so all exposed hook files remain covered and regression-safe.
func TestHookOpsAndWorkloadWrappersCoverage(t *testing.T) {
	cfg := lifecycleConfig(t)
	cfg.Startup.WorkloadConvergenceWaitSeconds = 1
	cfg.Startup.WorkloadConvergencePollSeconds = 1
	cfg.Startup.CriticalServiceEndpointWaitSec = 1
	cfg.Startup.CriticalServiceEndpointPollSec = 1
	cfg.Startup.AutoRecycleStuckPods = true
	cfg.Startup.StuckPodGraceSeconds = 1
	svc := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
		w.WriteHeader(http.StatusOK)
		_, _ = w.Write([]byte(`{"status":"ok"}`))
	}))
	defer svc.Close()
	cfg.Startup.ServiceChecklist = []config.ServiceChecklistCheck{
		{Name: "grafana", URL: svc.URL, AcceptedStatuses: []int{200}, BodyContains: `"status":"ok"`, TimeoutSeconds: 2},
	}
	cfg.Startup.ServiceChecklistWaitSeconds = 1
	cfg.Startup.ServiceChecklistPollSeconds = 1

	recorder := &commandRecorder{}
	dispatch := wrapperCoverageDispatcher(recorder)
	orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0))
	orch.SetCommandOverrides(dispatch, dispatch)

	ctx := context.Background()
	if err := orch.TestHookDrainWorkers(ctx, []string{"titan-23"}); err != nil {
		t.Fatalf("drain workers: %v", err)
	}
	if err := orch.TestHookUncordonWorkers(ctx, []string{"titan-23"}); err != nil {
		t.Fatalf("uncordon workers: %v", err)
	}
	if err := orch.TestHookTakeEtcdSnapshot(ctx, "titan-db"); err != nil {
		t.Fatalf("take etcd snapshot: %v", err)
	}
	if snapshot, err := orch.TestHookLatestEtcdSnapshotPath(ctx, "titan-db"); err != nil || snapshot == "" {
		t.Fatalf("latest snapshot path failed: snapshot=%q err=%v", snapshot, err)
	}
	if workers, err := orch.TestHookEffectiveWorkers(ctx); err != nil || len(workers) == 0 {
		t.Fatalf("effective workers failed: workers=%v err=%v", workers, err)
	}
	if workers, err := orch.TestHookDiscoverWorkers(ctx); err != nil || len(workers) == 0 {
		t.Fatalf("discover workers failed: workers=%v err=%v", workers, err)
	}
	if err := orch.TestHookPatchFluxSuspendAll(ctx, true); err != nil {
		t.Fatalf("patch flux suspend: %v", err)
	}
	if entries, err := orch.TestHookListScalableWorkloads(ctx); err != nil {
		t.Fatalf("list scalable workloads: %v", err)
	} else {
		if err := orch.TestHookScaleWorkloads(ctx, entries, 0, 1); err != nil {
			t.Fatalf("scale workloads forced: %v", err)
		}
		if err := orch.TestHookWriteScaledWorkloadSnapshot(entries); err != nil {
			t.Fatalf("write scaled workload snapshot: %v", err)
		}
		if _, err := orch.TestHookReadScaledWorkloadSnapshot(); err != nil {
			t.Fatalf("read scaled workload snapshot: %v", err)
		}
	}
	if err := orch.TestHookScaleDownApps(ctx); err != nil {
		t.Fatalf("scale down apps: %v", err)
	}
	if err := orch.TestHookRestoreScaledApps(ctx); err != nil {
		t.Fatalf("restore scaled apps: %v", err)
	}

	if err := orch.TestHookWaitForCriticalServiceEndpoints(ctx); err != nil {
		t.Fatalf("wait for critical service endpoints: %v", err)
	}
	if ok, _, _, _, err := orch.TestHookCriticalServiceEndpointsReady(ctx); err != nil || !ok {
		t.Fatalf("critical endpoints ready failed: ok=%v err=%v", ok, err)
	}
	if err := orch.TestHookWaitForWorkloadConvergence(ctx); err != nil {
		t.Fatalf("wait for workload convergence: %v", err)
	}
	if ok, detail, err := orch.TestHookWorkloadConvergenceReady(ctx); err != nil || !ok || detail == "" {
		t.Fatalf("workload convergence ready failed: ok=%v detail=%q err=%v", ok, detail, err)
	}
	if err := orch.TestHookRecycleStuckControllerPods(ctx); err != nil {
		t.Fatalf("recycle stuck controller pods: %v", err)
	}
	lastRecycle := time.Time{}
	orch.TestHookMaybeAutoRecycleStuckPods(ctx, &lastRecycle)
	lastHeal := time.Time{}
	orch.TestHookMaybeAutoHealCriticalWorkloadReplicas(ctx, &lastHeal)
	if _, err := orch.TestHookHealCriticalWorkloadReplicas(ctx); err != nil {
		t.Fatalf("heal critical workload replicas: %v", err)
	}
	if _, err := orch.TestHookStartupFailurePods(ctx); err != nil {
		t.Fatalf("startup failure pods: %v", err)
	}
	if err := orch.TestHookEnsureCriticalStartupWorkloads(ctx); err != nil {
		t.Fatalf("ensure critical startup workloads: %v", err)
	}
	if _, err := orch.TestHookMissingCriticalStartupWorkloads(ctx); err != nil {
		t.Fatalf("missing critical startup workloads: %v", err)
	}
	if err := orch.TestHookWaitForServiceChecklistAlias(ctx); err != nil {
		t.Fatalf("wait for service checklist alias: %v", err)
	}
}