ananke/testing/orchestrator/hooks_health_storage_test.go

package orchestrator

import (
	"context"
	"io"
	"log"
	"net"
	"strings"
	"testing"
	"time"

	"scm.bstein.dev/bstein/ananke/internal/cluster"
	"scm.bstein.dev/bstein/ananke/internal/config"
	"scm.bstein.dev/bstein/ananke/internal/execx"
	"scm.bstein.dev/bstein/ananke/internal/state"
)

// TestHookFluxHealthAndStorageBranches runs one orchestration or CLI step.
// Signature: TestHookFluxHealthAndStorageBranches(t *testing.T).
// Why: exercises flux-health and storage readiness helpers directly for coverage and behavioral safety.
func TestHookFluxHealthAndStorageBranches(t *testing.T) {
	cfg := lifecycleConfig(t)
	cfg.Startup.FluxHealthWaitSeconds = 2
	cfg.Startup.FluxHealthPollSeconds = 1
	cfg.Startup.IgnoreFluxKustomizations = []string{"flux-system/skip-me"}
	cfg.Startup.StorageReadyWaitSeconds = 2
	cfg.Startup.StorageReadyPollSeconds = 1
	cfg.Startup.StorageMinReadyNodes = 1
	cfg.Startup.StorageCriticalPVCs = []string{"monitoring/grafana-data"}

	recorder := &commandRecorder{}
	base := lifecycleDispatcher(recorder)
	fluxCalls := 0
	run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
		command := name + " " + strings.Join(args, " ")
		switch {
		case name == "kubectl" && strings.Contains(command, "get kustomizations.kustomize.toolkit.fluxcd.io -A -o json"):
			recorder.record(name, args)
			fluxCalls++
			if fluxCalls <= 1 {
				return `{"items":[{"metadata":{"namespace":"flux-system","name":"services"},"spec":{"suspend":false},"status":{"conditions":[{"type":"Ready","status":"False","reason":"Unknown","message":"waiting"}]}}]}`, nil
			}
			return `{"items":[{"metadata":{"namespace":"flux-system","name":"services"},"spec":{"suspend":false},"status":{"conditions":[{"type":"Ready","status":"True","reason":"ReconciliationSucceeded","message":"ok"}]}}]}`, nil
		case name == "kubectl" && strings.Contains(command, "get jobs -A -o json"):
			recorder.record(name, args)
			return `{"items":[]}`, nil
		case name == "kubectl" && strings.Contains(command, "nodes.longhorn.io"):
			recorder.record(name, args)
			return "lh-a:True:True\n", nil
		case name == "kubectl" && strings.Contains(command, "-n monitoring get pvc grafana-data"):
			recorder.record(name, args)
			return "Bound", nil
		default:
			return base(ctx, timeout, name, args...)
		}
	}
	orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0))
	orch.SetCommandOverrides(run, run)

	ok, detail, err := orch.TestHookFluxHealthReady(context.Background())
	if err != nil {
		t.Fatalf("fluxHealthReady error: %v", err)
	}
	if ok {
		t.Fatalf("expected first fluxHealthReady call to be not-ready: %s", detail)
	}
	healed, err := orch.TestHookHealImmutableFluxJobs(context.Background())
	if err != nil {
		t.Fatalf("healImmutableFluxJobs error: %v", err)
	}
	if healed {
		t.Fatalf("expected no immutable job heal action in this fixture")
	}
	if err := orch.TestHookWaitForFluxHealth(context.Background()); err != nil {
		t.Fatalf("waitForFluxHealth: %v", err)
	}

	ready, reason, readyErr := orch.TestHookStorageReady(context.Background())
	if readyErr != nil {
		t.Fatalf("storageReady error: %v", readyErr)
	}
	if !ready {
		t.Fatalf("expected storage ready, reason=%s", reason)
	}
	if err := orch.TestHookWaitForStorageReady(context.Background()); err != nil {
		t.Fatalf("waitForStorageReady: %v", err)
	}
}

// TestHookTimeSyncAndDatastoreBranches runs one orchestration or CLI step.
// Signature: TestHookTimeSyncAndDatastoreBranches(t *testing.T).
// Why: covers time-sync gate and datastore preflight helpers, including parser and TCP helper paths.
func TestHookTimeSyncAndDatastoreBranches(t *testing.T) {
	cfg := lifecycleConfig(t)
	cfg.ControlPlanes = []string{"titan-db", "titan-23"}
	cfg.Workers = []string{"titan-24"}
	cfg.SSHManagedNodes = []string{"titan-db", "titan-23", "titan-24"}
	cfg.SSHNodeHosts["titan-23"] = "titan-23"
	cfg.SSHNodeHosts["titan-24"] = "titan-24"
	cfg.Startup.TimeSyncMode = "quorum"
	cfg.Startup.TimeSyncQuorum = 1
	cfg.Startup.TimeSyncWaitSeconds = 2
	cfg.Startup.TimeSyncPollSeconds = 1

	recorder := &commandRecorder{}
	base := lifecycleDispatcher(recorder)
	run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
		command := name + " " + strings.Join(args, " ")
		switch {
		case name == "sh" && strings.Contains(command, "timedatectl show -p NTPSynchronized --value"):
			recorder.record(name, args)
			return "yes", nil
		case name == "ssh" && strings.Contains(command, "timedatectl show -p NTPSynchronized --value"):
			recorder.record(name, args)
			if strings.Contains(command, "titan-db") {
				return "yes", nil
			}
			return "no", nil
		case name == "ssh" && strings.Contains(command, "systemctl cat k3s"):
			recorder.record(name, args)
			return "ExecStart=/usr/local/bin/k3s server --datastore-endpoint=postgres://127.0.0.1:6543/k3s", nil
		default:
			return base(ctx, timeout, name, args...)
		}
	}
	orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0))
	orch.SetCommandOverrides(run, run)

	if err := orch.TestHookWaitForTimeSync(context.Background(), []string{"titan-db", "titan-23"}); err != nil {
		t.Fatalf("waitForTimeSync: %v", err)
	}
	if got := cluster.TestHookParseDatastoreEndpoint(`ExecStart=/usr/local/bin/k3s server --datastore-endpoint="postgres://db:5432/k3s"`); !strings.Contains(got, "postgres://db:5432/k3s") {
		t.Fatalf("unexpected datastore endpoint parse: %q", got)
	}
	if got := orch.TestHookNodeNameForHost("titan-23"); got != "titan-23" {
		t.Fatalf("unexpected nodeNameForHost direct match: %q", got)
	}
	if err := orch.TestHookValidateNodeInventory(); err != nil {
		t.Fatalf("validateNodeInventory: %v", err)
	}

	ln, err := net.Listen("tcp", "127.0.0.1:0")
	if err != nil {
		t.Fatalf("listen for tcpReachable test: %v", err)
	}
	addr := ln.Addr().String()
	if !orch.TestHookTCPReachable(addr, time.Second) {
		t.Fatalf("expected tcpReachable=true for listener %s", addr)
	}
	_ = ln.Close()
	if orch.TestHookTCPReachable(addr, 100*time.Millisecond) {
		t.Fatalf("expected tcpReachable=false after listener close")
	}
}

// TestHookChecklistAndStabilityBranches runs one orchestration or CLI step.
// Signature: TestHookChecklistAndStabilityBranches(t *testing.T).
// Why: covers checklist helper methods and startup stability window internals.
func TestHookChecklistAndStabilityBranches(t *testing.T) {
	cfg := lifecycleConfig(t)
	cfg.Startup.ServiceChecklist = []config.ServiceChecklistCheck{
		{
			Name:             "grafana",
			URL:              "https://metrics.bstein.dev/api/health",
			AcceptedStatuses: []int{200},
			BodyContains:     `"database":"ok"`,
			TimeoutSeconds:   5,
		},
	}
	cfg.Startup.ServiceChecklistWaitSeconds = 1
	cfg.Startup.ServiceChecklistPollSeconds = 1
	cfg.Startup.ServiceChecklistStabilitySec = 1
	cfg.Startup.RequireWorkloadConvergence = false

	recorder := &commandRecorder{}
	base := lifecycleDispatcher(recorder)
	serviceCalls := 0
	run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
		command := name + " " + strings.Join(args, " ")
		switch {
		case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
			recorder.record(name, args)
			return `{"items":[]}`, nil
		case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"):
			recorder.record(name, args)
			return `{"items":[{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"grafana"},"spec":{"replicas":1,"template":{"spec":{}}},"status":{"readyReplicas":1}}]}`, nil
		case name == "kubectl" && strings.Contains(command, "get deployment -A -o jsonpath="):
			recorder.record(name, args)
			return "monitoring\tgrafana\t1\n", nil
		case name == "kubectl" && strings.Contains(command, "get statefulset -A -o jsonpath="):
			recorder.record(name, args)
			return "", nil
		case name == "kubectl" && strings.Contains(command, "get ingress -A -o json"):
			recorder.record(name, args)
			return `{"items":[]}`, nil
		case name == "kubectl" && strings.Contains(command, "nodes.longhorn.io"):
			recorder.record(name, args)
			return "lh-a:True:True\n", nil
		case name == "curl":
			recorder.record(name, args)
			serviceCalls++
			if serviceCalls == 1 {
				return "503", nil
			}
			return "200", nil
		default:
			return base(ctx, timeout, name, args...)
		}
	}
	orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0))
	orch.SetCommandOverrides(run, run)

	ok, detail := orch.TestHookPostStartProbesReady(context.Background())
	if !ok || !strings.Contains(detail, "no probes configured") {
		t.Fatalf("expected no-probes ready branch, got ok=%v detail=%q", ok, detail)
	}
	code, err := orch.TestHookHTTPProbe(context.Background(), "https://example.invalid")
	if err != nil {
		t.Fatalf("unexpected HTTP probe error with recorder override: %v", err)
	}
	if code != 503 {
		t.Fatalf("expected first synthetic HTTP probe code=503, got %d", code)
	}

	// Direct checklist readiness path should always return a non-empty status detail.
	_, checkDetail := orch.TestHookServiceChecklistReady(context.Background())
	if checkDetail == "" {
		t.Fatalf("expected service checklist detail to be populated")
	}

	// Force stability helper path through synthetic kubectl outputs.
	if err := orch.TestHookWaitForStabilityWindow(context.Background()); err != nil && !strings.Contains(err.Error(), "stability") {
		t.Fatalf("unexpected stability window error: %v", err)
	}
}