ananke/testing/orchestrator/hooks_health_storage_test.go

package orchestrator

import (
	"context"
	"io"
	"log"
	"net"
	"strings"
	"testing"
	"time"

	"scm.bstein.dev/bstein/ananke/internal/cluster"
	"scm.bstein.dev/bstein/ananke/internal/config"
	"scm.bstein.dev/bstein/ananke/internal/execx"
	"scm.bstein.dev/bstein/ananke/internal/state"
)

// TestHookFluxHealthAndStorageBranches runs one orchestration or CLI step.
// Signature: TestHookFluxHealthAndStorageBranches(t *testing.T).
// Why: exercises flux-health and storage readiness helpers directly for coverage and behavioral safety.
func TestHookFluxHealthAndStorageBranches(t *testing.T) {
	cfg := lifecycleConfig(t)
	cfg.Startup.FluxHealthWaitSeconds = 2
	cfg.Startup.FluxHealthPollSeconds = 1
	cfg.Startup.IgnoreFluxKustomizations = []string{"flux-system/skip-me"}
	cfg.Startup.StorageReadyWaitSeconds = 2
	cfg.Startup.StorageReadyPollSeconds = 1
	cfg.Startup.StorageMinReadyNodes = 1
	cfg.Startup.StorageCriticalPVCs = []string{"monitoring/grafana-data"}

	recorder := &commandRecorder{}
	base := lifecycleDispatcher(recorder)
	fluxCalls := 0
	run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
		command := name + " " + strings.Join(args, " ")
		switch {
		case name == "kubectl" && strings.Contains(command, "get kustomizations.kustomize.toolkit.fluxcd.io -A -o json"):
			recorder.record(name, args)
			fluxCalls++
			if fluxCalls <= 1 {
				return `{"items":[{"metadata":{"namespace":"flux-system","name":"services"},"spec":{"suspend":false},"status":{"conditions":[{"type":"Ready","status":"False","reason":"Unknown","message":"waiting"}]}}]}`, nil
			}
			return `{"items":[{"metadata":{"namespace":"flux-system","name":"services"},"spec":{"suspend":false},"status":{"conditions":[{"type":"Ready","status":"True","reason":"ReconciliationSucceeded","message":"ok"}]}}]}`, nil
		case name == "kubectl" && strings.Contains(command, "get jobs -A -o json"):
			recorder.record(name, args)
			return `{"items":[]}`, nil
		case name == "kubectl" && strings.Contains(command, "nodes.longhorn.io"):
			recorder.record(name, args)
			return "lh-a:True:True\n", nil
		case name == "kubectl" && strings.Contains(command, "-n monitoring get pvc grafana-data"):
			recorder.record(name, args)
			return "Bound", nil
		default:
			return base(ctx, timeout, name, args...)
		}
	}
	orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0))
	orch.SetCommandOverrides(run, run)

	ok, detail, err := orch.TestHookFluxHealthReady(context.Background())
	if err != nil {
		t.Fatalf("fluxHealthReady error: %v", err)
	}
	if ok {
		t.Fatalf("expected first fluxHealthReady call to be not-ready: %s", detail)
	}
	healed, err := orch.TestHookHealImmutableFluxJobs(context.Background())
	if err != nil {
		t.Fatalf("healImmutableFluxJobs error: %v", err)
	}
	if healed {
		t.Fatalf("expected no immutable job heal action in this fixture")
	}
	if err := orch.TestHookWaitForFluxHealth(context.Background()); err != nil {
		t.Fatalf("waitForFluxHealth: %v", err)
	}

	ready, reason, readyErr := orch.TestHookStorageReady(context.Background())
	if readyErr != nil {
		t.Fatalf("storageReady error: %v", readyErr)
	}
	if !ready {
		t.Fatalf("expected storage ready, reason=%s", reason)
	}
	if err := orch.TestHookWaitForStorageReady(context.Background()); err != nil {
		t.Fatalf("waitForStorageReady: %v", err)
	}
}

// TestHookTimeSyncAndDatastoreBranches runs one orchestration or CLI step.
// Signature: TestHookTimeSyncAndDatastoreBranches(t *testing.T).
// Why: covers time-sync gate and datastore preflight helpers, including parser and TCP helper paths.
func TestHookTimeSyncAndDatastoreBranches(t *testing.T) {
	cfg := lifecycleConfig(t)
	cfg.ControlPlanes = []string{"titan-db", "titan-23"}
	cfg.Workers = []string{"titan-24"}
	cfg.SSHManagedNodes = []string{"titan-db", "titan-23", "titan-24"}
	cfg.SSHNodeHosts["titan-23"] = "titan-23"
	cfg.SSHNodeHosts["titan-24"] = "titan-24"
	cfg.Startup.TimeSyncMode = "quorum"
	cfg.Startup.TimeSyncQuorum = 1
	cfg.Startup.TimeSyncWaitSeconds = 2
	cfg.Startup.TimeSyncPollSeconds = 1

	recorder := &commandRecorder{}
	base := lifecycleDispatcher(recorder)
	run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
		command := name + " " + strings.Join(args, " ")
		switch {
		case name == "sh" && strings.Contains(command, "timedatectl show -p NTPSynchronized --value"):
			recorder.record(name, args)
			return "yes", nil
		case name == "ssh" && strings.Contains(command, "timedatectl show -p NTPSynchronized --value"):
			recorder.record(name, args)
			if strings.Contains(command, "titan-db") {
				return "yes", nil
			}
			return "no", nil
		case name == "ssh" && strings.Contains(command, "systemctl cat k3s"):
			recorder.record(name, args)
			return "ExecStart=/usr/local/bin/k3s server --datastore-endpoint=postgres://127.0.0.1:6543/k3s", nil
		default:
			return base(ctx, timeout, name, args...)
		}
	}
	orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0))
	orch.SetCommandOverrides(run, run)

	if err := orch.TestHookWaitForTimeSync(context.Background(), []string{"titan-db", "titan-23"}); err != nil {
		t.Fatalf("waitForTimeSync: %v", err)
	}
	if got := cluster.TestHookParseDatastoreEndpoint(`ExecStart=/usr/local/bin/k3s server --datastore-endpoint="postgres://db:5432/k3s"`); !strings.Contains(got, "postgres://db:5432/k3s") {
		t.Fatalf("unexpected datastore endpoint parse: %q", got)
	}
	if got := orch.TestHookNodeNameForHost("titan-23"); got != "titan-23" {
		t.Fatalf("unexpected nodeNameForHost direct match: %q", got)
	}
	if err := orch.TestHookValidateNodeInventory(); err != nil {
		t.Fatalf("validateNodeInventory: %v", err)
	}

	ln, err := net.Listen("tcp", "127.0.0.1:0")
	if err != nil {
		t.Fatalf("listen for tcpReachable test: %v", err)
	}
	addr := ln.Addr().String()
	if !orch.TestHookTCPReachable(addr, time.Second) {
		t.Fatalf("expected tcpReachable=true for listener %s", addr)
	}
	_ = ln.Close()
	if orch.TestHookTCPReachable(addr, 100*time.Millisecond) {
		t.Fatalf("expected tcpReachable=false after listener close")
	}
}

// TestHookChecklistAndStabilityBranches runs one orchestration or CLI step.
// Signature: TestHookChecklistAndStabilityBranches(t *testing.T).
// Why: covers checklist helper methods and startup stability window internals.
func TestHookChecklistAndStabilityBranches(t *testing.T) {
	cfg := lifecycleConfig(t)
	cfg.Startup.ServiceChecklist = []config.ServiceChecklistCheck{
		{
			Name:             "grafana",
			URL:              "https://metrics.bstein.dev/api/health",
			AcceptedStatuses: []int{200},
			BodyContains:     `"database":"ok"`,
			TimeoutSeconds:   5,
		},
	}
	cfg.Startup.ServiceChecklistWaitSeconds = 1
	cfg.Startup.ServiceChecklistPollSeconds = 1
	cfg.Startup.ServiceChecklistStabilitySec = 1
	cfg.Startup.RequireWorkloadConvergence = false

	recorder := &commandRecorder{}
	base := lifecycleDispatcher(recorder)
	serviceCalls := 0
	run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
		command := name + " " + strings.Join(args, " ")
		switch {
		case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
			recorder.record(name, args)
			return `{"items":[]}`, nil
		case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"):
			recorder.record(name, args)
			return `{"items":[{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"grafana"},"spec":{"replicas":1,"template":{"spec":{}}},"status":{"readyReplicas":1}}]}`, nil
		case name == "kubectl" && strings.Contains(command, "get deployment -A -o jsonpath="):
			recorder.record(name, args)
			return "monitoring\tgrafana\t1\n", nil
		case name == "kubectl" && strings.Contains(command, "get statefulset -A -o jsonpath="):
			recorder.record(name, args)
			return "", nil
		case name == "kubectl" && strings.Contains(command, "get ingress -A -o json"):
			recorder.record(name, args)
			return `{"items":[]}`, nil
		case name == "kubectl" && strings.Contains(command, "nodes.longhorn.io"):
			recorder.record(name, args)
			return "lh-a:True:True\n", nil
		case name == "curl":
			recorder.record(name, args)
			serviceCalls++
			if serviceCalls == 1 {
				return "503", nil
			}
			return "200", nil
		default:
			return base(ctx, timeout, name, args...)
		}
	}
	orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0))
	orch.SetCommandOverrides(run, run)

	ok, detail := orch.TestHookPostStartProbesReady(context.Background())
	if !ok || !strings.Contains(detail, "no probes configured") {
		t.Fatalf("expected no-probes ready branch, got ok=%v detail=%q", ok, detail)
	}
	code, err := orch.TestHookHTTPProbe(context.Background(), "https://example.invalid")
	if err != nil {
		t.Fatalf("unexpected HTTP probe error with recorder override: %v", err)
	}
	if code != 503 {
		t.Fatalf("expected first synthetic HTTP probe code=503, got %d", code)
	}

	// Direct checklist readiness path should always return a non-empty status detail.
	_, checkDetail := orch.TestHookServiceChecklistReady(context.Background())
	if checkDetail == "" {
		t.Fatalf("expected service checklist detail to be populated")
	}

	// Force stability helper path through synthetic kubectl outputs.
	if err := orch.TestHookWaitForStabilityWindow(context.Background()); err != nil && !strings.Contains(err.Error(), "stability") {
		t.Fatalf("unexpected stability window error: %v", err)
	}
}
build: reconcile split modules and restore clean checkout integrity 2026-04-08 23:52:29 -03:00			`package orchestrator`

			`import (`
			`"context"`
			`"io"`
			`"log"`
			`"net"`
			`"strings"`
			`"testing"`
			`"time"`

			`"scm.bstein.dev/bstein/ananke/internal/cluster"`
			`"scm.bstein.dev/bstein/ananke/internal/config"`
			`"scm.bstein.dev/bstein/ananke/internal/execx"`
			`"scm.bstein.dev/bstein/ananke/internal/state"`
			`)`

			`// TestHookFluxHealthAndStorageBranches runs one orchestration or CLI step.`
			`// Signature: TestHookFluxHealthAndStorageBranches(t *testing.T).`
			`// Why: exercises flux-health and storage readiness helpers directly for coverage and behavioral safety.`
			`func TestHookFluxHealthAndStorageBranches(t *testing.T) {`
			`cfg := lifecycleConfig(t)`
			`cfg.Startup.FluxHealthWaitSeconds = 2`
			`cfg.Startup.FluxHealthPollSeconds = 1`
			`cfg.Startup.IgnoreFluxKustomizations = []string{"flux-system/skip-me"}`
			`cfg.Startup.StorageReadyWaitSeconds = 2`
			`cfg.Startup.StorageReadyPollSeconds = 1`
			`cfg.Startup.StorageMinReadyNodes = 1`
			`cfg.Startup.StorageCriticalPVCs = []string{"monitoring/grafana-data"}`

			`recorder := &commandRecorder{}`
			`base := lifecycleDispatcher(recorder)`
			`fluxCalls := 0`
			`run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {`
			`command := name + " " + strings.Join(args, " ")`
			`switch {`
			`case name == "kubectl" && strings.Contains(command, "get kustomizations.kustomize.toolkit.fluxcd.io -A -o json"):`
			`recorder.record(name, args)`
			`fluxCalls++`
			`if fluxCalls <= 1 {`
			return `{"items":[{"metadata":{"namespace":"flux-system","name":"services"},"spec":{"suspend":false},"status":{"conditions":[{"type":"Ready","status":"False","reason":"Unknown","message":"waiting"}]}}]}`, nil
			`}`
			return `{"items":[{"metadata":{"namespace":"flux-system","name":"services"},"spec":{"suspend":false},"status":{"conditions":[{"type":"Ready","status":"True","reason":"ReconciliationSucceeded","message":"ok"}]}}]}`, nil
			`case name == "kubectl" && strings.Contains(command, "get jobs -A -o json"):`
			`recorder.record(name, args)`
			return `{"items":[]}`, nil
			`case name == "kubectl" && strings.Contains(command, "nodes.longhorn.io"):`
			`recorder.record(name, args)`
			`return "lh-a:True:True\n", nil`
			`case name == "kubectl" && strings.Contains(command, "-n monitoring get pvc grafana-data"):`
			`recorder.record(name, args)`
			`return "Bound", nil`
			`default:`
			`return base(ctx, timeout, name, args...)`
			`}`
			`}`
			`orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0))`
			`orch.SetCommandOverrides(run, run)`

			`ok, detail, err := orch.TestHookFluxHealthReady(context.Background())`
			`if err != nil {`
			`t.Fatalf("fluxHealthReady error: %v", err)`
			`}`
			`if ok {`
			`t.Fatalf("expected first fluxHealthReady call to be not-ready: %s", detail)`
			`}`
			`healed, err := orch.TestHookHealImmutableFluxJobs(context.Background())`
			`if err != nil {`
			`t.Fatalf("healImmutableFluxJobs error: %v", err)`
			`}`
			`if healed {`
			`t.Fatalf("expected no immutable job heal action in this fixture")`
			`}`
			`if err := orch.TestHookWaitForFluxHealth(context.Background()); err != nil {`
			`t.Fatalf("waitForFluxHealth: %v", err)`
			`}`

			`ready, reason, readyErr := orch.TestHookStorageReady(context.Background())`
			`if readyErr != nil {`
			`t.Fatalf("storageReady error: %v", readyErr)`
			`}`
			`if !ready {`
			`t.Fatalf("expected storage ready, reason=%s", reason)`
			`}`
			`if err := orch.TestHookWaitForStorageReady(context.Background()); err != nil {`
			`t.Fatalf("waitForStorageReady: %v", err)`
			`}`
			`}`

			`// TestHookTimeSyncAndDatastoreBranches runs one orchestration or CLI step.`
			`// Signature: TestHookTimeSyncAndDatastoreBranches(t *testing.T).`
			`// Why: covers time-sync gate and datastore preflight helpers, including parser and TCP helper paths.`
			`func TestHookTimeSyncAndDatastoreBranches(t *testing.T) {`
			`cfg := lifecycleConfig(t)`
			`cfg.ControlPlanes = []string{"titan-db", "titan-23"}`
			`cfg.Workers = []string{"titan-24"}`
			`cfg.SSHManagedNodes = []string{"titan-db", "titan-23", "titan-24"}`
			`cfg.SSHNodeHosts["titan-23"] = "titan-23"`
			`cfg.SSHNodeHosts["titan-24"] = "titan-24"`
			`cfg.Startup.TimeSyncMode = "quorum"`
			`cfg.Startup.TimeSyncQuorum = 1`
			`cfg.Startup.TimeSyncWaitSeconds = 2`
			`cfg.Startup.TimeSyncPollSeconds = 1`

			`recorder := &commandRecorder{}`
			`base := lifecycleDispatcher(recorder)`
			`run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {`
			`command := name + " " + strings.Join(args, " ")`
			`switch {`
			`case name == "sh" && strings.Contains(command, "timedatectl show -p NTPSynchronized --value"):`
			`recorder.record(name, args)`
			`return "yes", nil`
			`case name == "ssh" && strings.Contains(command, "timedatectl show -p NTPSynchronized --value"):`
			`recorder.record(name, args)`
			`if strings.Contains(command, "titan-db") {`
			`return "yes", nil`
			`}`
			`return "no", nil`
			`case name == "ssh" && strings.Contains(command, "systemctl cat k3s"):`
			`recorder.record(name, args)`
			`return "ExecStart=/usr/local/bin/k3s server --datastore-endpoint=postgres://127.0.0.1:6543/k3s", nil`
			`default:`
			`return base(ctx, timeout, name, args...)`
			`}`
			`}`
			`orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0))`
			`orch.SetCommandOverrides(run, run)`

			`if err := orch.TestHookWaitForTimeSync(context.Background(), []string{"titan-db", "titan-23"}); err != nil {`
			`t.Fatalf("waitForTimeSync: %v", err)`
			`}`
			if got := cluster.TestHookParseDatastoreEndpoint(`ExecStart=/usr/local/bin/k3s server --datastore-endpoint="postgres://db:5432/k3s"`); !strings.Contains(got, "postgres://db:5432/k3s") {
			`t.Fatalf("unexpected datastore endpoint parse: %q", got)`
			`}`
			`if got := orch.TestHookNodeNameForHost("titan-23"); got != "titan-23" {`
			`t.Fatalf("unexpected nodeNameForHost direct match: %q", got)`
			`}`
			`if err := orch.TestHookValidateNodeInventory(); err != nil {`
			`t.Fatalf("validateNodeInventory: %v", err)`
			`}`

			`ln, err := net.Listen("tcp", "127.0.0.1:0")`
			`if err != nil {`
			`t.Fatalf("listen for tcpReachable test: %v", err)`
			`}`
			`addr := ln.Addr().String()`
			`if !orch.TestHookTCPReachable(addr, time.Second) {`
			`t.Fatalf("expected tcpReachable=true for listener %s", addr)`
			`}`
			`_ = ln.Close()`
			`if orch.TestHookTCPReachable(addr, 100*time.Millisecond) {`
			`t.Fatalf("expected tcpReachable=false after listener close")`
			`}`
			`}`

			`// TestHookChecklistAndStabilityBranches runs one orchestration or CLI step.`
			`// Signature: TestHookChecklistAndStabilityBranches(t *testing.T).`
			`// Why: covers checklist helper methods and startup stability window internals.`
			`func TestHookChecklistAndStabilityBranches(t *testing.T) {`
			`cfg := lifecycleConfig(t)`
			`cfg.Startup.ServiceChecklist = []config.ServiceChecklistCheck{`
			`{`
			`Name: "grafana",`
			`URL: "https://metrics.bstein.dev/api/health",`
			`AcceptedStatuses: []int{200},`
			BodyContains: `"database":"ok"`,
			`TimeoutSeconds: 5,`
			`},`
			`}`
			`cfg.Startup.ServiceChecklistWaitSeconds = 1`
			`cfg.Startup.ServiceChecklistPollSeconds = 1`
			`cfg.Startup.ServiceChecklistStabilitySec = 1`
			`cfg.Startup.RequireWorkloadConvergence = false`

			`recorder := &commandRecorder{}`
			`base := lifecycleDispatcher(recorder)`
			`serviceCalls := 0`
			`run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {`
			`command := name + " " + strings.Join(args, " ")`
			`switch {`
			`case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):`
			`recorder.record(name, args)`
			return `{"items":[]}`, nil
			`case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"):`
			`recorder.record(name, args)`
			return `{"items":[{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"grafana"},"spec":{"replicas":1,"template":{"spec":{}}},"status":{"readyReplicas":1}}]}`, nil
			`case name == "kubectl" && strings.Contains(command, "get deployment -A -o jsonpath="):`
			`recorder.record(name, args)`
			`return "monitoring\tgrafana\t1\n", nil`
			`case name == "kubectl" && strings.Contains(command, "get statefulset -A -o jsonpath="):`
			`recorder.record(name, args)`
			`return "", nil`
			`case name == "kubectl" && strings.Contains(command, "get ingress -A -o json"):`
			`recorder.record(name, args)`
			return `{"items":[]}`, nil
			`case name == "kubectl" && strings.Contains(command, "nodes.longhorn.io"):`
			`recorder.record(name, args)`
			`return "lh-a:True:True\n", nil`
			`case name == "curl":`
			`recorder.record(name, args)`
			`serviceCalls++`
			`if serviceCalls == 1 {`
			`return "503", nil`
			`}`
			`return "200", nil`
			`default:`
			`return base(ctx, timeout, name, args...)`
			`}`
			`}`
			`orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0))`
			`orch.SetCommandOverrides(run, run)`

			`ok, detail := orch.TestHookPostStartProbesReady(context.Background())`
			`if !ok \|\| !strings.Contains(detail, "no probes configured") {`
			`t.Fatalf("expected no-probes ready branch, got ok=%v detail=%q", ok, detail)`
			`}`
			`code, err := orch.TestHookHTTPProbe(context.Background(), "https://example.invalid")`
			`if err != nil {`
			`t.Fatalf("unexpected HTTP probe error with recorder override: %v", err)`
			`}`
			`if code != 503 {`
			`t.Fatalf("expected first synthetic HTTP probe code=503, got %d", code)`
			`}`

			`// Direct checklist readiness path should always return a non-empty status detail.`
			`_, checkDetail := orch.TestHookServiceChecklistReady(context.Background())`
			`if checkDetail == "" {`
			`t.Fatalf("expected service checklist detail to be populated")`
			`}`

			`// Force stability helper path through synthetic kubectl outputs.`
			`if err := orch.TestHookWaitForStabilityWindow(context.Background()); err != nil && !strings.Contains(err.Error(), "stability") {`
			`t.Fatalf("unexpected stability window error: %v", err)`
			`}`
			`}`