ananke/testing/orchestrator/hooks_health_storage_test.go

236 lines
9.4 KiB
Go
Raw Normal View History

package orchestrator
import (
"context"
"io"
"log"
"net"
"strings"
"testing"
"time"
"scm.bstein.dev/bstein/ananke/internal/cluster"
"scm.bstein.dev/bstein/ananke/internal/config"
"scm.bstein.dev/bstein/ananke/internal/execx"
"scm.bstein.dev/bstein/ananke/internal/state"
)
// TestHookFluxHealthAndStorageBranches runs one orchestration or CLI step.
// Signature: TestHookFluxHealthAndStorageBranches(t *testing.T).
// Why: exercises flux-health and storage readiness helpers directly for coverage and behavioral safety.
func TestHookFluxHealthAndStorageBranches(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.FluxHealthWaitSeconds = 2
cfg.Startup.FluxHealthPollSeconds = 1
cfg.Startup.IgnoreFluxKustomizations = []string{"flux-system/skip-me"}
cfg.Startup.StorageReadyWaitSeconds = 2
cfg.Startup.StorageReadyPollSeconds = 1
cfg.Startup.StorageMinReadyNodes = 1
cfg.Startup.StorageCriticalPVCs = []string{"monitoring/grafana-data"}
recorder := &commandRecorder{}
base := lifecycleDispatcher(recorder)
fluxCalls := 0
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "get kustomizations.kustomize.toolkit.fluxcd.io -A -o json"):
recorder.record(name, args)
fluxCalls++
if fluxCalls <= 1 {
return `{"items":[{"metadata":{"namespace":"flux-system","name":"services"},"spec":{"suspend":false},"status":{"conditions":[{"type":"Ready","status":"False","reason":"Unknown","message":"waiting"}]}}]}`, nil
}
return `{"items":[{"metadata":{"namespace":"flux-system","name":"services"},"spec":{"suspend":false},"status":{"conditions":[{"type":"Ready","status":"True","reason":"ReconciliationSucceeded","message":"ok"}]}}]}`, nil
case name == "kubectl" && strings.Contains(command, "get jobs -A -o json"):
recorder.record(name, args)
return `{"items":[]}`, nil
case name == "kubectl" && strings.Contains(command, "nodes.longhorn.io"):
recorder.record(name, args)
return "lh-a:True:True\n", nil
case name == "kubectl" && strings.Contains(command, "-n monitoring get pvc grafana-data"):
recorder.record(name, args)
return "Bound", nil
default:
return base(ctx, timeout, name, args...)
}
}
orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0))
orch.SetCommandOverrides(run, run)
ok, detail, err := orch.TestHookFluxHealthReady(context.Background())
if err != nil {
t.Fatalf("fluxHealthReady error: %v", err)
}
if ok {
t.Fatalf("expected first fluxHealthReady call to be not-ready: %s", detail)
}
healed, err := orch.TestHookHealImmutableFluxJobs(context.Background())
if err != nil {
t.Fatalf("healImmutableFluxJobs error: %v", err)
}
if healed {
t.Fatalf("expected no immutable job heal action in this fixture")
}
if err := orch.TestHookWaitForFluxHealth(context.Background()); err != nil {
t.Fatalf("waitForFluxHealth: %v", err)
}
ready, reason, readyErr := orch.TestHookStorageReady(context.Background())
if readyErr != nil {
t.Fatalf("storageReady error: %v", readyErr)
}
if !ready {
t.Fatalf("expected storage ready, reason=%s", reason)
}
if err := orch.TestHookWaitForStorageReady(context.Background()); err != nil {
t.Fatalf("waitForStorageReady: %v", err)
}
}
// TestHookTimeSyncAndDatastoreBranches runs one orchestration or CLI step.
// Signature: TestHookTimeSyncAndDatastoreBranches(t *testing.T).
// Why: covers time-sync gate and datastore preflight helpers, including parser and TCP helper paths.
func TestHookTimeSyncAndDatastoreBranches(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.ControlPlanes = []string{"titan-db", "titan-23"}
cfg.Workers = []string{"titan-24"}
cfg.SSHManagedNodes = []string{"titan-db", "titan-23", "titan-24"}
cfg.SSHNodeHosts["titan-23"] = "titan-23"
cfg.SSHNodeHosts["titan-24"] = "titan-24"
cfg.Startup.TimeSyncMode = "quorum"
cfg.Startup.TimeSyncQuorum = 1
cfg.Startup.TimeSyncWaitSeconds = 2
cfg.Startup.TimeSyncPollSeconds = 1
recorder := &commandRecorder{}
base := lifecycleDispatcher(recorder)
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "sh" && strings.Contains(command, "timedatectl show -p NTPSynchronized --value"):
recorder.record(name, args)
return "yes", nil
case name == "ssh" && strings.Contains(command, "timedatectl show -p NTPSynchronized --value"):
recorder.record(name, args)
if strings.Contains(command, "titan-db") {
return "yes", nil
}
return "no", nil
case name == "ssh" && strings.Contains(command, "systemctl cat k3s"):
recorder.record(name, args)
return "ExecStart=/usr/local/bin/k3s server --datastore-endpoint=postgres://127.0.0.1:6543/k3s", nil
default:
return base(ctx, timeout, name, args...)
}
}
orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0))
orch.SetCommandOverrides(run, run)
if err := orch.TestHookWaitForTimeSync(context.Background(), []string{"titan-db", "titan-23"}); err != nil {
t.Fatalf("waitForTimeSync: %v", err)
}
if got := cluster.TestHookParseDatastoreEndpoint(`ExecStart=/usr/local/bin/k3s server --datastore-endpoint="postgres://db:5432/k3s"`); !strings.Contains(got, "postgres://db:5432/k3s") {
t.Fatalf("unexpected datastore endpoint parse: %q", got)
}
if got := orch.TestHookNodeNameForHost("titan-23"); got != "titan-23" {
t.Fatalf("unexpected nodeNameForHost direct match: %q", got)
}
if err := orch.TestHookValidateNodeInventory(); err != nil {
t.Fatalf("validateNodeInventory: %v", err)
}
ln, err := net.Listen("tcp", "127.0.0.1:0")
if err != nil {
t.Fatalf("listen for tcpReachable test: %v", err)
}
addr := ln.Addr().String()
if !orch.TestHookTCPReachable(addr, time.Second) {
t.Fatalf("expected tcpReachable=true for listener %s", addr)
}
_ = ln.Close()
if orch.TestHookTCPReachable(addr, 100*time.Millisecond) {
t.Fatalf("expected tcpReachable=false after listener close")
}
}
// TestHookChecklistAndStabilityBranches runs one orchestration or CLI step.
// Signature: TestHookChecklistAndStabilityBranches(t *testing.T).
// Why: covers checklist helper methods and startup stability window internals.
func TestHookChecklistAndStabilityBranches(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.ServiceChecklist = []config.ServiceChecklistCheck{
{
Name: "grafana",
URL: "https://metrics.bstein.dev/api/health",
AcceptedStatuses: []int{200},
BodyContains: `"database":"ok"`,
TimeoutSeconds: 5,
},
}
cfg.Startup.ServiceChecklistWaitSeconds = 1
cfg.Startup.ServiceChecklistPollSeconds = 1
cfg.Startup.ServiceChecklistStabilitySec = 1
cfg.Startup.RequireWorkloadConvergence = false
recorder := &commandRecorder{}
base := lifecycleDispatcher(recorder)
serviceCalls := 0
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
recorder.record(name, args)
return `{"items":[]}`, nil
case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"):
recorder.record(name, args)
return `{"items":[{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"grafana"},"spec":{"replicas":1,"template":{"spec":{}}},"status":{"readyReplicas":1}}]}`, nil
case name == "kubectl" && strings.Contains(command, "get deployment -A -o jsonpath="):
recorder.record(name, args)
return "monitoring\tgrafana\t1\n", nil
case name == "kubectl" && strings.Contains(command, "get statefulset -A -o jsonpath="):
recorder.record(name, args)
return "", nil
case name == "kubectl" && strings.Contains(command, "get ingress -A -o json"):
recorder.record(name, args)
return `{"items":[]}`, nil
case name == "kubectl" && strings.Contains(command, "nodes.longhorn.io"):
recorder.record(name, args)
return "lh-a:True:True\n", nil
case name == "curl":
recorder.record(name, args)
serviceCalls++
if serviceCalls == 1 {
return "503", nil
}
return "200", nil
default:
return base(ctx, timeout, name, args...)
}
}
orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0))
orch.SetCommandOverrides(run, run)
ok, detail := orch.TestHookPostStartProbesReady(context.Background())
if !ok || !strings.Contains(detail, "no probes configured") {
t.Fatalf("expected no-probes ready branch, got ok=%v detail=%q", ok, detail)
}
code, err := orch.TestHookHTTPProbe(context.Background(), "https://example.invalid")
if err != nil {
t.Fatalf("unexpected HTTP probe error with recorder override: %v", err)
}
if code != 503 {
t.Fatalf("expected first synthetic HTTP probe code=503, got %d", code)
}
// Direct checklist readiness path should always return a non-empty status detail.
_, checkDetail := orch.TestHookServiceChecklistReady(context.Background())
if checkDetail == "" {
t.Fatalf("expected service checklist detail to be populated")
}
// Force stability helper path through synthetic kubectl outputs.
if err := orch.TestHookWaitForStabilityWindow(context.Background()); err != nil && !strings.Contains(err.Error(), "stability") {
t.Fatalf("unexpected stability window error: %v", err)
}
}