ananke/testing/orchestrator/hooks_ops_timesync_matrix_test.go

252 lines
10 KiB
Go

package orchestrator
import (
"context"
"errors"
"fmt"
"net"
"strings"
"testing"
"time"
"scm.bstein.dev/bstein/ananke/internal/cluster"
)
// TestHookTimeSyncInventoryMatrix runs one orchestration or CLI step.
// Signature: TestHookTimeSyncInventoryMatrix(t *testing.T).
// Why: closes branch-heavy time-sync and datastore preflight gaps that appear
// during cold bootstraps and network jitter.
func TestHookTimeSyncInventoryMatrix(t *testing.T) {
t.Run("wait-for-time-sync-strict-and-quorum", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.TimeSyncWaitSeconds = 1
cfg.Startup.TimeSyncPollSeconds = 1
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "sh" && strings.Contains(command, "timedatectl show -p NTPSynchronized"):
return "yes", nil
case name == "ssh" && strings.Contains(command, "timedatectl show -p NTPSynchronized"):
if strings.Contains(command, "titan-23") {
return "no", nil
}
return "yes", nil
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
if err := orch.TestHookWaitForTimeSync(context.Background(), []string{"titan-db"}); err != nil {
t.Fatalf("strict timesync success expected, got %v", err)
}
cfg.Startup.TimeSyncMode = "quorum"
cfg.Startup.TimeSyncQuorum = 1
orchQuorum, _ := newHookOrchestrator(t, cfg, run, run)
if err := orchQuorum.TestHookWaitForTimeSync(context.Background(), []string{"titan-db", "titan-23"}); err != nil {
t.Fatalf("quorum timesync success expected, got %v", err)
}
})
t.Run("wait-for-time-sync-timeout-and-cancel", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.TimeSyncWaitSeconds = 1
cfg.Startup.TimeSyncPollSeconds = 1
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "sh" && strings.Contains(command, "timedatectl show -p NTPSynchronized"):
return "no", nil
case name == "ssh" && strings.Contains(command, "timedatectl show -p NTPSynchronized"):
return "no", nil
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
err := orch.TestHookWaitForTimeSync(context.Background(), []string{"titan-db"})
if err == nil || !strings.Contains(err.Error(), "time sync not ready") {
t.Fatalf("expected strict timesync timeout error, got %v", err)
}
cancelCtx, cancel := context.WithCancel(context.Background())
cancel()
if err := orch.TestHookWaitForTimeSync(cancelCtx, []string{"titan-db"}); !errors.Is(err, context.Canceled) {
t.Fatalf("expected canceled timesync wait, got %v", err)
}
})
t.Run("preflight-external-datastore-noop-branches", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.ControlPlanes = nil
orchNoCP, _ := newHookOrchestrator(t, cfg, nil, nil)
if err := orchNoCP.TestHookPreflightExternalDatastore(context.Background()); err != nil {
t.Fatalf("expected nil when no control planes configured, got %v", err)
}
cfgUnmanaged := lifecycleConfig(t)
cfgUnmanaged.ControlPlanes = []string{"titan-24"}
cfgUnmanaged.SSHManagedNodes = []string{"titan-db"}
orchUnmanaged, _ := newHookOrchestrator(t, cfgUnmanaged, nil, nil)
if err := orchUnmanaged.TestHookPreflightExternalDatastore(context.Background()); err != nil {
t.Fatalf("expected nil when control plane is unmanaged, got %v", err)
}
cfgInspectErr := lifecycleConfig(t)
runInspectErr := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "ssh" && strings.Contains(command, "systemctl cat k3s") {
return "", errors.New("permission denied")
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchInspectErr, _ := newHookOrchestrator(t, cfgInspectErr, runInspectErr, runInspectErr)
if err := orchInspectErr.TestHookPreflightExternalDatastore(context.Background()); err != nil {
t.Fatalf("expected nil when k3s unit inspect fails, got %v", err)
}
cfgBadEndpoint := lifecycleConfig(t)
runBadEndpoint := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "ssh" && strings.Contains(command, "systemctl cat k3s") {
return "ExecStart=/usr/local/bin/k3s server --datastore-endpoint=::bad-endpoint::", nil
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchBadEndpoint, _ := newHookOrchestrator(t, cfgBadEndpoint, runBadEndpoint, runBadEndpoint)
if err := orchBadEndpoint.TestHookPreflightExternalDatastore(context.Background()); err != nil {
t.Fatalf("expected nil when datastore endpoint cannot be parsed, got %v", err)
}
})
t.Run("preflight-external-datastore-reachable-and-recovery", func(t *testing.T) {
listener, err := net.Listen("tcp", "127.0.0.1:0")
if err != nil {
t.Fatalf("listen for reachable datastore branch: %v", err)
}
reachablePort := listener.Addr().(*net.TCPAddr).Port
defer listener.Close()
cfgReachable := lifecycleConfig(t)
runReachable := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "ssh" && strings.Contains(command, "systemctl cat k3s") {
return fmt.Sprintf(
"ExecStart=/usr/local/bin/k3s server --datastore-endpoint=postgres://127.0.0.1:%d/k3s",
reachablePort,
), nil
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchReachable, _ := newHookOrchestrator(t, cfgReachable, runReachable, runReachable)
if err := orchReachable.TestHookPreflightExternalDatastore(context.Background()); err != nil {
t.Fatalf("expected reachable datastore preflight success, got %v", err)
}
recoverySeed, err := net.Listen("tcp", "127.0.0.1:0")
if err != nil {
t.Fatalf("seed recovery port: %v", err)
}
recoveryPort := recoverySeed.Addr().(*net.TCPAddr).Port
_ = recoverySeed.Close()
cfgRecovery := lifecycleConfig(t)
cfgRecovery.SSHManagedNodes = append(cfgRecovery.SSHManagedNodes, "db-postgres")
cfgRecovery.SSHNodeHosts["db-postgres"] = "127.0.0.1"
restarted := false
runRecovery := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "ssh" && strings.Contains(command, "systemctl cat k3s"):
return fmt.Sprintf(
"ExecStart=/usr/local/bin/k3s server --datastore-endpoint=postgres://127.0.0.1:%d/k3s",
recoveryPort,
), nil
case name == "ssh" && strings.Contains(command, "systemctl restart postgresql"):
restarted = true
go func() {
time.Sleep(500 * time.Millisecond)
l, listenErr := net.Listen("tcp", fmt.Sprintf("127.0.0.1:%d", recoveryPort))
if listenErr != nil {
return
}
time.Sleep(4 * time.Second)
_ = l.Close()
}()
return "", nil
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orchRecovery, _ := newHookOrchestrator(t, cfgRecovery, runRecovery, runRecovery)
if err := orchRecovery.TestHookPreflightExternalDatastore(context.Background()); err != nil {
t.Fatalf("expected datastore recovery branch success, got %v", err)
}
if !restarted {
t.Fatalf("expected datastore restart branch to run")
}
})
t.Run("preflight-external-datastore-canceled", func(t *testing.T) {
cfg := lifecycleConfig(t)
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "ssh" && strings.Contains(command, "systemctl cat k3s") {
return "ExecStart=/usr/local/bin/k3s server --datastore-endpoint=postgres://127.0.0.1:1/k3s", nil
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
cancelCtx, cancel := context.WithCancel(context.Background())
cancel()
if err := orch.TestHookPreflightExternalDatastore(cancelCtx); !errors.Is(err, context.Canceled) {
t.Fatalf("expected canceled datastore preflight, got %v", err)
}
})
t.Run("helpers-parse-endpoint-node-lookup-and-inventory-validate", func(t *testing.T) {
if got := cluster.TestHookParseDatastoreEndpoint(`ExecStart=/usr/local/bin/k3s server --datastore-endpoint="postgres://db.internal:5432/k3s"`); got == "" {
t.Fatalf("expected regex endpoint parse to return value")
}
if got := cluster.TestHookParseDatastoreEndpoint("ExecStart=/usr/local/bin/k3s server \\\n --datastore-endpoint=postgres://db.internal:5432/k3s \\\n"); got == "" {
t.Fatalf("expected line-scan endpoint parse to return value")
}
if got := cluster.TestHookParseDatastoreEndpoint("ExecStart=/usr/local/bin/k3s server"); got != "" {
t.Fatalf("expected empty parse result for missing endpoint, got %q", got)
}
cfg := lifecycleConfig(t)
cfg.SSHNodeHosts["db-postgres"] = "10.42.0.10"
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
if got := orch.TestHookNodeNameForHost("db-postgres"); got != "db-postgres" {
t.Fatalf("expected direct node lookup, got %q", got)
}
if got := orch.TestHookNodeNameForHost("10.42.0.10"); got != "db-postgres" {
t.Fatalf("expected reverse host lookup to resolve db-postgres, got %q", got)
}
if got := orch.TestHookNodeNameForHost(""); got != "" {
t.Fatalf("expected empty host lookup to be empty, got %q", got)
}
cfgBad := lifecycleConfig(t)
cfgBad.SSHPort = 70000
cfgBad.Workers = []string{"titan-23", "titan-24"}
cfgBad.SSHManagedNodes = []string{"titan-db", "titan-23"}
cfgBad.SSHNodeHosts["titan-24"] = "bad/host"
cfgBad.SSHNodeUsers = map[string]string{}
cfgBad.SSHNodeUsers["titan-24"] = "bad@user"
orchBad, _ := newHookOrchestrator(t, cfgBad, nil, nil)
err := orchBad.TestHookValidateNodeInventory()
if err == nil {
t.Fatalf("expected invalid inventory error")
}
if !strings.Contains(err.Error(), "ssh_port") || !strings.Contains(err.Error(), "invalid ssh host") || !strings.Contains(err.Error(), "invalid ssh user") {
t.Fatalf("unexpected inventory validation error: %v", err)
}
})
}