package orchestrator import ( "context" "errors" "fmt" "net" "strings" "testing" "time" "scm.bstein.dev/bstein/ananke/internal/cluster" ) // TestHookTimeSyncInventoryMatrix runs one orchestration or CLI step. // Signature: TestHookTimeSyncInventoryMatrix(t *testing.T). // Why: closes branch-heavy time-sync and datastore preflight gaps that appear // during cold bootstraps and network jitter. func TestHookTimeSyncInventoryMatrix(t *testing.T) { t.Run("wait-for-time-sync-strict-and-quorum", func(t *testing.T) { cfg := lifecycleConfig(t) cfg.Startup.TimeSyncWaitSeconds = 1 cfg.Startup.TimeSyncPollSeconds = 1 run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "sh" && strings.Contains(command, "timedatectl show -p NTPSynchronized"): return "yes", nil case name == "ssh" && strings.Contains(command, "timedatectl show -p NTPSynchronized"): if strings.Contains(command, "titan-23") { return "no", nil } return "yes", nil default: return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } } orch, _ := newHookOrchestrator(t, cfg, run, run) if err := orch.TestHookWaitForTimeSync(context.Background(), []string{"titan-db"}); err != nil { t.Fatalf("strict timesync success expected, got %v", err) } cfg.Startup.TimeSyncMode = "quorum" cfg.Startup.TimeSyncQuorum = 1 orchQuorum, _ := newHookOrchestrator(t, cfg, run, run) if err := orchQuorum.TestHookWaitForTimeSync(context.Background(), []string{"titan-db", "titan-23"}); err != nil { t.Fatalf("quorum timesync success expected, got %v", err) } }) t.Run("wait-for-time-sync-timeout-and-cancel", func(t *testing.T) { cfg := lifecycleConfig(t) cfg.Startup.TimeSyncWaitSeconds = 1 cfg.Startup.TimeSyncPollSeconds = 1 run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "sh" && strings.Contains(command, "timedatectl show -p NTPSynchronized"): return "no", nil case name == "ssh" && strings.Contains(command, "timedatectl show -p NTPSynchronized"): return "no", nil default: return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } } orch, _ := newHookOrchestrator(t, cfg, run, run) err := orch.TestHookWaitForTimeSync(context.Background(), []string{"titan-db"}) if err == nil || !strings.Contains(err.Error(), "time sync not ready") { t.Fatalf("expected strict timesync timeout error, got %v", err) } cancelCtx, cancel := context.WithCancel(context.Background()) cancel() if err := orch.TestHookWaitForTimeSync(cancelCtx, []string{"titan-db"}); !errors.Is(err, context.Canceled) { t.Fatalf("expected canceled timesync wait, got %v", err) } }) t.Run("preflight-external-datastore-noop-branches", func(t *testing.T) { cfg := lifecycleConfig(t) cfg.ControlPlanes = nil orchNoCP, _ := newHookOrchestrator(t, cfg, nil, nil) if err := orchNoCP.TestHookPreflightExternalDatastore(context.Background()); err != nil { t.Fatalf("expected nil when no control planes configured, got %v", err) } cfgUnmanaged := lifecycleConfig(t) cfgUnmanaged.ControlPlanes = []string{"titan-24"} cfgUnmanaged.SSHManagedNodes = []string{"titan-db"} orchUnmanaged, _ := newHookOrchestrator(t, cfgUnmanaged, nil, nil) if err := orchUnmanaged.TestHookPreflightExternalDatastore(context.Background()); err != nil { t.Fatalf("expected nil when control plane is unmanaged, got %v", err) } cfgInspectErr := lifecycleConfig(t) runInspectErr := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "ssh" && strings.Contains(command, "systemctl cat k3s") { return "", errors.New("permission denied") } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orchInspectErr, _ := newHookOrchestrator(t, cfgInspectErr, runInspectErr, runInspectErr) if err := orchInspectErr.TestHookPreflightExternalDatastore(context.Background()); err != nil { t.Fatalf("expected nil when k3s unit inspect fails, got %v", err) } cfgBadEndpoint := lifecycleConfig(t) runBadEndpoint := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "ssh" && strings.Contains(command, "systemctl cat k3s") { return "ExecStart=/usr/local/bin/k3s server --datastore-endpoint=::bad-endpoint::", nil } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orchBadEndpoint, _ := newHookOrchestrator(t, cfgBadEndpoint, runBadEndpoint, runBadEndpoint) if err := orchBadEndpoint.TestHookPreflightExternalDatastore(context.Background()); err != nil { t.Fatalf("expected nil when datastore endpoint cannot be parsed, got %v", err) } }) t.Run("preflight-external-datastore-reachable-and-recovery", func(t *testing.T) { listener, err := net.Listen("tcp", "127.0.0.1:0") if err != nil { t.Fatalf("listen for reachable datastore branch: %v", err) } reachablePort := listener.Addr().(*net.TCPAddr).Port defer listener.Close() cfgReachable := lifecycleConfig(t) runReachable := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "ssh" && strings.Contains(command, "systemctl cat k3s") { return fmt.Sprintf( "ExecStart=/usr/local/bin/k3s server --datastore-endpoint=postgres://127.0.0.1:%d/k3s", reachablePort, ), nil } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orchReachable, _ := newHookOrchestrator(t, cfgReachable, runReachable, runReachable) if err := orchReachable.TestHookPreflightExternalDatastore(context.Background()); err != nil { t.Fatalf("expected reachable datastore preflight success, got %v", err) } recoverySeed, err := net.Listen("tcp", "127.0.0.1:0") if err != nil { t.Fatalf("seed recovery port: %v", err) } recoveryPort := recoverySeed.Addr().(*net.TCPAddr).Port _ = recoverySeed.Close() cfgRecovery := lifecycleConfig(t) cfgRecovery.SSHManagedNodes = append(cfgRecovery.SSHManagedNodes, "db-postgres") cfgRecovery.SSHNodeHosts["db-postgres"] = "127.0.0.1" restarted := false runRecovery := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "ssh" && strings.Contains(command, "systemctl cat k3s"): return fmt.Sprintf( "ExecStart=/usr/local/bin/k3s server --datastore-endpoint=postgres://127.0.0.1:%d/k3s", recoveryPort, ), nil case name == "ssh" && strings.Contains(command, "systemctl restart postgresql"): restarted = true go func() { time.Sleep(500 * time.Millisecond) l, listenErr := net.Listen("tcp", fmt.Sprintf("127.0.0.1:%d", recoveryPort)) if listenErr != nil { return } time.Sleep(4 * time.Second) _ = l.Close() }() return "", nil default: return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } } orchRecovery, _ := newHookOrchestrator(t, cfgRecovery, runRecovery, runRecovery) if err := orchRecovery.TestHookPreflightExternalDatastore(context.Background()); err != nil { t.Fatalf("expected datastore recovery branch success, got %v", err) } if !restarted { t.Fatalf("expected datastore restart branch to run") } }) t.Run("preflight-external-datastore-canceled", func(t *testing.T) { cfg := lifecycleConfig(t) run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "ssh" && strings.Contains(command, "systemctl cat k3s") { return "ExecStart=/usr/local/bin/k3s server --datastore-endpoint=postgres://127.0.0.1:1/k3s", nil } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orch, _ := newHookOrchestrator(t, cfg, run, run) cancelCtx, cancel := context.WithCancel(context.Background()) cancel() if err := orch.TestHookPreflightExternalDatastore(cancelCtx); !errors.Is(err, context.Canceled) { t.Fatalf("expected canceled datastore preflight, got %v", err) } }) t.Run("helpers-parse-endpoint-node-lookup-and-inventory-validate", func(t *testing.T) { if got := cluster.TestHookParseDatastoreEndpoint(`ExecStart=/usr/local/bin/k3s server --datastore-endpoint="postgres://db.internal:5432/k3s"`); got == "" { t.Fatalf("expected regex endpoint parse to return value") } if got := cluster.TestHookParseDatastoreEndpoint("ExecStart=/usr/local/bin/k3s server \\\n --datastore-endpoint=postgres://db.internal:5432/k3s \\\n"); got == "" { t.Fatalf("expected line-scan endpoint parse to return value") } if got := cluster.TestHookParseDatastoreEndpoint("ExecStart=/usr/local/bin/k3s server"); got != "" { t.Fatalf("expected empty parse result for missing endpoint, got %q", got) } cfg := lifecycleConfig(t) cfg.SSHNodeHosts["db-postgres"] = "10.42.0.10" orch, _ := newHookOrchestrator(t, cfg, nil, nil) if got := orch.TestHookNodeNameForHost("db-postgres"); got != "db-postgres" { t.Fatalf("expected direct node lookup, got %q", got) } if got := orch.TestHookNodeNameForHost("10.42.0.10"); got != "db-postgres" { t.Fatalf("expected reverse host lookup to resolve db-postgres, got %q", got) } if got := orch.TestHookNodeNameForHost(""); got != "" { t.Fatalf("expected empty host lookup to be empty, got %q", got) } cfgBad := lifecycleConfig(t) cfgBad.SSHPort = 70000 cfgBad.Workers = []string{"titan-23", "titan-24"} cfgBad.SSHManagedNodes = []string{"titan-db", "titan-23"} cfgBad.SSHNodeHosts["titan-24"] = "bad/host" cfgBad.SSHNodeUsers = map[string]string{} cfgBad.SSHNodeUsers["titan-24"] = "bad@user" orchBad, _ := newHookOrchestrator(t, cfgBad, nil, nil) err := orchBad.TestHookValidateNodeInventory() if err == nil { t.Fatalf("expected invalid inventory error") } if !strings.Contains(err.Error(), "ssh_port") || !strings.Contains(err.Error(), "invalid ssh host") || !strings.Contains(err.Error(), "invalid ssh user") { t.Fatalf("unexpected inventory validation error: %v", err) } }) }