ananke/internal/cluster/orchestrator_unit_additional_test.go

999 lines
44 KiB
Go

package cluster
import (
"context"
"errors"
"io"
"log"
"net"
"path/filepath"
"strings"
"sync"
"testing"
"time"
"scm.bstein.dev/bstein/ananke/internal/config"
"scm.bstein.dev/bstein/ananke/internal/execx"
"scm.bstein.dev/bstein/ananke/internal/state"
)
type commandStub struct {
match func(name string, args []string) bool
out string
err error
}
// buildOrchestratorWithStubs runs one orchestration or CLI step.
// Signature: buildOrchestratorWithStubs(t *testing.T, cfg config.Config, stubs []commandStub) *Orchestrator.
// Why: helper centralizes deterministic command dispatch for fast, isolated unit tests.
func buildOrchestratorWithStubs(t *testing.T, cfg config.Config, stubs []commandStub) *Orchestrator {
t.Helper()
if cfg.State.Dir == "" {
cfg.State.Dir = t.TempDir()
}
if cfg.State.ReportsDir == "" {
cfg.State.ReportsDir = filepath.Join(cfg.State.Dir, "reports")
}
if cfg.State.RunHistoryPath == "" {
cfg.State.RunHistoryPath = filepath.Join(cfg.State.Dir, "runs.json")
}
orch := &Orchestrator{
cfg: cfg,
runner: &execx.Runner{},
store: state.New(cfg.State.RunHistoryPath),
log: log.New(io.Discard, "", 0),
}
dispatch := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
for _, stub := range stubs {
if stub.match(name, args) {
return stub.out, stub.err
}
}
return "", nil
}
orch.runOverride = dispatch
orch.runSensitiveOverride = dispatch
return orch
}
// matchContains runs one orchestration or CLI step.
// Signature: matchContains(cmd string, parts ...string) func(string, []string) bool.
// Why: concise substring matching keeps command stubs readable across many tests.
func matchContains(cmd string, parts ...string) func(string, []string) bool {
return func(name string, args []string) bool {
if name != cmd {
return false
}
joined := strings.Join(args, " ")
for _, part := range parts {
if !strings.Contains(joined, part) {
return false
}
}
return true
}
}
// TestStartupEarlyFailureLeavesFluxSuspensionUnchanged runs one orchestration or CLI step.
// Signature: TestStartupEarlyFailureLeavesFluxSuspensionUnchanged(t *testing.T).
// Why: recovery must not release Flux when bootstrap fails before storage and
// critical workloads are ready, or Flux can re-create the same dependency loop.
func TestStartupEarlyFailureLeavesFluxSuspensionUnchanged(t *testing.T) {
tmpDir := t.TempDir()
cfg := config.Config{
SSHPort: 2277,
Startup: config.Startup{
APIWaitSeconds: 1,
APIPollSeconds: 1,
RequireNodeInventoryReach: false,
RequireTimeSync: false,
RequireNodeSSHAuth: false,
ReconcileAccessOnBoot: false,
AutoEtcdRestoreOnAPIFailure: false,
RequiredNodeLabels: map[string]map[string]string{
"titan-missing": {
"node-role.kubernetes.io/worker": "true",
},
},
},
State: config.State{
Dir: tmpDir,
ReportsDir: filepath.Join(tmpDir, "reports"),
RunHistoryPath: filepath.Join(tmpDir, "runs.json"),
LockPath: filepath.Join(tmpDir, "ananke.lock"),
IntentPath: filepath.Join(tmpDir, "intent.json"),
},
}
var mu sync.Mutex
calls := []string{}
orch := buildOrchestratorWithStubs(t, cfg, []commandStub{
{
match: func(name string, args []string) bool {
mu.Lock()
calls = append(calls, name+" "+strings.Join(args, " "))
mu.Unlock()
return false
},
},
{match: matchContains("kubectl", "version", "--request-timeout=5s"), out: "ok"},
{match: matchContains("kubectl", "-n", "vault", "get", "pod", "vault-0"), out: "Pending"},
{
match: matchContains("kubectl", "label", "node", "titan-missing"),
err: errors.New(`nodes "titan-missing" not found`),
},
})
err := orch.Startup(context.Background(), StartupOptions{Reason: "test early failure"})
if err == nil {
t.Fatalf("expected startup to fail before flux resume")
}
if !strings.Contains(err.Error(), "ensure required node labels on titan-missing") {
t.Fatalf("expected required-label failure, got: %v", err)
}
mu.Lock()
defer mu.Unlock()
for _, call := range calls {
if strings.Contains(call, `"suspend":false`) {
t.Fatalf("early failed startup unexpectedly resumed flux via call: %s", call)
}
}
}
// TestRecycleStuckControllerPodsHandlesLonghornAttachBlockedPods runs one orchestration or CLI step.
// Signature: TestRecycleStuckControllerPodsHandlesLonghornAttachBlockedPods(t *testing.T).
// Why: Pending Longhorn-backed pods on Longhorn-unready nodes should be
// rescheduled without mutating Longhorn volume, replica, or disk objects.
func TestRecycleStuckControllerPodsHandlesLonghornAttachBlockedPods(t *testing.T) {
created := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
lastSeen := time.Now().UTC().Format(time.RFC3339)
pods := `{"items":[{"metadata":{"namespace":"monitoring","name":"victoria-metrics-single-server-0","creationTimestamp":"` + created + `","ownerReferences":[{"kind":"StatefulSet","name":"victoria-metrics-single-server"}]},"spec":{"nodeName":"titan-0b"},"status":{"phase":"Pending"}}]}`
events := `{"items":[{"metadata":{"namespace":"monitoring","creationTimestamp":"` + lastSeen + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"victoria-metrics-single-server-0"},"type":"Warning","reason":"FailedAttachVolume","message":"AttachVolume.Attach failed for volume \"pvc-1\" : rpc error from [http://longhorn-backend:9500/v1/volumes/pvc-1?action=attach]: unable to attach volume pvc-1 to titan-0b: node titan-0b is not ready","lastTimestamp":"` + lastSeen + `"}]}`
deleted := false
orch := buildOrchestratorWithStubs(t, config.Config{
Startup: config.Startup{StuckPodGraceSeconds: 180},
}, []commandStub{
{match: matchContains("kubectl", "get", "pods", "-A", "-o", "json"), out: pods},
{match: matchContains("kubectl", "-n", "longhorn-system", "get", "nodes.longhorn.io"), out: "titan-0b\tFalse\n"},
{match: matchContains("kubectl", "get", "events", "-A", "-o", "json"), out: events},
{match: matchContains("kubectl", "get", "nodes", "-o", "json"), out: `{"items":[{"metadata":{"name":"titan-0b"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`},
{
match: func(name string, args []string) bool {
if !matchContains("kubectl", "-n", "monitoring", "delete", "pod", "victoria-metrics-single-server-0", "--wait=false")(name, args) {
return false
}
deleted = true
return true
},
},
})
if err := orch.recycleStuckControllerPods(context.Background()); err != nil {
t.Fatalf("recycleStuckControllerPods failed: %v", err)
}
if !deleted {
t.Fatalf("expected longhorn attach-blocked pending pod to be recycled")
}
}
// TestRecycleStuckControllerPodsRepairsEncryptedVolumeCryptsetup runs one orchestration or CLI step.
// Signature: TestRecycleStuckControllerPodsRepairsEncryptedVolumeCryptsetup(t *testing.T).
// Why: encrypted Longhorn PVC recovery should repair missing host cryptsetup and
// then recycle the blocked pod without touching Longhorn data-plane objects.
func TestRecycleStuckControllerPodsRepairsEncryptedVolumeCryptsetup(t *testing.T) {
created := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
lastSeen := time.Now().UTC().Format(time.RFC3339)
pods := `{"items":[{"metadata":{"namespace":"finance","name":"actual-budget-abc","creationTimestamp":"` + created + `","ownerReferences":[{"kind":"ReplicaSet","name":"actual-budget"}]},"spec":{"nodeName":"titan-19"},"status":{"phase":"Pending"}}]}`
events := `{"items":[{"metadata":{"namespace":"finance","creationTimestamp":"` + lastSeen + `"},"involvedObject":{"kind":"Pod","namespace":"finance","name":"actual-budget-abc"},"type":"Warning","reason":"FailedMount","message":"MountVolume.MountDevice failed for volume \"pvc-1\" : nsenter: failed to execute cryptsetup: No such file or directory","lastTimestamp":"` + lastSeen + `"}]}`
installed := false
deleted := false
orch := buildOrchestratorWithStubs(t, config.Config{
Startup: config.Startup{StuckPodGraceSeconds: 180},
}, []commandStub{
{match: matchContains("kubectl", "get", "pods", "-A", "-o", "json"), out: pods},
{match: matchContains("kubectl", "-n", "longhorn-system", "get", "nodes.longhorn.io"), out: "titan-19\tTrue\n"},
{match: matchContains("kubectl", "get", "events", "-A", "-o", "json"), out: events},
{match: matchContains("kubectl", "get", "nodes", "-o", "json"), out: `{"items":[{"metadata":{"name":"titan-19"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`},
{
match: func(name string, args []string) bool {
if name != "ssh" || !strings.Contains(strings.Join(args, " "), "apt-get install -y --no-install-recommends cryptsetup-bin") {
return false
}
installed = true
return true
},
out: "__ANANKE_CRYPTSETUP_INSTALLED__",
},
{
match: func(name string, args []string) bool {
if !matchContains("kubectl", "-n", "finance", "delete", "pod", "actual-budget-abc", "--wait=false")(name, args) {
return false
}
deleted = true
return true
},
},
})
if err := orch.recycleStuckControllerPods(context.Background()); err != nil {
t.Fatalf("recycleStuckControllerPods failed: %v", err)
}
if !installed {
t.Fatalf("expected missing host cryptsetup to be installed")
}
if !deleted {
t.Fatalf("expected encrypted-volume blocked pod to be recycled")
}
}
// TestRecycleStuckControllerPodsCordonsEncryptedVolumeNodeWhenRepairFails runs one orchestration or CLI step.
// Signature: TestRecycleStuckControllerPodsCordonsEncryptedVolumeNodeWhenRepairFails(t *testing.T).
// Why: when host package repair is blocked by sudo policy, Ananke should avoid
// the bad node and retry the controller-owned pod elsewhere.
func TestRecycleStuckControllerPodsCordonsEncryptedVolumeNodeWhenRepairFails(t *testing.T) {
created := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
lastSeen := time.Now().UTC().Format(time.RFC3339)
pods := `{"items":[{"metadata":{"namespace":"finance","name":"actual-budget-abc","creationTimestamp":"` + created + `","ownerReferences":[{"kind":"ReplicaSet","name":"actual-budget"}]},"spec":{"nodeName":"titan-19"},"status":{"phase":"Pending"}}]}`
events := `{"items":[{"metadata":{"namespace":"finance","creationTimestamp":"` + lastSeen + `"},"involvedObject":{"kind":"Pod","namespace":"finance","name":"actual-budget-abc"},"type":"Warning","reason":"FailedMount","message":"MountVolume.MountDevice failed for volume \"pvc-1\" : nsenter: failed to execute cryptsetup: No such file or directory","lastTimestamp":"` + lastSeen + `"}]}`
cordoned := false
deleted := false
orch := buildOrchestratorWithStubs(t, config.Config{
Startup: config.Startup{StuckPodGraceSeconds: 180},
}, []commandStub{
{match: matchContains("kubectl", "get", "pods", "-A", "-o", "json"), out: pods},
{match: matchContains("kubectl", "-n", "longhorn-system", "get", "nodes.longhorn.io"), out: "titan-19\tTrue\n"},
{match: matchContains("kubectl", "get", "events", "-A", "-o", "json"), out: events},
{match: matchContains("kubectl", "get", "nodes", "-o", "json"), out: `{"items":[{"metadata":{"name":"titan-19"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`},
{
match: matchContains("ssh", "apt-get install -y --no-install-recommends cryptsetup-bin"),
err: errors.New("sudo: a password is required"),
},
{
match: func(name string, args []string) bool {
if !matchContains("kubectl", "cordon", "titan-19")(name, args) {
return false
}
cordoned = true
return true
},
},
{
match: func(name string, args []string) bool {
if !matchContains("kubectl", "-n", "finance", "delete", "pod", "actual-budget-abc", "--wait=false")(name, args) {
return false
}
deleted = true
return true
},
},
})
if err := orch.recycleStuckControllerPods(context.Background()); err != nil {
t.Fatalf("recycleStuckControllerPods failed: %v", err)
}
if !cordoned {
t.Fatalf("expected cryptsetup-missing node to be cordoned")
}
if !deleted {
t.Fatalf("expected encrypted-volume blocked pod to be recycled")
}
}
// TestRecycleStuckControllerPodsHandlesStalePodsOnReadyNodes runs one orchestration or CLI step.
// Signature: TestRecycleStuckControllerPodsHandlesStalePodsOnReadyNodes(t *testing.T).
// Why: post-outage controller pods can remain Unknown or Failed after their
// node recovers; deletion clears stale status while force deletion stays away
// from PVC-backed storage.
func TestRecycleStuckControllerPodsHandlesStalePodsOnReadyNodes(t *testing.T) {
old := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
recent := time.Now().Add(-30 * time.Second).UTC().Format(time.RFC3339)
pods := `{"items":[` +
`{"metadata":{"namespace":"longhorn-system","name":"longhorn-vault-sync-old","creationTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"longhorn-vault-sync"}]},"spec":{"nodeName":"titan-12"},"status":{"phase":"Unknown"}},` +
`{"metadata":{"namespace":"longhorn-system","name":"longhorn-vault-sync-failed","creationTimestamp":"` + old + `","deletionTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"longhorn-vault-sync"}]},"spec":{"nodeName":"titan-12","volumes":[{"name":"secret"}]},"status":{"phase":"Failed"}},` +
`{"metadata":{"namespace":"logging","name":"oauth2-proxy-terminating","creationTimestamp":"` + old + `","deletionTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"oauth2-proxy-logs"}]},"spec":{"nodeName":"titan-18","volumes":[{"name":"secret"}]},"status":{"phase":"Running"}},` +
`{"metadata":{"namespace":"longhorn-system","name":"pvc-backed-failed","creationTimestamp":"` + old + `","deletionTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"pvc-backed"}]},"spec":{"nodeName":"titan-12","volumes":[{"name":"data","persistentVolumeClaim":{"claimName":"data"}}]},"status":{"phase":"Failed"}},` +
`{"metadata":{"namespace":"longhorn-system","name":"longhorn-vault-sync-fresh","creationTimestamp":"` + recent + `","ownerReferences":[{"kind":"ReplicaSet","name":"longhorn-vault-sync"}]},"spec":{"nodeName":"titan-12"},"status":{"phase":"Unknown"}},` +
`{"metadata":{"namespace":"maintenance","name":"stale-on-bad-node","creationTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"maintenance"}]},"spec":{"nodeName":"titan-22"},"status":{"phase":"Unknown"}},` +
`{"metadata":{"namespace":"default","name":"bare-pod","creationTimestamp":"` + old + `"},"spec":{"nodeName":"titan-12"},"status":{"phase":"Unknown"}}]}`
deleted := []string{}
orch := buildOrchestratorWithStubs(t, config.Config{
Startup: config.Startup{StuckPodGraceSeconds: 180},
}, []commandStub{
{match: matchContains("kubectl", "get", "pods", "-A", "-o", "json"), out: pods},
{match: matchContains("kubectl", "-n", "longhorn-system", "get", "nodes.longhorn.io"), out: "titan-12\tTrue\ntitan-22\tTrue\n"},
{match: matchContains("kubectl", "get", "events", "-A", "-o", "json"), out: `{"items":[]}`},
{match: matchContains("kubectl", "get", "nodes", "-o", "json"), out: `{"items":[{"metadata":{"name":"titan-12"},"status":{"conditions":[{"type":"Ready","status":"True"}]}},{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"False"}]}}]}`},
{
match: func(name string, args []string) bool {
if !matchContains("kubectl", "-n", "longhorn-system", "delete", "pod", "longhorn-vault-sync-old", "--wait=false")(name, args) {
return false
}
deleted = append(deleted, "longhorn-vault-sync-old")
return true
},
},
{
match: func(name string, args []string) bool {
if !matchContains("kubectl", "-n", "longhorn-system", "delete", "pod", "longhorn-vault-sync-failed", "--wait=false", "--grace-period=0", "--force")(name, args) {
return false
}
deleted = append(deleted, "longhorn-vault-sync-failed")
return true
},
},
{
match: func(name string, args []string) bool {
if !matchContains("kubectl", "-n", "logging", "delete", "pod", "oauth2-proxy-terminating", "--wait=false", "--grace-period=0", "--force")(name, args) {
return false
}
deleted = append(deleted, "oauth2-proxy-terminating")
return true
},
},
{
match: func(name string, args []string) bool {
if !matchContains("kubectl", "-n", "longhorn-system", "delete", "pod", "pvc-backed-failed", "--wait=false")(name, args) {
return false
}
if strings.Contains(strings.Join(args, " "), "--force") {
t.Fatalf("pvc-backed stale pod must not be force deleted")
}
deleted = append(deleted, "pvc-backed-failed")
return true
},
},
})
if err := orch.recycleStuckControllerPods(context.Background()); err != nil {
t.Fatalf("recycleStuckControllerPods failed: %v", err)
}
if strings.Join(deleted, ",") != "longhorn-vault-sync-old,longhorn-vault-sync-failed,oauth2-proxy-terminating,pvc-backed-failed" {
t.Fatalf("expected only stale controller pods on Ready node to be recycled, got %#v", deleted)
}
}
// TestRecycleStuckControllerPodsCordonsContainerRuntimeWedgeNode runs one orchestration or CLI step.
// Signature: TestRecycleStuckControllerPodsCordonsContainerRuntimeWedgeNode(t *testing.T).
// Why: a Ready node with a wedged container runtime can trap replacement pods
// indefinitely; startup should cordon that scheduler target without draining it
// or touching Longhorn data-plane objects.
func TestRecycleStuckControllerPodsCordonsContainerRuntimeWedgeNode(t *testing.T) {
old := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
lastSeen := time.Now().UTC().Format(time.RFC3339)
pods := `{"items":[` +
`{"metadata":{"namespace":"logging","name":"oauth2-proxy-bad","creationTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"oauth2-proxy"}]},"spec":{"nodeName":"titan-18","volumes":[{"name":"scratch"}]},"status":{"phase":"Pending","containerStatuses":[{"name":"oauth2-proxy","state":{"waiting":{"reason":"CreateContainerError"}}}]}},` +
`{"metadata":{"namespace":"monitoring","name":"suite-probe-bad","creationTimestamp":"` + old + `","ownerReferences":[{"kind":"Job","name":"suite-probe"}]},"spec":{"nodeName":"titan-18","volumes":[{"name":"scratch"}]},"status":{"phase":"Pending","containerStatuses":[{"name":"probe","state":{"waiting":{"reason":"CreateContainerError"}}}]}},` +
`{"metadata":{"namespace":"sso","name":"secret-ensure-bad","creationTimestamp":"` + old + `","ownerReferences":[{"kind":"Job","name":"secret-ensure"}]},"spec":{"nodeName":"titan-18","volumes":[{"name":"scratch"}]},"status":{"phase":"Pending","initContainerStatuses":[{"name":"init","state":{"waiting":{"reason":"CreateContainerError"}}}]}},` +
`{"metadata":{"namespace":"finance","name":"single-node-bad","creationTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"single"}]},"spec":{"nodeName":"titan-19","volumes":[{"name":"scratch"}]},"status":{"phase":"Pending","containerStatuses":[{"name":"app","state":{"waiting":{"reason":"CreateContainerError"}}}]}}]}`
events := `{"items":[` +
`{"metadata":{"namespace":"logging","creationTimestamp":"` + lastSeen + `"},"involvedObject":{"kind":"Pod","namespace":"logging","name":"oauth2-proxy-bad"},"type":"Warning","reason":"Failed","message":"spec.containers{oauth2-proxy}: Error: failed to reserve container name oauth2-proxy_logging","lastTimestamp":"` + lastSeen + `"},` +
`{"metadata":{"namespace":"monitoring","creationTimestamp":"` + lastSeen + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"suite-probe-bad"},"type":"Warning","reason":"Failed","message":"spec.containers{probe}: Error: context deadline exceeded","lastTimestamp":"` + lastSeen + `"},` +
`{"metadata":{"namespace":"sso","creationTimestamp":"` + lastSeen + `"},"involvedObject":{"kind":"Pod","namespace":"sso","name":"secret-ensure-bad"},"type":"Warning","reason":"Failed","message":"spec.initContainers{init}: Error: failed to reserve container name init_sso","lastTimestamp":"` + lastSeen + `"},` +
`{"metadata":{"namespace":"finance","creationTimestamp":"` + lastSeen + `"},"involvedObject":{"kind":"Pod","namespace":"finance","name":"single-node-bad"},"type":"Warning","reason":"Failed","message":"spec.containers{app}: Error: failed to reserve container name app_finance","lastTimestamp":"` + lastSeen + `"}]}`
cordoned := []string{}
deleted := []string{}
orch := buildOrchestratorWithStubs(t, config.Config{
Startup: config.Startup{StuckPodGraceSeconds: 180},
}, []commandStub{
{match: matchContains("kubectl", "get", "pods", "-A", "-o", "json"), out: pods},
{match: matchContains("kubectl", "-n", "longhorn-system", "get", "nodes.longhorn.io"), out: ""},
{match: matchContains("kubectl", "get", "events", "-A", "-o", "json"), out: events},
{match: matchContains("kubectl", "get", "nodes", "-o", "json"), out: `{"items":[{"metadata":{"name":"titan-18"},"status":{"conditions":[{"type":"Ready","status":"True"}]}},{"metadata":{"name":"titan-19"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`},
{
match: func(name string, args []string) bool {
if !matchContains("kubectl", "cordon")(name, args) {
return false
}
cordoned = append(cordoned, args[len(args)-1])
return true
},
},
{
match: func(name string, args []string) bool {
if !matchContains("kubectl", "delete", "pod", "--wait=false")(name, args) {
return false
}
joined := strings.Join(args, " ")
if strings.Contains(joined, "--force") {
t.Fatalf("container-runtime wedge recycle must not force-delete fresh pods")
}
if len(args) >= 5 {
deleted = append(deleted, args[4])
}
return true
},
},
})
if err := orch.recycleStuckControllerPods(context.Background()); err != nil {
t.Fatalf("recycleStuckControllerPods failed: %v", err)
}
if strings.Join(cordoned, ",") != "titan-18" {
t.Fatalf("expected only titan-18 to be cordoned, got %#v", cordoned)
}
if strings.Join(deleted, ",") != "oauth2-proxy-bad,suite-probe-bad,secret-ensure-bad,single-node-bad" {
t.Fatalf("expected runtime-wedged pods to be recycled, got %#v", deleted)
}
}
// TestEffectiveWorkersFiltersIgnoredUnavailableNodes runs one orchestration or CLI step.
// Signature: TestEffectiveWorkersFiltersIgnoredUnavailableNodes(t *testing.T).
// Why: ignored unavailable nodes should be excluded before startup tries SSH,
// k3s-agent start, or uncordon operations against intentionally absent hosts.
func TestEffectiveWorkersFiltersIgnoredUnavailableNodes(t *testing.T) {
cfg := config.Config{
Workers: []string{" titan-08 ", "titan-09", "titan-10", "titan-11"},
Startup: config.Startup{
IgnoreUnavailableNodes: []string{"titan-09", "titan-10"},
},
}
orch := buildOrchestratorWithStubs(t, cfg, nil)
got, err := orch.effectiveWorkers(context.Background())
if err != nil {
t.Fatalf("effectiveWorkers failed: %v", err)
}
want := []string{"titan-08", "titan-11"}
if strings.Join(got, ",") != strings.Join(want, ",") {
t.Fatalf("effectiveWorkers mismatch got=%v want=%v", got, want)
}
}
// TestEnsureLonghornEncryptedHostPrereqsFiltersUnsafeWorkers runs one orchestration or CLI step.
// Signature: TestEnsureLonghornEncryptedHostPrereqsFiltersUnsafeWorkers(t *testing.T).
// Why: startup must not uncordon Longhorn workers that cannot mount encrypted
// PVCs; cordoning those nodes is safe and avoids repeating the post-outage
// mount deadlock.
func TestEnsureLonghornEncryptedHostPrereqsFiltersUnsafeWorkers(t *testing.T) {
cordoned := []string{}
orch := buildOrchestratorWithStubs(t, config.Config{
SSHManagedNodes: []string{"titan-04", "titan-19"},
}, []commandStub{
{match: matchContains("kubectl", "get", "nodes", "-l", "longhorn-host=true"), out: "titan-04\ntitan-19\ntitan-23\n"},
{
match: matchContains("ssh", "titan-04", "command -v cryptsetup"),
out: "__ANANKE_CRYPTSETUP_PRESENT__",
},
{
match: matchContains("ssh", "titan-19", "apt-get install -y --no-install-recommends cryptsetup-bin"),
err: errors.New("sudo: a password is required"),
},
{
match: func(name string, args []string) bool {
if name != "kubectl" || len(args) == 0 || args[0] != "cordon" {
return false
}
if len(args) > 1 {
cordoned = append(cordoned, args[len(args)-1])
}
return true
},
},
})
got, err := orch.ensureLonghornEncryptedHostPrereqs(context.Background(), []string{"titan-04", "titan-19", "titan-20"})
if err != nil {
t.Fatalf("ensureLonghornEncryptedHostPrereqs failed: %v", err)
}
want := []string{"titan-04", "titan-20"}
if strings.Join(got, ",") != strings.Join(want, ",") {
t.Fatalf("guarded workers mismatch got=%v want=%v", got, want)
}
if strings.Join(cordoned, ",") != "titan-19,titan-23" {
t.Fatalf("expected unsafe longhorn hosts to be cordoned, got %v", cordoned)
}
}
// TestLonghornCryptsetupExemptNodesAreNotQuarantined runs one orchestration or CLI step.
// Signature: TestLonghornCryptsetupExemptNodesAreNotQuarantined(t *testing.T).
// Why: Veles/Oceanus uses titan-23 as a Longhorn host for unencrypted local
// volumes; startup should uncordon that policy-exempt node without requiring
// host SSH or weakening encrypted-volume safety on other workers.
func TestLonghornCryptsetupExemptNodesAreNotQuarantined(t *testing.T) {
cordoned := []string{}
uncordoned := []string{}
sshTitan23 := false
orch := buildOrchestratorWithStubs(t, config.Config{
SSHManagedNodes: []string{"titan-04"},
Startup: config.Startup{
LonghornCryptsetupExemptNodes: []string{"titan-23"},
},
}, []commandStub{
{match: matchContains("kubectl", "get", "nodes", "-l", "longhorn-host=true"), out: "titan-04\ntitan-23\n"},
{
match: matchContains("ssh", "titan-04", "command -v cryptsetup"),
out: "__ANANKE_CRYPTSETUP_PRESENT__",
},
{
match: func(name string, args []string) bool {
if name == "ssh" && strings.Contains(strings.Join(args, " "), "titan-23") {
sshTitan23 = true
return true
}
return false
},
},
{
match: func(name string, args []string) bool {
if name != "kubectl" || len(args) == 0 || args[0] != "cordon" {
return false
}
if len(args) > 1 {
cordoned = append(cordoned, args[len(args)-1])
}
return true
},
},
{
match: func(name string, args []string) bool {
if !matchContains("kubectl", "uncordon")(name, args) {
return false
}
if len(args) > 1 {
uncordoned = append(uncordoned, args[len(args)-1])
}
return true
},
},
})
got, err := orch.ensureLonghornEncryptedHostPrereqs(context.Background(), []string{"titan-04"})
if err != nil {
t.Fatalf("ensureLonghornEncryptedHostPrereqs failed: %v", err)
}
if strings.Join(got, ",") != "titan-04" {
t.Fatalf("guarded workers mismatch got=%v", got)
}
if err := orch.uncordonLonghornCryptsetupExemptNodes(context.Background()); err != nil {
t.Fatalf("uncordonLonghornCryptsetupExemptNodes failed: %v", err)
}
if sshTitan23 {
t.Fatalf("did not expect cryptsetup SSH check for exempt titan-23")
}
if len(cordoned) != 0 {
t.Fatalf("did not expect exempt node to be cordoned, got %v", cordoned)
}
if strings.Join(uncordoned, ",") != "titan-23" {
t.Fatalf("expected exempt titan-23 to be uncordoned, got %v", uncordoned)
}
}
// TestLonghornHostNodesFallsBackToConfiguredLabels runs one orchestration or CLI step.
// Signature: TestLonghornHostNodesFallsBackToConfiguredLabels(t *testing.T).
// Why: bootstrap caches or minimal test clusters can lack live labels; the
// static startup inventory should still protect configured storage workers.
func TestLonghornHostNodesFallsBackToConfiguredLabels(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{
Startup: config.Startup{
RequiredNodeLabels: map[string]map[string]string{
"titan-04": {"longhorn-host": "true"},
"titan-20": {"node-role.kubernetes.io/worker": "true"},
},
},
}, []commandStub{
{match: matchContains("kubectl", "get", "nodes", "-l", "longhorn-host=true"), out: ""},
})
got, err := orch.longhornHostNodes(context.Background())
if err != nil {
t.Fatalf("longhornHostNodes failed: %v", err)
}
if _, ok := got["titan-04"]; !ok || len(got) != 1 {
t.Fatalf("expected configured longhorn host fallback, got %v", got)
}
}
// TestNewConstructsOrchestrator runs one orchestration or CLI step.
// Signature: TestNewConstructsOrchestrator(t *testing.T).
// Why: covers constructor path in orchestrator core module.
func TestNewConstructsOrchestrator(t *testing.T) {
cfg := config.Config{State: config.State{RunHistoryPath: filepath.Join(t.TempDir(), "runs.json")}}
r := &execx.Runner{}
s := state.New(cfg.State.RunHistoryPath)
orch := New(cfg, r, s, log.New(io.Discard, "", 0))
if orch == nil || orch.runner != r || orch.store != s {
t.Fatalf("constructor returned unexpected orchestrator: %#v", orch)
}
}
// TestParseSnapshotPathFromEtcdSnapshotList runs one orchestration or CLI step.
// Signature: TestParseSnapshotPathFromEtcdSnapshotList(t *testing.T).
// Why: covers snapshot-path parser branches including header skip and no-match.
func TestParseSnapshotPathFromEtcdSnapshotList(t *testing.T) {
out := strings.Join([]string{
"Name Size Created Location",
`pre-shutdown 4.2M now "file:///var/lib/rancher/k3s/server/db/snapshots/pre-shutdown"`,
}, "\n")
got := parseSnapshotPathFromEtcdSnapshotList(out)
if got != "/var/lib/rancher/k3s/server/db/snapshots/pre-shutdown" {
t.Fatalf("unexpected snapshot path: %q", got)
}
if parseSnapshotPathFromEtcdSnapshotList("no snapshots") != "" {
t.Fatalf("expected no snapshot path")
}
}
// TestFluxSourceHelpers runs one orchestration or CLI step.
// Signature: TestFluxSourceHelpers(t *testing.T).
// Why: covers flux source readiness/guard/branch patch helper flows.
func TestFluxSourceHelpers(t *testing.T) {
cfg := config.Config{
ExpectedFluxSource: "ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git",
}
orch := buildOrchestratorWithStubs(t, cfg, []commandStub{
{match: matchContains("kubectl", "jsonpath={.status.conditions"), out: "True"},
{match: matchContains("kubectl", "jsonpath={.spec.url}"), out: "ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git"},
{match: matchContains("kubectl", "jsonpath={.spec.ref.branch}"), out: "main"},
{match: matchContains("kubectl", "patch", "gitrepository"), out: ""},
})
ready, err := orch.fluxSourceReady(context.Background())
if err != nil || !ready {
t.Fatalf("expected flux source ready, got ready=%v err=%v", ready, err)
}
if err := orch.guardFluxSourceDrift(context.Background(), "main", false); err != nil {
t.Fatalf("guardFluxSourceDrift failed: %v", err)
}
if err := orch.ensureFluxBranch(context.Background(), "main", false); err != nil {
t.Fatalf("ensureFluxBranch no-op failed: %v", err)
}
if got := normalizeGitURL(" SSH://Git@Host/Repo.git/ "); got != "ssh://git@host/repo" {
t.Fatalf("unexpected normalized url: %q", got)
}
}
// TestCoordinationHelpers runs one orchestration or CLI step.
// Signature: TestCoordinationHelpers(t *testing.T).
// Why: covers intent-age helpers, shell quoting, and peer selection logic.
func TestCoordinationHelpers(t *testing.T) {
in := state.Intent{UpdatedAt: time.Now().Add(-10 * time.Second)}
if intentAge(in) <= 0 {
t.Fatalf("expected positive age")
}
if !intentFresh(state.Intent{}, time.Second) {
t.Fatalf("zero timestamp should be fresh")
}
if shellQuote("a'b") != `'a'"'"'b'` {
t.Fatalf("unexpected shell quote output")
}
orch := buildOrchestratorWithStubs(t, config.Config{
Coordination: config.Coordination{
PeerHosts: []string{"titan-24", "titan-24", "titan-db"},
ForwardShutdownHost: "titan-db",
},
}, nil)
peers := orch.coordinationPeers()
if len(peers) != 2 {
t.Fatalf("expected deduped peers, got %v", peers)
}
}
// TestVerifyEtcdSnapshotAndRunSudoK3S runs one orchestration or CLI step.
// Signature: TestVerifyEtcdSnapshotAndRunSudoK3S(t *testing.T).
// Why: covers k3s command fallback and snapshot verification happy path.
func TestVerifyEtcdSnapshotAndRunSudoK3S(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
{match: matchContains("ssh", "stat -c %s"), out: "2097152"},
{match: matchContains("ssh", "k3s etcd-snapshot ls"), out: "/var/lib/rancher/k3s/server/db/snapshots/pre-shutdown"},
{match: matchContains("ssh", "sha256sum"), out: strings.Repeat("a", 64)},
})
if err := orch.verifyEtcdSnapshot(context.Background(), "titan-0a", "/var/lib/rancher/k3s/server/db/snapshots/pre-shutdown"); err != nil {
t.Fatalf("verifyEtcdSnapshot failed: %v", err)
}
}
// TestScalingHelpers runs one orchestration or CLI step.
// Signature: TestScalingHelpers(t *testing.T).
// Why: covers workload discovery, snapshot IO, and scale command orchestration.
func TestScalingHelpers(t *testing.T) {
cfg := config.Config{
ExcludedNamespaces: []string{"kube-system"},
State: config.State{Dir: t.TempDir()},
}
orch := buildOrchestratorWithStubs(t, cfg, []commandStub{
{
match: matchContains("kubectl", "get deployment", "jsonpath"),
out: strings.Join([]string{
"default\tgrafana\t1",
"kube-system\tcoredns\t2",
"",
}, "\n"),
},
{
match: matchContains("kubectl", "get statefulset", "jsonpath"),
out: "vault\tvault\t1\n",
},
{match: matchContains("kubectl", "scale", "deployment", "grafana"), out: ""},
{match: matchContains("kubectl", "scale", "statefulset", "vault"), out: ""},
})
entries, err := orch.listScalableWorkloads(context.Background())
if err != nil {
t.Fatalf("listScalableWorkloads failed: %v", err)
}
if len(entries) != 2 {
t.Fatalf("expected 2 scalable entries, got %d (%v)", len(entries), entries)
}
if err := orch.writeScaledWorkloadSnapshot(entries); err != nil {
t.Fatalf("writeScaledWorkloadSnapshot failed: %v", err)
}
snapshot, err := orch.readScaledWorkloadSnapshot()
if err != nil || snapshot == nil || len(snapshot.Entries) != 2 {
t.Fatalf("readScaledWorkloadSnapshot failed snapshot=%v err=%v", snapshot, err)
}
if err := orch.scaleWorkloads(context.Background(), entries, -1, 2); err != nil {
t.Fatalf("scaleWorkloads failed: %v", err)
}
}
// TestStorageReadyAndWorkloadHelpers runs one orchestration or CLI step.
// Signature: TestStorageReadyAndWorkloadHelpers(t *testing.T).
// Why: covers storage readiness checks and workload helper utilities.
func TestStorageReadyAndWorkloadHelpers(t *testing.T) {
cfg := config.Config{
Startup: config.Startup{
StorageMinReadyNodes: 1,
StorageCriticalPVCs: []string{"vault/data-vault-0"},
},
}
orch := buildOrchestratorWithStubs(t, cfg, []commandStub{
{match: matchContains("kubectl", "nodes.longhorn.io"), out: "titan-23:True:True\n"},
{match: matchContains("kubectl", "get pvc data-vault-0"), out: "Bound"},
})
ok, reason, err := orch.storageReady(context.Background())
if err != nil || !ok {
t.Fatalf("expected storageReady true, got ok=%v reason=%q err=%v", ok, reason, err)
}
}
// TestIngressAndServiceHelpers runs one orchestration or CLI step.
// Signature: TestIngressAndServiceHelpers(t *testing.T).
// Why: covers ingress host discovery helpers and URL parsing helpers.
func TestIngressAndServiceHelpers(t *testing.T) {
cfg := config.Config{
Startup: config.Startup{
IngressChecklistIgnoreHosts: []string{"ignore.bstein.dev"},
},
}
orch := buildOrchestratorWithStubs(t, cfg, []commandStub{
{match: matchContains("kubectl", "get ingress", "-A", "-o", "json"), out: `{"items":[{"metadata":{"namespace":"gitea"},"spec":{"rules":[{"host":"scm.bstein.dev"}]}},{"metadata":{"namespace":"x"},"spec":{"rules":[{"host":"ignore.bstein.dev"}]}}]}`},
})
hosts, err := orch.discoverIngressHosts(context.Background())
if err != nil || len(hosts) != 1 || hosts[0] != "scm.bstein.dev" {
t.Fatalf("discoverIngressHosts unexpected hosts=%v err=%v", hosts, err)
}
if got := hostFromURL("https://metrics.bstein.dev/api/health"); got != "metrics.bstein.dev" {
t.Fatalf("unexpected hostFromURL value: %q", got)
}
if !isLikelyHostname("metrics.bstein.dev") || isLikelyHostname("bad path/value") {
t.Fatalf("isLikelyHostname classification mismatch")
}
}
// TestWorkloadConvergenceHelpers runs one orchestration or CLI step.
// Signature: TestWorkloadConvergenceHelpers(t *testing.T).
// Why: covers controller readiness helpers and stuck-pod heuristics.
func TestWorkloadConvergenceHelpers(t *testing.T) {
replicas := int32(2)
item := workloadResource{Kind: "deployment"}
item.Spec.Replicas = &replicas
item.Status.ReadyReplicas = 1
desired, ready, ok := desiredReady(item)
if !ok || desired != 2 || ready != 1 {
t.Fatalf("desiredReady mismatch desired=%d ready=%d ok=%v", desired, ready, ok)
}
var pod podResource
pod.Metadata.OwnerReferences = []ownerReference{{Kind: "ReplicaSet"}}
if !podControllerOwned(pod) {
t.Fatalf("expected podControllerOwned=true")
}
pod.Status.ContainerStatuses = []podContainerStatus{{State: podContainerState{Waiting: &podContainerWaitingState{Reason: "CrashLoopBackOff"}}}}
reason := stuckContainerReason(pod, map[string]struct{}{"CrashLoopBackOff": struct{}{}})
if reason != "CrashLoopBackOff" {
t.Fatalf("unexpected stuck reason: %q", reason)
}
}
// TestDrainAndK3SHelpers runs one orchestration or CLI step.
// Signature: TestDrainAndK3SHelpers(t *testing.T).
// Why: covers node drain diagnostics and k3s snapshot selection flow.
func TestDrainAndK3SHelpers(t *testing.T) {
cfg := config.Config{
SSHManagedNodes: []string{"titan-0a"},
}
orch := buildOrchestratorWithStubs(t, cfg, []commandStub{
{match: matchContains("kubectl", "get pods", "--field-selector", "spec.nodeName=titan-22"), out: "vault vault-0 Running StatefulSet\n"},
{match: matchContains("ssh", "k3s etcd-snapshot ls"), out: "pre-shutdown /var/lib/rancher/k3s/server/db/snapshots/pre-shutdown"},
})
diag := orch.drainNodeDiagnostics(context.Background(), "titan-22")
if !strings.Contains(diag, "vault/vault-0") {
t.Fatalf("unexpected diagnostics output: %q", diag)
}
snapshot, err := orch.latestEtcdSnapshotPath(context.Background(), "titan-0a")
if err != nil || snapshot == "" {
t.Fatalf("latestEtcdSnapshotPath failed snapshot=%q err=%v", snapshot, err)
}
}
// TestTimesyncAndInventoryHelpers runs one orchestration or CLI step.
// Signature: TestTimesyncAndInventoryHelpers(t *testing.T).
// Why: covers time sync helpers, datastore endpoint parsing, and inventory assembly.
func TestTimesyncAndInventoryHelpers(t *testing.T) {
cfg := config.Config{
ControlPlanes: []string{"titan-0a"},
Workers: []string{"titan-22"},
SSHManagedNodes: []string{"titan-0a", "titan-22"},
SSHNodeHosts: map[string]string{
"titan-db": "10.0.0.10",
},
Coordination: config.Coordination{
PeerHosts: []string{"titan-24"},
ForwardShutdownHost: "titan-db",
},
}
orch := buildOrchestratorWithStubs(t, cfg, nil)
nodes := orch.inventoryNodesForValidation()
if len(nodes) < 3 {
t.Fatalf("expected combined inventory nodes, got %v", nodes)
}
if parseDatastoreEndpoint("ExecStart=/usr/local/bin/k3s server --datastore-endpoint=postgres://x") == "" {
t.Fatalf("expected datastore endpoint parse")
}
if !isTimeSynced("YES") || isTimeSynced("no") {
t.Fatalf("unexpected isTimeSynced behavior")
}
ln, err := net.Listen("tcp", "127.0.0.1:0")
if err != nil {
t.Fatalf("listen failed: %v", err)
}
defer ln.Close()
if !orch.tcpReachable(ln.Addr().String(), 500*time.Millisecond) {
t.Fatalf("expected tcpReachable=true for open listener")
}
}
// TestShutdownModeValidation runs one orchestration or CLI step.
// Signature: TestShutdownModeValidation(t *testing.T).
// Why: covers removed poweroff mode and invalid-mode errors.
func TestShutdownModeValidation(t *testing.T) {
if mode, err := normalizeShutdownMode("cluster-only"); err != nil || mode != "cluster-only" {
t.Fatalf("expected cluster-only mode, got mode=%q err=%v", mode, err)
}
if _, err := normalizeShutdownMode("bogus"); err == nil {
t.Fatalf("expected invalid mode error")
}
}
// TestWaitForAPIDryRunShortCircuit runs one orchestration or CLI step.
// Signature: TestWaitForAPIDryRunShortCircuit(t *testing.T).
// Why: covers dry-run short-circuit branch for api readiness wait.
func TestWaitForAPIDryRunShortCircuit(t *testing.T) {
orch := &Orchestrator{runner: &execx.Runner{DryRun: true}}
if err := orch.waitForAPI(context.Background(), 1, time.Millisecond); err != nil {
t.Fatalf("expected dry-run waitForAPI to pass: %v", err)
}
}
// TestGuardFluxSourceDriftMismatch runs one orchestration or CLI step.
// Signature: TestGuardFluxSourceDriftMismatch(t *testing.T).
// Why: covers url-drift and branch-drift error branches.
func TestGuardFluxSourceDriftMismatch(t *testing.T) {
cfg := config.Config{ExpectedFluxSource: "ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git"}
orch := buildOrchestratorWithStubs(t, cfg, []commandStub{
{match: matchContains("kubectl", "jsonpath={.spec.url}"), out: "ssh://git@scm.bstein.dev:2242/bstein/wrong.git"},
})
if err := orch.guardFluxSourceDrift(context.Background(), "main", false); err == nil {
t.Fatalf("expected guardFluxSourceDrift mismatch error")
}
orch = buildOrchestratorWithStubs(t, cfg, []commandStub{
{match: matchContains("kubectl", "jsonpath={.spec.url}"), out: "ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git"},
{match: matchContains("kubectl", "jsonpath={.spec.ref.branch}"), out: "atlasbot"},
})
if err := orch.guardFluxSourceDrift(context.Background(), "main", false); err == nil {
t.Fatalf("expected branch drift error")
}
}
// TestRunSudoK3SFailsWhenAllCandidatesFail runs one orchestration or CLI step.
// Signature: TestRunSudoK3SFailsWhenAllCandidatesFail(t *testing.T).
// Why: covers fallback failure return in runSudoK3S.
func TestRunSudoK3SFailsWhenAllCandidatesFail(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
{match: matchContains("ssh", "k3s"), err: errors.New("no binary")},
})
if _, err := orch.runSudoK3S(context.Background(), "titan-0a", "server"); err == nil {
t.Fatalf("expected runSudoK3S failure when all candidates fail")
}
}
// TestCriticalEndpointHelpers runs one orchestration or CLI step.
// Signature: TestCriticalEndpointHelpers(t *testing.T).
// Why: covers critical endpoint parsing and readiness checks that gate startup completion.
func TestCriticalEndpointHelpers(t *testing.T) {
cfg := config.Config{
Startup: config.Startup{
CriticalServiceEndpoints: []string{"monitoring/victoria-metrics-single-server"},
},
}
orch := buildOrchestratorWithStubs(t, cfg, []commandStub{
{match: matchContains("kubectl", "get endpoints victoria-metrics-single-server"), out: "10.42.0.10\n10.42.0.11\n"},
})
ok, detail, ns, svc, err := orch.criticalServiceEndpointsReady(context.Background())
if err != nil || !ok {
t.Fatalf("expected criticalServiceEndpointsReady success, got ok=%v detail=%q ns=%q svc=%q err=%v", ok, detail, ns, svc, err)
}
if detail != "services=1" {
t.Fatalf("unexpected readiness detail: %q", detail)
}
gotNS, gotSvc, err := parseCriticalServiceEndpoint("monitoring/victoria-metrics-single-server")
if err != nil || gotNS != "monitoring" || gotSvc != "victoria-metrics-single-server" {
t.Fatalf("unexpected parse result ns=%q svc=%q err=%v", gotNS, gotSvc, err)
}
if _, _, err := parseCriticalServiceEndpoint("invalid"); err == nil {
t.Fatalf("expected parseCriticalServiceEndpoint error")
}
}
// TestCriticalEndpointAutoHealWorkflow runs one orchestration or CLI step.
// Signature: TestCriticalEndpointAutoHealWorkflow(t *testing.T).
// Why: covers endpoint-zero recovery where startup heals workload replicas before succeeding.
func TestCriticalEndpointAutoHealWorkflow(t *testing.T) {
cfg := config.Config{
Startup: config.Startup{
CriticalServiceEndpointWaitSec: 2,
CriticalServiceEndpointPollSec: 1,
CriticalServiceEndpoints: []string{"monitoring/victoria-metrics-single-server"},
},
State: config.State{
Dir: t.TempDir(),
ReportsDir: filepath.Join(t.TempDir(), "reports"),
RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"),
},
}
orch := &Orchestrator{
cfg: cfg,
runner: &execx.Runner{},
store: state.New(cfg.State.RunHistoryPath),
log: log.New(io.Discard, "", 0),
}
endpointChecks := 0
dispatch := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
joined := name + " " + strings.Join(args, " ")
if strings.Contains(joined, "get endpoints victoria-metrics-single-server") {
endpointChecks++
if endpointChecks == 1 {
return "", nil
}
return "10.42.0.10\n", nil
}
if strings.Contains(joined, "scale deployment victoria-metrics-single-server") {
return "", errors.New(`Error from server (NotFound): deployments.apps "victoria-metrics-single-server" not found`)
}
if strings.Contains(joined, "scale statefulset victoria-metrics-single-server") {
return "", nil
}
if strings.Contains(joined, "rollout status statefulset/victoria-metrics-single-server") {
return "statefulset rolled out", nil
}
return "", nil
}
orch.runOverride = dispatch
orch.runSensitiveOverride = dispatch
if err := orch.waitForCriticalServiceEndpoints(context.Background()); err != nil {
t.Fatalf("waitForCriticalServiceEndpoints failed: %v", err)
}
if endpointChecks < 2 {
t.Fatalf("expected repeated endpoint checks, got %d", endpointChecks)
}
}