924 lines
39 KiB
Go
924 lines
39 KiB
Go
package cluster
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"io"
|
|
"log"
|
|
"net"
|
|
"path/filepath"
|
|
"strings"
|
|
"sync"
|
|
"testing"
|
|
"time"
|
|
|
|
"scm.bstein.dev/bstein/ananke/internal/config"
|
|
"scm.bstein.dev/bstein/ananke/internal/execx"
|
|
"scm.bstein.dev/bstein/ananke/internal/state"
|
|
)
|
|
|
|
type commandStub struct {
|
|
match func(name string, args []string) bool
|
|
out string
|
|
err error
|
|
}
|
|
|
|
// buildOrchestratorWithStubs runs one orchestration or CLI step.
|
|
// Signature: buildOrchestratorWithStubs(t *testing.T, cfg config.Config, stubs []commandStub) *Orchestrator.
|
|
// Why: helper centralizes deterministic command dispatch for fast, isolated unit tests.
|
|
func buildOrchestratorWithStubs(t *testing.T, cfg config.Config, stubs []commandStub) *Orchestrator {
|
|
t.Helper()
|
|
if cfg.State.Dir == "" {
|
|
cfg.State.Dir = t.TempDir()
|
|
}
|
|
if cfg.State.ReportsDir == "" {
|
|
cfg.State.ReportsDir = filepath.Join(cfg.State.Dir, "reports")
|
|
}
|
|
if cfg.State.RunHistoryPath == "" {
|
|
cfg.State.RunHistoryPath = filepath.Join(cfg.State.Dir, "runs.json")
|
|
}
|
|
orch := &Orchestrator{
|
|
cfg: cfg,
|
|
runner: &execx.Runner{},
|
|
store: state.New(cfg.State.RunHistoryPath),
|
|
log: log.New(io.Discard, "", 0),
|
|
}
|
|
dispatch := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
|
|
for _, stub := range stubs {
|
|
if stub.match(name, args) {
|
|
return stub.out, stub.err
|
|
}
|
|
}
|
|
return "", nil
|
|
}
|
|
orch.runOverride = dispatch
|
|
orch.runSensitiveOverride = dispatch
|
|
return orch
|
|
}
|
|
|
|
// matchContains runs one orchestration or CLI step.
|
|
// Signature: matchContains(cmd string, parts ...string) func(string, []string) bool.
|
|
// Why: concise substring matching keeps command stubs readable across many tests.
|
|
func matchContains(cmd string, parts ...string) func(string, []string) bool {
|
|
return func(name string, args []string) bool {
|
|
if name != cmd {
|
|
return false
|
|
}
|
|
joined := strings.Join(args, " ")
|
|
for _, part := range parts {
|
|
if !strings.Contains(joined, part) {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
}
|
|
|
|
// TestStartupEarlyFailureLeavesFluxSuspensionUnchanged runs one orchestration or CLI step.
|
|
// Signature: TestStartupEarlyFailureLeavesFluxSuspensionUnchanged(t *testing.T).
|
|
// Why: recovery must not release Flux when bootstrap fails before storage and
|
|
// critical workloads are ready, or Flux can re-create the same dependency loop.
|
|
func TestStartupEarlyFailureLeavesFluxSuspensionUnchanged(t *testing.T) {
|
|
tmpDir := t.TempDir()
|
|
cfg := config.Config{
|
|
SSHPort: 2277,
|
|
Startup: config.Startup{
|
|
APIWaitSeconds: 1,
|
|
APIPollSeconds: 1,
|
|
RequireNodeInventoryReach: false,
|
|
RequireTimeSync: false,
|
|
RequireNodeSSHAuth: false,
|
|
ReconcileAccessOnBoot: false,
|
|
AutoEtcdRestoreOnAPIFailure: false,
|
|
RequiredNodeLabels: map[string]map[string]string{
|
|
"titan-missing": {
|
|
"node-role.kubernetes.io/worker": "true",
|
|
},
|
|
},
|
|
},
|
|
State: config.State{
|
|
Dir: tmpDir,
|
|
ReportsDir: filepath.Join(tmpDir, "reports"),
|
|
RunHistoryPath: filepath.Join(tmpDir, "runs.json"),
|
|
LockPath: filepath.Join(tmpDir, "ananke.lock"),
|
|
IntentPath: filepath.Join(tmpDir, "intent.json"),
|
|
},
|
|
}
|
|
|
|
var mu sync.Mutex
|
|
calls := []string{}
|
|
orch := buildOrchestratorWithStubs(t, cfg, []commandStub{
|
|
{
|
|
match: func(name string, args []string) bool {
|
|
mu.Lock()
|
|
calls = append(calls, name+" "+strings.Join(args, " "))
|
|
mu.Unlock()
|
|
return false
|
|
},
|
|
},
|
|
{match: matchContains("kubectl", "version", "--request-timeout=5s"), out: "ok"},
|
|
{match: matchContains("kubectl", "-n", "vault", "get", "pod", "vault-0"), out: "Pending"},
|
|
{
|
|
match: matchContains("kubectl", "label", "node", "titan-missing"),
|
|
err: errors.New(`nodes "titan-missing" not found`),
|
|
},
|
|
})
|
|
|
|
err := orch.Startup(context.Background(), StartupOptions{Reason: "test early failure"})
|
|
if err == nil {
|
|
t.Fatalf("expected startup to fail before flux resume")
|
|
}
|
|
if !strings.Contains(err.Error(), "ensure required node labels on titan-missing") {
|
|
t.Fatalf("expected required-label failure, got: %v", err)
|
|
}
|
|
|
|
mu.Lock()
|
|
defer mu.Unlock()
|
|
for _, call := range calls {
|
|
if strings.Contains(call, `"suspend":false`) {
|
|
t.Fatalf("early failed startup unexpectedly resumed flux via call: %s", call)
|
|
}
|
|
}
|
|
}
|
|
|
|
// TestRecycleStuckControllerPodsHandlesLonghornAttachBlockedPods runs one orchestration or CLI step.
|
|
// Signature: TestRecycleStuckControllerPodsHandlesLonghornAttachBlockedPods(t *testing.T).
|
|
// Why: Pending Longhorn-backed pods on Longhorn-unready nodes should be
|
|
// rescheduled without mutating Longhorn volume, replica, or disk objects.
|
|
func TestRecycleStuckControllerPodsHandlesLonghornAttachBlockedPods(t *testing.T) {
|
|
created := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
|
|
lastSeen := time.Now().UTC().Format(time.RFC3339)
|
|
pods := `{"items":[{"metadata":{"namespace":"monitoring","name":"victoria-metrics-single-server-0","creationTimestamp":"` + created + `","ownerReferences":[{"kind":"StatefulSet","name":"victoria-metrics-single-server"}]},"spec":{"nodeName":"titan-0b"},"status":{"phase":"Pending"}}]}`
|
|
events := `{"items":[{"metadata":{"namespace":"monitoring","creationTimestamp":"` + lastSeen + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"victoria-metrics-single-server-0"},"type":"Warning","reason":"FailedAttachVolume","message":"AttachVolume.Attach failed for volume \"pvc-1\" : rpc error from [http://longhorn-backend:9500/v1/volumes/pvc-1?action=attach]: unable to attach volume pvc-1 to titan-0b: node titan-0b is not ready","lastTimestamp":"` + lastSeen + `"}]}`
|
|
|
|
deleted := false
|
|
orch := buildOrchestratorWithStubs(t, config.Config{
|
|
Startup: config.Startup{StuckPodGraceSeconds: 180},
|
|
}, []commandStub{
|
|
{match: matchContains("kubectl", "get", "pods", "-A", "-o", "json"), out: pods},
|
|
{match: matchContains("kubectl", "-n", "longhorn-system", "get", "nodes.longhorn.io"), out: "titan-0b\tFalse\n"},
|
|
{match: matchContains("kubectl", "get", "events", "-A", "-o", "json"), out: events},
|
|
{match: matchContains("kubectl", "get", "nodes", "-o", "json"), out: `{"items":[{"metadata":{"name":"titan-0b"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`},
|
|
{
|
|
match: func(name string, args []string) bool {
|
|
if !matchContains("kubectl", "-n", "monitoring", "delete", "pod", "victoria-metrics-single-server-0", "--wait=false")(name, args) {
|
|
return false
|
|
}
|
|
deleted = true
|
|
return true
|
|
},
|
|
},
|
|
})
|
|
|
|
if err := orch.recycleStuckControllerPods(context.Background()); err != nil {
|
|
t.Fatalf("recycleStuckControllerPods failed: %v", err)
|
|
}
|
|
if !deleted {
|
|
t.Fatalf("expected longhorn attach-blocked pending pod to be recycled")
|
|
}
|
|
}
|
|
|
|
// TestRecycleStuckControllerPodsRepairsEncryptedVolumeCryptsetup runs one orchestration or CLI step.
|
|
// Signature: TestRecycleStuckControllerPodsRepairsEncryptedVolumeCryptsetup(t *testing.T).
|
|
// Why: encrypted Longhorn PVC recovery should repair missing host cryptsetup and
|
|
// then recycle the blocked pod without touching Longhorn data-plane objects.
|
|
func TestRecycleStuckControllerPodsRepairsEncryptedVolumeCryptsetup(t *testing.T) {
|
|
created := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
|
|
lastSeen := time.Now().UTC().Format(time.RFC3339)
|
|
pods := `{"items":[{"metadata":{"namespace":"finance","name":"actual-budget-abc","creationTimestamp":"` + created + `","ownerReferences":[{"kind":"ReplicaSet","name":"actual-budget"}]},"spec":{"nodeName":"titan-19"},"status":{"phase":"Pending"}}]}`
|
|
events := `{"items":[{"metadata":{"namespace":"finance","creationTimestamp":"` + lastSeen + `"},"involvedObject":{"kind":"Pod","namespace":"finance","name":"actual-budget-abc"},"type":"Warning","reason":"FailedMount","message":"MountVolume.MountDevice failed for volume \"pvc-1\" : nsenter: failed to execute cryptsetup: No such file or directory","lastTimestamp":"` + lastSeen + `"}]}`
|
|
|
|
installed := false
|
|
deleted := false
|
|
orch := buildOrchestratorWithStubs(t, config.Config{
|
|
Startup: config.Startup{StuckPodGraceSeconds: 180},
|
|
}, []commandStub{
|
|
{match: matchContains("kubectl", "get", "pods", "-A", "-o", "json"), out: pods},
|
|
{match: matchContains("kubectl", "-n", "longhorn-system", "get", "nodes.longhorn.io"), out: "titan-19\tTrue\n"},
|
|
{match: matchContains("kubectl", "get", "events", "-A", "-o", "json"), out: events},
|
|
{match: matchContains("kubectl", "get", "nodes", "-o", "json"), out: `{"items":[{"metadata":{"name":"titan-19"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`},
|
|
{
|
|
match: func(name string, args []string) bool {
|
|
if name != "ssh" || !strings.Contains(strings.Join(args, " "), "apt-get install -y --no-install-recommends cryptsetup-bin") {
|
|
return false
|
|
}
|
|
installed = true
|
|
return true
|
|
},
|
|
out: "__ANANKE_CRYPTSETUP_INSTALLED__",
|
|
},
|
|
{
|
|
match: func(name string, args []string) bool {
|
|
if !matchContains("kubectl", "-n", "finance", "delete", "pod", "actual-budget-abc", "--wait=false")(name, args) {
|
|
return false
|
|
}
|
|
deleted = true
|
|
return true
|
|
},
|
|
},
|
|
})
|
|
|
|
if err := orch.recycleStuckControllerPods(context.Background()); err != nil {
|
|
t.Fatalf("recycleStuckControllerPods failed: %v", err)
|
|
}
|
|
if !installed {
|
|
t.Fatalf("expected missing host cryptsetup to be installed")
|
|
}
|
|
if !deleted {
|
|
t.Fatalf("expected encrypted-volume blocked pod to be recycled")
|
|
}
|
|
}
|
|
|
|
// TestRecycleStuckControllerPodsCordonsEncryptedVolumeNodeWhenRepairFails runs one orchestration or CLI step.
|
|
// Signature: TestRecycleStuckControllerPodsCordonsEncryptedVolumeNodeWhenRepairFails(t *testing.T).
|
|
// Why: when host package repair is blocked by sudo policy, Ananke should avoid
|
|
// the bad node and retry the controller-owned pod elsewhere.
|
|
func TestRecycleStuckControllerPodsCordonsEncryptedVolumeNodeWhenRepairFails(t *testing.T) {
|
|
created := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
|
|
lastSeen := time.Now().UTC().Format(time.RFC3339)
|
|
pods := `{"items":[{"metadata":{"namespace":"finance","name":"actual-budget-abc","creationTimestamp":"` + created + `","ownerReferences":[{"kind":"ReplicaSet","name":"actual-budget"}]},"spec":{"nodeName":"titan-19"},"status":{"phase":"Pending"}}]}`
|
|
events := `{"items":[{"metadata":{"namespace":"finance","creationTimestamp":"` + lastSeen + `"},"involvedObject":{"kind":"Pod","namespace":"finance","name":"actual-budget-abc"},"type":"Warning","reason":"FailedMount","message":"MountVolume.MountDevice failed for volume \"pvc-1\" : nsenter: failed to execute cryptsetup: No such file or directory","lastTimestamp":"` + lastSeen + `"}]}`
|
|
|
|
cordoned := false
|
|
deleted := false
|
|
orch := buildOrchestratorWithStubs(t, config.Config{
|
|
Startup: config.Startup{StuckPodGraceSeconds: 180},
|
|
}, []commandStub{
|
|
{match: matchContains("kubectl", "get", "pods", "-A", "-o", "json"), out: pods},
|
|
{match: matchContains("kubectl", "-n", "longhorn-system", "get", "nodes.longhorn.io"), out: "titan-19\tTrue\n"},
|
|
{match: matchContains("kubectl", "get", "events", "-A", "-o", "json"), out: events},
|
|
{match: matchContains("kubectl", "get", "nodes", "-o", "json"), out: `{"items":[{"metadata":{"name":"titan-19"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`},
|
|
{
|
|
match: matchContains("ssh", "apt-get install -y --no-install-recommends cryptsetup-bin"),
|
|
err: errors.New("sudo: a password is required"),
|
|
},
|
|
{
|
|
match: func(name string, args []string) bool {
|
|
if !matchContains("kubectl", "cordon", "titan-19")(name, args) {
|
|
return false
|
|
}
|
|
cordoned = true
|
|
return true
|
|
},
|
|
},
|
|
{
|
|
match: func(name string, args []string) bool {
|
|
if !matchContains("kubectl", "-n", "finance", "delete", "pod", "actual-budget-abc", "--wait=false")(name, args) {
|
|
return false
|
|
}
|
|
deleted = true
|
|
return true
|
|
},
|
|
},
|
|
})
|
|
|
|
if err := orch.recycleStuckControllerPods(context.Background()); err != nil {
|
|
t.Fatalf("recycleStuckControllerPods failed: %v", err)
|
|
}
|
|
if !cordoned {
|
|
t.Fatalf("expected cryptsetup-missing node to be cordoned")
|
|
}
|
|
if !deleted {
|
|
t.Fatalf("expected encrypted-volume blocked pod to be recycled")
|
|
}
|
|
}
|
|
|
|
// TestRecycleStuckControllerPodsHandlesStalePodsOnReadyNodes runs one orchestration or CLI step.
|
|
// Signature: TestRecycleStuckControllerPodsHandlesStalePodsOnReadyNodes(t *testing.T).
|
|
// Why: post-outage controller pods can remain Unknown or Failed after their
|
|
// node recovers; normal deletion clears stale status without force-deleting or
|
|
// touching storage.
|
|
func TestRecycleStuckControllerPodsHandlesStalePodsOnReadyNodes(t *testing.T) {
|
|
old := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
|
|
recent := time.Now().Add(-30 * time.Second).UTC().Format(time.RFC3339)
|
|
pods := `{"items":[` +
|
|
`{"metadata":{"namespace":"longhorn-system","name":"longhorn-vault-sync-old","creationTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"longhorn-vault-sync"}]},"spec":{"nodeName":"titan-12"},"status":{"phase":"Unknown"}},` +
|
|
`{"metadata":{"namespace":"longhorn-system","name":"longhorn-vault-sync-failed","creationTimestamp":"` + old + `","deletionTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"longhorn-vault-sync"}]},"spec":{"nodeName":"titan-12","volumes":[{"name":"secret"}]},"status":{"phase":"Failed"}},` +
|
|
`{"metadata":{"namespace":"longhorn-system","name":"pvc-backed-failed","creationTimestamp":"` + old + `","deletionTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"pvc-backed"}]},"spec":{"nodeName":"titan-12","volumes":[{"name":"data","persistentVolumeClaim":{"claimName":"data"}}]},"status":{"phase":"Failed"}},` +
|
|
`{"metadata":{"namespace":"longhorn-system","name":"longhorn-vault-sync-fresh","creationTimestamp":"` + recent + `","ownerReferences":[{"kind":"ReplicaSet","name":"longhorn-vault-sync"}]},"spec":{"nodeName":"titan-12"},"status":{"phase":"Unknown"}},` +
|
|
`{"metadata":{"namespace":"maintenance","name":"stale-on-bad-node","creationTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"maintenance"}]},"spec":{"nodeName":"titan-22"},"status":{"phase":"Unknown"}},` +
|
|
`{"metadata":{"namespace":"default","name":"bare-pod","creationTimestamp":"` + old + `"},"spec":{"nodeName":"titan-12"},"status":{"phase":"Unknown"}}]}`
|
|
|
|
deleted := []string{}
|
|
orch := buildOrchestratorWithStubs(t, config.Config{
|
|
Startup: config.Startup{StuckPodGraceSeconds: 180},
|
|
}, []commandStub{
|
|
{match: matchContains("kubectl", "get", "pods", "-A", "-o", "json"), out: pods},
|
|
{match: matchContains("kubectl", "-n", "longhorn-system", "get", "nodes.longhorn.io"), out: "titan-12\tTrue\ntitan-22\tTrue\n"},
|
|
{match: matchContains("kubectl", "get", "events", "-A", "-o", "json"), out: `{"items":[]}`},
|
|
{match: matchContains("kubectl", "get", "nodes", "-o", "json"), out: `{"items":[{"metadata":{"name":"titan-12"},"status":{"conditions":[{"type":"Ready","status":"True"}]}},{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"False"}]}}]}`},
|
|
{
|
|
match: func(name string, args []string) bool {
|
|
if !matchContains("kubectl", "-n", "longhorn-system", "delete", "pod", "longhorn-vault-sync-old", "--wait=false")(name, args) {
|
|
return false
|
|
}
|
|
deleted = append(deleted, "longhorn-vault-sync-old")
|
|
return true
|
|
},
|
|
},
|
|
{
|
|
match: func(name string, args []string) bool {
|
|
if !matchContains("kubectl", "-n", "longhorn-system", "delete", "pod", "longhorn-vault-sync-failed", "--wait=false", "--grace-period=0", "--force")(name, args) {
|
|
return false
|
|
}
|
|
deleted = append(deleted, "longhorn-vault-sync-failed")
|
|
return true
|
|
},
|
|
},
|
|
{
|
|
match: func(name string, args []string) bool {
|
|
if !matchContains("kubectl", "-n", "longhorn-system", "delete", "pod", "pvc-backed-failed", "--wait=false")(name, args) {
|
|
return false
|
|
}
|
|
if strings.Contains(strings.Join(args, " "), "--force") {
|
|
t.Fatalf("pvc-backed stale pod must not be force deleted")
|
|
}
|
|
deleted = append(deleted, "pvc-backed-failed")
|
|
return true
|
|
},
|
|
},
|
|
})
|
|
|
|
if err := orch.recycleStuckControllerPods(context.Background()); err != nil {
|
|
t.Fatalf("recycleStuckControllerPods failed: %v", err)
|
|
}
|
|
if strings.Join(deleted, ",") != "longhorn-vault-sync-old,longhorn-vault-sync-failed,pvc-backed-failed" {
|
|
t.Fatalf("expected only stale controller pods on Ready node to be recycled, got %#v", deleted)
|
|
}
|
|
}
|
|
|
|
// TestEffectiveWorkersFiltersIgnoredUnavailableNodes runs one orchestration or CLI step.
|
|
// Signature: TestEffectiveWorkersFiltersIgnoredUnavailableNodes(t *testing.T).
|
|
// Why: ignored unavailable nodes should be excluded before startup tries SSH,
|
|
// k3s-agent start, or uncordon operations against intentionally absent hosts.
|
|
func TestEffectiveWorkersFiltersIgnoredUnavailableNodes(t *testing.T) {
|
|
cfg := config.Config{
|
|
Workers: []string{" titan-08 ", "titan-09", "titan-10", "titan-11"},
|
|
Startup: config.Startup{
|
|
IgnoreUnavailableNodes: []string{"titan-09", "titan-10"},
|
|
},
|
|
}
|
|
orch := buildOrchestratorWithStubs(t, cfg, nil)
|
|
got, err := orch.effectiveWorkers(context.Background())
|
|
if err != nil {
|
|
t.Fatalf("effectiveWorkers failed: %v", err)
|
|
}
|
|
want := []string{"titan-08", "titan-11"}
|
|
if strings.Join(got, ",") != strings.Join(want, ",") {
|
|
t.Fatalf("effectiveWorkers mismatch got=%v want=%v", got, want)
|
|
}
|
|
}
|
|
|
|
// TestEnsureLonghornEncryptedHostPrereqsFiltersUnsafeWorkers runs one orchestration or CLI step.
|
|
// Signature: TestEnsureLonghornEncryptedHostPrereqsFiltersUnsafeWorkers(t *testing.T).
|
|
// Why: startup must not uncordon Longhorn workers that cannot mount encrypted
|
|
// PVCs; cordoning those nodes is safe and avoids repeating the post-outage
|
|
// mount deadlock.
|
|
func TestEnsureLonghornEncryptedHostPrereqsFiltersUnsafeWorkers(t *testing.T) {
|
|
cordoned := []string{}
|
|
orch := buildOrchestratorWithStubs(t, config.Config{
|
|
SSHManagedNodes: []string{"titan-04", "titan-19"},
|
|
}, []commandStub{
|
|
{match: matchContains("kubectl", "get", "nodes", "-l", "longhorn-host=true"), out: "titan-04\ntitan-19\ntitan-23\n"},
|
|
{
|
|
match: matchContains("ssh", "titan-04", "command -v cryptsetup"),
|
|
out: "__ANANKE_CRYPTSETUP_PRESENT__",
|
|
},
|
|
{
|
|
match: matchContains("ssh", "titan-19", "apt-get install -y --no-install-recommends cryptsetup-bin"),
|
|
err: errors.New("sudo: a password is required"),
|
|
},
|
|
{
|
|
match: func(name string, args []string) bool {
|
|
if name != "kubectl" || len(args) == 0 || args[0] != "cordon" {
|
|
return false
|
|
}
|
|
if len(args) > 1 {
|
|
cordoned = append(cordoned, args[len(args)-1])
|
|
}
|
|
return true
|
|
},
|
|
},
|
|
})
|
|
|
|
got, err := orch.ensureLonghornEncryptedHostPrereqs(context.Background(), []string{"titan-04", "titan-19", "titan-20"})
|
|
if err != nil {
|
|
t.Fatalf("ensureLonghornEncryptedHostPrereqs failed: %v", err)
|
|
}
|
|
want := []string{"titan-04", "titan-20"}
|
|
if strings.Join(got, ",") != strings.Join(want, ",") {
|
|
t.Fatalf("guarded workers mismatch got=%v want=%v", got, want)
|
|
}
|
|
if strings.Join(cordoned, ",") != "titan-19,titan-23" {
|
|
t.Fatalf("expected unsafe longhorn hosts to be cordoned, got %v", cordoned)
|
|
}
|
|
}
|
|
|
|
// TestLonghornCryptsetupExemptNodesAreNotQuarantined runs one orchestration or CLI step.
|
|
// Signature: TestLonghornCryptsetupExemptNodesAreNotQuarantined(t *testing.T).
|
|
// Why: Veles/Oceanus uses titan-23 as a Longhorn host for unencrypted local
|
|
// volumes; startup should uncordon that policy-exempt node without requiring
|
|
// host SSH or weakening encrypted-volume safety on other workers.
|
|
func TestLonghornCryptsetupExemptNodesAreNotQuarantined(t *testing.T) {
|
|
cordoned := []string{}
|
|
uncordoned := []string{}
|
|
sshTitan23 := false
|
|
orch := buildOrchestratorWithStubs(t, config.Config{
|
|
SSHManagedNodes: []string{"titan-04"},
|
|
Startup: config.Startup{
|
|
LonghornCryptsetupExemptNodes: []string{"titan-23"},
|
|
},
|
|
}, []commandStub{
|
|
{match: matchContains("kubectl", "get", "nodes", "-l", "longhorn-host=true"), out: "titan-04\ntitan-23\n"},
|
|
{
|
|
match: matchContains("ssh", "titan-04", "command -v cryptsetup"),
|
|
out: "__ANANKE_CRYPTSETUP_PRESENT__",
|
|
},
|
|
{
|
|
match: func(name string, args []string) bool {
|
|
if name == "ssh" && strings.Contains(strings.Join(args, " "), "titan-23") {
|
|
sshTitan23 = true
|
|
return true
|
|
}
|
|
return false
|
|
},
|
|
},
|
|
{
|
|
match: func(name string, args []string) bool {
|
|
if name != "kubectl" || len(args) == 0 || args[0] != "cordon" {
|
|
return false
|
|
}
|
|
if len(args) > 1 {
|
|
cordoned = append(cordoned, args[len(args)-1])
|
|
}
|
|
return true
|
|
},
|
|
},
|
|
{
|
|
match: func(name string, args []string) bool {
|
|
if !matchContains("kubectl", "uncordon")(name, args) {
|
|
return false
|
|
}
|
|
if len(args) > 1 {
|
|
uncordoned = append(uncordoned, args[len(args)-1])
|
|
}
|
|
return true
|
|
},
|
|
},
|
|
})
|
|
|
|
got, err := orch.ensureLonghornEncryptedHostPrereqs(context.Background(), []string{"titan-04"})
|
|
if err != nil {
|
|
t.Fatalf("ensureLonghornEncryptedHostPrereqs failed: %v", err)
|
|
}
|
|
if strings.Join(got, ",") != "titan-04" {
|
|
t.Fatalf("guarded workers mismatch got=%v", got)
|
|
}
|
|
if err := orch.uncordonLonghornCryptsetupExemptNodes(context.Background()); err != nil {
|
|
t.Fatalf("uncordonLonghornCryptsetupExemptNodes failed: %v", err)
|
|
}
|
|
if sshTitan23 {
|
|
t.Fatalf("did not expect cryptsetup SSH check for exempt titan-23")
|
|
}
|
|
if len(cordoned) != 0 {
|
|
t.Fatalf("did not expect exempt node to be cordoned, got %v", cordoned)
|
|
}
|
|
if strings.Join(uncordoned, ",") != "titan-23" {
|
|
t.Fatalf("expected exempt titan-23 to be uncordoned, got %v", uncordoned)
|
|
}
|
|
}
|
|
|
|
// TestLonghornHostNodesFallsBackToConfiguredLabels runs one orchestration or CLI step.
|
|
// Signature: TestLonghornHostNodesFallsBackToConfiguredLabels(t *testing.T).
|
|
// Why: bootstrap caches or minimal test clusters can lack live labels; the
|
|
// static startup inventory should still protect configured storage workers.
|
|
func TestLonghornHostNodesFallsBackToConfiguredLabels(t *testing.T) {
|
|
orch := buildOrchestratorWithStubs(t, config.Config{
|
|
Startup: config.Startup{
|
|
RequiredNodeLabels: map[string]map[string]string{
|
|
"titan-04": {"longhorn-host": "true"},
|
|
"titan-20": {"node-role.kubernetes.io/worker": "true"},
|
|
},
|
|
},
|
|
}, []commandStub{
|
|
{match: matchContains("kubectl", "get", "nodes", "-l", "longhorn-host=true"), out: ""},
|
|
})
|
|
|
|
got, err := orch.longhornHostNodes(context.Background())
|
|
if err != nil {
|
|
t.Fatalf("longhornHostNodes failed: %v", err)
|
|
}
|
|
if _, ok := got["titan-04"]; !ok || len(got) != 1 {
|
|
t.Fatalf("expected configured longhorn host fallback, got %v", got)
|
|
}
|
|
}
|
|
|
|
// TestNewConstructsOrchestrator runs one orchestration or CLI step.
|
|
// Signature: TestNewConstructsOrchestrator(t *testing.T).
|
|
// Why: covers constructor path in orchestrator core module.
|
|
func TestNewConstructsOrchestrator(t *testing.T) {
|
|
cfg := config.Config{State: config.State{RunHistoryPath: filepath.Join(t.TempDir(), "runs.json")}}
|
|
r := &execx.Runner{}
|
|
s := state.New(cfg.State.RunHistoryPath)
|
|
orch := New(cfg, r, s, log.New(io.Discard, "", 0))
|
|
if orch == nil || orch.runner != r || orch.store != s {
|
|
t.Fatalf("constructor returned unexpected orchestrator: %#v", orch)
|
|
}
|
|
}
|
|
|
|
// TestParseSnapshotPathFromEtcdSnapshotList runs one orchestration or CLI step.
|
|
// Signature: TestParseSnapshotPathFromEtcdSnapshotList(t *testing.T).
|
|
// Why: covers snapshot-path parser branches including header skip and no-match.
|
|
func TestParseSnapshotPathFromEtcdSnapshotList(t *testing.T) {
|
|
out := strings.Join([]string{
|
|
"Name Size Created Location",
|
|
`pre-shutdown 4.2M now "file:///var/lib/rancher/k3s/server/db/snapshots/pre-shutdown"`,
|
|
}, "\n")
|
|
got := parseSnapshotPathFromEtcdSnapshotList(out)
|
|
if got != "/var/lib/rancher/k3s/server/db/snapshots/pre-shutdown" {
|
|
t.Fatalf("unexpected snapshot path: %q", got)
|
|
}
|
|
if parseSnapshotPathFromEtcdSnapshotList("no snapshots") != "" {
|
|
t.Fatalf("expected no snapshot path")
|
|
}
|
|
}
|
|
|
|
// TestFluxSourceHelpers runs one orchestration or CLI step.
|
|
// Signature: TestFluxSourceHelpers(t *testing.T).
|
|
// Why: covers flux source readiness/guard/branch patch helper flows.
|
|
func TestFluxSourceHelpers(t *testing.T) {
|
|
cfg := config.Config{
|
|
ExpectedFluxSource: "ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git",
|
|
}
|
|
orch := buildOrchestratorWithStubs(t, cfg, []commandStub{
|
|
{match: matchContains("kubectl", "jsonpath={.status.conditions"), out: "True"},
|
|
{match: matchContains("kubectl", "jsonpath={.spec.url}"), out: "ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git"},
|
|
{match: matchContains("kubectl", "jsonpath={.spec.ref.branch}"), out: "main"},
|
|
{match: matchContains("kubectl", "patch", "gitrepository"), out: ""},
|
|
})
|
|
|
|
ready, err := orch.fluxSourceReady(context.Background())
|
|
if err != nil || !ready {
|
|
t.Fatalf("expected flux source ready, got ready=%v err=%v", ready, err)
|
|
}
|
|
if err := orch.guardFluxSourceDrift(context.Background(), "main", false); err != nil {
|
|
t.Fatalf("guardFluxSourceDrift failed: %v", err)
|
|
}
|
|
if err := orch.ensureFluxBranch(context.Background(), "main", false); err != nil {
|
|
t.Fatalf("ensureFluxBranch no-op failed: %v", err)
|
|
}
|
|
if got := normalizeGitURL(" SSH://Git@Host/Repo.git/ "); got != "ssh://git@host/repo" {
|
|
t.Fatalf("unexpected normalized url: %q", got)
|
|
}
|
|
}
|
|
|
|
// TestCoordinationHelpers runs one orchestration or CLI step.
|
|
// Signature: TestCoordinationHelpers(t *testing.T).
|
|
// Why: covers intent-age helpers, shell quoting, and peer selection logic.
|
|
func TestCoordinationHelpers(t *testing.T) {
|
|
in := state.Intent{UpdatedAt: time.Now().Add(-10 * time.Second)}
|
|
if intentAge(in) <= 0 {
|
|
t.Fatalf("expected positive age")
|
|
}
|
|
if !intentFresh(state.Intent{}, time.Second) {
|
|
t.Fatalf("zero timestamp should be fresh")
|
|
}
|
|
if shellQuote("a'b") != `'a'"'"'b'` {
|
|
t.Fatalf("unexpected shell quote output")
|
|
}
|
|
|
|
orch := buildOrchestratorWithStubs(t, config.Config{
|
|
Coordination: config.Coordination{
|
|
PeerHosts: []string{"titan-24", "titan-24", "titan-db"},
|
|
ForwardShutdownHost: "titan-db",
|
|
},
|
|
}, nil)
|
|
peers := orch.coordinationPeers()
|
|
if len(peers) != 2 {
|
|
t.Fatalf("expected deduped peers, got %v", peers)
|
|
}
|
|
}
|
|
|
|
// TestVerifyEtcdSnapshotAndRunSudoK3S runs one orchestration or CLI step.
|
|
// Signature: TestVerifyEtcdSnapshotAndRunSudoK3S(t *testing.T).
|
|
// Why: covers k3s command fallback and snapshot verification happy path.
|
|
func TestVerifyEtcdSnapshotAndRunSudoK3S(t *testing.T) {
|
|
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
|
{match: matchContains("ssh", "stat -c %s"), out: "2097152"},
|
|
{match: matchContains("ssh", "k3s etcd-snapshot ls"), out: "/var/lib/rancher/k3s/server/db/snapshots/pre-shutdown"},
|
|
{match: matchContains("ssh", "sha256sum"), out: strings.Repeat("a", 64)},
|
|
})
|
|
if err := orch.verifyEtcdSnapshot(context.Background(), "titan-0a", "/var/lib/rancher/k3s/server/db/snapshots/pre-shutdown"); err != nil {
|
|
t.Fatalf("verifyEtcdSnapshot failed: %v", err)
|
|
}
|
|
}
|
|
|
|
// TestScalingHelpers runs one orchestration or CLI step.
|
|
// Signature: TestScalingHelpers(t *testing.T).
|
|
// Why: covers workload discovery, snapshot IO, and scale command orchestration.
|
|
func TestScalingHelpers(t *testing.T) {
|
|
cfg := config.Config{
|
|
ExcludedNamespaces: []string{"kube-system"},
|
|
State: config.State{Dir: t.TempDir()},
|
|
}
|
|
orch := buildOrchestratorWithStubs(t, cfg, []commandStub{
|
|
{
|
|
match: matchContains("kubectl", "get deployment", "jsonpath"),
|
|
out: strings.Join([]string{
|
|
"default\tgrafana\t1",
|
|
"kube-system\tcoredns\t2",
|
|
"",
|
|
}, "\n"),
|
|
},
|
|
{
|
|
match: matchContains("kubectl", "get statefulset", "jsonpath"),
|
|
out: "vault\tvault\t1\n",
|
|
},
|
|
{match: matchContains("kubectl", "scale", "deployment", "grafana"), out: ""},
|
|
{match: matchContains("kubectl", "scale", "statefulset", "vault"), out: ""},
|
|
})
|
|
|
|
entries, err := orch.listScalableWorkloads(context.Background())
|
|
if err != nil {
|
|
t.Fatalf("listScalableWorkloads failed: %v", err)
|
|
}
|
|
if len(entries) != 2 {
|
|
t.Fatalf("expected 2 scalable entries, got %d (%v)", len(entries), entries)
|
|
}
|
|
if err := orch.writeScaledWorkloadSnapshot(entries); err != nil {
|
|
t.Fatalf("writeScaledWorkloadSnapshot failed: %v", err)
|
|
}
|
|
snapshot, err := orch.readScaledWorkloadSnapshot()
|
|
if err != nil || snapshot == nil || len(snapshot.Entries) != 2 {
|
|
t.Fatalf("readScaledWorkloadSnapshot failed snapshot=%v err=%v", snapshot, err)
|
|
}
|
|
if err := orch.scaleWorkloads(context.Background(), entries, -1, 2); err != nil {
|
|
t.Fatalf("scaleWorkloads failed: %v", err)
|
|
}
|
|
}
|
|
|
|
// TestStorageReadyAndWorkloadHelpers runs one orchestration or CLI step.
|
|
// Signature: TestStorageReadyAndWorkloadHelpers(t *testing.T).
|
|
// Why: covers storage readiness checks and workload helper utilities.
|
|
func TestStorageReadyAndWorkloadHelpers(t *testing.T) {
|
|
cfg := config.Config{
|
|
Startup: config.Startup{
|
|
StorageMinReadyNodes: 1,
|
|
StorageCriticalPVCs: []string{"vault/data-vault-0"},
|
|
},
|
|
}
|
|
orch := buildOrchestratorWithStubs(t, cfg, []commandStub{
|
|
{match: matchContains("kubectl", "nodes.longhorn.io"), out: "titan-23:True:True\n"},
|
|
{match: matchContains("kubectl", "get pvc data-vault-0"), out: "Bound"},
|
|
})
|
|
ok, reason, err := orch.storageReady(context.Background())
|
|
if err != nil || !ok {
|
|
t.Fatalf("expected storageReady true, got ok=%v reason=%q err=%v", ok, reason, err)
|
|
}
|
|
}
|
|
|
|
// TestIngressAndServiceHelpers runs one orchestration or CLI step.
|
|
// Signature: TestIngressAndServiceHelpers(t *testing.T).
|
|
// Why: covers ingress host discovery helpers and URL parsing helpers.
|
|
func TestIngressAndServiceHelpers(t *testing.T) {
|
|
cfg := config.Config{
|
|
Startup: config.Startup{
|
|
IngressChecklistIgnoreHosts: []string{"ignore.bstein.dev"},
|
|
},
|
|
}
|
|
orch := buildOrchestratorWithStubs(t, cfg, []commandStub{
|
|
{match: matchContains("kubectl", "get ingress", "-A", "-o", "json"), out: `{"items":[{"metadata":{"namespace":"gitea"},"spec":{"rules":[{"host":"scm.bstein.dev"}]}},{"metadata":{"namespace":"x"},"spec":{"rules":[{"host":"ignore.bstein.dev"}]}}]}`},
|
|
})
|
|
hosts, err := orch.discoverIngressHosts(context.Background())
|
|
if err != nil || len(hosts) != 1 || hosts[0] != "scm.bstein.dev" {
|
|
t.Fatalf("discoverIngressHosts unexpected hosts=%v err=%v", hosts, err)
|
|
}
|
|
if got := hostFromURL("https://metrics.bstein.dev/api/health"); got != "metrics.bstein.dev" {
|
|
t.Fatalf("unexpected hostFromURL value: %q", got)
|
|
}
|
|
if !isLikelyHostname("metrics.bstein.dev") || isLikelyHostname("bad path/value") {
|
|
t.Fatalf("isLikelyHostname classification mismatch")
|
|
}
|
|
}
|
|
|
|
// TestWorkloadConvergenceHelpers runs one orchestration or CLI step.
|
|
// Signature: TestWorkloadConvergenceHelpers(t *testing.T).
|
|
// Why: covers controller readiness helpers and stuck-pod heuristics.
|
|
func TestWorkloadConvergenceHelpers(t *testing.T) {
|
|
replicas := int32(2)
|
|
item := workloadResource{Kind: "deployment"}
|
|
item.Spec.Replicas = &replicas
|
|
item.Status.ReadyReplicas = 1
|
|
desired, ready, ok := desiredReady(item)
|
|
if !ok || desired != 2 || ready != 1 {
|
|
t.Fatalf("desiredReady mismatch desired=%d ready=%d ok=%v", desired, ready, ok)
|
|
}
|
|
var pod podResource
|
|
pod.Metadata.OwnerReferences = []ownerReference{{Kind: "ReplicaSet"}}
|
|
if !podControllerOwned(pod) {
|
|
t.Fatalf("expected podControllerOwned=true")
|
|
}
|
|
pod.Status.ContainerStatuses = []podContainerStatus{{State: podContainerState{Waiting: &podContainerWaitingState{Reason: "CrashLoopBackOff"}}}}
|
|
reason := stuckContainerReason(pod, map[string]struct{}{"CrashLoopBackOff": struct{}{}})
|
|
if reason != "CrashLoopBackOff" {
|
|
t.Fatalf("unexpected stuck reason: %q", reason)
|
|
}
|
|
}
|
|
|
|
// TestDrainAndK3SHelpers runs one orchestration or CLI step.
|
|
// Signature: TestDrainAndK3SHelpers(t *testing.T).
|
|
// Why: covers node drain diagnostics and k3s snapshot selection flow.
|
|
func TestDrainAndK3SHelpers(t *testing.T) {
|
|
cfg := config.Config{
|
|
SSHManagedNodes: []string{"titan-0a"},
|
|
}
|
|
orch := buildOrchestratorWithStubs(t, cfg, []commandStub{
|
|
{match: matchContains("kubectl", "get pods", "--field-selector", "spec.nodeName=titan-22"), out: "vault vault-0 Running StatefulSet\n"},
|
|
{match: matchContains("ssh", "k3s etcd-snapshot ls"), out: "pre-shutdown /var/lib/rancher/k3s/server/db/snapshots/pre-shutdown"},
|
|
})
|
|
diag := orch.drainNodeDiagnostics(context.Background(), "titan-22")
|
|
if !strings.Contains(diag, "vault/vault-0") {
|
|
t.Fatalf("unexpected diagnostics output: %q", diag)
|
|
}
|
|
snapshot, err := orch.latestEtcdSnapshotPath(context.Background(), "titan-0a")
|
|
if err != nil || snapshot == "" {
|
|
t.Fatalf("latestEtcdSnapshotPath failed snapshot=%q err=%v", snapshot, err)
|
|
}
|
|
}
|
|
|
|
// TestTimesyncAndInventoryHelpers runs one orchestration or CLI step.
|
|
// Signature: TestTimesyncAndInventoryHelpers(t *testing.T).
|
|
// Why: covers time sync helpers, datastore endpoint parsing, and inventory assembly.
|
|
func TestTimesyncAndInventoryHelpers(t *testing.T) {
|
|
cfg := config.Config{
|
|
ControlPlanes: []string{"titan-0a"},
|
|
Workers: []string{"titan-22"},
|
|
SSHManagedNodes: []string{"titan-0a", "titan-22"},
|
|
SSHNodeHosts: map[string]string{
|
|
"titan-db": "10.0.0.10",
|
|
},
|
|
Coordination: config.Coordination{
|
|
PeerHosts: []string{"titan-24"},
|
|
ForwardShutdownHost: "titan-db",
|
|
},
|
|
}
|
|
orch := buildOrchestratorWithStubs(t, cfg, nil)
|
|
nodes := orch.inventoryNodesForValidation()
|
|
if len(nodes) < 3 {
|
|
t.Fatalf("expected combined inventory nodes, got %v", nodes)
|
|
}
|
|
if parseDatastoreEndpoint("ExecStart=/usr/local/bin/k3s server --datastore-endpoint=postgres://x") == "" {
|
|
t.Fatalf("expected datastore endpoint parse")
|
|
}
|
|
if !isTimeSynced("YES") || isTimeSynced("no") {
|
|
t.Fatalf("unexpected isTimeSynced behavior")
|
|
}
|
|
|
|
ln, err := net.Listen("tcp", "127.0.0.1:0")
|
|
if err != nil {
|
|
t.Fatalf("listen failed: %v", err)
|
|
}
|
|
defer ln.Close()
|
|
if !orch.tcpReachable(ln.Addr().String(), 500*time.Millisecond) {
|
|
t.Fatalf("expected tcpReachable=true for open listener")
|
|
}
|
|
}
|
|
|
|
// TestShutdownModeValidation runs one orchestration or CLI step.
|
|
// Signature: TestShutdownModeValidation(t *testing.T).
|
|
// Why: covers removed poweroff mode and invalid-mode errors.
|
|
func TestShutdownModeValidation(t *testing.T) {
|
|
if mode, err := normalizeShutdownMode("cluster-only"); err != nil || mode != "cluster-only" {
|
|
t.Fatalf("expected cluster-only mode, got mode=%q err=%v", mode, err)
|
|
}
|
|
if _, err := normalizeShutdownMode("bogus"); err == nil {
|
|
t.Fatalf("expected invalid mode error")
|
|
}
|
|
}
|
|
|
|
// TestWaitForAPIDryRunShortCircuit runs one orchestration or CLI step.
|
|
// Signature: TestWaitForAPIDryRunShortCircuit(t *testing.T).
|
|
// Why: covers dry-run short-circuit branch for api readiness wait.
|
|
func TestWaitForAPIDryRunShortCircuit(t *testing.T) {
|
|
orch := &Orchestrator{runner: &execx.Runner{DryRun: true}}
|
|
if err := orch.waitForAPI(context.Background(), 1, time.Millisecond); err != nil {
|
|
t.Fatalf("expected dry-run waitForAPI to pass: %v", err)
|
|
}
|
|
}
|
|
|
|
// TestGuardFluxSourceDriftMismatch runs one orchestration or CLI step.
|
|
// Signature: TestGuardFluxSourceDriftMismatch(t *testing.T).
|
|
// Why: covers url-drift and branch-drift error branches.
|
|
func TestGuardFluxSourceDriftMismatch(t *testing.T) {
|
|
cfg := config.Config{ExpectedFluxSource: "ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git"}
|
|
orch := buildOrchestratorWithStubs(t, cfg, []commandStub{
|
|
{match: matchContains("kubectl", "jsonpath={.spec.url}"), out: "ssh://git@scm.bstein.dev:2242/bstein/wrong.git"},
|
|
})
|
|
if err := orch.guardFluxSourceDrift(context.Background(), "main", false); err == nil {
|
|
t.Fatalf("expected guardFluxSourceDrift mismatch error")
|
|
}
|
|
|
|
orch = buildOrchestratorWithStubs(t, cfg, []commandStub{
|
|
{match: matchContains("kubectl", "jsonpath={.spec.url}"), out: "ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git"},
|
|
{match: matchContains("kubectl", "jsonpath={.spec.ref.branch}"), out: "atlasbot"},
|
|
})
|
|
if err := orch.guardFluxSourceDrift(context.Background(), "main", false); err == nil {
|
|
t.Fatalf("expected branch drift error")
|
|
}
|
|
}
|
|
|
|
// TestRunSudoK3SFailsWhenAllCandidatesFail runs one orchestration or CLI step.
|
|
// Signature: TestRunSudoK3SFailsWhenAllCandidatesFail(t *testing.T).
|
|
// Why: covers fallback failure return in runSudoK3S.
|
|
func TestRunSudoK3SFailsWhenAllCandidatesFail(t *testing.T) {
|
|
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
|
{match: matchContains("ssh", "k3s"), err: errors.New("no binary")},
|
|
})
|
|
if _, err := orch.runSudoK3S(context.Background(), "titan-0a", "server"); err == nil {
|
|
t.Fatalf("expected runSudoK3S failure when all candidates fail")
|
|
}
|
|
}
|
|
|
|
// TestCriticalEndpointHelpers runs one orchestration or CLI step.
|
|
// Signature: TestCriticalEndpointHelpers(t *testing.T).
|
|
// Why: covers critical endpoint parsing and readiness checks that gate startup completion.
|
|
func TestCriticalEndpointHelpers(t *testing.T) {
|
|
cfg := config.Config{
|
|
Startup: config.Startup{
|
|
CriticalServiceEndpoints: []string{"monitoring/victoria-metrics-single-server"},
|
|
},
|
|
}
|
|
orch := buildOrchestratorWithStubs(t, cfg, []commandStub{
|
|
{match: matchContains("kubectl", "get endpoints victoria-metrics-single-server"), out: "10.42.0.10\n10.42.0.11\n"},
|
|
})
|
|
ok, detail, ns, svc, err := orch.criticalServiceEndpointsReady(context.Background())
|
|
if err != nil || !ok {
|
|
t.Fatalf("expected criticalServiceEndpointsReady success, got ok=%v detail=%q ns=%q svc=%q err=%v", ok, detail, ns, svc, err)
|
|
}
|
|
if detail != "services=1" {
|
|
t.Fatalf("unexpected readiness detail: %q", detail)
|
|
}
|
|
gotNS, gotSvc, err := parseCriticalServiceEndpoint("monitoring/victoria-metrics-single-server")
|
|
if err != nil || gotNS != "monitoring" || gotSvc != "victoria-metrics-single-server" {
|
|
t.Fatalf("unexpected parse result ns=%q svc=%q err=%v", gotNS, gotSvc, err)
|
|
}
|
|
if _, _, err := parseCriticalServiceEndpoint("invalid"); err == nil {
|
|
t.Fatalf("expected parseCriticalServiceEndpoint error")
|
|
}
|
|
}
|
|
|
|
// TestCriticalEndpointAutoHealWorkflow runs one orchestration or CLI step.
|
|
// Signature: TestCriticalEndpointAutoHealWorkflow(t *testing.T).
|
|
// Why: covers endpoint-zero recovery where startup heals workload replicas before succeeding.
|
|
func TestCriticalEndpointAutoHealWorkflow(t *testing.T) {
|
|
cfg := config.Config{
|
|
Startup: config.Startup{
|
|
CriticalServiceEndpointWaitSec: 2,
|
|
CriticalServiceEndpointPollSec: 1,
|
|
CriticalServiceEndpoints: []string{"monitoring/victoria-metrics-single-server"},
|
|
},
|
|
State: config.State{
|
|
Dir: t.TempDir(),
|
|
ReportsDir: filepath.Join(t.TempDir(), "reports"),
|
|
RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"),
|
|
},
|
|
}
|
|
orch := &Orchestrator{
|
|
cfg: cfg,
|
|
runner: &execx.Runner{},
|
|
store: state.New(cfg.State.RunHistoryPath),
|
|
log: log.New(io.Discard, "", 0),
|
|
}
|
|
|
|
endpointChecks := 0
|
|
dispatch := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
|
|
joined := name + " " + strings.Join(args, " ")
|
|
if strings.Contains(joined, "get endpoints victoria-metrics-single-server") {
|
|
endpointChecks++
|
|
if endpointChecks == 1 {
|
|
return "", nil
|
|
}
|
|
return "10.42.0.10\n", nil
|
|
}
|
|
if strings.Contains(joined, "scale deployment victoria-metrics-single-server") {
|
|
return "", errors.New(`Error from server (NotFound): deployments.apps "victoria-metrics-single-server" not found`)
|
|
}
|
|
if strings.Contains(joined, "scale statefulset victoria-metrics-single-server") {
|
|
return "", nil
|
|
}
|
|
if strings.Contains(joined, "rollout status statefulset/victoria-metrics-single-server") {
|
|
return "statefulset rolled out", nil
|
|
}
|
|
return "", nil
|
|
}
|
|
orch.runOverride = dispatch
|
|
orch.runSensitiveOverride = dispatch
|
|
|
|
if err := orch.waitForCriticalServiceEndpoints(context.Background()); err != nil {
|
|
t.Fatalf("waitForCriticalServiceEndpoints failed: %v", err)
|
|
}
|
|
if endpointChecks < 2 {
|
|
t.Fatalf("expected repeated endpoint checks, got %d", endpointChecks)
|
|
}
|
|
}
|