298 lines
14 KiB
Go
298 lines
14 KiB
Go
|
|
package orchestrator
|
||
|
|
|
||
|
|
import (
|
||
|
|
"context"
|
||
|
|
"io"
|
||
|
|
"log"
|
||
|
|
"net/http"
|
||
|
|
"net/http/httptest"
|
||
|
|
"strings"
|
||
|
|
"testing"
|
||
|
|
"time"
|
||
|
|
|
||
|
|
"scm.bstein.dev/bstein/ananke/internal/cluster"
|
||
|
|
"scm.bstein.dev/bstein/ananke/internal/config"
|
||
|
|
"scm.bstein.dev/bstein/ananke/internal/execx"
|
||
|
|
"scm.bstein.dev/bstein/ananke/internal/state"
|
||
|
|
)
|
||
|
|
|
||
|
|
// wrapperCoverageDispatcher runs one orchestration or CLI step.
|
||
|
|
// Signature: wrapperCoverageDispatcher(recorder *commandRecorder) func(context.Context, time.Duration, string, ...string) (string, error).
|
||
|
|
// Why: centralizes deterministic command output so hook-wrapper tests can execute all exported test hooks without live cluster access.
|
||
|
|
func wrapperCoverageDispatcher(recorder *commandRecorder) func(context.Context, time.Duration, string, ...string) (string, error) {
|
||
|
|
base := lifecycleDispatcher(recorder)
|
||
|
|
return func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||
|
|
command := name + " " + strings.Join(args, " ")
|
||
|
|
switch {
|
||
|
|
case name == "curl":
|
||
|
|
recorder.record(name, args)
|
||
|
|
return "200", nil
|
||
|
|
case name == "ssh" && strings.Contains(command, "k3s etcd-snapshot ls"):
|
||
|
|
recorder.record(name, args)
|
||
|
|
return "Name Size Created Location\npre-shutdown 4.2M now \"file:///var/lib/rancher/k3s/server/db/snapshots/pre-shutdown\"\n", nil
|
||
|
|
case name == "ssh" && strings.Contains(command, "k3s etcd-snapshot save"):
|
||
|
|
recorder.record(name, args)
|
||
|
|
return "snapshot saved", nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "get ingress -A -o json"):
|
||
|
|
recorder.record(name, args)
|
||
|
|
return `{"items":[{"metadata":{"namespace":"monitoring"},"spec":{"rules":[{"host":"metrics.bstein.dev"}]}}]}`, nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "get endpoints victoria-metrics-single-server"):
|
||
|
|
recorder.record(name, args)
|
||
|
|
return "10.42.0.10\n", nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "get deploy,statefulset,daemonset -A -o json"):
|
||
|
|
recorder.record(name, args)
|
||
|
|
return `{"items":[{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"grafana"},"spec":{"replicas":1,"template":{"spec":{"nodeSelector":{"kubernetes.io/hostname":"titan-23"}}}},"status":{"readyReplicas":1}},{"kind":"StatefulSet","metadata":{"namespace":"monitoring","name":"victoria-metrics-single-server"},"spec":{"replicas":1,"template":{"spec":{}}},"status":{"readyReplicas":1}},{"kind":"DaemonSet","metadata":{"namespace":"monitoring","name":"node-exporter"},"spec":{"template":{"spec":{}}},"status":{"desiredNumberScheduled":1,"numberReady":1}}]}`, nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"):
|
||
|
|
recorder.record(name, args)
|
||
|
|
return `{"items":[{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"grafana"},"spec":{"replicas":0,"template":{"spec":{}}},"status":{"readyReplicas":0}},{"kind":"StatefulSet","metadata":{"namespace":"monitoring","name":"victoria-metrics-single-server"},"spec":{"replicas":1,"template":{"spec":{}}},"status":{"readyReplicas":1}}]}`, nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
|
||
|
|
recorder.record(name, args)
|
||
|
|
return `{"items":[{"metadata":{"namespace":"vault","name":"vault-0","creationTimestamp":"2020-01-01T00:00:00Z","ownerReferences":[{"kind":"StatefulSet","name":"vault"}]},"spec":{"nodeName":"titan-23","containers":[{"name":"vault"}]},"status":{"phase":"Pending","initContainerStatuses":[{"name":"vault-agent-init","state":{"running":{"startedAt":"2020-01-01T00:00:00Z"}}}]}}]}`, nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "get deployment -A -o jsonpath="):
|
||
|
|
recorder.record(name, args)
|
||
|
|
return "monitoring\tgrafana\t1\nflux-system\tsource-controller\t1\ngitea\tgitea\t1\n", nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "get statefulset -A -o jsonpath="):
|
||
|
|
recorder.record(name, args)
|
||
|
|
return "monitoring\tvictoria-metrics-single-server\t1\nvault\tvault\t1\npostgres\tpostgres\t1\n", nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "get nodes -o custom-columns="):
|
||
|
|
recorder.record(name, args)
|
||
|
|
return "titan-23 <none> <none>\n", nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "-n flux-system get kustomizations.kustomize.toolkit.fluxcd.io -o jsonpath="):
|
||
|
|
recorder.record(name, args)
|
||
|
|
return "services\n", nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath="):
|
||
|
|
recorder.record(name, args)
|
||
|
|
return "monitoring/grafana\n", nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "get statefulset vault -o jsonpath={.status.readyReplicas}"):
|
||
|
|
recorder.record(name, args)
|
||
|
|
return "1", nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "jsonpath={.status.readyReplicas}"):
|
||
|
|
recorder.record(name, args)
|
||
|
|
return "1", nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "rollout status"):
|
||
|
|
recorder.record(name, args)
|
||
|
|
return "rolled out", nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "scale "):
|
||
|
|
recorder.record(name, args)
|
||
|
|
return "", nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "delete pod"):
|
||
|
|
recorder.record(name, args)
|
||
|
|
return "", nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "patch "):
|
||
|
|
recorder.record(name, args)
|
||
|
|
return "", nil
|
||
|
|
default:
|
||
|
|
return base(ctx, timeout, name, args...)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// TestHookLifecycleWrappersCoverage runs one orchestration or CLI step.
|
||
|
|
// Signature: TestHookLifecycleWrappersCoverage(t *testing.T).
|
||
|
|
// Why: executes lifecycle hook wrappers so test-hook files and their orchestration backends stay covered from the top-level testing module.
|
||
|
|
func TestHookLifecycleWrappersCoverage(t *testing.T) {
|
||
|
|
cfg := lifecycleConfig(t)
|
||
|
|
cfg.Startup.PostStartProbes = []string{"https://metrics.bstein.dev/healthz"}
|
||
|
|
cfg.Startup.PostStartProbeWaitSeconds = 1
|
||
|
|
cfg.Startup.PostStartProbePollSeconds = 1
|
||
|
|
cfg.Startup.RequireIngressChecklist = false
|
||
|
|
cfg.Startup.RequireServiceChecklist = false
|
||
|
|
cfg.Startup.RequireCriticalServiceEndpoints = false
|
||
|
|
cfg.Startup.RequireFluxHealth = false
|
||
|
|
cfg.Startup.RequireWorkloadConvergence = false
|
||
|
|
cfg.Startup.ServiceChecklistStabilitySec = 1
|
||
|
|
cfg.Startup.IngressChecklistWaitSeconds = 1
|
||
|
|
cfg.Startup.IngressChecklistPollSeconds = 1
|
||
|
|
cfg.Startup.ServiceChecklistWaitSeconds = 1
|
||
|
|
cfg.Startup.ServiceChecklistPollSeconds = 1
|
||
|
|
cfg.Startup.CriticalServiceEndpoints = []string{"monitoring/victoria-metrics-single-server"}
|
||
|
|
cfg.Startup.CriticalServiceEndpointWaitSec = 1
|
||
|
|
cfg.Startup.CriticalServiceEndpointPollSec = 1
|
||
|
|
|
||
|
|
svc := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
|
||
|
|
w.WriteHeader(http.StatusOK)
|
||
|
|
_, _ = w.Write([]byte(`{"database":"ok"}`))
|
||
|
|
}))
|
||
|
|
defer svc.Close()
|
||
|
|
|
||
|
|
cfg.Startup.ServiceChecklist = []config.ServiceChecklistCheck{
|
||
|
|
{
|
||
|
|
Name: "grafana",
|
||
|
|
URL: svc.URL,
|
||
|
|
AcceptedStatuses: []int{200},
|
||
|
|
BodyContains: `"database":"ok"`,
|
||
|
|
TimeoutSeconds: 2,
|
||
|
|
},
|
||
|
|
}
|
||
|
|
|
||
|
|
recorder := &commandRecorder{}
|
||
|
|
dispatch := wrapperCoverageDispatcher(recorder)
|
||
|
|
orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0))
|
||
|
|
orch.SetCommandOverrides(dispatch, dispatch)
|
||
|
|
|
||
|
|
ctx := context.Background()
|
||
|
|
if err := orch.TestHookWaitForPostStartProbes(ctx); err != nil {
|
||
|
|
t.Fatalf("wait for post-start probes: %v", err)
|
||
|
|
}
|
||
|
|
if _, _ = orch.TestHookPostStartProbesReady(ctx); true {
|
||
|
|
}
|
||
|
|
if _, err := orch.TestHookHTTPProbe(ctx, "https://metrics.bstein.dev/healthz"); err != nil {
|
||
|
|
t.Fatalf("http probe: %v", err)
|
||
|
|
}
|
||
|
|
if err := orch.TestHookResumeFluxAndReconcile(ctx); err != nil {
|
||
|
|
t.Fatalf("resume flux and reconcile: %v", err)
|
||
|
|
}
|
||
|
|
if _, err := orch.TestHookSSHWithTimeout(ctx, "titan-db", "echo ok", time.Second); err != nil {
|
||
|
|
t.Fatalf("ssh with timeout: %v", err)
|
||
|
|
}
|
||
|
|
if _, err := orch.TestHookRunSensitive(ctx, time.Second, "kubectl", "version", "--request-timeout=5s"); err != nil {
|
||
|
|
t.Fatalf("runSensitive wrapper: %v", err)
|
||
|
|
}
|
||
|
|
if err := orch.TestHookWaitForStartupConvergence(ctx); err != nil {
|
||
|
|
t.Fatalf("wait for startup convergence: %v", err)
|
||
|
|
}
|
||
|
|
if err := orch.TestHookWaitForServiceChecklist(ctx); err != nil {
|
||
|
|
t.Fatalf("wait for service checklist: %v", err)
|
||
|
|
}
|
||
|
|
if ok, detail := orch.TestHookServiceChecklistReady(ctx); !ok || detail == "" {
|
||
|
|
t.Fatalf("service checklist ready result unexpected: ok=%v detail=%q", ok, detail)
|
||
|
|
}
|
||
|
|
if ok, detail := orch.TestHookServiceCheckReady(ctx, cfg.Startup.ServiceChecklist[0]); !ok || detail == "" {
|
||
|
|
t.Fatalf("service check ready result unexpected: ok=%v detail=%q", ok, detail)
|
||
|
|
}
|
||
|
|
if status, body, err := orch.TestHookHTTPChecklistProbe(ctx, cfg.Startup.ServiceChecklist[0]); err != nil || status != 200 || body == "" {
|
||
|
|
t.Fatalf("http checklist probe unexpected result status=%d body=%q err=%v", status, body, err)
|
||
|
|
}
|
||
|
|
if err := orch.TestHookWaitForStabilityWindow(ctx); err != nil {
|
||
|
|
t.Fatalf("wait for stability window: %v", err)
|
||
|
|
}
|
||
|
|
if err := orch.TestHookStartupStabilityHealthy(ctx); err != nil {
|
||
|
|
t.Fatalf("startup stability healthy: %v", err)
|
||
|
|
}
|
||
|
|
|
||
|
|
cancelCtx, cancel := context.WithCancel(ctx)
|
||
|
|
cancel()
|
||
|
|
_ = orch.TestHookWaitForIngressChecklist(cancelCtx)
|
||
|
|
if _, _ = orch.TestHookIngressChecklistReady(ctx); true {
|
||
|
|
}
|
||
|
|
if hosts, err := orch.TestHookDiscoverIngressHosts(ctx); err != nil || len(hosts) == 0 {
|
||
|
|
t.Fatalf("discover ingress hosts failed: hosts=%v err=%v", hosts, err)
|
||
|
|
}
|
||
|
|
if namespaces, err := orch.TestHookDiscoverIngressNamespacesForHost(ctx, "metrics.bstein.dev"); err != nil || len(namespaces) == 0 {
|
||
|
|
t.Fatalf("discover ingress namespaces failed: ns=%v err=%v", namespaces, err)
|
||
|
|
}
|
||
|
|
last := time.Time{}
|
||
|
|
orch.TestHookMaybeAutoHealIngressHostBackends(ctx, &last, "metrics.bstein.dev: status=503")
|
||
|
|
if got := orch.TestHookChecklistFailureHost("metrics.bstein.dev: status=503"); got != "metrics.bstein.dev" {
|
||
|
|
t.Fatalf("unexpected checklistFailureHost parse: %q", got)
|
||
|
|
}
|
||
|
|
if got := cluster.TestHookHostFromURL("https://metrics.bstein.dev/api/health"); got != "metrics.bstein.dev" {
|
||
|
|
t.Fatalf("unexpected host parse: %q", got)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// TestHookOpsAndWorkloadWrappersCoverage runs one orchestration or CLI step.
|
||
|
|
// Signature: TestHookOpsAndWorkloadWrappersCoverage(t *testing.T).
|
||
|
|
// Why: executes ops/workload wrappers so all exposed hook files remain covered and regression-safe.
|
||
|
|
func TestHookOpsAndWorkloadWrappersCoverage(t *testing.T) {
|
||
|
|
cfg := lifecycleConfig(t)
|
||
|
|
cfg.Startup.WorkloadConvergenceWaitSeconds = 1
|
||
|
|
cfg.Startup.WorkloadConvergencePollSeconds = 1
|
||
|
|
cfg.Startup.CriticalServiceEndpointWaitSec = 1
|
||
|
|
cfg.Startup.CriticalServiceEndpointPollSec = 1
|
||
|
|
cfg.Startup.AutoRecycleStuckPods = true
|
||
|
|
cfg.Startup.StuckPodGraceSeconds = 1
|
||
|
|
svc := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
|
||
|
|
w.WriteHeader(http.StatusOK)
|
||
|
|
_, _ = w.Write([]byte(`{"status":"ok"}`))
|
||
|
|
}))
|
||
|
|
defer svc.Close()
|
||
|
|
cfg.Startup.ServiceChecklist = []config.ServiceChecklistCheck{
|
||
|
|
{Name: "grafana", URL: svc.URL, AcceptedStatuses: []int{200}, BodyContains: `"status":"ok"`, TimeoutSeconds: 2},
|
||
|
|
}
|
||
|
|
cfg.Startup.ServiceChecklistWaitSeconds = 1
|
||
|
|
cfg.Startup.ServiceChecklistPollSeconds = 1
|
||
|
|
|
||
|
|
recorder := &commandRecorder{}
|
||
|
|
dispatch := wrapperCoverageDispatcher(recorder)
|
||
|
|
orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0))
|
||
|
|
orch.SetCommandOverrides(dispatch, dispatch)
|
||
|
|
|
||
|
|
ctx := context.Background()
|
||
|
|
if err := orch.TestHookDrainWorkers(ctx, []string{"titan-23"}); err != nil {
|
||
|
|
t.Fatalf("drain workers: %v", err)
|
||
|
|
}
|
||
|
|
if err := orch.TestHookUncordonWorkers(ctx, []string{"titan-23"}); err != nil {
|
||
|
|
t.Fatalf("uncordon workers: %v", err)
|
||
|
|
}
|
||
|
|
if err := orch.TestHookTakeEtcdSnapshot(ctx, "titan-db"); err != nil {
|
||
|
|
t.Fatalf("take etcd snapshot: %v", err)
|
||
|
|
}
|
||
|
|
if snapshot, err := orch.TestHookLatestEtcdSnapshotPath(ctx, "titan-db"); err != nil || snapshot == "" {
|
||
|
|
t.Fatalf("latest snapshot path failed: snapshot=%q err=%v", snapshot, err)
|
||
|
|
}
|
||
|
|
if workers, err := orch.TestHookEffectiveWorkers(ctx); err != nil || len(workers) == 0 {
|
||
|
|
t.Fatalf("effective workers failed: workers=%v err=%v", workers, err)
|
||
|
|
}
|
||
|
|
if workers, err := orch.TestHookDiscoverWorkers(ctx); err != nil || len(workers) == 0 {
|
||
|
|
t.Fatalf("discover workers failed: workers=%v err=%v", workers, err)
|
||
|
|
}
|
||
|
|
if err := orch.TestHookPatchFluxSuspendAll(ctx, true); err != nil {
|
||
|
|
t.Fatalf("patch flux suspend: %v", err)
|
||
|
|
}
|
||
|
|
if entries, err := orch.TestHookListScalableWorkloads(ctx); err != nil {
|
||
|
|
t.Fatalf("list scalable workloads: %v", err)
|
||
|
|
} else {
|
||
|
|
if err := orch.TestHookScaleWorkloads(ctx, entries, 0, 1); err != nil {
|
||
|
|
t.Fatalf("scale workloads forced: %v", err)
|
||
|
|
}
|
||
|
|
if err := orch.TestHookWriteScaledWorkloadSnapshot(entries); err != nil {
|
||
|
|
t.Fatalf("write scaled workload snapshot: %v", err)
|
||
|
|
}
|
||
|
|
if _, err := orch.TestHookReadScaledWorkloadSnapshot(); err != nil {
|
||
|
|
t.Fatalf("read scaled workload snapshot: %v", err)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if err := orch.TestHookScaleDownApps(ctx); err != nil {
|
||
|
|
t.Fatalf("scale down apps: %v", err)
|
||
|
|
}
|
||
|
|
if err := orch.TestHookRestoreScaledApps(ctx); err != nil {
|
||
|
|
t.Fatalf("restore scaled apps: %v", err)
|
||
|
|
}
|
||
|
|
|
||
|
|
if err := orch.TestHookWaitForCriticalServiceEndpoints(ctx); err != nil {
|
||
|
|
t.Fatalf("wait for critical service endpoints: %v", err)
|
||
|
|
}
|
||
|
|
if ok, _, _, _, err := orch.TestHookCriticalServiceEndpointsReady(ctx); err != nil || !ok {
|
||
|
|
t.Fatalf("critical endpoints ready failed: ok=%v err=%v", ok, err)
|
||
|
|
}
|
||
|
|
if err := orch.TestHookWaitForWorkloadConvergence(ctx); err != nil {
|
||
|
|
t.Fatalf("wait for workload convergence: %v", err)
|
||
|
|
}
|
||
|
|
if ok, detail, err := orch.TestHookWorkloadConvergenceReady(ctx); err != nil || !ok || detail == "" {
|
||
|
|
t.Fatalf("workload convergence ready failed: ok=%v detail=%q err=%v", ok, detail, err)
|
||
|
|
}
|
||
|
|
if err := orch.TestHookRecycleStuckControllerPods(ctx); err != nil {
|
||
|
|
t.Fatalf("recycle stuck controller pods: %v", err)
|
||
|
|
}
|
||
|
|
lastRecycle := time.Time{}
|
||
|
|
orch.TestHookMaybeAutoRecycleStuckPods(ctx, &lastRecycle)
|
||
|
|
lastHeal := time.Time{}
|
||
|
|
orch.TestHookMaybeAutoHealCriticalWorkloadReplicas(ctx, &lastHeal)
|
||
|
|
if _, err := orch.TestHookHealCriticalWorkloadReplicas(ctx); err != nil {
|
||
|
|
t.Fatalf("heal critical workload replicas: %v", err)
|
||
|
|
}
|
||
|
|
if _, err := orch.TestHookStartupFailurePods(ctx); err != nil {
|
||
|
|
t.Fatalf("startup failure pods: %v", err)
|
||
|
|
}
|
||
|
|
if err := orch.TestHookEnsureCriticalStartupWorkloads(ctx); err != nil {
|
||
|
|
t.Fatalf("ensure critical startup workloads: %v", err)
|
||
|
|
}
|
||
|
|
if _, err := orch.TestHookMissingCriticalStartupWorkloads(ctx); err != nil {
|
||
|
|
t.Fatalf("missing critical startup workloads: %v", err)
|
||
|
|
}
|
||
|
|
if err := orch.TestHookWaitForServiceChecklistAlias(ctx); err != nil {
|
||
|
|
t.Fatalf("wait for service checklist alias: %v", err)
|
||
|
|
}
|
||
|
|
}
|