ananke/testing/orchestrator/hooks_gap_matrix_part4_test.go

450 lines
19 KiB
Go

package orchestrator
import (
"context"
"errors"
"io"
"log"
"net/http"
"net/http/httptest"
"os"
"path/filepath"
"strings"
"testing"
"time"
"scm.bstein.dev/bstein/ananke/internal/cluster"
"scm.bstein.dev/bstein/ananke/internal/config"
"scm.bstein.dev/bstein/ananke/internal/execx"
"scm.bstein.dev/bstein/ananke/internal/state"
)
// TestHookGapMatrixPart4CoordinationAndReachability runs one orchestration or CLI step.
// Signature: TestHookGapMatrixPart4CoordinationAndReachability(t *testing.T).
// Why: closes remaining coordination/reachability low branches with deterministic
// command responses and short timeouts.
func TestHookGapMatrixPart4CoordinationAndReachability(t *testing.T) {
t.Run("peer-shutdown-complete-cooldown-blocks-startup", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Coordination.PeerHosts = []string{"titan-24"}
cfg.SSHManagedNodes = append(cfg.SSHManagedNodes, "titan-24")
cfg.SSHNodeHosts["titan-24"] = "titan-24"
base := lifecycleDispatcher(&commandRecorder{})
now := time.Now().UTC().Format(time.RFC3339)
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "ssh" && strings.Contains(command, "ananke intent --config /etc/ananke/ananke.yaml") {
return "__ANANKE_BOOTSTRAP_IDLE__\nintent=shutdown_complete reason=\"recent\" source=peer updated_at=" + now + "\n", nil
}
return base(ctx, timeout, name, args...)
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
err := orch.TestHookGuardPeerStartupIntents(context.Background())
if err == nil || !strings.Contains(err.Error(), "completed shutdown too recently") {
t.Fatalf("expected shutdown-complete cooldown block, got %v", err)
}
})
t.Run("peer-startup-stale-auto-clears-when-bootstrap-idle", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Coordination.PeerHosts = []string{"titan-24"}
cfg.SSHManagedNodes = append(cfg.SSHManagedNodes, "titan-24")
cfg.SSHNodeHosts["titan-24"] = "titan-24"
cfg.Coordination.StartupGuardMaxAgeSec = 30
stale := time.Now().UTC().Add(-3 * time.Hour).Format(time.RFC3339)
base := lifecycleDispatcher(&commandRecorder{})
clearCalls := 0
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "ssh" && strings.Contains(command, "ananke intent --config /etc/ananke/ananke.yaml --set normal"):
clearCalls++
return "ok", nil
case name == "ssh" && strings.Contains(command, "ananke intent --config /etc/ananke/ananke.yaml"):
return "__ANANKE_BOOTSTRAP_IDLE__\nintent=startup_in_progress reason=\"stale\" source=peer updated_at=" + stale + "\n", nil
default:
return base(ctx, timeout, name, args...)
}
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
if err := orch.TestHookGuardPeerStartupIntents(context.Background()); err != nil {
t.Fatalf("expected stale peer startup intent to auto-clear, got %v", err)
}
if clearCalls == 0 {
t.Fatalf("expected remote stale-intent clear call")
}
})
t.Run("read-peer-parse-error-api-timeout-and-snapshot-size-parse", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Coordination.PeerHosts = []string{"titan-24"}
cfg.SSHManagedNodes = append(cfg.SSHManagedNodes, "titan-24")
cfg.SSHNodeHosts["titan-24"] = "titan-24"
base := lifecycleDispatcher(&commandRecorder{})
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "ssh" && strings.Contains(command, "ananke intent --config /etc/ananke/ananke.yaml"):
return "__ANANKE_BOOTSTRAP_IDLE__\nnot-an-intent-payload\n", nil
case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
return "", errors.New("api unreachable")
case name == "ssh" && strings.Contains(command, "stat -c %s"):
return "not-a-size", nil
default:
return base(ctx, timeout, name, args...)
}
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
if _, err := orch.TestHookReadRemotePeerStatus(context.Background(), "titan-24"); err == nil {
t.Fatalf("expected parse failure for remote peer intent output")
}
if err := orch.TestHookWaitForAPI(context.Background(), 1, 0); err == nil {
t.Fatalf("expected waitForAPI timeout error")
}
err := orch.TestHookVerifyEtcdSnapshot(context.Background(), "titan-db", "/var/lib/rancher/k3s/server/db/snapshots/pre-shutdown")
if err == nil || !strings.Contains(err.Error(), "parse size") {
t.Fatalf("expected snapshot size parse error, got %v", err)
}
})
t.Run("inventory-reachability-times-out-on-unexpected-output", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.RequireNodeInventoryReach = true
cfg.Startup.NodeInventoryReachWaitSeconds = 1
cfg.Startup.NodeInventoryReachPollSeconds = 1
base := lifecycleDispatcher(&commandRecorder{})
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "ssh" && strings.Contains(command, "__ANANKE_NODE_REACHABLE__") {
return "unexpected", nil
}
return base(ctx, timeout, name, args...)
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
err := orch.TestHookWaitForNodeInventoryReachability(context.Background())
if err == nil || !strings.Contains(err.Error(), "unexpected output") {
t.Fatalf("expected unexpected-output timeout branch, got %v", err)
}
})
}
// TestHookGapMatrixPart4IngressServiceAndPostStart runs one orchestration or CLI step.
// Signature: TestHookGapMatrixPart4IngressServiceAndPostStart(t *testing.T).
// Why: drives ingress/service checklist and post-start branches that were still
// under-covered after drill-focused matrix tests.
func TestHookGapMatrixPart4IngressServiceAndPostStart(t *testing.T) {
t.Run("ingress-backend-autoscale-cooldown-and-host-parser", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.ServiceChecklist = []config.ServiceChecklistCheck{
{Name: "metrics", URL: "https://metrics.bstein.dev/api/health"},
}
cfg.Startup.IngressChecklistIgnoreHosts = []string{"ignore.bstein.dev"}
base := lifecycleDispatcher(&commandRecorder{})
scaleCalls := 0
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "get ingress -A -o json"):
return `{"items":[{"metadata":{"namespace":"monitoring","name":"grafana"},"spec":{"rules":[{"host":"metrics.bstein.dev"},{"host":"ignore.bstein.dev"}]}}]}`, nil
case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"):
return `{"items":[{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"grafana"},"spec":{"replicas":0},"status":{"readyReplicas":0}}]}`, nil
case name == "kubectl" && strings.Contains(command, " scale deployment grafana --replicas=1"):
scaleCalls++
return "", nil
default:
return base(ctx, timeout, name, args...)
}
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
last := time.Time{}
orch.TestHookMaybeAutoHealIngressHostBackends(context.Background(), &last, "metrics: unexpected status code=502")
orch.TestHookMaybeAutoHealIngressHostBackends(context.Background(), &last, "metrics: still bad")
if scaleCalls != 1 {
t.Fatalf("expected one scale call due cooldown gate, got %d", scaleCalls)
}
if got := orch.TestHookChecklistFailureHost("metrics: request failed"); got != "metrics.bstein.dev" {
t.Fatalf("expected mapped host metrics.bstein.dev, got %q", got)
}
if got := orch.TestHookChecklistFailureHost("not-a-host detail"); got != "" {
t.Fatalf("expected empty host for unknown failure prefix, got %q", got)
}
if !cluster.TestHookChecklistContains("hello \n world", "HELLO WORLD") {
t.Fatalf("expected compact checklist matcher branch")
}
})
t.Run("service-check-body-notcontains-and-poststart-timeout", func(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte("metrics ready marker"))
}))
defer srv.Close()
cfg := lifecycleConfig(t)
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
ok, detail := orch.TestHookServiceCheckReady(context.Background(), config.ServiceChecklistCheck{
Name: "forbidden-marker",
URL: srv.URL,
AcceptedStatuses: []int{200},
BodyNotContains: "marker",
TimeoutSeconds: 2,
})
if ok || !strings.Contains(detail, "forbidden marker") {
t.Fatalf("expected forbidden-marker branch, ok=%v detail=%q", ok, detail)
}
cfg = lifecycleConfig(t)
cfg.Startup.PostStartProbeWaitSeconds = 1
cfg.Startup.PostStartProbePollSeconds = 1
cfg.Startup.PostStartProbes = []string{"https://metrics.bstein.dev/health"}
base := lifecycleDispatcher(&commandRecorder{})
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "curl" || (name == "kubectl" && strings.Contains(command, "curl")) {
return "500", nil
}
return base(ctx, timeout, name, args...)
}
orch, _ = newHookOrchestrator(t, cfg, run, run)
err := orch.TestHookWaitForPostStartProbes(context.Background())
if err == nil || !strings.Contains(err.Error(), "post-start probes did not pass") {
t.Fatalf("expected post-start timeout branch, got %v", err)
}
})
t.Run("hostname-heuristic-negative-cases", func(t *testing.T) {
cases := []string{"", "not-a-host", "metrics bstein dev", "metrics.bstein.dev/path"}
for _, in := range cases {
if cluster.TestHookIsLikelyHostname(in) {
t.Fatalf("expected %q to be treated as non-hostname", in)
}
}
})
}
// TestHookGapMatrixPart4ReportScalingStorageDrain runs one orchestration or CLI step.
// Signature: TestHookGapMatrixPart4ReportScalingStorageDrain(t *testing.T).
// Why: covers artifact, scaling snapshot, storage, and drain error branches that
// are difficult to hit from happy-path lifecycle drills.
func TestHookGapMatrixPart4ReportScalingStorageDrain(t *testing.T) {
t.Run("report-artifact-and-progress-error-paths", func(t *testing.T) {
cfg := lifecycleConfig(t)
reportsFile := filepath.Join(t.TempDir(), "reports-as-file")
if err := os.WriteFile(reportsFile, []byte("x"), 0o644); err != nil {
t.Fatalf("create reports file: %v", err)
}
cfg.State.ReportsDir = reportsFile
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
err := orch.TestHookWriteRunRecordArtifact(state.RunRecord{
ID: "shutdown-record",
Action: "shutdown",
Reason: "drill",
StartedAt: time.Now().UTC(),
EndedAt: time.Now().UTC(),
})
if err == nil {
t.Fatalf("expected report archive dir error")
}
if err := orch.TestHookWriteStartupReportFile(filepath.Join(reportsFile, "startup.json"), "running"); err == nil {
t.Fatalf("expected startup report path mkdir error")
}
cfg2 := lifecycleConfig(t)
cfg2.State.Dir = filepath.Join(t.TempDir(), "state")
cfg2.State.ReportsDir = reportsFile
orch2, _ := newHookOrchestrator(t, cfg2, nil, nil)
orch2.TestHookPersistStartupProgress("running")
orch2.TestHookBeginStartupReport("drill")
orch2.TestHookFinalizeStartupReport(errors.New("boom"))
})
t.Run("scaled-workload-snapshot-write-and-read-error-paths", func(t *testing.T) {
cfg := lifecycleConfig(t)
stateFile := filepath.Join(t.TempDir(), "state-file")
if err := os.WriteFile(stateFile, []byte("x"), 0o644); err != nil {
t.Fatalf("create state file: %v", err)
}
cfg.State.Dir = filepath.Join(stateFile, "nested")
orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0))
dispatch := lifecycleDispatcher(&commandRecorder{})
orch.SetCommandOverrides(dispatch, dispatch)
entries, err := orch.TestHookListScalableWorkloads(context.Background())
if err != nil {
t.Fatalf("list scalable workloads: %v", err)
}
err = orch.TestHookWriteScaledWorkloadSnapshot(entries[:1])
if err == nil || !strings.Contains(err.Error(), "ensure state dir") {
t.Fatalf("expected scaled snapshot state-dir failure, got %v", err)
}
cfg2 := lifecycleConfig(t)
orch2, _ := newHookOrchestrator(t, cfg2, nil, nil)
snapshotPath := filepath.Join(cfg2.State.Dir, "scaled-workloads.json")
if err := os.WriteFile(snapshotPath, []byte("{bad"), 0o644); err != nil {
t.Fatalf("write corrupt snapshot: %v", err)
}
if _, err := orch2.TestHookReadScaledWorkloadSnapshot(); err == nil {
t.Fatalf("expected corrupt snapshot decode error")
}
})
t.Run("storage-and-drain-failure-branches", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.StorageReadyWaitSeconds = 1
cfg.Startup.StorageReadyPollSeconds = 1
cfg.Startup.StorageMinReadyNodes = 3
cfg.Startup.StorageCriticalPVCs = []string{"monitoring/grafana-data"}
cfg.Shutdown.DrainParallelism = 1
base := lifecycleDispatcher(&commandRecorder{})
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "nodes.longhorn.io"):
return "titan-23:True:True\ntitan-24:False:False\n", nil
case name == "kubectl" && strings.Contains(command, " drain titan-23 "):
return "", errors.New("drain blocked")
case name == "kubectl" && strings.Contains(command, "--field-selector spec.nodeName=titan-23"):
return "monitoring grafana-0 Running ReplicaSet\n", nil
default:
return base(ctx, timeout, name, args...)
}
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
if _, _, err := orch.TestHookStorageReady(context.Background()); err != nil {
t.Fatalf("expected storageReady non-error not-ready branch, got %v", err)
}
if err := orch.TestHookWaitForStorageReady(context.Background()); err == nil {
t.Fatalf("expected storage readiness timeout")
}
err := orch.TestHookDrainWorkers(context.Background(), []string{"titan-23"})
if err == nil || !strings.Contains(err.Error(), "details:") {
t.Fatalf("expected drain diagnostics branch, got %v", err)
}
})
}
// TestHookGapMatrixPart4TimesyncLifecycleAndAccess runs one orchestration or CLI step.
// Signature: TestHookGapMatrixPart4TimesyncLifecycleAndAccess(t *testing.T).
// Why: closes remaining timing/access/lifecycle branches that still sat below
// target after the earlier matrices.
func TestHookGapMatrixPart4TimesyncLifecycleAndAccess(t *testing.T) {
t.Run("timesync-quorum-and-strict-failure-branches", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.TimeSyncMode = "quorum"
cfg.Startup.TimeSyncQuorum = 1
cfg.Startup.TimeSyncWaitSeconds = 1
cfg.Startup.TimeSyncPollSeconds = 1
base := lifecycleDispatcher(&commandRecorder{})
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "sh" && strings.Contains(command, "timedatectl show -p NTPSynchronized"):
return "yes", nil
case name == "ssh" && strings.Contains(command, "timedatectl show -p NTPSynchronized") && strings.Contains(command, "titan-db"):
return "yes", nil
case name == "ssh" && strings.Contains(command, "timedatectl show -p NTPSynchronized") && strings.Contains(command, "titan-23"):
return "no", nil
default:
return base(ctx, timeout, name, args...)
}
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
if err := orch.TestHookWaitForTimeSync(context.Background(), []string{"titan-db", "titan-23"}); err != nil {
t.Fatalf("expected quorum-mode success, got %v", err)
}
cfgStrict := lifecycleConfig(t)
cfgStrict.Startup.TimeSyncMode = "strict"
cfgStrict.Startup.TimeSyncWaitSeconds = 1
cfgStrict.Startup.TimeSyncPollSeconds = 1
runStrict := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if (name == "sh" || name == "ssh") && strings.Contains(command, "timedatectl show -p NTPSynchronized") {
return "no", nil
}
return base(ctx, timeout, name, args...)
}
orchStrict, _ := newHookOrchestrator(t, cfgStrict, runStrict, runStrict)
err := orchStrict.TestHookWaitForTimeSync(context.Background(), []string{"titan-db"})
if err == nil || !strings.Contains(err.Error(), "time sync not ready") {
t.Fatalf("expected strict-mode timesync failure, got %v", err)
}
})
t.Run("validate-inventory-and-access-guard-branches", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.SSHPort = 70000
cfg.SSHManagedNodes = []string{"titan-db"}
cfg.Workers = []string{"titan-23"}
cfg.SSHNodeUsers = map[string]string{}
cfg.SSHNodeHosts["titan-db"] = "bad/host"
cfg.SSHNodeUsers["titan-db"] = "bad user"
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
if err := orch.TestHookValidateNodeInventory(); err == nil {
t.Fatalf("expected inventory validation failure")
}
base := lifecycleDispatcher(&commandRecorder{})
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "ssh" && strings.Contains(command, "/usr/bin/systemctl --version") && strings.Contains(command, "titan-23"):
return "", errors.New("permission denied")
case name == "kubectl" && strings.Contains(command, "jsonpath={.spec.ref.branch}"):
return "feature/sso", nil
default:
return base(ctx, timeout, name, args...)
}
}
cfg2 := lifecycleConfig(t)
orch2, _ := newHookOrchestrator(t, cfg2, run, run)
if err := orch2.TestHookReconcileNodeAccess(context.Background(), []string{"titan-db", "titan-23"}); err == nil {
t.Fatalf("expected reconcileNodeAccess aggregated error")
}
if err := orch2.TestHookEnsureFluxBranch(context.Background(), "main", false); err == nil {
t.Fatalf("expected ensureFluxBranch mismatch guard")
}
})
t.Run("lifecycle-restore-and-mode-guard-branches", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.ControlPlanes = []string{"titan-db"}
cfg.SSHManagedNodes = []string{"titan-db"}
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
if err := orch.EtcdRestore(context.Background(), cluster.EtcdRestoreOptions{ControlPlane: "not-a-cp"}); err == nil {
t.Fatalf("expected etcd restore control-plane membership guard")
}
cfgDry := lifecycleConfig(t)
cfgDry.ControlPlanes = []string{"titan-db"}
cfgDry.SSHManagedNodes = []string{"titan-db"}
orchDry := newDryRunHookOrchestrator(t, cfgDry, nil)
if err := orchDry.EtcdRestore(context.Background(), cluster.EtcdRestoreOptions{ControlPlane: "titan-db"}); err != nil {
t.Fatalf("expected dry-run etcd restore success, got %v", err)
}
cfgMode := lifecycleConfig(t)
orchMode, _ := newHookOrchestrator(t, cfgMode, nil, nil)
if err := orchMode.Shutdown(context.Background(), cluster.ShutdownOptions{Mode: "poweroff"}); err == nil {
t.Fatalf("expected removed shutdown mode guard")
}
})
}