ananke/testing/orchestrator/hooks_gap_matrix_part8_test.go

411 lines
20 KiB
Go

package orchestrator
import (
"context"
"encoding/base64"
"errors"
"fmt"
"io"
"log"
"net"
"os"
"path/filepath"
"strings"
"testing"
"time"
"scm.bstein.dev/bstein/ananke/internal/cluster"
"scm.bstein.dev/bstein/ananke/internal/execx"
"scm.bstein.dev/bstein/ananke/internal/state"
)
// TestHookGapMatrixPart8CoverageClosure runs one orchestration or CLI step.
// Signature: TestHookGapMatrixPart8CoverageClosure(t *testing.T).
// Why: closes additional low-coverage branches in access, vault, lifecycle,
// ingress/service stability, and timesync/inventory orchestration paths.
func TestHookGapMatrixPart8CoverageClosure(t *testing.T) {
t.Run("access-ssh-and-branch-guard-branches", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.RequireNodeSSHAuth = true
cfg.Startup.NodeSSHAuthWaitSeconds = 1
cfg.Startup.NodeSSHAuthPollSeconds = 1
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "ssh" && strings.Contains(command, "__ANANKE_SSH_AUTH_OK__"):
return "", errors.New("Permission denied (publickey)")
case name == "ssh" && strings.Contains(command, "/usr/bin/systemctl --version"):
return "", errors.New("sudo: a password is required")
case name == "kubectl" && strings.Contains(command, "jsonpath={.spec.url}"):
return cfg.ExpectedFluxSource, nil
case name == "kubectl" && strings.Contains(command, "jsonpath={.spec.ref.branch}"):
return "legacy", nil
case name == "kubectl" && strings.Contains(command, "patch gitrepository flux-system"):
return "", errors.New("patch denied")
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
if err := orch.TestHookWaitForNodeSSHAuth(context.Background(), []string{"titan-db"}); err == nil || !strings.Contains(err.Error(), "auth denied") {
t.Fatalf("expected ssh auth denied branch, got %v", err)
}
if err := orch.TestHookReconcileNodeAccess(context.Background(), []string{"titan-db"}); err == nil || !strings.Contains(err.Error(), "access validation had") {
t.Fatalf("expected reconcile access error aggregation, got %v", err)
}
if err := orch.TestHookEnsureFluxBranch(context.Background(), "main", false); err == nil || !strings.Contains(err.Error(), "startup blocked") {
t.Fatalf("expected ensureFluxBranch block branch, got %v", err)
}
if err := orch.TestHookEnsureFluxBranch(context.Background(), "main", true); err == nil || !strings.Contains(err.Error(), "set flux source branch") {
t.Fatalf("expected ensureFluxBranch patch failure branch, got %v", err)
}
cfgCache := lifecycleConfig(t)
cfgCache.IACRepoPath = t.TempDir()
cfgCache.LocalBootstrapPaths = []string{"missing-path"}
orchCache, _ := newHookOrchestrator(t, cfgCache, nil, nil)
if err := orchCache.TestHookRefreshBootstrapCache(context.Background()); err == nil || !strings.Contains(err.Error(), "no bootstrap cache manifests rendered") {
t.Fatalf("expected refresh cache zero-rendered branch, got %v", err)
}
})
t.Run("coordination-and-snapshot-verification-branches", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Coordination.PeerHosts = []string{"titan-24"}
cfg.SSHManagedNodes = append(cfg.SSHManagedNodes, "titan-24")
cfg.SSHNodeHosts["titan-24"] = "titan-24"
now := time.Now().UTC().Format(time.RFC3339)
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "ssh" && strings.Contains(command, "ananke intent --config /etc/ananke/ananke.yaml"):
return "__ANANKE_BOOTSTRAP_ACTIVE__\nintent=startup_in_progress reason=\"rolling\" source=peer updated_at=" + now + "\n", nil
case name == "ssh" && strings.Contains(command, "stat -c %s"):
return "abc", nil
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
if err := orch.TestHookGuardPeerStartupIntents(context.Background()); err == nil || !strings.Contains(err.Error(), "startup_in_progress") {
t.Fatalf("expected peer startup-in-progress block branch, got %v", err)
}
if err := orch.TestHookVerifyEtcdSnapshot(context.Background(), "titan-db", "/var/lib/rancher/k3s/server/db/snapshots/pre-shutdown"); err == nil || !strings.Contains(err.Error(), "parse size") {
t.Fatalf("expected verify snapshot parse-size branch, got %v", err)
}
runSmall := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "ssh" && strings.Contains(command, "stat -c %s"):
return "128", nil
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orchSmall, _ := newHookOrchestrator(t, cfg, runSmall, runSmall)
if err := orchSmall.TestHookVerifyEtcdSnapshot(context.Background(), "titan-db", "/var/lib/rancher/k3s/server/db/snapshots/pre-shutdown"); err == nil || !strings.Contains(err.Error(), "too small") {
t.Fatalf("expected verify snapshot too-small branch, got %v", err)
}
})
t.Run("critical-endpoints-and-vault-branches", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.CriticalServiceEndpoints = []string{"bad-entry"}
orchInvalid, _ := newHookOrchestrator(t, cfg, nil, nil)
if _, _, _, _, err := orchInvalid.TestHookCriticalServiceEndpointsReady(context.Background()); err == nil {
t.Fatalf("expected invalid critical endpoint entry branch")
}
cfg = lifecycleConfig(t)
keyB64 := base64.StdEncoding.EncodeToString([]byte("vault-unseal-key"))
sealedChecks := 0
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
return "Running", nil
case name == "kubectl" && strings.Contains(command, "vault status -format=json"):
sealedChecks++
if sealedChecks <= 2 {
return `{"sealed":true}`, nil
}
return `{"sealed":false}`, nil
case name == "kubectl" && strings.Contains(command, "get secret vault-init"):
return keyB64, nil
case name == "kubectl" && strings.Contains(command, "vault operator unseal"):
return "unsealed", nil
case name == "kubectl" && strings.Contains(command, "jsonpath={.status.readyReplicas}"):
if sealedChecks == 0 {
return "0", nil
}
return "1", nil
case name == "kubectl" && strings.Contains(command, "get pods -o custom-columns"):
return "vault-0 Unknown StatefulSet vault\n", nil
case name == "kubectl" && strings.Contains(command, "delete pod vault-0"):
return "", errors.New("delete failed")
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
if err := orch.TestHookEnsureVaultUnsealed(context.Background()); err != nil {
t.Fatalf("expected ensureVaultUnsealed success branch, got %v", err)
}
if err := orch.TestHookWaitVaultReady(context.Background(), "vault", "statefulset", "vault"); err != nil {
t.Fatalf("expected waitVaultReady retry/success branch, got %v", err)
}
if err := orch.TestHookCleanupStaleCriticalWorkloadPods(context.Background(), "vault", "statefulset", "vault"); err == nil || !strings.Contains(err.Error(), "delete stale pod") {
t.Fatalf("expected stale-pod delete failure branch, got %v", err)
}
runBadReady := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "jsonpath={.status.readyReplicas}") {
return "not-a-number", nil
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchBadReady, _ := newHookOrchestrator(t, lifecycleConfig(t), runBadReady, runBadReady)
if _, err := orchBadReady.TestHookWorkloadReady(context.Background(), "monitoring", "deployment", "grafana"); err == nil {
t.Fatalf("expected workloadReady parse branch")
}
cfgEmptyKey := lifecycleConfig(t)
cfgEmptyKey.Startup.VaultUnsealKeyFile = ""
orchEmptyKey, _ := newHookOrchestrator(t, cfgEmptyKey, nil, nil)
if err := orchEmptyKey.TestHookWriteVaultUnsealKeyFile("abc"); err == nil || !strings.Contains(err.Error(), "path is empty") {
t.Fatalf("expected writeVaultUnsealKeyFile empty-path branch, got %v", err)
}
})
t.Run("lifecycle-etcd-restore-and-shutdown-branches", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.ControlPlanes = []string{}
orchNoCP := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0))
if err := orchNoCP.EtcdRestore(context.Background(), cluster.EtcdRestoreOptions{}); err == nil || !strings.Contains(err.Error(), "no control planes") {
t.Fatalf("expected etcd restore no-control-planes branch, got %v", err)
}
cfgManaged := lifecycleConfig(t)
cfgManaged.SSHManagedNodes = []string{"titan-db"}
runExternal := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "ssh" && strings.Contains(command, "systemctl cat k3s"):
return "ExecStart=/usr/local/bin/k3s server --datastore-endpoint=postgres://db:5432/k3s", nil
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orchExternal, _ := newHookOrchestrator(t, cfgManaged, runExternal, runExternal)
err := orchExternal.EtcdRestore(context.Background(), cluster.EtcdRestoreOptions{ControlPlane: "titan-db"})
if err == nil || !errors.Is(err, cluster.ErrEtcdRestoreNotApplicable) {
t.Fatalf("expected etcd restore not-applicable branch, got %v", err)
}
cfgShutdown := lifecycleConfig(t)
cfgShutdown.SSHPort = 0
orchShutdown := cluster.New(cfgShutdown, &execx.Runner{DryRun: false}, state.New(cfgShutdown.State.RunHistoryPath), log.New(io.Discard, "", 0))
if err := orchShutdown.Shutdown(context.Background(), cluster.ShutdownOptions{Reason: "drill", Mode: "cluster-only"}); err == nil || !strings.Contains(err.Error(), "node inventory preflight failed") {
t.Fatalf("expected shutdown inventory-preflight branch, got %v", err)
}
})
t.Run("ingress-service-stability-and-timesync-branches", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.RequireIngressChecklist = true
cfg.Startup.RequireServiceChecklist = true
cfg.Startup.RequireCriticalServiceEndpoints = true
cfg.Startup.RequireFluxHealth = true
cfg.Startup.RequireWorkloadConvergence = true
cfg.Startup.ServiceChecklistStabilitySec = 1
cfg.Startup.ServiceChecklistPollSeconds = 1
cfg.Startup.TimeSyncWaitSeconds = 1
cfg.Startup.TimeSyncPollSeconds = 1
cfg.Startup.TimeSyncMode = "quorum"
cfg.Startup.TimeSyncQuorum = 1
cfg.Startup.ServiceChecklist = nil
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "sh" && strings.Contains(command, "timedatectl show -p NTPSynchronized"):
return "yes", nil
case name == "ssh" && strings.Contains(command, "timedatectl show -p NTPSynchronized"):
if strings.Contains(command, "titan-db") {
return "yes", nil
}
return "no", nil
case name == "kubectl" && strings.Contains(command, "get ingress -A -o json"):
return `{"items":[]}`, nil
case name == "kubectl" && strings.Contains(command, "get kustomizations.kustomize.toolkit.fluxcd.io -A -o json"):
return `{"items":[{"metadata":{"namespace":"flux-system","name":"services"},"spec":{"suspend":false},"status":{"conditions":[{"type":"Ready","status":"True","message":"ok"}]}}]}`, nil
case name == "kubectl" && strings.Contains(command, "get deploy,statefulset,daemonset -A -o json"):
return `{"items":[{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"grafana"},"spec":{"replicas":1,"template":{"spec":{}}},"status":{"readyReplicas":1}}]}`, nil
case name == "kubectl" && strings.Contains(command, "get endpoints victoria-metrics-single-server"):
return "10.42.0.10\n", nil
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
return `{"items":[]}`, nil
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
if err := orch.TestHookWaitForStartupConvergence(context.Background()); err != nil {
t.Fatalf("expected startup convergence pass branch, got %v", err)
}
if err := orch.TestHookWaitForTimeSync(context.Background(), []string{"titan-db", "titan-23"}); err != nil {
t.Fatalf("expected quorum timesync pass branch, got %v", err)
}
runDecodeIngress := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "get ingress -A -o json") {
return "{bad-json", nil
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchDecodeIngress, _ := newHookOrchestrator(t, lifecycleConfig(t), runDecodeIngress, runDecodeIngress)
if _, err := orchDecodeIngress.TestHookDiscoverIngressNamespacesForHost(context.Background(), "metrics.bstein.dev"); err == nil {
t.Fatalf("expected ingress namespace decode branch")
}
cfgInventory := lifecycleConfig(t)
cfgInventory.SSHPort = 70000
orchInventory, _ := newHookOrchestrator(t, cfgInventory, nil, nil)
if err := orchInventory.TestHookValidateNodeInventory(); err == nil || !strings.Contains(err.Error(), "ssh_port") {
t.Fatalf("expected inventory invalid-port branch, got %v", err)
}
})
t.Run("report-and-scaling-edge-branches", func(t *testing.T) {
cfg := lifecycleConfig(t)
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
rec := state.RunRecord{ID: "shutdown-1", Action: "shutdown", StartedAt: time.Now().UTC()}
if err := orch.TestHookWriteRunRecordArtifact(rec); err != nil {
t.Fatalf("expected writeRunRecordArtifact shutdown branch, got %v", err)
}
if _, err := os.Stat(orch.TestHookLastShutdownReportPath()); err != nil {
t.Fatalf("expected last shutdown report file: %v", err)
}
cfgScaleErr := lifecycleConfig(t)
stateFile := filepath.Join(t.TempDir(), "state-file")
if err := os.WriteFile(stateFile, []byte("x"), 0o600); err != nil {
t.Fatalf("write state file: %v", err)
}
cfgScaleErr.State.Dir = stateFile
orchScaleErr := cluster.New(
cfgScaleErr,
&execx.Runner{DryRun: false},
state.New(cfgScaleErr.State.RunHistoryPath),
log.New(io.Discard, "", 0),
)
if err := orchScaleErr.TestHookWriteScaledWorkloadSnapshot(nil); err == nil || !strings.Contains(err.Error(), "ensure state dir") {
t.Fatalf("expected writeScaledWorkloadSnapshot mkdir branch, got %v", err)
}
cfgRestore := lifecycleConfig(t)
stateDir := cfgRestore.State.Dir
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, " scale ") {
return "", nil
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchRestore, _ := newHookOrchestrator(t, cfgRestore, run, run)
snapshotPath := filepath.Join(stateDir, "scaled-workloads.json")
snapshot := `{"generated_at":"2026-01-01T00:00:00Z","entries":[{"namespace":"monitoring","kind":"deployment","name":"grafana","replicas":1}]}`
if err := os.WriteFile(snapshotPath, []byte(snapshot), 0o644); err != nil {
t.Fatalf("write snapshot: %v", err)
}
if err := orchRestore.TestHookRestoreScaledApps(context.Background()); err != nil {
t.Fatalf("expected restoreScaledApps success branch, got %v", err)
}
if _, err := os.Stat(snapshotPath); !os.IsNotExist(err) {
t.Fatalf("expected snapshot removal after restore, stat err=%v", err)
}
})
}
// TestHookGapMatrixPart8LifecycleStartupAutoRestoreBranch runs one orchestration or CLI step.
// Signature: TestHookGapMatrixPart8LifecycleStartupAutoRestoreBranch(t *testing.T).
// Why: covers Startup's API-failure->auto-restore retry path that is otherwise
// hard to exercise in deterministic top-level tests.
func TestHookGapMatrixPart8LifecycleStartupAutoRestoreBranch(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.AutoEtcdRestoreOnAPIFailure = true
cfg.Startup.EtcdRestoreControlPlane = "titan-db"
cfg.Startup.RequireNodeInventoryReach = false
cfg.Startup.RequireNodeSSHAuth = false
cfg.Startup.RequireStorageReady = false
cfg.Startup.RequireServiceChecklist = false
cfg.Startup.RequireIngressChecklist = false
cfg.Startup.RequireCriticalServiceEndpoints = false
cfg.Startup.RequireFluxHealth = false
cfg.Startup.RequireWorkloadConvergence = false
datastoreListener, err := net.Listen("tcp", "127.0.0.1:0")
if err != nil {
t.Fatalf("listen datastore preflight stub: %v", err)
}
defer datastoreListener.Close()
apiCalls := 0
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
apiCalls++
if apiCalls == 1 {
return "", errors.New("api down")
}
return "v1.31.0", nil
case name == "ssh" && strings.Contains(command, "systemctl cat k3s"):
return fmt.Sprintf(
"ExecStart=/usr/local/bin/k3s server --datastore-endpoint=postgres://127.0.0.1:%d/k3s",
datastoreListener.Addr().(*net.TCPAddr).Port,
), nil
case name == "kubectl" && strings.Contains(command, "jsonpath={.status.conditions[?(@.type==\"Ready\")].status}"):
return "True", nil
case name == "kubectl" && strings.Contains(command, "jsonpath={.spec.url}"):
return cfg.ExpectedFluxSource, nil
case name == "kubectl" && strings.Contains(command, "jsonpath={.spec.ref.branch}"):
return "main", nil
case name == "kubectl" && strings.Contains(command, "annotate kustomizations.kustomize.toolkit.fluxcd.io"):
return "", nil
case name == "kubectl" && strings.Contains(command, "annotate --all-namespaces helmreleases.helm.toolkit.fluxcd.io"):
return "", nil
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
err = orch.Startup(context.Background(), cluster.StartupOptions{Reason: "part8-auto-restore"})
if err != nil {
t.Fatalf("expected startup auto-restore path success, got %v", err)
}
if apiCalls < 2 {
t.Fatalf("expected startup to retry API after auto-restore path, calls=%d", apiCalls)
}
cfgBadMode := lifecycleConfig(t)
orchBadMode, _ := newHookOrchestrator(t, cfgBadMode, nil, nil)
err = orchBadMode.Shutdown(context.Background(), cluster.ShutdownOptions{Reason: "part8", Mode: "unknown-mode"})
if err == nil || !strings.Contains(err.Error(), "unsupported shutdown mode") {
t.Fatalf("expected shutdown unsupported-mode branch, got %v", err)
}
cfgReport := lifecycleConfig(t)
orchReport, _ := newHookOrchestrator(t, cfgReport, nil, nil)
if err := orchReport.TestHookWriteStartupReportFile(filepath.Join("", string(os.PathSeparator)), "running"); err == nil {
t.Fatalf("expected startup report write failure branch")
}
if ok := orchReport.TestHookFinalizeStartupReportSnapshot(fmt.Errorf("boom")); !ok {
t.Fatalf("expected finalize startup report snapshot non-nil")
}
}