From b229f47af8a0d1b4c7a0c48225cc66b24b1909eb Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Thu, 9 Apr 2026 04:56:41 -0300 Subject: [PATCH] testing: make quality gate root-safe and deterministic --- internal/state/heal.go | 10 +++ internal/state/intent.go | 10 +++ internal/state/testhooks.go | 62 +++++++++++++++- .../hooks_gap_matrix_part5_test.go | 29 ++++++-- .../hooks_lifecycle_deep_matrix_test.go | 33 +++++---- .../state/state_quality_additional_test.go | 41 +++++------ .../state_quality_branch_closeout_test.go | 15 ++-- testing/state/state_testhooks_quality_test.go | 70 +++++++++++++++++++ 8 files changed, 219 insertions(+), 51 deletions(-) create mode 100644 testing/state/state_testhooks_quality_test.go diff --git a/internal/state/heal.go b/internal/state/heal.go index 7427c60..389304b 100644 --- a/internal/state/heal.go +++ b/internal/state/heal.go @@ -7,10 +7,20 @@ import ( "time" ) +var quarantineCorruptFileImpl = quarantineCorruptFileDefault + // quarantineCorruptFile runs one orchestration or CLI step. // Signature: quarantineCorruptFile(path string, payload []byte, replacement []byte, mode os.FileMode) error. // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func quarantineCorruptFile(path string, payload []byte, replacement []byte, mode os.FileMode) error { + return quarantineCorruptFileImpl(path, payload, replacement, mode) +} + +// quarantineCorruptFileDefault runs one orchestration or CLI step. +// Signature: quarantineCorruptFileDefault(path string, payload []byte, replacement []byte, mode os.FileMode) error. +// Why: keeps production file-healing behavior as the default while tests can +// deterministically force heal failures in root/sudo environments. +func quarantineCorruptFileDefault(path string, payload []byte, replacement []byte, mode os.FileMode) error { if err := os.MkdirAll(filepath.Dir(path), 0o750); err != nil { return err } diff --git a/internal/state/intent.go b/internal/state/intent.go index 9ad1c92..d7f58ea 100644 --- a/internal/state/intent.go +++ b/internal/state/intent.go @@ -22,6 +22,8 @@ type Intent struct { UpdatedAt time.Time `json:"updated_at"` } +var writeIntentImpl = writeIntentDefault + // ReadIntent runs one orchestration or CLI step. // Signature: ReadIntent(path string) (Intent, error). // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. @@ -50,6 +52,14 @@ func ReadIntent(path string) (Intent, error) { // Signature: WriteIntent(path string, in Intent) error. // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func WriteIntent(path string, in Intent) error { + return writeIntentImpl(path, in) +} + +// writeIntentDefault runs one orchestration or CLI step. +// Signature: writeIntentDefault(path string, in Intent) error. +// Why: keeps the production write behavior available while tests can override +// WriteIntent deterministically without root-only permission tricks. +func writeIntentDefault(path string, in Intent) error { if in.UpdatedAt.IsZero() { in.UpdatedAt = time.Now().UTC() } diff --git a/internal/state/testhooks.go b/internal/state/testhooks.go index 701c3b4..c6e7754 100644 --- a/internal/state/testhooks.go +++ b/internal/state/testhooks.go @@ -1,6 +1,11 @@ package state -import "os" +import ( + "os" + "sync" +) + +var testHookOverrideMu sync.Mutex // TestHookQuarantineCorruptFile runs one orchestration or CLI step. // Signature: TestHookQuarantineCorruptFile(path string, payload []byte, replacement []byte, mode os.FileMode) error. @@ -8,3 +13,58 @@ import "os" func TestHookQuarantineCorruptFile(path string, payload []byte, replacement []byte, mode os.FileMode) error { return quarantineCorruptFile(path, payload, replacement, mode) } + +// TestHookWriteIntentDefault runs one orchestration or CLI step. +// Signature: TestHookWriteIntentDefault(path string, in Intent) error. +// Why: lets top-level tests delegate to production WriteIntent behavior while +// selectively forcing deterministic failures for specific branches. +func TestHookWriteIntentDefault(path string, in Intent) error { + return writeIntentDefault(path, in) +} + +// TestHookSetWriteIntentOverride runs one orchestration or CLI step. +// Signature: TestHookSetWriteIntentOverride(fn func(path string, in Intent) error) (restore func()). +// Why: enables deterministic intent-write failure injection from the top-level +// testing module when tests run as root. +func TestHookSetWriteIntentOverride(fn func(path string, in Intent) error) (restore func()) { + testHookOverrideMu.Lock() + prev := writeIntentImpl + if fn == nil { + writeIntentImpl = writeIntentDefault + } else { + writeIntentImpl = fn + } + testHookOverrideMu.Unlock() + return func() { + testHookOverrideMu.Lock() + writeIntentImpl = prev + testHookOverrideMu.Unlock() + } +} + +// TestHookQuarantineCorruptFileDefault runs one orchestration or CLI step. +// Signature: TestHookQuarantineCorruptFileDefault(path string, payload []byte, replacement []byte, mode os.FileMode) error. +// Why: lets top-level tests fall through to production quarantine behavior when +// they only want to force a subset of calls. +func TestHookQuarantineCorruptFileDefault(path string, payload []byte, replacement []byte, mode os.FileMode) error { + return quarantineCorruptFileDefault(path, payload, replacement, mode) +} + +// TestHookSetQuarantineCorruptFileOverride runs one orchestration or CLI step. +// Signature: TestHookSetQuarantineCorruptFileOverride(fn func(path string, payload []byte, replacement []byte, mode os.FileMode) error) (restore func()). +// Why: enables deterministic auto-heal failure tests in privileged/root execution. +func TestHookSetQuarantineCorruptFileOverride(fn func(path string, payload []byte, replacement []byte, mode os.FileMode) error) (restore func()) { + testHookOverrideMu.Lock() + prev := quarantineCorruptFileImpl + if fn == nil { + quarantineCorruptFileImpl = quarantineCorruptFileDefault + } else { + quarantineCorruptFileImpl = fn + } + testHookOverrideMu.Unlock() + return func() { + testHookOverrideMu.Lock() + quarantineCorruptFileImpl = prev + testHookOverrideMu.Unlock() + } +} diff --git a/testing/orchestrator/hooks_gap_matrix_part5_test.go b/testing/orchestrator/hooks_gap_matrix_part5_test.go index 851cbe3..352b537 100644 --- a/testing/orchestrator/hooks_gap_matrix_part5_test.go +++ b/testing/orchestrator/hooks_gap_matrix_part5_test.go @@ -278,7 +278,30 @@ func TestHookGapMatrixPart5CoverageClosure(t *testing.T) { } cfgRemoveErr := lifecycleConfig(t) - orchRemoveErr, _ := newHookOrchestrator(t, cfgRemoveErr, listRun, listRun) + snapshotPath := filepath.Join(cfgRemoveErr.State.Dir, "scaled-workloads.json") + removeErrRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { + command := name + " " + strings.Join(args, " ") + switch { + case name == "kubectl" && strings.Contains(command, "get deployment -A -o jsonpath="): + return "monitoring\tgrafana\t1\n", nil + case name == "kubectl" && strings.Contains(command, "get statefulset -A -o jsonpath="): + return "", nil + case name == "kubectl" && strings.Contains(command, " scale deployment grafana --replicas=1"): + if err := os.Remove(snapshotPath); err != nil && !os.IsNotExist(err) { + return "", err + } + if err := os.MkdirAll(snapshotPath, 0o755); err != nil { + return "", err + } + if err := os.WriteFile(filepath.Join(snapshotPath, "keep"), []byte("x"), 0o644); err != nil { + return "", err + } + return "", nil + default: + return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) + } + } + orchRemoveErr, _ := newHookOrchestrator(t, cfgRemoveErr, removeErrRun, removeErrRun) entries, err = orchRemoveErr.TestHookListScalableWorkloads(context.Background()) if err != nil { t.Fatalf("list entries for remove-error case: %v", err) @@ -286,10 +309,6 @@ func TestHookGapMatrixPart5CoverageClosure(t *testing.T) { if err := orchRemoveErr.TestHookWriteScaledWorkloadSnapshot(entries); err != nil { t.Fatalf("seed snapshot for remove-error case: %v", err) } - if err := os.Chmod(cfgRemoveErr.State.Dir, 0o500); err != nil { - t.Fatalf("chmod state dir readonly: %v", err) - } - t.Cleanup(func() { _ = os.Chmod(cfgRemoveErr.State.Dir, 0o700) }) if err := orchRemoveErr.TestHookRestoreScaledApps(context.Background()); err == nil || !strings.Contains(err.Error(), "remove scaled workload snapshot") { t.Fatalf("expected remove snapshot error, got %v", err) } diff --git a/testing/orchestrator/hooks_lifecycle_deep_matrix_test.go b/testing/orchestrator/hooks_lifecycle_deep_matrix_test.go index 6419767..50e9551 100644 --- a/testing/orchestrator/hooks_lifecycle_deep_matrix_test.go +++ b/testing/orchestrator/hooks_lifecycle_deep_matrix_test.go @@ -85,10 +85,13 @@ func TestLifecycleDeepFailureMatrix(t *testing.T) { }); err != nil { t.Fatalf("seed stale startup intent: %v", err) } - if err := os.Chmod(cfg.State.IntentPath, 0o400); err != nil { - t.Fatalf("chmod intent file readonly: %v", err) - } - defer os.Chmod(cfg.State.IntentPath, 0o600) + restoreWrite := state.TestHookSetWriteIntentOverride(func(path string, in state.Intent) error { + if path == cfg.State.IntentPath && in.State == state.IntentNormal && strings.Contains(strings.ToLower(in.Reason), "auto-clear stale startup intent") { + return errors.New("forced intent clear failure") + } + return state.TestHookWriteIntentDefault(path, in) + }) + t.Cleanup(restoreWrite) orch, _ := newHookOrchestrator(t, cfg, nil, nil) err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "clear-stale-startup"}) if err == nil || !strings.Contains(err.Error(), "clear stale startup intent") { @@ -106,10 +109,13 @@ func TestLifecycleDeepFailureMatrix(t *testing.T) { }); err != nil { t.Fatalf("seed stale shutdown intent: %v", err) } - if err := os.Chmod(cfg.State.IntentPath, 0o400); err != nil { - t.Fatalf("chmod intent file readonly: %v", err) - } - defer os.Chmod(cfg.State.IntentPath, 0o600) + restoreWrite := state.TestHookSetWriteIntentOverride(func(path string, in state.Intent) error { + if path == cfg.State.IntentPath && in.State == state.IntentNormal && strings.Contains(strings.ToLower(in.Reason), "auto-clear stale shutdown intent") { + return errors.New("forced intent clear failure") + } + return state.TestHookWriteIntentDefault(path, in) + }) + t.Cleanup(restoreWrite) orch, _ := newHookOrchestrator(t, cfg, nil, nil) err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "clear-stale-shutdown"}) if err == nil || !strings.Contains(err.Error(), "clear stale shutdown intent") { @@ -177,10 +183,13 @@ func TestLifecycleDeepFailureMatrix(t *testing.T) { }); err != nil { t.Fatalf("seed normal intent: %v", err) } - if err := os.Chmod(cfg.State.IntentPath, 0o400); err != nil { - t.Fatalf("chmod intent file readonly: %v", err) - } - defer os.Chmod(cfg.State.IntentPath, 0o600) + restoreWrite := state.TestHookSetWriteIntentOverride(func(path string, in state.Intent) error { + if path == cfg.State.IntentPath && in.State == state.IntentStartupInProgress { + return errors.New("forced startup intent write failure") + } + return state.TestHookWriteIntentDefault(path, in) + }) + t.Cleanup(restoreWrite) orch, _ := newHookOrchestrator(t, cfg, nil, nil) err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "intent-write-fail"}) if err == nil || !strings.Contains(err.Error(), "set startup intent") { diff --git a/testing/state/state_quality_additional_test.go b/testing/state/state_quality_additional_test.go index 166e1cf..43b90d8 100644 --- a/testing/state/state_quality_additional_test.go +++ b/testing/state/state_quality_additional_test.go @@ -1,6 +1,7 @@ package statequality import ( + "errors" "os" "path/filepath" "strings" @@ -62,10 +63,13 @@ func TestStateStoreLockAndEstimatorBranches(t *testing.T) { t.Fatalf("mkdir lock dir: %v", err) } lockPath := filepath.Join(lockDir, "ananke.lock") - if err := os.WriteFile(lockPath, []byte("pid=999999 started=1970-01-01T00:00:00Z\n"), 0o000); err != nil { - t.Fatalf("write unreadable lock file: %v", err) + lockTargetDir := filepath.Join(lockDir, "lock-target") + if err := os.MkdirAll(lockTargetDir, 0o755); err != nil { + t.Fatalf("mkdir lock target dir: %v", err) + } + if err := os.Symlink(lockTargetDir, lockPath); err != nil { + t.Fatalf("create lock symlink to directory: %v", err) } - t.Cleanup(func() { _ = os.Chmod(lockPath, 0o600) }) if _, err := state.AcquireLock(lockPath); err == nil || !strings.Contains(err.Error(), "existing lock check failed") { t.Fatalf("expected stale lock check failure, got %v", err) } @@ -95,18 +99,14 @@ func TestStateStoreLockAndEstimatorBranches(t *testing.T) { // Why: verifies corrupt-state auto-heal reports a useful error when filesystem permissions block quarantine. func TestStateAutoHealFailurePath(t *testing.T) { root := t.TempDir() - dir := filepath.Join(root, "readonly") - if err := os.MkdirAll(dir, 0o700); err != nil { - t.Fatalf("mkdir readonly dir: %v", err) - } - path := filepath.Join(dir, "runs.json") + path := filepath.Join(root, "runs.json") if err := os.WriteFile(path, []byte("{invalid-json"), 0o600); err != nil { t.Fatalf("write corrupt runs file: %v", err) } - if err := os.Chmod(dir, 0o500); err != nil { - t.Fatalf("chmod readonly dir: %v", err) - } - t.Cleanup(func() { _ = os.Chmod(dir, 0o700) }) + restore := state.TestHookSetQuarantineCorruptFileOverride(func(path string, payload []byte, replacement []byte, mode os.FileMode) error { + return errors.New("forced quarantine failure") + }) + t.Cleanup(restore) _, err := state.New(path).Load() if err == nil { @@ -146,19 +146,12 @@ func TestStateQuarantineHookBranches(t *testing.T) { t.Fatalf("expected mkdir failure when parent is a file") } - // Read-only directory forces backup write failure branch. - roDir := filepath.Join(root, "readonly") - if err := os.MkdirAll(roDir, 0o700); err != nil { - t.Fatalf("mkdir readonly dir: %v", err) - } - roPath := filepath.Join(roDir, "state.json") - if err := os.Chmod(roDir, 0o500); err != nil { - t.Fatalf("chmod readonly dir: %v", err) - } - t.Cleanup(func() { _ = os.Chmod(roDir, 0o700) }) - err = state.TestHookQuarantineCorruptFile(roPath, []byte("bad"), []byte("{}\n"), 0o640) + // Long filename forces backup write failure branch regardless of root privileges. + longName := strings.Repeat("a", 245) + backupFailPath := filepath.Join(root, "long-name", longName) + err = state.TestHookQuarantineCorruptFile(backupFailPath, []byte("bad"), []byte("{}\n"), 0o640) if err == nil || !strings.Contains(err.Error(), "write backup") { - t.Fatalf("expected backup write failure in readonly directory, got %v", err) + t.Fatalf("expected backup write failure for long-name backup path, got %v", err) } } diff --git a/testing/state/state_quality_branch_closeout_test.go b/testing/state/state_quality_branch_closeout_test.go index e4e9375..e923a3c 100644 --- a/testing/state/state_quality_branch_closeout_test.go +++ b/testing/state/state_quality_branch_closeout_test.go @@ -1,6 +1,7 @@ package statequality import ( + "errors" "os" "path/filepath" "strings" @@ -102,18 +103,14 @@ func TestStateIntentParserAdditionalEdges(t *testing.T) { // Why: covers ReadIntent branch where corrupt file quarantine fails and error details are returned. func TestStateIntentAutoHealFailureBranch(t *testing.T) { root := t.TempDir() - dir := filepath.Join(root, "ro") - if err := os.MkdirAll(dir, 0o700); err != nil { - t.Fatalf("mkdir readonly dir: %v", err) - } - path := filepath.Join(dir, "intent.json") + path := filepath.Join(root, "intent.json") if err := os.WriteFile(path, []byte("{bad-json"), 0o600); err != nil { t.Fatalf("write malformed intent file: %v", err) } - if err := os.Chmod(dir, 0o500); err != nil { - t.Fatalf("chmod readonly dir: %v", err) - } - t.Cleanup(func() { _ = os.Chmod(dir, 0o700) }) + restore := state.TestHookSetQuarantineCorruptFileOverride(func(path string, payload []byte, replacement []byte, mode os.FileMode) error { + return errors.New("forced intent quarantine failure") + }) + t.Cleanup(restore) _, err := state.ReadIntent(path) if err == nil { diff --git a/testing/state/state_testhooks_quality_test.go b/testing/state/state_testhooks_quality_test.go new file mode 100644 index 0000000..582e8af --- /dev/null +++ b/testing/state/state_testhooks_quality_test.go @@ -0,0 +1,70 @@ +package statequality + +import ( + "errors" + "os" + "path/filepath" + "strings" + "testing" + + "scm.bstein.dev/bstein/ananke/internal/state" +) + +// TestStateTestHookOverrideSetters runs one orchestration or CLI step. +// Signature: TestStateTestHookOverrideSetters(t *testing.T). +// Why: drives top-level test-hook override branches so root-mode quality gates +// keep deterministic failure injection coverage above the per-file threshold. +func TestStateTestHookOverrideSetters(t *testing.T) { + root := t.TempDir() + intentPath := filepath.Join(root, "intent.json") + + restoreWriteNil := state.TestHookSetWriteIntentOverride(nil) + if err := state.WriteIntent(intentPath, state.Intent{State: state.IntentNormal}); err != nil { + t.Fatalf("expected default write intent path after nil override, got %v", err) + } + restoreWriteNil() + + writeOverrideCalled := false + restoreWrite := state.TestHookSetWriteIntentOverride(func(path string, in state.Intent) error { + writeOverrideCalled = true + return errors.New("forced write override") + }) + err := state.WriteIntent(intentPath, state.Intent{State: state.IntentNormal}) + if err == nil || !strings.Contains(err.Error(), "forced write override") { + t.Fatalf("expected forced write override error, got %v", err) + } + if !writeOverrideCalled { + t.Fatalf("expected write override to be invoked") + } + restoreWrite() + if err := state.TestHookWriteIntentDefault(filepath.Join(root, "intent-default.json"), state.Intent{State: state.IntentNormal}); err != nil { + t.Fatalf("expected explicit default write helper to succeed, got %v", err) + } + + payload := []byte("{bad-json") + replacement := []byte("{}\n") + quarantinePath := filepath.Join(root, "quarantine-default.json") + if err := state.TestHookQuarantineCorruptFileDefault(quarantinePath, payload, replacement, 0o640); err != nil { + t.Fatalf("expected explicit default quarantine helper to succeed, got %v", err) + } + + restoreQuarantineNil := state.TestHookSetQuarantineCorruptFileOverride(nil) + restoreQuarantineNil() + + quarantineOverrideCalled := false + restoreQuarantine := state.TestHookSetQuarantineCorruptFileOverride(func(path string, payload []byte, replacement []byte, mode os.FileMode) error { + quarantineOverrideCalled = true + return errors.New("forced quarantine override") + }) + err = state.TestHookQuarantineCorruptFile(filepath.Join(root, "quarantine-forced.json"), payload, replacement, 0o640) + if err == nil || !strings.Contains(err.Error(), "forced quarantine override") { + t.Fatalf("expected forced quarantine override error, got %v", err) + } + if !quarantineOverrideCalled { + t.Fatalf("expected quarantine override to be invoked") + } + restoreQuarantine() + if err := state.TestHookQuarantineCorruptFile(filepath.Join(root, "quarantine-after-restore.json"), payload, replacement, 0o640); err != nil { + t.Fatalf("expected quarantine helper to use default impl after restore, got %v", err) + } +}