testing: make quality gate root-safe and deterministic

This commit is contained in:
Brad Stein 2026-04-09 04:56:41 -03:00
parent a493670dbd
commit b229f47af8
8 changed files with 219 additions and 51 deletions

View File

@ -7,10 +7,20 @@ import (
"time" "time"
) )
var quarantineCorruptFileImpl = quarantineCorruptFileDefault
// quarantineCorruptFile runs one orchestration or CLI step. // quarantineCorruptFile runs one orchestration or CLI step.
// Signature: quarantineCorruptFile(path string, payload []byte, replacement []byte, mode os.FileMode) error. // Signature: quarantineCorruptFile(path string, payload []byte, replacement []byte, mode os.FileMode) error.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func quarantineCorruptFile(path string, payload []byte, replacement []byte, mode os.FileMode) error { func quarantineCorruptFile(path string, payload []byte, replacement []byte, mode os.FileMode) error {
return quarantineCorruptFileImpl(path, payload, replacement, mode)
}
// quarantineCorruptFileDefault runs one orchestration or CLI step.
// Signature: quarantineCorruptFileDefault(path string, payload []byte, replacement []byte, mode os.FileMode) error.
// Why: keeps production file-healing behavior as the default while tests can
// deterministically force heal failures in root/sudo environments.
func quarantineCorruptFileDefault(path string, payload []byte, replacement []byte, mode os.FileMode) error {
if err := os.MkdirAll(filepath.Dir(path), 0o750); err != nil { if err := os.MkdirAll(filepath.Dir(path), 0o750); err != nil {
return err return err
} }

View File

@ -22,6 +22,8 @@ type Intent struct {
UpdatedAt time.Time `json:"updated_at"` UpdatedAt time.Time `json:"updated_at"`
} }
var writeIntentImpl = writeIntentDefault
// ReadIntent runs one orchestration or CLI step. // ReadIntent runs one orchestration or CLI step.
// Signature: ReadIntent(path string) (Intent, error). // Signature: ReadIntent(path string) (Intent, error).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
@ -50,6 +52,14 @@ func ReadIntent(path string) (Intent, error) {
// Signature: WriteIntent(path string, in Intent) error. // Signature: WriteIntent(path string, in Intent) error.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func WriteIntent(path string, in Intent) error { func WriteIntent(path string, in Intent) error {
return writeIntentImpl(path, in)
}
// writeIntentDefault runs one orchestration or CLI step.
// Signature: writeIntentDefault(path string, in Intent) error.
// Why: keeps the production write behavior available while tests can override
// WriteIntent deterministically without root-only permission tricks.
func writeIntentDefault(path string, in Intent) error {
if in.UpdatedAt.IsZero() { if in.UpdatedAt.IsZero() {
in.UpdatedAt = time.Now().UTC() in.UpdatedAt = time.Now().UTC()
} }

View File

@ -1,6 +1,11 @@
package state package state
import "os" import (
"os"
"sync"
)
var testHookOverrideMu sync.Mutex
// TestHookQuarantineCorruptFile runs one orchestration or CLI step. // TestHookQuarantineCorruptFile runs one orchestration or CLI step.
// Signature: TestHookQuarantineCorruptFile(path string, payload []byte, replacement []byte, mode os.FileMode) error. // Signature: TestHookQuarantineCorruptFile(path string, payload []byte, replacement []byte, mode os.FileMode) error.
@ -8,3 +13,58 @@ import "os"
func TestHookQuarantineCorruptFile(path string, payload []byte, replacement []byte, mode os.FileMode) error { func TestHookQuarantineCorruptFile(path string, payload []byte, replacement []byte, mode os.FileMode) error {
return quarantineCorruptFile(path, payload, replacement, mode) return quarantineCorruptFile(path, payload, replacement, mode)
} }
// TestHookWriteIntentDefault runs one orchestration or CLI step.
// Signature: TestHookWriteIntentDefault(path string, in Intent) error.
// Why: lets top-level tests delegate to production WriteIntent behavior while
// selectively forcing deterministic failures for specific branches.
func TestHookWriteIntentDefault(path string, in Intent) error {
return writeIntentDefault(path, in)
}
// TestHookSetWriteIntentOverride runs one orchestration or CLI step.
// Signature: TestHookSetWriteIntentOverride(fn func(path string, in Intent) error) (restore func()).
// Why: enables deterministic intent-write failure injection from the top-level
// testing module when tests run as root.
func TestHookSetWriteIntentOverride(fn func(path string, in Intent) error) (restore func()) {
testHookOverrideMu.Lock()
prev := writeIntentImpl
if fn == nil {
writeIntentImpl = writeIntentDefault
} else {
writeIntentImpl = fn
}
testHookOverrideMu.Unlock()
return func() {
testHookOverrideMu.Lock()
writeIntentImpl = prev
testHookOverrideMu.Unlock()
}
}
// TestHookQuarantineCorruptFileDefault runs one orchestration or CLI step.
// Signature: TestHookQuarantineCorruptFileDefault(path string, payload []byte, replacement []byte, mode os.FileMode) error.
// Why: lets top-level tests fall through to production quarantine behavior when
// they only want to force a subset of calls.
func TestHookQuarantineCorruptFileDefault(path string, payload []byte, replacement []byte, mode os.FileMode) error {
return quarantineCorruptFileDefault(path, payload, replacement, mode)
}
// TestHookSetQuarantineCorruptFileOverride runs one orchestration or CLI step.
// Signature: TestHookSetQuarantineCorruptFileOverride(fn func(path string, payload []byte, replacement []byte, mode os.FileMode) error) (restore func()).
// Why: enables deterministic auto-heal failure tests in privileged/root execution.
func TestHookSetQuarantineCorruptFileOverride(fn func(path string, payload []byte, replacement []byte, mode os.FileMode) error) (restore func()) {
testHookOverrideMu.Lock()
prev := quarantineCorruptFileImpl
if fn == nil {
quarantineCorruptFileImpl = quarantineCorruptFileDefault
} else {
quarantineCorruptFileImpl = fn
}
testHookOverrideMu.Unlock()
return func() {
testHookOverrideMu.Lock()
quarantineCorruptFileImpl = prev
testHookOverrideMu.Unlock()
}
}

View File

@ -278,7 +278,30 @@ func TestHookGapMatrixPart5CoverageClosure(t *testing.T) {
} }
cfgRemoveErr := lifecycleConfig(t) cfgRemoveErr := lifecycleConfig(t)
orchRemoveErr, _ := newHookOrchestrator(t, cfgRemoveErr, listRun, listRun) snapshotPath := filepath.Join(cfgRemoveErr.State.Dir, "scaled-workloads.json")
removeErrRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "get deployment -A -o jsonpath="):
return "monitoring\tgrafana\t1\n", nil
case name == "kubectl" && strings.Contains(command, "get statefulset -A -o jsonpath="):
return "", nil
case name == "kubectl" && strings.Contains(command, " scale deployment grafana --replicas=1"):
if err := os.Remove(snapshotPath); err != nil && !os.IsNotExist(err) {
return "", err
}
if err := os.MkdirAll(snapshotPath, 0o755); err != nil {
return "", err
}
if err := os.WriteFile(filepath.Join(snapshotPath, "keep"), []byte("x"), 0o644); err != nil {
return "", err
}
return "", nil
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orchRemoveErr, _ := newHookOrchestrator(t, cfgRemoveErr, removeErrRun, removeErrRun)
entries, err = orchRemoveErr.TestHookListScalableWorkloads(context.Background()) entries, err = orchRemoveErr.TestHookListScalableWorkloads(context.Background())
if err != nil { if err != nil {
t.Fatalf("list entries for remove-error case: %v", err) t.Fatalf("list entries for remove-error case: %v", err)
@ -286,10 +309,6 @@ func TestHookGapMatrixPart5CoverageClosure(t *testing.T) {
if err := orchRemoveErr.TestHookWriteScaledWorkloadSnapshot(entries); err != nil { if err := orchRemoveErr.TestHookWriteScaledWorkloadSnapshot(entries); err != nil {
t.Fatalf("seed snapshot for remove-error case: %v", err) t.Fatalf("seed snapshot for remove-error case: %v", err)
} }
if err := os.Chmod(cfgRemoveErr.State.Dir, 0o500); err != nil {
t.Fatalf("chmod state dir readonly: %v", err)
}
t.Cleanup(func() { _ = os.Chmod(cfgRemoveErr.State.Dir, 0o700) })
if err := orchRemoveErr.TestHookRestoreScaledApps(context.Background()); err == nil || !strings.Contains(err.Error(), "remove scaled workload snapshot") { if err := orchRemoveErr.TestHookRestoreScaledApps(context.Background()); err == nil || !strings.Contains(err.Error(), "remove scaled workload snapshot") {
t.Fatalf("expected remove snapshot error, got %v", err) t.Fatalf("expected remove snapshot error, got %v", err)
} }

View File

@ -85,10 +85,13 @@ func TestLifecycleDeepFailureMatrix(t *testing.T) {
}); err != nil { }); err != nil {
t.Fatalf("seed stale startup intent: %v", err) t.Fatalf("seed stale startup intent: %v", err)
} }
if err := os.Chmod(cfg.State.IntentPath, 0o400); err != nil { restoreWrite := state.TestHookSetWriteIntentOverride(func(path string, in state.Intent) error {
t.Fatalf("chmod intent file readonly: %v", err) if path == cfg.State.IntentPath && in.State == state.IntentNormal && strings.Contains(strings.ToLower(in.Reason), "auto-clear stale startup intent") {
return errors.New("forced intent clear failure")
} }
defer os.Chmod(cfg.State.IntentPath, 0o600) return state.TestHookWriteIntentDefault(path, in)
})
t.Cleanup(restoreWrite)
orch, _ := newHookOrchestrator(t, cfg, nil, nil) orch, _ := newHookOrchestrator(t, cfg, nil, nil)
err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "clear-stale-startup"}) err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "clear-stale-startup"})
if err == nil || !strings.Contains(err.Error(), "clear stale startup intent") { if err == nil || !strings.Contains(err.Error(), "clear stale startup intent") {
@ -106,10 +109,13 @@ func TestLifecycleDeepFailureMatrix(t *testing.T) {
}); err != nil { }); err != nil {
t.Fatalf("seed stale shutdown intent: %v", err) t.Fatalf("seed stale shutdown intent: %v", err)
} }
if err := os.Chmod(cfg.State.IntentPath, 0o400); err != nil { restoreWrite := state.TestHookSetWriteIntentOverride(func(path string, in state.Intent) error {
t.Fatalf("chmod intent file readonly: %v", err) if path == cfg.State.IntentPath && in.State == state.IntentNormal && strings.Contains(strings.ToLower(in.Reason), "auto-clear stale shutdown intent") {
return errors.New("forced intent clear failure")
} }
defer os.Chmod(cfg.State.IntentPath, 0o600) return state.TestHookWriteIntentDefault(path, in)
})
t.Cleanup(restoreWrite)
orch, _ := newHookOrchestrator(t, cfg, nil, nil) orch, _ := newHookOrchestrator(t, cfg, nil, nil)
err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "clear-stale-shutdown"}) err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "clear-stale-shutdown"})
if err == nil || !strings.Contains(err.Error(), "clear stale shutdown intent") { if err == nil || !strings.Contains(err.Error(), "clear stale shutdown intent") {
@ -177,10 +183,13 @@ func TestLifecycleDeepFailureMatrix(t *testing.T) {
}); err != nil { }); err != nil {
t.Fatalf("seed normal intent: %v", err) t.Fatalf("seed normal intent: %v", err)
} }
if err := os.Chmod(cfg.State.IntentPath, 0o400); err != nil { restoreWrite := state.TestHookSetWriteIntentOverride(func(path string, in state.Intent) error {
t.Fatalf("chmod intent file readonly: %v", err) if path == cfg.State.IntentPath && in.State == state.IntentStartupInProgress {
return errors.New("forced startup intent write failure")
} }
defer os.Chmod(cfg.State.IntentPath, 0o600) return state.TestHookWriteIntentDefault(path, in)
})
t.Cleanup(restoreWrite)
orch, _ := newHookOrchestrator(t, cfg, nil, nil) orch, _ := newHookOrchestrator(t, cfg, nil, nil)
err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "intent-write-fail"}) err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "intent-write-fail"})
if err == nil || !strings.Contains(err.Error(), "set startup intent") { if err == nil || !strings.Contains(err.Error(), "set startup intent") {

View File

@ -1,6 +1,7 @@
package statequality package statequality
import ( import (
"errors"
"os" "os"
"path/filepath" "path/filepath"
"strings" "strings"
@ -62,10 +63,13 @@ func TestStateStoreLockAndEstimatorBranches(t *testing.T) {
t.Fatalf("mkdir lock dir: %v", err) t.Fatalf("mkdir lock dir: %v", err)
} }
lockPath := filepath.Join(lockDir, "ananke.lock") lockPath := filepath.Join(lockDir, "ananke.lock")
if err := os.WriteFile(lockPath, []byte("pid=999999 started=1970-01-01T00:00:00Z\n"), 0o000); err != nil { lockTargetDir := filepath.Join(lockDir, "lock-target")
t.Fatalf("write unreadable lock file: %v", err) if err := os.MkdirAll(lockTargetDir, 0o755); err != nil {
t.Fatalf("mkdir lock target dir: %v", err)
}
if err := os.Symlink(lockTargetDir, lockPath); err != nil {
t.Fatalf("create lock symlink to directory: %v", err)
} }
t.Cleanup(func() { _ = os.Chmod(lockPath, 0o600) })
if _, err := state.AcquireLock(lockPath); err == nil || !strings.Contains(err.Error(), "existing lock check failed") { if _, err := state.AcquireLock(lockPath); err == nil || !strings.Contains(err.Error(), "existing lock check failed") {
t.Fatalf("expected stale lock check failure, got %v", err) t.Fatalf("expected stale lock check failure, got %v", err)
} }
@ -95,18 +99,14 @@ func TestStateStoreLockAndEstimatorBranches(t *testing.T) {
// Why: verifies corrupt-state auto-heal reports a useful error when filesystem permissions block quarantine. // Why: verifies corrupt-state auto-heal reports a useful error when filesystem permissions block quarantine.
func TestStateAutoHealFailurePath(t *testing.T) { func TestStateAutoHealFailurePath(t *testing.T) {
root := t.TempDir() root := t.TempDir()
dir := filepath.Join(root, "readonly") path := filepath.Join(root, "runs.json")
if err := os.MkdirAll(dir, 0o700); err != nil {
t.Fatalf("mkdir readonly dir: %v", err)
}
path := filepath.Join(dir, "runs.json")
if err := os.WriteFile(path, []byte("{invalid-json"), 0o600); err != nil { if err := os.WriteFile(path, []byte("{invalid-json"), 0o600); err != nil {
t.Fatalf("write corrupt runs file: %v", err) t.Fatalf("write corrupt runs file: %v", err)
} }
if err := os.Chmod(dir, 0o500); err != nil { restore := state.TestHookSetQuarantineCorruptFileOverride(func(path string, payload []byte, replacement []byte, mode os.FileMode) error {
t.Fatalf("chmod readonly dir: %v", err) return errors.New("forced quarantine failure")
} })
t.Cleanup(func() { _ = os.Chmod(dir, 0o700) }) t.Cleanup(restore)
_, err := state.New(path).Load() _, err := state.New(path).Load()
if err == nil { if err == nil {
@ -146,19 +146,12 @@ func TestStateQuarantineHookBranches(t *testing.T) {
t.Fatalf("expected mkdir failure when parent is a file") t.Fatalf("expected mkdir failure when parent is a file")
} }
// Read-only directory forces backup write failure branch. // Long filename forces backup write failure branch regardless of root privileges.
roDir := filepath.Join(root, "readonly") longName := strings.Repeat("a", 245)
if err := os.MkdirAll(roDir, 0o700); err != nil { backupFailPath := filepath.Join(root, "long-name", longName)
t.Fatalf("mkdir readonly dir: %v", err) err = state.TestHookQuarantineCorruptFile(backupFailPath, []byte("bad"), []byte("{}\n"), 0o640)
}
roPath := filepath.Join(roDir, "state.json")
if err := os.Chmod(roDir, 0o500); err != nil {
t.Fatalf("chmod readonly dir: %v", err)
}
t.Cleanup(func() { _ = os.Chmod(roDir, 0o700) })
err = state.TestHookQuarantineCorruptFile(roPath, []byte("bad"), []byte("{}\n"), 0o640)
if err == nil || !strings.Contains(err.Error(), "write backup") { if err == nil || !strings.Contains(err.Error(), "write backup") {
t.Fatalf("expected backup write failure in readonly directory, got %v", err) t.Fatalf("expected backup write failure for long-name backup path, got %v", err)
} }
} }

View File

@ -1,6 +1,7 @@
package statequality package statequality
import ( import (
"errors"
"os" "os"
"path/filepath" "path/filepath"
"strings" "strings"
@ -102,18 +103,14 @@ func TestStateIntentParserAdditionalEdges(t *testing.T) {
// Why: covers ReadIntent branch where corrupt file quarantine fails and error details are returned. // Why: covers ReadIntent branch where corrupt file quarantine fails and error details are returned.
func TestStateIntentAutoHealFailureBranch(t *testing.T) { func TestStateIntentAutoHealFailureBranch(t *testing.T) {
root := t.TempDir() root := t.TempDir()
dir := filepath.Join(root, "ro") path := filepath.Join(root, "intent.json")
if err := os.MkdirAll(dir, 0o700); err != nil {
t.Fatalf("mkdir readonly dir: %v", err)
}
path := filepath.Join(dir, "intent.json")
if err := os.WriteFile(path, []byte("{bad-json"), 0o600); err != nil { if err := os.WriteFile(path, []byte("{bad-json"), 0o600); err != nil {
t.Fatalf("write malformed intent file: %v", err) t.Fatalf("write malformed intent file: %v", err)
} }
if err := os.Chmod(dir, 0o500); err != nil { restore := state.TestHookSetQuarantineCorruptFileOverride(func(path string, payload []byte, replacement []byte, mode os.FileMode) error {
t.Fatalf("chmod readonly dir: %v", err) return errors.New("forced intent quarantine failure")
} })
t.Cleanup(func() { _ = os.Chmod(dir, 0o700) }) t.Cleanup(restore)
_, err := state.ReadIntent(path) _, err := state.ReadIntent(path)
if err == nil { if err == nil {

View File

@ -0,0 +1,70 @@
package statequality
import (
"errors"
"os"
"path/filepath"
"strings"
"testing"
"scm.bstein.dev/bstein/ananke/internal/state"
)
// TestStateTestHookOverrideSetters runs one orchestration or CLI step.
// Signature: TestStateTestHookOverrideSetters(t *testing.T).
// Why: drives top-level test-hook override branches so root-mode quality gates
// keep deterministic failure injection coverage above the per-file threshold.
func TestStateTestHookOverrideSetters(t *testing.T) {
root := t.TempDir()
intentPath := filepath.Join(root, "intent.json")
restoreWriteNil := state.TestHookSetWriteIntentOverride(nil)
if err := state.WriteIntent(intentPath, state.Intent{State: state.IntentNormal}); err != nil {
t.Fatalf("expected default write intent path after nil override, got %v", err)
}
restoreWriteNil()
writeOverrideCalled := false
restoreWrite := state.TestHookSetWriteIntentOverride(func(path string, in state.Intent) error {
writeOverrideCalled = true
return errors.New("forced write override")
})
err := state.WriteIntent(intentPath, state.Intent{State: state.IntentNormal})
if err == nil || !strings.Contains(err.Error(), "forced write override") {
t.Fatalf("expected forced write override error, got %v", err)
}
if !writeOverrideCalled {
t.Fatalf("expected write override to be invoked")
}
restoreWrite()
if err := state.TestHookWriteIntentDefault(filepath.Join(root, "intent-default.json"), state.Intent{State: state.IntentNormal}); err != nil {
t.Fatalf("expected explicit default write helper to succeed, got %v", err)
}
payload := []byte("{bad-json")
replacement := []byte("{}\n")
quarantinePath := filepath.Join(root, "quarantine-default.json")
if err := state.TestHookQuarantineCorruptFileDefault(quarantinePath, payload, replacement, 0o640); err != nil {
t.Fatalf("expected explicit default quarantine helper to succeed, got %v", err)
}
restoreQuarantineNil := state.TestHookSetQuarantineCorruptFileOverride(nil)
restoreQuarantineNil()
quarantineOverrideCalled := false
restoreQuarantine := state.TestHookSetQuarantineCorruptFileOverride(func(path string, payload []byte, replacement []byte, mode os.FileMode) error {
quarantineOverrideCalled = true
return errors.New("forced quarantine override")
})
err = state.TestHookQuarantineCorruptFile(filepath.Join(root, "quarantine-forced.json"), payload, replacement, 0o640)
if err == nil || !strings.Contains(err.Error(), "forced quarantine override") {
t.Fatalf("expected forced quarantine override error, got %v", err)
}
if !quarantineOverrideCalled {
t.Fatalf("expected quarantine override to be invoked")
}
restoreQuarantine()
if err := state.TestHookQuarantineCorruptFile(filepath.Join(root, "quarantine-after-restore.json"), payload, replacement, 0o640); err != nil {
t.Fatalf("expected quarantine helper to use default impl after restore, got %v", err)
}
}