test(soteria): cover policy cycle and start loops

2026-04-20 18:47:19 -03:00 · 2026-04-20 18:47:19 -03:00 · 5c9cca4420
commit 5c9cca4420
parent 7b592edb6d
3 changed files with 334 additions and 0 deletions
--- a/internal/server/b2_refresh_test.go
+++ b/internal/server/b2_refresh_test.go
@ -2,10 +2,14 @@ package server

 import (
 	"context"
+	"encoding/json"
 	"strings"
 	"testing"
 	"time"

+	"net/http"
+	"net/http/httptest"
+	"scm.bstein.dev/bstein/soteria/internal/api"
 	"scm.bstein.dev/bstein/soteria/internal/config"
 )

@ -62,3 +66,52 @@ func TestRefreshB2UsageRecordsScanErrorsAfterCredentialsResolve(t *testing.T) {
 		t.Fatalf("expected failure metrics after scan error, got success=%f duration=%f", srv.metrics.b2ScanSuccess, srv.metrics.b2ScanDurationSeconds)
 	}
 }
+
+func TestHandleB2UsageRefreshesWhenForcedAndQueryBoolRecognizesTruthyValues(t *testing.T) {
+	for _, raw := range []string{"1", "true", "yes", "y", "on", "TRUE"} {
+		if !queryBool(raw) {
+			t.Fatalf("expected %q to be treated as truthy", raw)
+		}
+	}
+	if queryBool("nope") {
+		t.Fatalf("expected invalid truthy value to be false")
+	}
+
+	srv := &Server{
+		cfg: &config.Config{
+			B2Enabled:         true,
+			B2Endpoint:        "https://",
+			B2AccessKeyID:     "atlas-key",
+			B2SecretAccessKey: "atlas-secret",
+			B2ScanTimeout:     time.Second,
+		},
+		client:  &fakeKubeClient{},
+		metrics: newTelemetry(),
+	}
+	srv.handler = http.HandlerFunc(srv.route)
+	srv.setB2Usage(api.B2UsageResponse{
+		Enabled:   true,
+		Available: true,
+		ScannedAt: time.Now().UTC().Add(-1 * time.Hour).Format(time.RFC3339),
+		Endpoint:  "cached-endpoint",
+	})
+
+	req := httptest.NewRequest(http.MethodGet, "/v1/b2?refresh=yes", nil)
+	res := httptest.NewRecorder()
+	srv.Handler().ServeHTTP(res, req)
+
+	if res.Code != http.StatusOK {
+		t.Fatalf("expected forced refresh request to succeed, got %d: %s", res.Code, res.Body.String())
+	}
+
+	var payload api.B2UsageResponse
+	if err := json.Unmarshal(res.Body.Bytes(), &payload); err != nil {
+		t.Fatalf("decode forced refresh payload: %v", err)
+	}
+	if payload.Error == "" || !strings.Contains(payload.Error, "S3 endpoint host is empty") {
+		t.Fatalf("expected refreshed B2 failure payload, got %#v", payload)
+	}
+	if payload.Endpoint != "https://" {
+		t.Fatalf("expected refreshed endpoint to replace cached snapshot, got %#v", payload)
+	}
+}
--- a/internal/server/policy_cycle_test.go
+++ b/internal/server/policy_cycle_test.go
@ -0,0 +1,156 @@
+package server
+
+import (
+	"context"
+	"errors"
+	"testing"
+	"time"
+
+	"scm.bstein.dev/bstein/soteria/internal/api"
+	"scm.bstein.dev/bstein/soteria/internal/config"
+	"scm.bstein.dev/bstein/soteria/internal/k8s"
+)
+
+type policyCycleTestKubeClient struct {
+	*inventoryTestKubeClient
+	createBackupErrForPVC map[string]error
+	backupRequests        []api.BackupRequest
+}
+
+func (k *policyCycleTestKubeClient) CreateBackupJob(ctx context.Context, cfg *config.Config, req api.BackupRequest) (string, string, error) {
+	k.backupRequests = append(k.backupRequests, req)
+	if err := k.createBackupErrForPVC[req.PVC]; err != nil {
+		return "", "", err
+	}
+	return k.inventoryTestKubeClient.fakeKubeClient.CreateBackupJob(ctx, cfg, req)
+}
+
+func metricCount(samples map[string]metricSample, labels map[string]string) float64 {
+	sample, ok := samples[metricKey(labels)]
+	if !ok {
+		return 0
+	}
+	return sample.value
+}
+
+func findBackupRequestByPVC(items []api.BackupRequest, pvc string) (api.BackupRequest, bool) {
+	for _, item := range items {
+		if item.PVC == pvc {
+			return item, true
+		}
+	}
+	return api.BackupRequest{}, false
+}
+
+func TestRunPolicyCycleCoversInventoryErrorAndConcurrentGuard(t *testing.T) {
+	client := &policyCycleTestKubeClient{
+		inventoryTestKubeClient: &inventoryTestKubeClient{
+			fakeKubeClient: &fakeKubeClient{},
+			listPVCsErr:    errors.New("inventory exploded"),
+		},
+	}
+	srv := &Server{
+		cfg: &config.Config{
+			BackupDriver:       "restic",
+			BackupMaxAge:       24 * time.Hour,
+			PolicyEvalInterval: time.Minute,
+		},
+		client:   client,
+		longhorn: &fakeLonghornClient{},
+		metrics:  newTelemetry(),
+		policies: map[string]api.BackupPolicy{
+			"apps__all": {ID: "apps__all", Namespace: "apps", IntervalHours: 6, Enabled: true, Dedupe: true},
+		},
+	}
+
+	srv.runPolicyCycle(context.Background())
+	if got := metricCount(srv.metrics.policyBackups, map[string]string{"result": "inventory_error"}); got != 1 {
+		t.Fatalf("expected inventory_error policy metric, got %f", got)
+	}
+	if srv.running {
+		t.Fatalf("expected policy runner lock to be released after inventory failure")
+	}
+
+	srv.running = true
+	srv.runPolicyCycle(context.Background())
+	if got := metricCount(srv.metrics.policyBackups, map[string]string{"result": "inventory_error"}); got != 1 {
+		t.Fatalf("expected concurrent guard to skip second run, got %f", got)
+	}
+}
+
+func TestRunPolicyCycleCoversEffectivePoliciesAndResultTracking(t *testing.T) {
+	now := time.Now().UTC()
+	recent := now.Add(-30 * time.Minute)
+	client := &policyCycleTestKubeClient{
+		inventoryTestKubeClient: &inventoryTestKubeClient{
+			fakeKubeClient: &fakeKubeClient{
+				pvcs: []k8s.PVCSummary{
+					{Namespace: "apps", Name: "data", VolumeName: "vol-data", Phase: "Bound"},
+					{Namespace: "apps", Name: "busy", VolumeName: "vol-busy", Phase: "Bound"},
+					{Namespace: "apps", Name: "recent", VolumeName: "vol-recent", Phase: "Bound"},
+					{Namespace: "apps", Name: "attempted", VolumeName: "vol-attempted", Phase: "Bound"},
+					{Namespace: "apps", Name: "fail", VolumeName: "vol-fail", Phase: "Bound"},
+				},
+				backupJobs: map[string][]k8s.BackupJobSummary{
+					"apps/busy": {
+						{Name: "job-busy", Namespace: "apps", PVC: "busy", State: "Running", CreatedAt: recent},
+					},
+					"apps/recent": {
+						{Name: "job-recent", Namespace: "apps", PVC: "recent", State: "Completed", CreatedAt: recent, CompletionTime: recent, KeepLast: 1},
+					},
+					"apps/attempted": {
+						{Name: "job-attempted", Namespace: "apps", PVC: "attempted", State: "Failed", CreatedAt: recent, KeepLast: 1},
+					},
+				},
+			},
+		},
+		createBackupErrForPVC: map[string]error{
+			"fail": errors.New("create backup job exploded"),
+		},
+	}
+	srv := &Server{
+		cfg: &config.Config{
+			BackupDriver:     "restic",
+			BackupMaxAge:     24 * time.Hour,
+			ResticRepository: "s3:https://repo/root",
+		},
+		client:   client,
+		longhorn: &fakeLonghornClient{},
+		metrics:  newTelemetry(),
+		policies: map[string]api.BackupPolicy{
+			"apps__all":      {ID: "apps__all", Namespace: "apps", IntervalHours: 6, Enabled: true, Dedupe: true, KeepLast: 1},
+			"apps__data":     {ID: "apps__data", Namespace: "apps", PVC: "data", IntervalHours: 1, Enabled: true, Dedupe: false, KeepLast: 2},
+			"apps__disabled": {ID: "apps__disabled", Namespace: "apps", PVC: "disabled", IntervalHours: 1, Enabled: false, Dedupe: false, KeepLast: 5},
+		},
+	}
+
+	srv.runPolicyCycle(context.Background())
+
+	if len(client.backupRequests) != 2 {
+		t.Fatalf("expected two executed backup requests, got %#v", client.backupRequests)
+	}
+	dataReq, ok := findBackupRequestByPVC(client.backupRequests, "data")
+	if !ok || dataReq.Dedupe == nil || *dataReq.Dedupe != false || dataReq.KeepLast == nil || *dataReq.KeepLast != 2 {
+		t.Fatalf("expected stricter PVC policy to win for data, got %#v", client.backupRequests)
+	}
+
+	if got := metricCount(srv.metrics.policyBackups, map[string]string{"result": "success"}); got != 1 {
+		t.Fatalf("expected one successful policy backup, got %f", got)
+	}
+	if got := metricCount(srv.metrics.policyBackups, map[string]string{"result": "backend_error"}); got != 1 {
+		t.Fatalf("expected one backend error policy backup, got %f", got)
+	}
+	if got := metricCount(srv.metrics.policyBackups, map[string]string{"result": "in_progress"}); got != 1 {
+		t.Fatalf("expected one in-progress policy backup, got %f", got)
+	}
+	if got := metricCount(srv.metrics.policyBackups, map[string]string{"result": "not_due"}); got != 2 {
+		t.Fatalf("expected two not-due policy backups, got %f", got)
+	}
+
+	if got := metricCount(srv.metrics.backupRequests, map[string]string{"driver": "restic", "result": "success"}); got != 1 {
+		t.Fatalf("expected one successful executed backup request, got %f", got)
+	}
+	if got := metricCount(srv.metrics.backupRequests, map[string]string{"driver": "restic", "result": "backend_error"}); got != 1 {
+		t.Fatalf("expected one failed executed backup request, got %f", got)
+	}
+}
--- a/internal/server/server_start_test.go
+++ b/internal/server/server_start_test.go
@ -0,0 +1,125 @@
+package server
+
+import (
+	"context"
+	"strings"
+	"sync"
+	"testing"
+	"time"
+
+	"scm.bstein.dev/bstein/soteria/internal/api"
+	"scm.bstein.dev/bstein/soteria/internal/config"
+	"scm.bstein.dev/bstein/soteria/internal/k8s"
+)
+
+type startTestKubeClient struct {
+	*fakeKubeClient
+	mu           sync.Mutex
+	loadCalls    int
+	listPVCCalls int
+}
+
+func (k *startTestKubeClient) LoadSecretData(ctx context.Context, namespace, secretName, key string) ([]byte, error) {
+	k.mu.Lock()
+	k.loadCalls++
+	k.mu.Unlock()
+	return k.fakeKubeClient.LoadSecretData(ctx, namespace, secretName, key)
+}
+
+func (k *startTestKubeClient) ListBoundPVCs(ctx context.Context) ([]k8s.PVCSummary, error) {
+	k.mu.Lock()
+	k.listPVCCalls++
+	k.mu.Unlock()
+	return k.fakeKubeClient.ListBoundPVCs(ctx)
+}
+
+func (k *startTestKubeClient) counts() (int, int) {
+	k.mu.Lock()
+	defer k.mu.Unlock()
+	return k.loadCalls, k.listPVCCalls
+}
+
+func newStartTestServer(cfg *config.Config, client kubeClient) *Server {
+	return &Server{
+		cfg:        cfg,
+		client:     client,
+		longhorn:   &fakeLonghornClient{},
+		metrics:    newTelemetry(),
+		ui:         newUIRenderer(),
+		policies:   map[string]api.BackupPolicy{},
+		jobUsage:   map[string]resticJobUsageCacheEntry{},
+		usageStore: map[string]resticPersistedUsageEntry{},
+	}
+}
+
+func TestStartRunsInitialLoadAndTickerLoopWithoutB2(t *testing.T) {
+	client := &startTestKubeClient{
+		fakeKubeClient: &fakeKubeClient{
+			pvcs: []k8s.PVCSummary{{Namespace: "apps", Name: "data", VolumeName: "vol-data", Phase: "Bound"}},
+		},
+	}
+	srv := newStartTestServer(&config.Config{
+		Namespace:              "atlas",
+		PolicySecretName:       "soteria-policies",
+		UsageSecretName:        "soteria-usage",
+		BackupDriver:           "longhorn",
+		BackupMaxAge:           24 * time.Hour,
+		MetricsRefreshInterval: 10 * time.Millisecond,
+		PolicyEvalInterval:     10 * time.Millisecond,
+		B2Enabled:              false,
+	}, client)
+
+	ctx, cancel := context.WithCancel(context.Background())
+	srv.Start(ctx)
+	time.Sleep(35 * time.Millisecond)
+	cancel()
+	time.Sleep(20 * time.Millisecond)
+
+	loadCalls, listPVCCalls := client.counts()
+	if loadCalls < 2 {
+		t.Fatalf("expected initial policy/usage secret loads, got %d", loadCalls)
+	}
+	if listPVCCalls < 2 {
+		t.Fatalf("expected inventory refresh to run initially and on ticker, got %d", listPVCCalls)
+	}
+	if srv.metrics.inventoryRefreshTime == 0 {
+		t.Fatalf("expected inventory metrics to be recorded after start loop")
+	}
+}
+
+func TestStartRunsB2TickerAndStoresRefreshFailures(t *testing.T) {
+	client := &startTestKubeClient{
+		fakeKubeClient: &fakeKubeClient{
+			pvcs: []k8s.PVCSummary{{Namespace: "apps", Name: "data", VolumeName: "vol-data", Phase: "Bound"}},
+		},
+	}
+	srv := newStartTestServer(&config.Config{
+		Namespace:              "atlas",
+		PolicySecretName:       "soteria-policies",
+		UsageSecretName:        "soteria-usage",
+		BackupDriver:           "longhorn",
+		BackupMaxAge:           24 * time.Hour,
+		MetricsRefreshInterval: 10 * time.Millisecond,
+		PolicyEvalInterval:     10 * time.Millisecond,
+		B2Enabled:              true,
+		B2Endpoint:             "https://",
+		B2AccessKeyID:          "atlas-key",
+		B2SecretAccessKey:      "atlas-secret",
+		B2ScanInterval:         10 * time.Millisecond,
+		B2ScanTimeout:          10 * time.Millisecond,
+	}, client)
+
+	ctx, cancel := context.WithCancel(context.Background())
+	srv.Start(ctx)
+	time.Sleep(35 * time.Millisecond)
+	cancel()
+	time.Sleep(20 * time.Millisecond)
+
+	usage := srv.getB2Usage()
+	if !usage.Enabled || usage.Error == "" || !strings.Contains(usage.Error, "S3 endpoint host is empty") {
+		t.Fatalf("expected B2 ticker refresh failure snapshot, got %#v", usage)
+	}
+	if srv.metrics.b2ScanTimestamp == 0 {
+		t.Fatalf("expected B2 scan metrics to be recorded during start loop")
+	}
+}