soteria/internal/server/metrics_test.go

252 lines
8.4 KiB
Go

package server
import (
"testing"
"time"
"scm.bstein.dev/bstein/soteria/internal/api"
)
func TestMetricValuesEmptyAndSetMetricClonesLabels(t *testing.T) {
if values := metricValues(map[string]metricSample{}); len(values) != 0 {
t.Fatalf("expected no metric values for empty source, got %#v", values)
}
target := map[string]metricSample{}
labels := map[string]string{
"namespace": "apps",
"pvc": "data",
}
setMetric(target, labels, 42)
labels["namespace"] = "mutated"
values := metricValues(target)
if len(values) != 1 {
t.Fatalf("expected one metric sample, got %#v", values)
}
if values[0].labels["namespace"] != "apps" {
t.Fatalf("expected cloned labels to remain unchanged, got %#v", values[0].labels)
}
if values[0].value != 42 {
t.Fatalf("expected metric value 42, got %v", values[0].value)
}
}
func TestTelemetryRecordInventoryFailureIncrementsCounter(t *testing.T) {
telemetry := newTelemetry()
telemetry.RecordInventoryFailure()
telemetry.RecordInventoryFailure()
if telemetry.inventoryRefreshFailure != 2 {
t.Fatalf("expected inventory refresh failure count 2, got %v", telemetry.inventoryRefreshFailure)
}
}
func TestTelemetryRecordInventoryPopulatesAndResetsMetrics(t *testing.T) {
telemetry := newTelemetry()
recordedAt := time.Date(2026, 4, 20, 13, 14, 15, 0, time.UTC)
telemetry.RecordInventory(api.InventoryResponse{
Namespaces: []api.NamespaceInventory{
{
Name: "apps",
PVCs: []api.PVCInventory{
{
Namespace: "apps",
PVC: "data",
Volume: "pv-apps-data",
Driver: "restic",
LastBackupAt: recordedAt.Format(time.RFC3339),
LastBackupAgeHours: 2.5,
BackupCount: 3,
CompletedBackups: 2,
LastBackupSizeBytes: 512,
TotalBackupSizeBytes: 2048,
Healthy: true,
},
{
Namespace: "apps",
PVC: "cache",
Volume: "pv-apps-cache",
Driver: "restic",
LastBackupAt: "not-a-time",
LastBackupAgeHours: 99,
BackupCount: 1,
CompletedBackups: 0,
LastBackupSizeBytes: 0,
TotalBackupSizeBytes: 0,
Healthy: false,
HealthReason: "stale",
},
},
},
},
})
dataKey := metricKey(map[string]string{
"namespace": "apps",
"pvc": "data",
"volume": "pv-apps-data",
"driver": "restic",
})
cacheKey := metricKey(map[string]string{
"namespace": "apps",
"pvc": "cache",
"volume": "pv-apps-cache",
"driver": "restic",
})
dataReasonKey := metricKey(map[string]string{
"namespace": "apps",
"pvc": "data",
"volume": "pv-apps-data",
"driver": "restic",
"reason": "unknown",
})
cacheReasonKey := metricKey(map[string]string{
"namespace": "apps",
"pvc": "cache",
"volume": "pv-apps-cache",
"driver": "restic",
"reason": "stale",
})
if telemetry.inventoryRefreshTime <= 0 {
t.Fatalf("expected inventory refresh time to be recorded, got %v", telemetry.inventoryRefreshTime)
}
if got := telemetry.pvcBackupCount[dataKey].value; got != 3 {
t.Fatalf("expected backup count 3 for data pvc, got %v", got)
}
if got := telemetry.pvcBackupCompletedCount[dataKey].value; got != 2 {
t.Fatalf("expected completed backup count 2, got %v", got)
}
if got := telemetry.pvcBackupLastSizeBytes[dataKey].value; got != 512 {
t.Fatalf("expected last backup size 512, got %v", got)
}
if got := telemetry.pvcBackupTotalSizeBytes[dataKey].value; got != 2048 {
t.Fatalf("expected total backup size 2048, got %v", got)
}
if got := telemetry.pvcBackupHealth[dataKey].value; got != 1 {
t.Fatalf("expected healthy pvc to emit 1, got %v", got)
}
if got := telemetry.pvcBackupHealth[cacheKey].value; got != 0 {
t.Fatalf("expected unhealthy pvc to emit 0, got %v", got)
}
if got := telemetry.pvcBackupHealthReason[dataReasonKey].value; got != 1 {
t.Fatalf("expected unknown health reason marker, got %v", got)
}
if got := telemetry.pvcBackupHealthReason[cacheReasonKey].value; got != 1 {
t.Fatalf("expected explicit stale health reason marker, got %v", got)
}
if got := telemetry.pvcBackupAgeHours[dataKey].value; got != 2.5 {
t.Fatalf("expected backup age 2.5 hours, got %v", got)
}
if got := telemetry.pvcBackupAgeHours[cacheKey].value; got != 99 {
t.Fatalf("expected invalid-time pvc to still expose age, got %v", got)
}
if got := telemetry.pvcBackupLastSuccess[dataKey].value; got != float64(recordedAt.Unix()) {
t.Fatalf("expected last success timestamp %d, got %v", recordedAt.Unix(), got)
}
if _, ok := telemetry.pvcBackupLastSuccess[cacheKey]; ok {
t.Fatalf("expected invalid last_backup_at to skip success timestamp")
}
telemetry.RecordInventory(api.InventoryResponse{})
if len(telemetry.pvcBackupCount) != 0 || len(telemetry.pvcBackupHealthReason) != 0 {
t.Fatalf("expected inventory metrics to reset on empty refresh, got counts=%d reasons=%d",
len(telemetry.pvcBackupCount), len(telemetry.pvcBackupHealthReason))
}
}
func TestTelemetryRecordB2UsageTracksBucketsAndFallbackTimestamp(t *testing.T) {
telemetry := newTelemetry()
scannedAt := time.Date(2026, 4, 20, 16, 30, 0, 0, time.UTC)
lastModifiedAt := scannedAt.Add(-45 * time.Minute)
telemetry.RecordB2Usage(api.B2UsageResponse{
Available: true,
ScannedAt: scannedAt.Format(time.RFC3339),
ScanDurationMS: 2750,
TotalObjects: 99,
TotalBytes: 123456,
RecentObjects24h: 7,
RecentBytes24h: 890,
Buckets: []api.B2BucketUsage{
{
Name: "atlas-backups",
ObjectCount: 44,
TotalBytes: 1000,
RecentObjects24h: 3,
RecentBytes24h: 250,
LastModifiedAt: lastModifiedAt.Format(time.RFC3339),
},
{
Name: "atlas-logs",
ObjectCount: 55,
TotalBytes: 2000,
RecentObjects24h: 4,
RecentBytes24h: 640,
LastModifiedAt: "invalid",
},
},
})
backupKey := metricKey(map[string]string{"bucket": "atlas-backups"})
logsKey := metricKey(map[string]string{"bucket": "atlas-logs"})
if telemetry.b2ScanSuccess != 1 {
t.Fatalf("expected successful scan marker, got %v", telemetry.b2ScanSuccess)
}
if telemetry.b2ScanTimestamp != float64(scannedAt.Unix()) {
t.Fatalf("expected scan timestamp %d, got %v", scannedAt.Unix(), telemetry.b2ScanTimestamp)
}
if telemetry.b2ScanDurationSeconds != 2.75 {
t.Fatalf("expected scan duration 2.75s, got %v", telemetry.b2ScanDurationSeconds)
}
if telemetry.b2AccountObjects != 99 || telemetry.b2AccountBytes != 123456 {
t.Fatalf("unexpected account usage totals: objects=%v bytes=%v", telemetry.b2AccountObjects, telemetry.b2AccountBytes)
}
if telemetry.b2AccountRecentObjects != 7 || telemetry.b2AccountRecentBytes != 890 {
t.Fatalf("unexpected recent account usage: objects=%v bytes=%v", telemetry.b2AccountRecentObjects, telemetry.b2AccountRecentBytes)
}
if got := telemetry.b2BucketObjects[backupKey].value; got != 44 {
t.Fatalf("expected atlas-backups object count 44, got %v", got)
}
if got := telemetry.b2BucketBytes[logsKey].value; got != 2000 {
t.Fatalf("expected atlas-logs bytes 2000, got %v", got)
}
if got := telemetry.b2BucketRecentBytes[backupKey].value; got != 250 {
t.Fatalf("expected atlas-backups recent bytes 250, got %v", got)
}
if got := telemetry.b2BucketLastModified[backupKey].value; got != float64(lastModifiedAt.Unix()) {
t.Fatalf("expected atlas-backups last modified timestamp %d, got %v", lastModifiedAt.Unix(), got)
}
if _, ok := telemetry.b2BucketLastModified[logsKey]; ok {
t.Fatalf("expected invalid bucket last modified timestamp to be ignored")
}
before := time.Now().Unix()
telemetry.RecordB2Usage(api.B2UsageResponse{
Available: false,
ScannedAt: "definitely-invalid",
ScanDurationMS: 500,
})
after := time.Now().Unix()
if telemetry.b2ScanSuccess != 0 {
t.Fatalf("expected failed scan marker after unavailable refresh, got %v", telemetry.b2ScanSuccess)
}
if telemetry.b2ScanDurationSeconds != 0.5 {
t.Fatalf("expected scan duration 0.5s, got %v", telemetry.b2ScanDurationSeconds)
}
if telemetry.b2ScanTimestamp < float64(before) || telemetry.b2ScanTimestamp > float64(after) {
t.Fatalf("expected invalid scanned_at to fall back to now, got %v not in [%d,%d]",
telemetry.b2ScanTimestamp, before, after)
}
if len(telemetry.b2BucketObjects) != 0 || len(telemetry.b2BucketLastModified) != 0 {
t.Fatalf("expected bucket metrics to reset when no buckets are provided")
}
}