package server import ( "fmt" "net/http" "sort" "strings" "sync" "time" "scm.bstein.dev/bstein/soteria/internal/api" ) type metricSample struct { labels map[string]string value float64 } type telemetry struct { mu sync.RWMutex backupRequests map[string]metricSample restoreRequests map[string]metricSample policyBackups map[string]metricSample namespaceBackupRequests map[string]metricSample namespaceRestoreReqs map[string]metricSample authzDenials map[string]metricSample inventoryRefreshFailure float64 inventoryRefreshTime float64 pvcBackupAgeHours map[string]metricSample pvcBackupHealth map[string]metricSample pvcBackupHealthReason map[string]metricSample pvcBackupLastSuccess map[string]metricSample pvcBackupCount map[string]metricSample pvcBackupCompletedCount map[string]metricSample pvcBackupLastSizeBytes map[string]metricSample pvcBackupTotalSizeBytes map[string]metricSample b2BucketObjects map[string]metricSample b2BucketBytes map[string]metricSample b2BucketRecentObjects map[string]metricSample b2BucketRecentBytes map[string]metricSample b2BucketLastModified map[string]metricSample b2ScanSuccess float64 b2ScanTimestamp float64 b2ScanDurationSeconds float64 b2AccountObjects float64 b2AccountBytes float64 b2AccountRecentObjects float64 b2AccountRecentBytes float64 } func newTelemetry() *telemetry { return &telemetry{ backupRequests: map[string]metricSample{}, restoreRequests: map[string]metricSample{}, policyBackups: map[string]metricSample{}, namespaceBackupRequests: map[string]metricSample{}, namespaceRestoreReqs: map[string]metricSample{}, authzDenials: map[string]metricSample{}, pvcBackupAgeHours: map[string]metricSample{}, pvcBackupHealth: map[string]metricSample{}, pvcBackupHealthReason: map[string]metricSample{}, pvcBackupLastSuccess: map[string]metricSample{}, pvcBackupCount: map[string]metricSample{}, pvcBackupCompletedCount: map[string]metricSample{}, pvcBackupLastSizeBytes: map[string]metricSample{}, pvcBackupTotalSizeBytes: map[string]metricSample{}, b2BucketObjects: map[string]metricSample{}, b2BucketBytes: map[string]metricSample{}, b2BucketRecentObjects: map[string]metricSample{}, b2BucketRecentBytes: map[string]metricSample{}, b2BucketLastModified: map[string]metricSample{}, } } // Handler exposes the telemetry renderer as an HTTP endpoint. func (t *telemetry) Handler() http.Handler { return http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8") _, _ = w.Write([]byte(t.render())) }) } // RecordBackupRequest counts a backup request outcome by driver. func (t *telemetry) RecordBackupRequest(driver, result string) { t.mu.Lock() defer t.mu.Unlock() incMetric(t.backupRequests, map[string]string{"driver": driver, "result": result}) } // RecordRestoreRequest counts a restore request outcome by driver. func (t *telemetry) RecordRestoreRequest(driver, result string) { t.mu.Lock() defer t.mu.Unlock() incMetric(t.restoreRequests, map[string]string{"driver": driver, "result": result}) } // RecordPolicyBackup counts a policy-cycle backup outcome. func (t *telemetry) RecordPolicyBackup(result string) { t.mu.Lock() defer t.mu.Unlock() incMetric(t.policyBackups, map[string]string{"result": result}) } // RecordNamespaceBackupRequest counts a namespace backup outcome by driver. func (t *telemetry) RecordNamespaceBackupRequest(driver, result string) { t.mu.Lock() defer t.mu.Unlock() incMetric(t.namespaceBackupRequests, map[string]string{"driver": driver, "result": result}) } // RecordNamespaceRestoreRequest counts a namespace restore outcome by driver. func (t *telemetry) RecordNamespaceRestoreRequest(driver, result string) { t.mu.Lock() defer t.mu.Unlock() incMetric(t.namespaceRestoreReqs, map[string]string{"driver": driver, "result": result}) } // RecordAuthzDenied counts authorization denials by reason. func (t *telemetry) RecordAuthzDenied(reason string) { t.mu.Lock() defer t.mu.Unlock() incMetric(t.authzDenials, map[string]string{"reason": reason}) } // RecordInventoryFailure increments the inventory refresh failure counter. func (t *telemetry) RecordInventoryFailure() { t.mu.Lock() defer t.mu.Unlock() t.inventoryRefreshFailure++ } // RecordInventory updates PVC backup metrics from the latest inventory snapshot. func (t *telemetry) RecordInventory(inv api.InventoryResponse) { t.mu.Lock() defer t.mu.Unlock() t.pvcBackupAgeHours = map[string]metricSample{} t.pvcBackupHealth = map[string]metricSample{} t.pvcBackupHealthReason = map[string]metricSample{} t.pvcBackupLastSuccess = map[string]metricSample{} t.pvcBackupCount = map[string]metricSample{} t.pvcBackupCompletedCount = map[string]metricSample{} t.pvcBackupLastSizeBytes = map[string]metricSample{} t.pvcBackupTotalSizeBytes = map[string]metricSample{} for _, namespace := range inv.Namespaces { for _, pvc := range namespace.PVCs { labels := map[string]string{ "namespace": pvc.Namespace, "pvc": pvc.PVC, "volume": pvc.Volume, "driver": pvc.Driver, } setMetric(t.pvcBackupCount, labels, float64(pvc.BackupCount)) setMetric(t.pvcBackupCompletedCount, labels, float64(pvc.CompletedBackups)) setMetric(t.pvcBackupLastSizeBytes, labels, pvc.LastBackupSizeBytes) setMetric(t.pvcBackupTotalSizeBytes, labels, pvc.TotalBackupSizeBytes) reasonLabels := cloneLabels(labels) reason := strings.TrimSpace(pvc.HealthReason) if reason == "" { reason = "unknown" } reasonLabels["reason"] = reason setMetric(t.pvcBackupHealthReason, reasonLabels, 1) if pvc.Healthy { setMetric(t.pvcBackupHealth, labels, 1) } else { setMetric(t.pvcBackupHealth, labels, 0) } if pvc.LastBackupAt == "" { continue } setMetric(t.pvcBackupAgeHours, labels, pvc.LastBackupAgeHours) if ts, ok := parseBackupTime(pvc.LastBackupAt); ok { setMetric(t.pvcBackupLastSuccess, labels, float64(ts.Unix())) } } } t.inventoryRefreshTime = float64(time.Now().Unix()) } // RecordB2Usage updates the B2 telemetry gauges from the latest scan result. func (t *telemetry) RecordB2Usage(usage api.B2UsageResponse) { t.mu.Lock() defer t.mu.Unlock() t.b2BucketObjects = map[string]metricSample{} t.b2BucketBytes = map[string]metricSample{} t.b2BucketRecentObjects = map[string]metricSample{} t.b2BucketRecentBytes = map[string]metricSample{} t.b2BucketLastModified = map[string]metricSample{} t.b2ScanSuccess = 0 t.b2ScanDurationSeconds = float64(usage.ScanDurationMS) / 1000.0 t.b2AccountObjects = float64(usage.TotalObjects) t.b2AccountBytes = float64(usage.TotalBytes) t.b2AccountRecentObjects = float64(usage.RecentObjects24h) t.b2AccountRecentBytes = float64(usage.RecentBytes24h) if usage.Available { t.b2ScanSuccess = 1 } if usage.ScannedAt != "" { if ts, err := time.Parse(time.RFC3339, usage.ScannedAt); err == nil { t.b2ScanTimestamp = float64(ts.Unix()) } else { t.b2ScanTimestamp = float64(time.Now().Unix()) } } else { t.b2ScanTimestamp = float64(time.Now().Unix()) } for _, bucket := range usage.Buckets { labels := map[string]string{"bucket": bucket.Name} setMetric(t.b2BucketObjects, labels, float64(bucket.ObjectCount)) setMetric(t.b2BucketBytes, labels, float64(bucket.TotalBytes)) setMetric(t.b2BucketRecentObjects, labels, float64(bucket.RecentObjects24h)) setMetric(t.b2BucketRecentBytes, labels, float64(bucket.RecentBytes24h)) if bucket.LastModifiedAt == "" { continue } if ts, err := time.Parse(time.RFC3339, bucket.LastModifiedAt); err == nil { setMetric(t.b2BucketLastModified, labels, float64(ts.Unix())) } } } func (t *telemetry) render() string { t.mu.RLock() defer t.mu.RUnlock() var b strings.Builder writeMetricFamily(&b, "soteria_backup_requests_total", "counter", "Backup requests handled by Soteria.", metricValues(t.backupRequests)) writeMetricFamily(&b, "soteria_restore_requests_total", "counter", "Restore requests handled by Soteria.", metricValues(t.restoreRequests)) writeMetricFamily(&b, "soteria_policy_backups_total", "counter", "Policy scheduler backup execution outcomes.", metricValues(t.policyBackups)) writeMetricFamily(&b, "soteria_namespace_backup_requests_total", "counter", "Namespace-level backup request outcomes.", metricValues(t.namespaceBackupRequests)) writeMetricFamily(&b, "soteria_namespace_restore_requests_total", "counter", "Namespace-level restore request outcomes.", metricValues(t.namespaceRestoreReqs)) writeMetricFamily(&b, "soteria_authz_denials_total", "counter", "Authorization denials emitted by Soteria.", metricValues(t.authzDenials)) writeMetricFamily(&b, "soteria_inventory_refresh_failures_total", "counter", "Inventory refresh failures while computing PVC backup telemetry.", []metricSample{{value: t.inventoryRefreshFailure}}) writeMetricFamily(&b, "soteria_inventory_refresh_timestamp_seconds", "gauge", "Unix timestamp of the last successful inventory refresh.", []metricSample{{value: t.inventoryRefreshTime}}) writeMetricFamily(&b, "pvc_backup_age_hours", "gauge", "Age in hours of the latest successful PVC backup known to Soteria.", metricValues(t.pvcBackupAgeHours)) writeMetricFamily(&b, "pvc_backup_health", "gauge", "PVC backup health according to Soteria: 1=fresh backup within policy, 0=missing/stale/error.", metricValues(t.pvcBackupHealth)) writeMetricFamily(&b, "pvc_backup_health_reason", "gauge", "PVC backup health reason marker with reason label set to 1.", metricValues(t.pvcBackupHealthReason)) writeMetricFamily(&b, "pvc_backup_last_success_timestamp_seconds", "gauge", "Unix timestamp of the latest successful PVC backup known to Soteria.", metricValues(t.pvcBackupLastSuccess)) writeMetricFamily(&b, "pvc_backup_count", "gauge", "Count of backup records discovered for a PVC.", metricValues(t.pvcBackupCount)) writeMetricFamily(&b, "pvc_backup_completed_count", "gauge", "Count of completed backup records discovered for a PVC.", metricValues(t.pvcBackupCompletedCount)) writeMetricFamily(&b, "pvc_backup_last_size_bytes", "gauge", "Size in bytes of the latest completed backup for a PVC.", metricValues(t.pvcBackupLastSizeBytes)) writeMetricFamily(&b, "pvc_backup_total_size_bytes", "gauge", "Total bytes across discovered backup records for a PVC.", metricValues(t.pvcBackupTotalSizeBytes)) writeMetricFamily(&b, "soteria_b2_scan_success", "gauge", "Whether the latest B2 consumption scan succeeded (1) or failed (0).", []metricSample{{value: t.b2ScanSuccess}}) writeMetricFamily(&b, "soteria_b2_scan_timestamp_seconds", "gauge", "Unix timestamp of the latest B2 consumption scan attempt.", []metricSample{{value: t.b2ScanTimestamp}}) writeMetricFamily(&b, "soteria_b2_scan_duration_seconds", "gauge", "Duration in seconds of the latest B2 consumption scan attempt.", []metricSample{{value: t.b2ScanDurationSeconds}}) writeMetricFamily(&b, "soteria_b2_account_objects", "gauge", "Total object count discovered across scanned B2 buckets.", []metricSample{{value: t.b2AccountObjects}}) writeMetricFamily(&b, "soteria_b2_account_bytes", "gauge", "Total stored bytes discovered across scanned B2 buckets.", []metricSample{{value: t.b2AccountBytes}}) writeMetricFamily(&b, "soteria_b2_account_recent_objects_24h", "gauge", "Object count with LastModified in the last 24h across scanned B2 buckets.", []metricSample{{value: t.b2AccountRecentObjects}}) writeMetricFamily(&b, "soteria_b2_account_recent_bytes_24h", "gauge", "Bytes with LastModified in the last 24h across scanned B2 buckets.", []metricSample{{value: t.b2AccountRecentBytes}}) writeMetricFamily(&b, "soteria_b2_bucket_objects", "gauge", "Object count discovered per scanned B2 bucket.", metricValues(t.b2BucketObjects)) writeMetricFamily(&b, "soteria_b2_bucket_bytes", "gauge", "Stored bytes discovered per scanned B2 bucket.", metricValues(t.b2BucketBytes)) writeMetricFamily(&b, "soteria_b2_bucket_recent_objects_24h", "gauge", "Object count with LastModified in the last 24h per scanned B2 bucket.", metricValues(t.b2BucketRecentObjects)) writeMetricFamily(&b, "soteria_b2_bucket_recent_bytes_24h", "gauge", "Bytes with LastModified in the last 24h per scanned B2 bucket.", metricValues(t.b2BucketRecentBytes)) writeMetricFamily(&b, "soteria_b2_bucket_last_modified_timestamp_seconds", "gauge", "Unix timestamp of the most recent object observed in each scanned B2 bucket.", metricValues(t.b2BucketLastModified)) return b.String() } func metricValues(source map[string]metricSample) []metricSample { keys := make([]string, 0, len(source)) for key := range source { keys = append(keys, key) } sort.Strings(keys) values := make([]metricSample, 0, len(keys)) for _, key := range keys { values = append(values, source[key]) } return values } func writeMetricFamily(b *strings.Builder, name, metricType, help string, samples []metricSample) { b.WriteString("# HELP ") b.WriteString(name) b.WriteString(" ") b.WriteString(help) b.WriteString("\n") b.WriteString("# TYPE ") b.WriteString(name) b.WriteString(" ") b.WriteString(metricType) b.WriteString("\n") for _, sample := range samples { b.WriteString(name) b.WriteString(renderLabels(sample.labels)) b.WriteString(" ") b.WriteString(fmt.Sprintf("%g", sample.value)) b.WriteString("\n") } } func renderLabels(labels map[string]string) string { if len(labels) == 0 { return "" } keys := make([]string, 0, len(labels)) for key := range labels { keys = append(keys, key) } sort.Strings(keys) parts := make([]string, 0, len(keys)) for _, key := range keys { parts = append(parts, fmt.Sprintf("%s=%q", key, labels[key])) } return "{" + strings.Join(parts, ",") + "}" } func metricKey(labels map[string]string) string { return renderLabels(labels) } func incMetric(target map[string]metricSample, labels map[string]string) { key := metricKey(labels) sample, ok := target[key] if !ok { target[key] = metricSample{labels: cloneLabels(labels), value: 1} return } sample.value++ target[key] = sample } func setMetric(target map[string]metricSample, labels map[string]string, value float64) { key := metricKey(labels) target[key] = metricSample{labels: cloneLabels(labels), value: value} } func cloneLabels(labels map[string]string) map[string]string { out := make(map[string]string, len(labels)) for key, value := range labels { out[key] = value } return out }