soteria/internal/server/metrics.go

330 lines
14 KiB
Go

package server
import (
"fmt"
"net/http"
"sort"
"strings"
"sync"
"time"
"scm.bstein.dev/bstein/soteria/internal/api"
)
type metricSample struct {
labels map[string]string
value float64
}
type telemetry struct {
mu sync.RWMutex
backupRequests map[string]metricSample
restoreRequests map[string]metricSample
policyBackups map[string]metricSample
namespaceBackupRequests map[string]metricSample
namespaceRestoreReqs map[string]metricSample
authzDenials map[string]metricSample
inventoryRefreshFailure float64
inventoryRefreshTime float64
pvcBackupAgeHours map[string]metricSample
pvcBackupHealth map[string]metricSample
pvcBackupHealthReason map[string]metricSample
pvcBackupLastSuccess map[string]metricSample
pvcBackupCount map[string]metricSample
pvcBackupCompletedCount map[string]metricSample
pvcBackupLastSizeBytes map[string]metricSample
pvcBackupTotalSizeBytes map[string]metricSample
b2BucketObjects map[string]metricSample
b2BucketBytes map[string]metricSample
b2BucketRecentObjects map[string]metricSample
b2BucketRecentBytes map[string]metricSample
b2BucketLastModified map[string]metricSample
b2ScanSuccess float64
b2ScanTimestamp float64
b2ScanDurationSeconds float64
b2AccountObjects float64
b2AccountBytes float64
b2AccountRecentObjects float64
b2AccountRecentBytes float64
}
func newTelemetry() *telemetry {
return &telemetry{
backupRequests: map[string]metricSample{},
restoreRequests: map[string]metricSample{},
policyBackups: map[string]metricSample{},
namespaceBackupRequests: map[string]metricSample{},
namespaceRestoreReqs: map[string]metricSample{},
authzDenials: map[string]metricSample{},
pvcBackupAgeHours: map[string]metricSample{},
pvcBackupHealth: map[string]metricSample{},
pvcBackupHealthReason: map[string]metricSample{},
pvcBackupLastSuccess: map[string]metricSample{},
pvcBackupCount: map[string]metricSample{},
pvcBackupCompletedCount: map[string]metricSample{},
pvcBackupLastSizeBytes: map[string]metricSample{},
pvcBackupTotalSizeBytes: map[string]metricSample{},
b2BucketObjects: map[string]metricSample{},
b2BucketBytes: map[string]metricSample{},
b2BucketRecentObjects: map[string]metricSample{},
b2BucketRecentBytes: map[string]metricSample{},
b2BucketLastModified: map[string]metricSample{},
}
}
func (t *telemetry) Handler() http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8")
_, _ = w.Write([]byte(t.render()))
})
}
func (t *telemetry) RecordBackupRequest(driver, result string) {
t.mu.Lock()
defer t.mu.Unlock()
incMetric(t.backupRequests, map[string]string{"driver": driver, "result": result})
}
func (t *telemetry) RecordRestoreRequest(driver, result string) {
t.mu.Lock()
defer t.mu.Unlock()
incMetric(t.restoreRequests, map[string]string{"driver": driver, "result": result})
}
func (t *telemetry) RecordPolicyBackup(result string) {
t.mu.Lock()
defer t.mu.Unlock()
incMetric(t.policyBackups, map[string]string{"result": result})
}
func (t *telemetry) RecordNamespaceBackupRequest(driver, result string) {
t.mu.Lock()
defer t.mu.Unlock()
incMetric(t.namespaceBackupRequests, map[string]string{"driver": driver, "result": result})
}
func (t *telemetry) RecordNamespaceRestoreRequest(driver, result string) {
t.mu.Lock()
defer t.mu.Unlock()
incMetric(t.namespaceRestoreReqs, map[string]string{"driver": driver, "result": result})
}
func (t *telemetry) RecordAuthzDenied(reason string) {
t.mu.Lock()
defer t.mu.Unlock()
incMetric(t.authzDenials, map[string]string{"reason": reason})
}
func (t *telemetry) RecordInventoryFailure() {
t.mu.Lock()
defer t.mu.Unlock()
t.inventoryRefreshFailure++
}
func (t *telemetry) RecordInventory(inv api.InventoryResponse) {
t.mu.Lock()
defer t.mu.Unlock()
t.pvcBackupAgeHours = map[string]metricSample{}
t.pvcBackupHealth = map[string]metricSample{}
t.pvcBackupHealthReason = map[string]metricSample{}
t.pvcBackupLastSuccess = map[string]metricSample{}
t.pvcBackupCount = map[string]metricSample{}
t.pvcBackupCompletedCount = map[string]metricSample{}
t.pvcBackupLastSizeBytes = map[string]metricSample{}
t.pvcBackupTotalSizeBytes = map[string]metricSample{}
for _, namespace := range inv.Namespaces {
for _, pvc := range namespace.PVCs {
labels := map[string]string{
"namespace": pvc.Namespace,
"pvc": pvc.PVC,
"volume": pvc.Volume,
"driver": pvc.Driver,
}
setMetric(t.pvcBackupCount, labels, float64(pvc.BackupCount))
setMetric(t.pvcBackupCompletedCount, labels, float64(pvc.CompletedBackups))
setMetric(t.pvcBackupLastSizeBytes, labels, pvc.LastBackupSizeBytes)
setMetric(t.pvcBackupTotalSizeBytes, labels, pvc.TotalBackupSizeBytes)
reasonLabels := cloneLabels(labels)
reason := strings.TrimSpace(pvc.HealthReason)
if reason == "" {
reason = "unknown"
}
reasonLabels["reason"] = reason
setMetric(t.pvcBackupHealthReason, reasonLabels, 1)
if pvc.Healthy {
setMetric(t.pvcBackupHealth, labels, 1)
} else {
setMetric(t.pvcBackupHealth, labels, 0)
}
if pvc.LastBackupAt == "" {
continue
}
setMetric(t.pvcBackupAgeHours, labels, pvc.LastBackupAgeHours)
if ts, ok := parseBackupTime(pvc.LastBackupAt); ok {
setMetric(t.pvcBackupLastSuccess, labels, float64(ts.Unix()))
}
}
}
t.inventoryRefreshTime = float64(time.Now().Unix())
}
func (t *telemetry) RecordB2Usage(usage api.B2UsageResponse) {
t.mu.Lock()
defer t.mu.Unlock()
t.b2BucketObjects = map[string]metricSample{}
t.b2BucketBytes = map[string]metricSample{}
t.b2BucketRecentObjects = map[string]metricSample{}
t.b2BucketRecentBytes = map[string]metricSample{}
t.b2BucketLastModified = map[string]metricSample{}
t.b2ScanSuccess = 0
t.b2ScanDurationSeconds = float64(usage.ScanDurationMS) / 1000.0
t.b2AccountObjects = float64(usage.TotalObjects)
t.b2AccountBytes = float64(usage.TotalBytes)
t.b2AccountRecentObjects = float64(usage.RecentObjects24h)
t.b2AccountRecentBytes = float64(usage.RecentBytes24h)
if usage.Available {
t.b2ScanSuccess = 1
}
if usage.ScannedAt != "" {
if ts, err := time.Parse(time.RFC3339, usage.ScannedAt); err == nil {
t.b2ScanTimestamp = float64(ts.Unix())
} else {
t.b2ScanTimestamp = float64(time.Now().Unix())
}
} else {
t.b2ScanTimestamp = float64(time.Now().Unix())
}
for _, bucket := range usage.Buckets {
labels := map[string]string{"bucket": bucket.Name}
setMetric(t.b2BucketObjects, labels, float64(bucket.ObjectCount))
setMetric(t.b2BucketBytes, labels, float64(bucket.TotalBytes))
setMetric(t.b2BucketRecentObjects, labels, float64(bucket.RecentObjects24h))
setMetric(t.b2BucketRecentBytes, labels, float64(bucket.RecentBytes24h))
if bucket.LastModifiedAt == "" {
continue
}
if ts, err := time.Parse(time.RFC3339, bucket.LastModifiedAt); err == nil {
setMetric(t.b2BucketLastModified, labels, float64(ts.Unix()))
}
}
}
func (t *telemetry) render() string {
t.mu.RLock()
defer t.mu.RUnlock()
var b strings.Builder
writeMetricFamily(&b, "soteria_backup_requests_total", "counter", "Backup requests handled by Soteria.", metricValues(t.backupRequests))
writeMetricFamily(&b, "soteria_restore_requests_total", "counter", "Restore requests handled by Soteria.", metricValues(t.restoreRequests))
writeMetricFamily(&b, "soteria_policy_backups_total", "counter", "Policy scheduler backup execution outcomes.", metricValues(t.policyBackups))
writeMetricFamily(&b, "soteria_namespace_backup_requests_total", "counter", "Namespace-level backup request outcomes.", metricValues(t.namespaceBackupRequests))
writeMetricFamily(&b, "soteria_namespace_restore_requests_total", "counter", "Namespace-level restore request outcomes.", metricValues(t.namespaceRestoreReqs))
writeMetricFamily(&b, "soteria_authz_denials_total", "counter", "Authorization denials emitted by Soteria.", metricValues(t.authzDenials))
writeMetricFamily(&b, "soteria_inventory_refresh_failures_total", "counter", "Inventory refresh failures while computing PVC backup telemetry.", []metricSample{{value: t.inventoryRefreshFailure}})
writeMetricFamily(&b, "soteria_inventory_refresh_timestamp_seconds", "gauge", "Unix timestamp of the last successful inventory refresh.", []metricSample{{value: t.inventoryRefreshTime}})
writeMetricFamily(&b, "pvc_backup_age_hours", "gauge", "Age in hours of the latest successful PVC backup known to Soteria.", metricValues(t.pvcBackupAgeHours))
writeMetricFamily(&b, "pvc_backup_health", "gauge", "PVC backup health according to Soteria: 1=fresh backup within policy, 0=missing/stale/error.", metricValues(t.pvcBackupHealth))
writeMetricFamily(&b, "pvc_backup_health_reason", "gauge", "PVC backup health reason marker with reason label set to 1.", metricValues(t.pvcBackupHealthReason))
writeMetricFamily(&b, "pvc_backup_last_success_timestamp_seconds", "gauge", "Unix timestamp of the latest successful PVC backup known to Soteria.", metricValues(t.pvcBackupLastSuccess))
writeMetricFamily(&b, "pvc_backup_count", "gauge", "Count of backup records discovered for a PVC.", metricValues(t.pvcBackupCount))
writeMetricFamily(&b, "pvc_backup_completed_count", "gauge", "Count of completed backup records discovered for a PVC.", metricValues(t.pvcBackupCompletedCount))
writeMetricFamily(&b, "pvc_backup_last_size_bytes", "gauge", "Size in bytes of the latest completed backup for a PVC.", metricValues(t.pvcBackupLastSizeBytes))
writeMetricFamily(&b, "pvc_backup_total_size_bytes", "gauge", "Total bytes across discovered backup records for a PVC.", metricValues(t.pvcBackupTotalSizeBytes))
writeMetricFamily(&b, "soteria_b2_scan_success", "gauge", "Whether the latest B2 consumption scan succeeded (1) or failed (0).", []metricSample{{value: t.b2ScanSuccess}})
writeMetricFamily(&b, "soteria_b2_scan_timestamp_seconds", "gauge", "Unix timestamp of the latest B2 consumption scan attempt.", []metricSample{{value: t.b2ScanTimestamp}})
writeMetricFamily(&b, "soteria_b2_scan_duration_seconds", "gauge", "Duration in seconds of the latest B2 consumption scan attempt.", []metricSample{{value: t.b2ScanDurationSeconds}})
writeMetricFamily(&b, "soteria_b2_account_objects", "gauge", "Total object count discovered across scanned B2 buckets.", []metricSample{{value: t.b2AccountObjects}})
writeMetricFamily(&b, "soteria_b2_account_bytes", "gauge", "Total stored bytes discovered across scanned B2 buckets.", []metricSample{{value: t.b2AccountBytes}})
writeMetricFamily(&b, "soteria_b2_account_recent_objects_24h", "gauge", "Object count with LastModified in the last 24h across scanned B2 buckets.", []metricSample{{value: t.b2AccountRecentObjects}})
writeMetricFamily(&b, "soteria_b2_account_recent_bytes_24h", "gauge", "Bytes with LastModified in the last 24h across scanned B2 buckets.", []metricSample{{value: t.b2AccountRecentBytes}})
writeMetricFamily(&b, "soteria_b2_bucket_objects", "gauge", "Object count discovered per scanned B2 bucket.", metricValues(t.b2BucketObjects))
writeMetricFamily(&b, "soteria_b2_bucket_bytes", "gauge", "Stored bytes discovered per scanned B2 bucket.", metricValues(t.b2BucketBytes))
writeMetricFamily(&b, "soteria_b2_bucket_recent_objects_24h", "gauge", "Object count with LastModified in the last 24h per scanned B2 bucket.", metricValues(t.b2BucketRecentObjects))
writeMetricFamily(&b, "soteria_b2_bucket_recent_bytes_24h", "gauge", "Bytes with LastModified in the last 24h per scanned B2 bucket.", metricValues(t.b2BucketRecentBytes))
writeMetricFamily(&b, "soteria_b2_bucket_last_modified_timestamp_seconds", "gauge", "Unix timestamp of the most recent object observed in each scanned B2 bucket.", metricValues(t.b2BucketLastModified))
return b.String()
}
func metricValues(source map[string]metricSample) []metricSample {
keys := make([]string, 0, len(source))
for key := range source {
keys = append(keys, key)
}
sort.Strings(keys)
values := make([]metricSample, 0, len(keys))
for _, key := range keys {
values = append(values, source[key])
}
return values
}
func writeMetricFamily(b *strings.Builder, name, metricType, help string, samples []metricSample) {
b.WriteString("# HELP ")
b.WriteString(name)
b.WriteString(" ")
b.WriteString(help)
b.WriteString("\n")
b.WriteString("# TYPE ")
b.WriteString(name)
b.WriteString(" ")
b.WriteString(metricType)
b.WriteString("\n")
for _, sample := range samples {
b.WriteString(name)
b.WriteString(renderLabels(sample.labels))
b.WriteString(" ")
b.WriteString(fmt.Sprintf("%g", sample.value))
b.WriteString("\n")
}
}
func renderLabels(labels map[string]string) string {
if len(labels) == 0 {
return ""
}
keys := make([]string, 0, len(labels))
for key := range labels {
keys = append(keys, key)
}
sort.Strings(keys)
parts := make([]string, 0, len(keys))
for _, key := range keys {
parts = append(parts, fmt.Sprintf("%s=%q", key, labels[key]))
}
return "{" + strings.Join(parts, ",") + "}"
}
func metricKey(labels map[string]string) string {
return renderLabels(labels)
}
func incMetric(target map[string]metricSample, labels map[string]string) {
key := metricKey(labels)
sample, ok := target[key]
if !ok {
target[key] = metricSample{labels: cloneLabels(labels), value: 1}
return
}
sample.value++
target[key] = sample
}
func setMetric(target map[string]metricSample, labels map[string]string, value float64) {
key := metricKey(labels)
target[key] = metricSample{labels: cloneLabels(labels), value: value}
}
func cloneLabels(labels map[string]string) map[string]string {
out := make(map[string]string, len(labels))
for key, value := range labels {
out[key] = value
}
return out
}