330 lines
14 KiB
Go
330 lines
14 KiB
Go
package server
|
|
|
|
import (
|
|
"fmt"
|
|
"net/http"
|
|
"sort"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"scm.bstein.dev/bstein/soteria/internal/api"
|
|
)
|
|
|
|
type metricSample struct {
|
|
labels map[string]string
|
|
value float64
|
|
}
|
|
|
|
type telemetry struct {
|
|
mu sync.RWMutex
|
|
backupRequests map[string]metricSample
|
|
restoreRequests map[string]metricSample
|
|
policyBackups map[string]metricSample
|
|
namespaceBackupRequests map[string]metricSample
|
|
namespaceRestoreReqs map[string]metricSample
|
|
authzDenials map[string]metricSample
|
|
inventoryRefreshFailure float64
|
|
inventoryRefreshTime float64
|
|
pvcBackupAgeHours map[string]metricSample
|
|
pvcBackupHealth map[string]metricSample
|
|
pvcBackupHealthReason map[string]metricSample
|
|
pvcBackupLastSuccess map[string]metricSample
|
|
pvcBackupCount map[string]metricSample
|
|
pvcBackupCompletedCount map[string]metricSample
|
|
pvcBackupLastSizeBytes map[string]metricSample
|
|
pvcBackupTotalSizeBytes map[string]metricSample
|
|
b2BucketObjects map[string]metricSample
|
|
b2BucketBytes map[string]metricSample
|
|
b2BucketRecentObjects map[string]metricSample
|
|
b2BucketRecentBytes map[string]metricSample
|
|
b2BucketLastModified map[string]metricSample
|
|
b2ScanSuccess float64
|
|
b2ScanTimestamp float64
|
|
b2ScanDurationSeconds float64
|
|
b2AccountObjects float64
|
|
b2AccountBytes float64
|
|
b2AccountRecentObjects float64
|
|
b2AccountRecentBytes float64
|
|
}
|
|
|
|
func newTelemetry() *telemetry {
|
|
return &telemetry{
|
|
backupRequests: map[string]metricSample{},
|
|
restoreRequests: map[string]metricSample{},
|
|
policyBackups: map[string]metricSample{},
|
|
namespaceBackupRequests: map[string]metricSample{},
|
|
namespaceRestoreReqs: map[string]metricSample{},
|
|
authzDenials: map[string]metricSample{},
|
|
pvcBackupAgeHours: map[string]metricSample{},
|
|
pvcBackupHealth: map[string]metricSample{},
|
|
pvcBackupHealthReason: map[string]metricSample{},
|
|
pvcBackupLastSuccess: map[string]metricSample{},
|
|
pvcBackupCount: map[string]metricSample{},
|
|
pvcBackupCompletedCount: map[string]metricSample{},
|
|
pvcBackupLastSizeBytes: map[string]metricSample{},
|
|
pvcBackupTotalSizeBytes: map[string]metricSample{},
|
|
b2BucketObjects: map[string]metricSample{},
|
|
b2BucketBytes: map[string]metricSample{},
|
|
b2BucketRecentObjects: map[string]metricSample{},
|
|
b2BucketRecentBytes: map[string]metricSample{},
|
|
b2BucketLastModified: map[string]metricSample{},
|
|
}
|
|
}
|
|
|
|
func (t *telemetry) Handler() http.Handler {
|
|
return http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
|
|
w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8")
|
|
_, _ = w.Write([]byte(t.render()))
|
|
})
|
|
}
|
|
|
|
func (t *telemetry) RecordBackupRequest(driver, result string) {
|
|
t.mu.Lock()
|
|
defer t.mu.Unlock()
|
|
incMetric(t.backupRequests, map[string]string{"driver": driver, "result": result})
|
|
}
|
|
|
|
func (t *telemetry) RecordRestoreRequest(driver, result string) {
|
|
t.mu.Lock()
|
|
defer t.mu.Unlock()
|
|
incMetric(t.restoreRequests, map[string]string{"driver": driver, "result": result})
|
|
}
|
|
|
|
func (t *telemetry) RecordPolicyBackup(result string) {
|
|
t.mu.Lock()
|
|
defer t.mu.Unlock()
|
|
incMetric(t.policyBackups, map[string]string{"result": result})
|
|
}
|
|
|
|
func (t *telemetry) RecordNamespaceBackupRequest(driver, result string) {
|
|
t.mu.Lock()
|
|
defer t.mu.Unlock()
|
|
incMetric(t.namespaceBackupRequests, map[string]string{"driver": driver, "result": result})
|
|
}
|
|
|
|
func (t *telemetry) RecordNamespaceRestoreRequest(driver, result string) {
|
|
t.mu.Lock()
|
|
defer t.mu.Unlock()
|
|
incMetric(t.namespaceRestoreReqs, map[string]string{"driver": driver, "result": result})
|
|
}
|
|
|
|
func (t *telemetry) RecordAuthzDenied(reason string) {
|
|
t.mu.Lock()
|
|
defer t.mu.Unlock()
|
|
incMetric(t.authzDenials, map[string]string{"reason": reason})
|
|
}
|
|
|
|
func (t *telemetry) RecordInventoryFailure() {
|
|
t.mu.Lock()
|
|
defer t.mu.Unlock()
|
|
t.inventoryRefreshFailure++
|
|
}
|
|
|
|
func (t *telemetry) RecordInventory(inv api.InventoryResponse) {
|
|
t.mu.Lock()
|
|
defer t.mu.Unlock()
|
|
|
|
t.pvcBackupAgeHours = map[string]metricSample{}
|
|
t.pvcBackupHealth = map[string]metricSample{}
|
|
t.pvcBackupHealthReason = map[string]metricSample{}
|
|
t.pvcBackupLastSuccess = map[string]metricSample{}
|
|
t.pvcBackupCount = map[string]metricSample{}
|
|
t.pvcBackupCompletedCount = map[string]metricSample{}
|
|
t.pvcBackupLastSizeBytes = map[string]metricSample{}
|
|
t.pvcBackupTotalSizeBytes = map[string]metricSample{}
|
|
|
|
for _, namespace := range inv.Namespaces {
|
|
for _, pvc := range namespace.PVCs {
|
|
labels := map[string]string{
|
|
"namespace": pvc.Namespace,
|
|
"pvc": pvc.PVC,
|
|
"volume": pvc.Volume,
|
|
"driver": pvc.Driver,
|
|
}
|
|
setMetric(t.pvcBackupCount, labels, float64(pvc.BackupCount))
|
|
setMetric(t.pvcBackupCompletedCount, labels, float64(pvc.CompletedBackups))
|
|
setMetric(t.pvcBackupLastSizeBytes, labels, pvc.LastBackupSizeBytes)
|
|
setMetric(t.pvcBackupTotalSizeBytes, labels, pvc.TotalBackupSizeBytes)
|
|
reasonLabels := cloneLabels(labels)
|
|
reason := strings.TrimSpace(pvc.HealthReason)
|
|
if reason == "" {
|
|
reason = "unknown"
|
|
}
|
|
reasonLabels["reason"] = reason
|
|
setMetric(t.pvcBackupHealthReason, reasonLabels, 1)
|
|
if pvc.Healthy {
|
|
setMetric(t.pvcBackupHealth, labels, 1)
|
|
} else {
|
|
setMetric(t.pvcBackupHealth, labels, 0)
|
|
}
|
|
if pvc.LastBackupAt == "" {
|
|
continue
|
|
}
|
|
setMetric(t.pvcBackupAgeHours, labels, pvc.LastBackupAgeHours)
|
|
if ts, ok := parseBackupTime(pvc.LastBackupAt); ok {
|
|
setMetric(t.pvcBackupLastSuccess, labels, float64(ts.Unix()))
|
|
}
|
|
}
|
|
}
|
|
|
|
t.inventoryRefreshTime = float64(time.Now().Unix())
|
|
}
|
|
|
|
func (t *telemetry) RecordB2Usage(usage api.B2UsageResponse) {
|
|
t.mu.Lock()
|
|
defer t.mu.Unlock()
|
|
|
|
t.b2BucketObjects = map[string]metricSample{}
|
|
t.b2BucketBytes = map[string]metricSample{}
|
|
t.b2BucketRecentObjects = map[string]metricSample{}
|
|
t.b2BucketRecentBytes = map[string]metricSample{}
|
|
t.b2BucketLastModified = map[string]metricSample{}
|
|
t.b2ScanSuccess = 0
|
|
t.b2ScanDurationSeconds = float64(usage.ScanDurationMS) / 1000.0
|
|
t.b2AccountObjects = float64(usage.TotalObjects)
|
|
t.b2AccountBytes = float64(usage.TotalBytes)
|
|
t.b2AccountRecentObjects = float64(usage.RecentObjects24h)
|
|
t.b2AccountRecentBytes = float64(usage.RecentBytes24h)
|
|
|
|
if usage.Available {
|
|
t.b2ScanSuccess = 1
|
|
}
|
|
if usage.ScannedAt != "" {
|
|
if ts, err := time.Parse(time.RFC3339, usage.ScannedAt); err == nil {
|
|
t.b2ScanTimestamp = float64(ts.Unix())
|
|
} else {
|
|
t.b2ScanTimestamp = float64(time.Now().Unix())
|
|
}
|
|
} else {
|
|
t.b2ScanTimestamp = float64(time.Now().Unix())
|
|
}
|
|
|
|
for _, bucket := range usage.Buckets {
|
|
labels := map[string]string{"bucket": bucket.Name}
|
|
setMetric(t.b2BucketObjects, labels, float64(bucket.ObjectCount))
|
|
setMetric(t.b2BucketBytes, labels, float64(bucket.TotalBytes))
|
|
setMetric(t.b2BucketRecentObjects, labels, float64(bucket.RecentObjects24h))
|
|
setMetric(t.b2BucketRecentBytes, labels, float64(bucket.RecentBytes24h))
|
|
if bucket.LastModifiedAt == "" {
|
|
continue
|
|
}
|
|
if ts, err := time.Parse(time.RFC3339, bucket.LastModifiedAt); err == nil {
|
|
setMetric(t.b2BucketLastModified, labels, float64(ts.Unix()))
|
|
}
|
|
}
|
|
}
|
|
|
|
func (t *telemetry) render() string {
|
|
t.mu.RLock()
|
|
defer t.mu.RUnlock()
|
|
|
|
var b strings.Builder
|
|
writeMetricFamily(&b, "soteria_backup_requests_total", "counter", "Backup requests handled by Soteria.", metricValues(t.backupRequests))
|
|
writeMetricFamily(&b, "soteria_restore_requests_total", "counter", "Restore requests handled by Soteria.", metricValues(t.restoreRequests))
|
|
writeMetricFamily(&b, "soteria_policy_backups_total", "counter", "Policy scheduler backup execution outcomes.", metricValues(t.policyBackups))
|
|
writeMetricFamily(&b, "soteria_namespace_backup_requests_total", "counter", "Namespace-level backup request outcomes.", metricValues(t.namespaceBackupRequests))
|
|
writeMetricFamily(&b, "soteria_namespace_restore_requests_total", "counter", "Namespace-level restore request outcomes.", metricValues(t.namespaceRestoreReqs))
|
|
writeMetricFamily(&b, "soteria_authz_denials_total", "counter", "Authorization denials emitted by Soteria.", metricValues(t.authzDenials))
|
|
writeMetricFamily(&b, "soteria_inventory_refresh_failures_total", "counter", "Inventory refresh failures while computing PVC backup telemetry.", []metricSample{{value: t.inventoryRefreshFailure}})
|
|
writeMetricFamily(&b, "soteria_inventory_refresh_timestamp_seconds", "gauge", "Unix timestamp of the last successful inventory refresh.", []metricSample{{value: t.inventoryRefreshTime}})
|
|
writeMetricFamily(&b, "pvc_backup_age_hours", "gauge", "Age in hours of the latest successful PVC backup known to Soteria.", metricValues(t.pvcBackupAgeHours))
|
|
writeMetricFamily(&b, "pvc_backup_health", "gauge", "PVC backup health according to Soteria: 1=fresh backup within policy, 0=missing/stale/error.", metricValues(t.pvcBackupHealth))
|
|
writeMetricFamily(&b, "pvc_backup_health_reason", "gauge", "PVC backup health reason marker with reason label set to 1.", metricValues(t.pvcBackupHealthReason))
|
|
writeMetricFamily(&b, "pvc_backup_last_success_timestamp_seconds", "gauge", "Unix timestamp of the latest successful PVC backup known to Soteria.", metricValues(t.pvcBackupLastSuccess))
|
|
writeMetricFamily(&b, "pvc_backup_count", "gauge", "Count of backup records discovered for a PVC.", metricValues(t.pvcBackupCount))
|
|
writeMetricFamily(&b, "pvc_backup_completed_count", "gauge", "Count of completed backup records discovered for a PVC.", metricValues(t.pvcBackupCompletedCount))
|
|
writeMetricFamily(&b, "pvc_backup_last_size_bytes", "gauge", "Size in bytes of the latest completed backup for a PVC.", metricValues(t.pvcBackupLastSizeBytes))
|
|
writeMetricFamily(&b, "pvc_backup_total_size_bytes", "gauge", "Total bytes across discovered backup records for a PVC.", metricValues(t.pvcBackupTotalSizeBytes))
|
|
writeMetricFamily(&b, "soteria_b2_scan_success", "gauge", "Whether the latest B2 consumption scan succeeded (1) or failed (0).", []metricSample{{value: t.b2ScanSuccess}})
|
|
writeMetricFamily(&b, "soteria_b2_scan_timestamp_seconds", "gauge", "Unix timestamp of the latest B2 consumption scan attempt.", []metricSample{{value: t.b2ScanTimestamp}})
|
|
writeMetricFamily(&b, "soteria_b2_scan_duration_seconds", "gauge", "Duration in seconds of the latest B2 consumption scan attempt.", []metricSample{{value: t.b2ScanDurationSeconds}})
|
|
writeMetricFamily(&b, "soteria_b2_account_objects", "gauge", "Total object count discovered across scanned B2 buckets.", []metricSample{{value: t.b2AccountObjects}})
|
|
writeMetricFamily(&b, "soteria_b2_account_bytes", "gauge", "Total stored bytes discovered across scanned B2 buckets.", []metricSample{{value: t.b2AccountBytes}})
|
|
writeMetricFamily(&b, "soteria_b2_account_recent_objects_24h", "gauge", "Object count with LastModified in the last 24h across scanned B2 buckets.", []metricSample{{value: t.b2AccountRecentObjects}})
|
|
writeMetricFamily(&b, "soteria_b2_account_recent_bytes_24h", "gauge", "Bytes with LastModified in the last 24h across scanned B2 buckets.", []metricSample{{value: t.b2AccountRecentBytes}})
|
|
writeMetricFamily(&b, "soteria_b2_bucket_objects", "gauge", "Object count discovered per scanned B2 bucket.", metricValues(t.b2BucketObjects))
|
|
writeMetricFamily(&b, "soteria_b2_bucket_bytes", "gauge", "Stored bytes discovered per scanned B2 bucket.", metricValues(t.b2BucketBytes))
|
|
writeMetricFamily(&b, "soteria_b2_bucket_recent_objects_24h", "gauge", "Object count with LastModified in the last 24h per scanned B2 bucket.", metricValues(t.b2BucketRecentObjects))
|
|
writeMetricFamily(&b, "soteria_b2_bucket_recent_bytes_24h", "gauge", "Bytes with LastModified in the last 24h per scanned B2 bucket.", metricValues(t.b2BucketRecentBytes))
|
|
writeMetricFamily(&b, "soteria_b2_bucket_last_modified_timestamp_seconds", "gauge", "Unix timestamp of the most recent object observed in each scanned B2 bucket.", metricValues(t.b2BucketLastModified))
|
|
return b.String()
|
|
}
|
|
|
|
func metricValues(source map[string]metricSample) []metricSample {
|
|
keys := make([]string, 0, len(source))
|
|
for key := range source {
|
|
keys = append(keys, key)
|
|
}
|
|
sort.Strings(keys)
|
|
values := make([]metricSample, 0, len(keys))
|
|
for _, key := range keys {
|
|
values = append(values, source[key])
|
|
}
|
|
return values
|
|
}
|
|
|
|
func writeMetricFamily(b *strings.Builder, name, metricType, help string, samples []metricSample) {
|
|
b.WriteString("# HELP ")
|
|
b.WriteString(name)
|
|
b.WriteString(" ")
|
|
b.WriteString(help)
|
|
b.WriteString("\n")
|
|
b.WriteString("# TYPE ")
|
|
b.WriteString(name)
|
|
b.WriteString(" ")
|
|
b.WriteString(metricType)
|
|
b.WriteString("\n")
|
|
for _, sample := range samples {
|
|
b.WriteString(name)
|
|
b.WriteString(renderLabels(sample.labels))
|
|
b.WriteString(" ")
|
|
b.WriteString(fmt.Sprintf("%g", sample.value))
|
|
b.WriteString("\n")
|
|
}
|
|
}
|
|
|
|
func renderLabels(labels map[string]string) string {
|
|
if len(labels) == 0 {
|
|
return ""
|
|
}
|
|
keys := make([]string, 0, len(labels))
|
|
for key := range labels {
|
|
keys = append(keys, key)
|
|
}
|
|
sort.Strings(keys)
|
|
parts := make([]string, 0, len(keys))
|
|
for _, key := range keys {
|
|
parts = append(parts, fmt.Sprintf("%s=%q", key, labels[key]))
|
|
}
|
|
return "{" + strings.Join(parts, ",") + "}"
|
|
}
|
|
|
|
func metricKey(labels map[string]string) string {
|
|
return renderLabels(labels)
|
|
}
|
|
|
|
func incMetric(target map[string]metricSample, labels map[string]string) {
|
|
key := metricKey(labels)
|
|
sample, ok := target[key]
|
|
if !ok {
|
|
target[key] = metricSample{labels: cloneLabels(labels), value: 1}
|
|
return
|
|
}
|
|
sample.value++
|
|
target[key] = sample
|
|
}
|
|
|
|
func setMetric(target map[string]metricSample, labels map[string]string, value float64) {
|
|
key := metricKey(labels)
|
|
target[key] = metricSample{labels: cloneLabels(labels), value: value}
|
|
}
|
|
|
|
func cloneLabels(labels map[string]string) map[string]string {
|
|
out := make(map[string]string, len(labels))
|
|
for key, value := range labels {
|
|
out[key] = value
|
|
}
|
|
return out
|
|
}
|