2026-04-12 11:09:49 -03:00
package server
import (
"fmt"
"net/http"
"sort"
"strings"
"sync"
"time"
"scm.bstein.dev/bstein/soteria/internal/api"
)
type metricSample struct {
labels map [ string ] string
value float64
}
type telemetry struct {
mu sync . RWMutex
backupRequests map [ string ] metricSample
restoreRequests map [ string ] metricSample
2026-04-12 14:32:39 -03:00
policyBackups map [ string ] metricSample
namespaceBackupRequests map [ string ] metricSample
namespaceRestoreReqs map [ string ] metricSample
2026-04-12 11:09:49 -03:00
authzDenials map [ string ] metricSample
inventoryRefreshFailure float64
inventoryRefreshTime float64
pvcBackupAgeHours map [ string ] metricSample
pvcBackupHealth map [ string ] metricSample
2026-04-12 19:45:23 -03:00
pvcBackupHealthReason map [ string ] metricSample
2026-04-12 11:09:49 -03:00
pvcBackupLastSuccess map [ string ] metricSample
pvcBackupCount map [ string ] metricSample
2026-04-12 19:45:23 -03:00
pvcBackupCompletedCount map [ string ] metricSample
pvcBackupLastSizeBytes map [ string ] metricSample
pvcBackupTotalSizeBytes map [ string ] metricSample
b2BucketObjects map [ string ] metricSample
b2BucketBytes map [ string ] metricSample
b2BucketRecentObjects map [ string ] metricSample
b2BucketRecentBytes map [ string ] metricSample
b2BucketLastModified map [ string ] metricSample
b2ScanSuccess float64
b2ScanTimestamp float64
b2ScanDurationSeconds float64
b2AccountObjects float64
b2AccountBytes float64
b2AccountRecentObjects float64
b2AccountRecentBytes float64
2026-04-12 11:09:49 -03:00
}
func newTelemetry ( ) * telemetry {
return & telemetry {
2026-04-12 14:32:39 -03:00
backupRequests : map [ string ] metricSample { } ,
restoreRequests : map [ string ] metricSample { } ,
policyBackups : map [ string ] metricSample { } ,
namespaceBackupRequests : map [ string ] metricSample { } ,
namespaceRestoreReqs : map [ string ] metricSample { } ,
authzDenials : map [ string ] metricSample { } ,
pvcBackupAgeHours : map [ string ] metricSample { } ,
pvcBackupHealth : map [ string ] metricSample { } ,
2026-04-12 19:45:23 -03:00
pvcBackupHealthReason : map [ string ] metricSample { } ,
2026-04-12 14:32:39 -03:00
pvcBackupLastSuccess : map [ string ] metricSample { } ,
pvcBackupCount : map [ string ] metricSample { } ,
2026-04-12 19:45:23 -03:00
pvcBackupCompletedCount : map [ string ] metricSample { } ,
pvcBackupLastSizeBytes : map [ string ] metricSample { } ,
pvcBackupTotalSizeBytes : map [ string ] metricSample { } ,
b2BucketObjects : map [ string ] metricSample { } ,
b2BucketBytes : map [ string ] metricSample { } ,
b2BucketRecentObjects : map [ string ] metricSample { } ,
b2BucketRecentBytes : map [ string ] metricSample { } ,
b2BucketLastModified : map [ string ] metricSample { } ,
2026-04-12 11:09:49 -03:00
}
}
2026-04-21 06:45:04 -03:00
// Handler exposes the telemetry renderer as an HTTP endpoint.
2026-04-12 11:09:49 -03:00
func ( t * telemetry ) Handler ( ) http . Handler {
return http . HandlerFunc ( func ( w http . ResponseWriter , _ * http . Request ) {
w . Header ( ) . Set ( "Content-Type" , "text/plain; version=0.0.4; charset=utf-8" )
_ , _ = w . Write ( [ ] byte ( t . render ( ) ) )
} )
}
2026-04-21 06:45:04 -03:00
// RecordBackupRequest counts a backup request outcome by driver.
2026-04-12 11:09:49 -03:00
func ( t * telemetry ) RecordBackupRequest ( driver , result string ) {
t . mu . Lock ( )
defer t . mu . Unlock ( )
incMetric ( t . backupRequests , map [ string ] string { "driver" : driver , "result" : result } )
}
2026-04-21 06:45:04 -03:00
// RecordRestoreRequest counts a restore request outcome by driver.
2026-04-12 11:09:49 -03:00
func ( t * telemetry ) RecordRestoreRequest ( driver , result string ) {
t . mu . Lock ( )
defer t . mu . Unlock ( )
incMetric ( t . restoreRequests , map [ string ] string { "driver" : driver , "result" : result } )
}
2026-04-21 06:45:04 -03:00
// RecordPolicyBackup counts a policy-cycle backup outcome.
2026-04-12 14:32:39 -03:00
func ( t * telemetry ) RecordPolicyBackup ( result string ) {
t . mu . Lock ( )
defer t . mu . Unlock ( )
incMetric ( t . policyBackups , map [ string ] string { "result" : result } )
}
2026-04-21 06:45:04 -03:00
// RecordNamespaceBackupRequest counts a namespace backup outcome by driver.
2026-04-12 14:32:39 -03:00
func ( t * telemetry ) RecordNamespaceBackupRequest ( driver , result string ) {
t . mu . Lock ( )
defer t . mu . Unlock ( )
incMetric ( t . namespaceBackupRequests , map [ string ] string { "driver" : driver , "result" : result } )
}
2026-04-21 06:45:04 -03:00
// RecordNamespaceRestoreRequest counts a namespace restore outcome by driver.
2026-04-12 14:32:39 -03:00
func ( t * telemetry ) RecordNamespaceRestoreRequest ( driver , result string ) {
t . mu . Lock ( )
defer t . mu . Unlock ( )
incMetric ( t . namespaceRestoreReqs , map [ string ] string { "driver" : driver , "result" : result } )
}
2026-04-21 06:45:04 -03:00
// RecordAuthzDenied counts authorization denials by reason.
2026-04-12 11:09:49 -03:00
func ( t * telemetry ) RecordAuthzDenied ( reason string ) {
t . mu . Lock ( )
defer t . mu . Unlock ( )
incMetric ( t . authzDenials , map [ string ] string { "reason" : reason } )
}
2026-04-21 06:45:04 -03:00
// RecordInventoryFailure increments the inventory refresh failure counter.
2026-04-12 11:09:49 -03:00
func ( t * telemetry ) RecordInventoryFailure ( ) {
t . mu . Lock ( )
defer t . mu . Unlock ( )
t . inventoryRefreshFailure ++
}
2026-04-21 06:45:04 -03:00
// RecordInventory updates PVC backup metrics from the latest inventory snapshot.
2026-04-12 11:09:49 -03:00
func ( t * telemetry ) RecordInventory ( inv api . InventoryResponse ) {
t . mu . Lock ( )
defer t . mu . Unlock ( )
t . pvcBackupAgeHours = map [ string ] metricSample { }
t . pvcBackupHealth = map [ string ] metricSample { }
2026-04-12 19:45:23 -03:00
t . pvcBackupHealthReason = map [ string ] metricSample { }
2026-04-12 11:09:49 -03:00
t . pvcBackupLastSuccess = map [ string ] metricSample { }
t . pvcBackupCount = map [ string ] metricSample { }
2026-04-12 19:45:23 -03:00
t . pvcBackupCompletedCount = map [ string ] metricSample { }
t . pvcBackupLastSizeBytes = map [ string ] metricSample { }
t . pvcBackupTotalSizeBytes = map [ string ] metricSample { }
2026-04-12 11:09:49 -03:00
for _ , namespace := range inv . Namespaces {
for _ , pvc := range namespace . PVCs {
labels := map [ string ] string {
"namespace" : pvc . Namespace ,
"pvc" : pvc . PVC ,
"volume" : pvc . Volume ,
"driver" : pvc . Driver ,
}
setMetric ( t . pvcBackupCount , labels , float64 ( pvc . BackupCount ) )
2026-04-12 19:45:23 -03:00
setMetric ( t . pvcBackupCompletedCount , labels , float64 ( pvc . CompletedBackups ) )
setMetric ( t . pvcBackupLastSizeBytes , labels , pvc . LastBackupSizeBytes )
setMetric ( t . pvcBackupTotalSizeBytes , labels , pvc . TotalBackupSizeBytes )
reasonLabels := cloneLabels ( labels )
reason := strings . TrimSpace ( pvc . HealthReason )
if reason == "" {
reason = "unknown"
}
reasonLabels [ "reason" ] = reason
setMetric ( t . pvcBackupHealthReason , reasonLabels , 1 )
2026-04-12 11:09:49 -03:00
if pvc . Healthy {
setMetric ( t . pvcBackupHealth , labels , 1 )
} else {
setMetric ( t . pvcBackupHealth , labels , 0 )
}
if pvc . LastBackupAt == "" {
continue
}
setMetric ( t . pvcBackupAgeHours , labels , pvc . LastBackupAgeHours )
if ts , ok := parseBackupTime ( pvc . LastBackupAt ) ; ok {
setMetric ( t . pvcBackupLastSuccess , labels , float64 ( ts . Unix ( ) ) )
}
}
}
t . inventoryRefreshTime = float64 ( time . Now ( ) . Unix ( ) )
}
2026-04-21 06:45:04 -03:00
// RecordB2Usage updates the B2 telemetry gauges from the latest scan result.
2026-04-12 19:45:23 -03:00
func ( t * telemetry ) RecordB2Usage ( usage api . B2UsageResponse ) {
t . mu . Lock ( )
defer t . mu . Unlock ( )
t . b2BucketObjects = map [ string ] metricSample { }
t . b2BucketBytes = map [ string ] metricSample { }
t . b2BucketRecentObjects = map [ string ] metricSample { }
t . b2BucketRecentBytes = map [ string ] metricSample { }
t . b2BucketLastModified = map [ string ] metricSample { }
t . b2ScanSuccess = 0
t . b2ScanDurationSeconds = float64 ( usage . ScanDurationMS ) / 1000.0
t . b2AccountObjects = float64 ( usage . TotalObjects )
t . b2AccountBytes = float64 ( usage . TotalBytes )
t . b2AccountRecentObjects = float64 ( usage . RecentObjects24h )
t . b2AccountRecentBytes = float64 ( usage . RecentBytes24h )
if usage . Available {
t . b2ScanSuccess = 1
}
if usage . ScannedAt != "" {
if ts , err := time . Parse ( time . RFC3339 , usage . ScannedAt ) ; err == nil {
t . b2ScanTimestamp = float64 ( ts . Unix ( ) )
} else {
t . b2ScanTimestamp = float64 ( time . Now ( ) . Unix ( ) )
}
} else {
t . b2ScanTimestamp = float64 ( time . Now ( ) . Unix ( ) )
}
for _ , bucket := range usage . Buckets {
labels := map [ string ] string { "bucket" : bucket . Name }
setMetric ( t . b2BucketObjects , labels , float64 ( bucket . ObjectCount ) )
setMetric ( t . b2BucketBytes , labels , float64 ( bucket . TotalBytes ) )
setMetric ( t . b2BucketRecentObjects , labels , float64 ( bucket . RecentObjects24h ) )
setMetric ( t . b2BucketRecentBytes , labels , float64 ( bucket . RecentBytes24h ) )
if bucket . LastModifiedAt == "" {
continue
}
if ts , err := time . Parse ( time . RFC3339 , bucket . LastModifiedAt ) ; err == nil {
setMetric ( t . b2BucketLastModified , labels , float64 ( ts . Unix ( ) ) )
}
}
}
2026-04-12 11:09:49 -03:00
func ( t * telemetry ) render ( ) string {
t . mu . RLock ( )
defer t . mu . RUnlock ( )
var b strings . Builder
writeMetricFamily ( & b , "soteria_backup_requests_total" , "counter" , "Backup requests handled by Soteria." , metricValues ( t . backupRequests ) )
writeMetricFamily ( & b , "soteria_restore_requests_total" , "counter" , "Restore requests handled by Soteria." , metricValues ( t . restoreRequests ) )
2026-04-12 14:32:39 -03:00
writeMetricFamily ( & b , "soteria_policy_backups_total" , "counter" , "Policy scheduler backup execution outcomes." , metricValues ( t . policyBackups ) )
writeMetricFamily ( & b , "soteria_namespace_backup_requests_total" , "counter" , "Namespace-level backup request outcomes." , metricValues ( t . namespaceBackupRequests ) )
writeMetricFamily ( & b , "soteria_namespace_restore_requests_total" , "counter" , "Namespace-level restore request outcomes." , metricValues ( t . namespaceRestoreReqs ) )
2026-04-12 11:09:49 -03:00
writeMetricFamily ( & b , "soteria_authz_denials_total" , "counter" , "Authorization denials emitted by Soteria." , metricValues ( t . authzDenials ) )
writeMetricFamily ( & b , "soteria_inventory_refresh_failures_total" , "counter" , "Inventory refresh failures while computing PVC backup telemetry." , [ ] metricSample { { value : t . inventoryRefreshFailure } } )
writeMetricFamily ( & b , "soteria_inventory_refresh_timestamp_seconds" , "gauge" , "Unix timestamp of the last successful inventory refresh." , [ ] metricSample { { value : t . inventoryRefreshTime } } )
writeMetricFamily ( & b , "pvc_backup_age_hours" , "gauge" , "Age in hours of the latest successful PVC backup known to Soteria." , metricValues ( t . pvcBackupAgeHours ) )
writeMetricFamily ( & b , "pvc_backup_health" , "gauge" , "PVC backup health according to Soteria: 1=fresh backup within policy, 0=missing/stale/error." , metricValues ( t . pvcBackupHealth ) )
2026-04-12 19:45:23 -03:00
writeMetricFamily ( & b , "pvc_backup_health_reason" , "gauge" , "PVC backup health reason marker with reason label set to 1." , metricValues ( t . pvcBackupHealthReason ) )
2026-04-12 11:09:49 -03:00
writeMetricFamily ( & b , "pvc_backup_last_success_timestamp_seconds" , "gauge" , "Unix timestamp of the latest successful PVC backup known to Soteria." , metricValues ( t . pvcBackupLastSuccess ) )
writeMetricFamily ( & b , "pvc_backup_count" , "gauge" , "Count of backup records discovered for a PVC." , metricValues ( t . pvcBackupCount ) )
2026-04-12 19:45:23 -03:00
writeMetricFamily ( & b , "pvc_backup_completed_count" , "gauge" , "Count of completed backup records discovered for a PVC." , metricValues ( t . pvcBackupCompletedCount ) )
writeMetricFamily ( & b , "pvc_backup_last_size_bytes" , "gauge" , "Size in bytes of the latest completed backup for a PVC." , metricValues ( t . pvcBackupLastSizeBytes ) )
writeMetricFamily ( & b , "pvc_backup_total_size_bytes" , "gauge" , "Total bytes across discovered backup records for a PVC." , metricValues ( t . pvcBackupTotalSizeBytes ) )
writeMetricFamily ( & b , "soteria_b2_scan_success" , "gauge" , "Whether the latest B2 consumption scan succeeded (1) or failed (0)." , [ ] metricSample { { value : t . b2ScanSuccess } } )
writeMetricFamily ( & b , "soteria_b2_scan_timestamp_seconds" , "gauge" , "Unix timestamp of the latest B2 consumption scan attempt." , [ ] metricSample { { value : t . b2ScanTimestamp } } )
writeMetricFamily ( & b , "soteria_b2_scan_duration_seconds" , "gauge" , "Duration in seconds of the latest B2 consumption scan attempt." , [ ] metricSample { { value : t . b2ScanDurationSeconds } } )
writeMetricFamily ( & b , "soteria_b2_account_objects" , "gauge" , "Total object count discovered across scanned B2 buckets." , [ ] metricSample { { value : t . b2AccountObjects } } )
writeMetricFamily ( & b , "soteria_b2_account_bytes" , "gauge" , "Total stored bytes discovered across scanned B2 buckets." , [ ] metricSample { { value : t . b2AccountBytes } } )
writeMetricFamily ( & b , "soteria_b2_account_recent_objects_24h" , "gauge" , "Object count with LastModified in the last 24h across scanned B2 buckets." , [ ] metricSample { { value : t . b2AccountRecentObjects } } )
writeMetricFamily ( & b , "soteria_b2_account_recent_bytes_24h" , "gauge" , "Bytes with LastModified in the last 24h across scanned B2 buckets." , [ ] metricSample { { value : t . b2AccountRecentBytes } } )
writeMetricFamily ( & b , "soteria_b2_bucket_objects" , "gauge" , "Object count discovered per scanned B2 bucket." , metricValues ( t . b2BucketObjects ) )
writeMetricFamily ( & b , "soteria_b2_bucket_bytes" , "gauge" , "Stored bytes discovered per scanned B2 bucket." , metricValues ( t . b2BucketBytes ) )
writeMetricFamily ( & b , "soteria_b2_bucket_recent_objects_24h" , "gauge" , "Object count with LastModified in the last 24h per scanned B2 bucket." , metricValues ( t . b2BucketRecentObjects ) )
writeMetricFamily ( & b , "soteria_b2_bucket_recent_bytes_24h" , "gauge" , "Bytes with LastModified in the last 24h per scanned B2 bucket." , metricValues ( t . b2BucketRecentBytes ) )
writeMetricFamily ( & b , "soteria_b2_bucket_last_modified_timestamp_seconds" , "gauge" , "Unix timestamp of the most recent object observed in each scanned B2 bucket." , metricValues ( t . b2BucketLastModified ) )
2026-04-12 11:09:49 -03:00
return b . String ( )
}
func metricValues ( source map [ string ] metricSample ) [ ] metricSample {
keys := make ( [ ] string , 0 , len ( source ) )
for key := range source {
keys = append ( keys , key )
}
sort . Strings ( keys )
values := make ( [ ] metricSample , 0 , len ( keys ) )
for _ , key := range keys {
values = append ( values , source [ key ] )
}
return values
}
func writeMetricFamily ( b * strings . Builder , name , metricType , help string , samples [ ] metricSample ) {
b . WriteString ( "# HELP " )
b . WriteString ( name )
b . WriteString ( " " )
b . WriteString ( help )
b . WriteString ( "\n" )
b . WriteString ( "# TYPE " )
b . WriteString ( name )
b . WriteString ( " " )
b . WriteString ( metricType )
b . WriteString ( "\n" )
for _ , sample := range samples {
b . WriteString ( name )
b . WriteString ( renderLabels ( sample . labels ) )
b . WriteString ( " " )
b . WriteString ( fmt . Sprintf ( "%g" , sample . value ) )
b . WriteString ( "\n" )
}
}
func renderLabels ( labels map [ string ] string ) string {
if len ( labels ) == 0 {
return ""
}
keys := make ( [ ] string , 0 , len ( labels ) )
for key := range labels {
keys = append ( keys , key )
}
sort . Strings ( keys )
parts := make ( [ ] string , 0 , len ( keys ) )
for _ , key := range keys {
parts = append ( parts , fmt . Sprintf ( "%s=%q" , key , labels [ key ] ) )
}
return "{" + strings . Join ( parts , "," ) + "}"
}
func metricKey ( labels map [ string ] string ) string {
return renderLabels ( labels )
}
func incMetric ( target map [ string ] metricSample , labels map [ string ] string ) {
key := metricKey ( labels )
sample , ok := target [ key ]
if ! ok {
target [ key ] = metricSample { labels : cloneLabels ( labels ) , value : 1 }
return
}
sample . value ++
target [ key ] = sample
}
func setMetric ( target map [ string ] metricSample , labels map [ string ] string , value float64 ) {
key := metricKey ( labels )
target [ key ] = metricSample { labels : cloneLabels ( labels ) , value : value }
}
func cloneLabels ( labels map [ string ] string ) map [ string ] string {
out := make ( map [ string ] string , len ( labels ) )
for key , value := range labels {
out [ key ] = value
}
return out
}