soteria/internal/server/metrics.go

204 lines
6.4 KiB
Go

package server
import (
"fmt"
"net/http"
"sort"
"strings"
"sync"
"time"
"scm.bstein.dev/bstein/soteria/internal/api"
)
type metricSample struct {
labels map[string]string
value float64
}
type telemetry struct {
mu sync.RWMutex
backupRequests map[string]metricSample
restoreRequests map[string]metricSample
authzDenials map[string]metricSample
inventoryRefreshFailure float64
inventoryRefreshTime float64
pvcBackupAgeHours map[string]metricSample
pvcBackupHealth map[string]metricSample
pvcBackupLastSuccess map[string]metricSample
pvcBackupCount map[string]metricSample
}
func newTelemetry() *telemetry {
return &telemetry{
backupRequests: map[string]metricSample{},
restoreRequests: map[string]metricSample{},
authzDenials: map[string]metricSample{},
pvcBackupAgeHours: map[string]metricSample{},
pvcBackupHealth: map[string]metricSample{},
pvcBackupLastSuccess: map[string]metricSample{},
pvcBackupCount: map[string]metricSample{},
}
}
func (t *telemetry) Handler() http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8")
_, _ = w.Write([]byte(t.render()))
})
}
func (t *telemetry) RecordBackupRequest(driver, result string) {
t.mu.Lock()
defer t.mu.Unlock()
incMetric(t.backupRequests, map[string]string{"driver": driver, "result": result})
}
func (t *telemetry) RecordRestoreRequest(driver, result string) {
t.mu.Lock()
defer t.mu.Unlock()
incMetric(t.restoreRequests, map[string]string{"driver": driver, "result": result})
}
func (t *telemetry) RecordAuthzDenied(reason string) {
t.mu.Lock()
defer t.mu.Unlock()
incMetric(t.authzDenials, map[string]string{"reason": reason})
}
func (t *telemetry) RecordInventoryFailure() {
t.mu.Lock()
defer t.mu.Unlock()
t.inventoryRefreshFailure++
}
func (t *telemetry) RecordInventory(inv api.InventoryResponse) {
t.mu.Lock()
defer t.mu.Unlock()
t.pvcBackupAgeHours = map[string]metricSample{}
t.pvcBackupHealth = map[string]metricSample{}
t.pvcBackupLastSuccess = map[string]metricSample{}
t.pvcBackupCount = map[string]metricSample{}
for _, namespace := range inv.Namespaces {
for _, pvc := range namespace.PVCs {
labels := map[string]string{
"namespace": pvc.Namespace,
"pvc": pvc.PVC,
"volume": pvc.Volume,
"driver": pvc.Driver,
}
setMetric(t.pvcBackupCount, labels, float64(pvc.BackupCount))
if pvc.Healthy {
setMetric(t.pvcBackupHealth, labels, 1)
} else {
setMetric(t.pvcBackupHealth, labels, 0)
}
if pvc.LastBackupAt == "" {
continue
}
setMetric(t.pvcBackupAgeHours, labels, pvc.LastBackupAgeHours)
if ts, ok := parseBackupTime(pvc.LastBackupAt); ok {
setMetric(t.pvcBackupLastSuccess, labels, float64(ts.Unix()))
}
}
}
t.inventoryRefreshTime = float64(time.Now().Unix())
}
func (t *telemetry) render() string {
t.mu.RLock()
defer t.mu.RUnlock()
var b strings.Builder
writeMetricFamily(&b, "soteria_backup_requests_total", "counter", "Backup requests handled by Soteria.", metricValues(t.backupRequests))
writeMetricFamily(&b, "soteria_restore_requests_total", "counter", "Restore requests handled by Soteria.", metricValues(t.restoreRequests))
writeMetricFamily(&b, "soteria_authz_denials_total", "counter", "Authorization denials emitted by Soteria.", metricValues(t.authzDenials))
writeMetricFamily(&b, "soteria_inventory_refresh_failures_total", "counter", "Inventory refresh failures while computing PVC backup telemetry.", []metricSample{{value: t.inventoryRefreshFailure}})
writeMetricFamily(&b, "soteria_inventory_refresh_timestamp_seconds", "gauge", "Unix timestamp of the last successful inventory refresh.", []metricSample{{value: t.inventoryRefreshTime}})
writeMetricFamily(&b, "pvc_backup_age_hours", "gauge", "Age in hours of the latest successful PVC backup known to Soteria.", metricValues(t.pvcBackupAgeHours))
writeMetricFamily(&b, "pvc_backup_health", "gauge", "PVC backup health according to Soteria: 1=fresh backup within policy, 0=missing/stale/error.", metricValues(t.pvcBackupHealth))
writeMetricFamily(&b, "pvc_backup_last_success_timestamp_seconds", "gauge", "Unix timestamp of the latest successful PVC backup known to Soteria.", metricValues(t.pvcBackupLastSuccess))
writeMetricFamily(&b, "pvc_backup_count", "gauge", "Count of backup records discovered for a PVC.", metricValues(t.pvcBackupCount))
return b.String()
}
func metricValues(source map[string]metricSample) []metricSample {
keys := make([]string, 0, len(source))
for key := range source {
keys = append(keys, key)
}
sort.Strings(keys)
values := make([]metricSample, 0, len(keys))
for _, key := range keys {
values = append(values, source[key])
}
return values
}
func writeMetricFamily(b *strings.Builder, name, metricType, help string, samples []metricSample) {
b.WriteString("# HELP ")
b.WriteString(name)
b.WriteString(" ")
b.WriteString(help)
b.WriteString("\n")
b.WriteString("# TYPE ")
b.WriteString(name)
b.WriteString(" ")
b.WriteString(metricType)
b.WriteString("\n")
for _, sample := range samples {
b.WriteString(name)
b.WriteString(renderLabels(sample.labels))
b.WriteString(" ")
b.WriteString(fmt.Sprintf("%g", sample.value))
b.WriteString("\n")
}
}
func renderLabels(labels map[string]string) string {
if len(labels) == 0 {
return ""
}
keys := make([]string, 0, len(labels))
for key := range labels {
keys = append(keys, key)
}
sort.Strings(keys)
parts := make([]string, 0, len(keys))
for _, key := range keys {
parts = append(parts, fmt.Sprintf("%s=%q", key, labels[key]))
}
return "{" + strings.Join(parts, ",") + "}"
}
func metricKey(labels map[string]string) string {
return renderLabels(labels)
}
func incMetric(target map[string]metricSample, labels map[string]string) {
key := metricKey(labels)
sample, ok := target[key]
if !ok {
target[key] = metricSample{labels: cloneLabels(labels), value: 1}
return
}
sample.value++
target[key] = sample
}
func setMetric(target map[string]metricSample, labels map[string]string, value float64) {
key := metricKey(labels)
target[key] = metricSample{labels: cloneLabels(labels), value: value}
}
func cloneLabels(labels map[string]string) map[string]string {
out := make(map[string]string, len(labels))
for key, value := range labels {
out[key] = value
}
return out
}