metis/pkg/service/metrics.go

189 lines
5.6 KiB
Go

package service
import (
"fmt"
"io"
"sort"
"strings"
"sync"
"time"
"metis/pkg/facts"
)
// Metrics captures the small Prometheus surface exported by Metis.
type Metrics struct {
mu sync.RWMutex
builds map[string]int
flashes map[string]int
snapshots map[string]int
lastSnapshotUnix map[string]float64
watches map[string]int
lastWatchSuccess float64
classDriftCounts map[string]int
lastWatchChangeSize float64
}
// NewMetrics builds a zero-value metrics registry.
func NewMetrics() *Metrics {
return &Metrics{
builds: map[string]int{},
flashes: map[string]int{},
snapshots: map[string]int{},
lastSnapshotUnix: map[string]float64{},
watches: map[string]int{},
classDriftCounts: map[string]int{},
}
}
func (m *Metrics) RecordBuild(node, status string) {
m.mu.Lock()
defer m.mu.Unlock()
m.builds[counterKey(node, status)]++
}
func (m *Metrics) RecordFlash(node, host, status string) {
m.mu.Lock()
defer m.mu.Unlock()
m.flashes[counterKey(node, host, status)]++
}
func (m *Metrics) RecordSnapshot(node, status string, ts time.Time) {
m.mu.Lock()
defer m.mu.Unlock()
m.snapshots[counterKey(node, status)]++
if !ts.IsZero() {
m.lastSnapshotUnix[node] = float64(ts.Unix())
}
}
func (m *Metrics) RecordWatch(status string) {
m.mu.Lock()
defer m.mu.Unlock()
m.watches[counterKey(status)]++
if status == "ok" {
m.lastWatchSuccess = float64(time.Now().UTC().Unix())
}
}
func (m *Metrics) SetDriftTargets(targets map[string]facts.Targets, changed int) {
m.mu.Lock()
defer m.mu.Unlock()
m.classDriftCounts = map[string]int{}
for class, target := range targets {
count := 0
if strings.TrimSpace(target.Kernel) != "" {
count++
}
if strings.TrimSpace(target.OSImage) != "" {
count++
}
if strings.TrimSpace(target.Containerd) != "" {
count++
}
if strings.TrimSpace(target.K3sVersion) != "" {
count++
}
count += len(target.Packages)
m.classDriftCounts[class] = count
}
m.lastWatchChangeSize = float64(changed)
}
// Render writes a Prometheus text exposition response.
func (m *Metrics) Render(w io.Writer) {
m.mu.RLock()
defer m.mu.RUnlock()
fmt.Fprintln(w, "# HELP metis_builds_total Replacement image builds by node and status")
fmt.Fprintln(w, "# TYPE metis_builds_total counter")
for _, key := range sortedKeys(m.builds) {
parts := splitKey(key, 2)
node, status := parts[0], parts[1]
fmt.Fprintf(w, "metis_builds_total{node=%q,status=%q} %d\n", node, status, m.builds[key])
}
fmt.Fprintln(w, "# HELP metis_flashes_total Replacement flashes by node, host, and status")
fmt.Fprintln(w, "# TYPE metis_flashes_total counter")
for _, key := range sortedKeys(m.flashes) {
parts := splitKey(key, 3)
node, host, status := parts[0], parts[1], parts[2]
fmt.Fprintf(w, "metis_flashes_total{node=%q,host=%q,status=%q} %d\n", node, host, status, m.flashes[key])
}
fmt.Fprintln(w, "# HELP metis_sentinel_snapshots_total Sentinel snapshots accepted by node and status")
fmt.Fprintln(w, "# TYPE metis_sentinel_snapshots_total counter")
for _, key := range sortedKeys(m.snapshots) {
parts := splitKey(key, 2)
node, status := parts[0], parts[1]
fmt.Fprintf(w, "metis_sentinel_snapshots_total{node=%q,status=%q} %d\n", node, status, m.snapshots[key])
}
fmt.Fprintln(w, "# HELP metis_sentinel_snapshot_timestamp_seconds Last accepted sentinel snapshot timestamp by node")
fmt.Fprintln(w, "# TYPE metis_sentinel_snapshot_timestamp_seconds gauge")
for _, node := range sortedFloatKeys(m.lastSnapshotUnix) {
fmt.Fprintf(w, "metis_sentinel_snapshot_timestamp_seconds{node=%q} %.0f\n", node, m.lastSnapshotUnix[node])
}
fmt.Fprintln(w, "# HELP metis_sentinel_watch_total Sentinel watch runs by status")
fmt.Fprintln(w, "# TYPE metis_sentinel_watch_total counter")
for _, key := range sortedKeys(m.watches) {
status := splitKey(key, 1)[0]
fmt.Fprintf(w, "metis_sentinel_watch_total{status=%q} %d\n", status, m.watches[key])
}
fmt.Fprintln(w, "# HELP metis_sentinel_watch_last_success_timestamp_seconds Last successful sentinel watch timestamp")
fmt.Fprintln(w, "# TYPE metis_sentinel_watch_last_success_timestamp_seconds gauge")
fmt.Fprintf(w, "metis_sentinel_watch_last_success_timestamp_seconds %.0f\n", m.lastWatchSuccess)
fmt.Fprintln(w, "# HELP metis_sentinel_watch_changed_classes Number of class target sets changed by the last watch")
fmt.Fprintln(w, "# TYPE metis_sentinel_watch_changed_classes gauge")
fmt.Fprintf(w, "metis_sentinel_watch_changed_classes %.0f\n", m.lastWatchChangeSize)
fmt.Fprintln(w, "# HELP metis_class_target_fields Count of populated target fields per class")
fmt.Fprintln(w, "# TYPE metis_class_target_fields gauge")
for _, class := range sortedFloatKeysInt(m.classDriftCounts) {
fmt.Fprintf(w, "metis_class_target_fields{class=%q} %d\n", class, m.classDriftCounts[class])
}
}
func counterKey(parts ...string) string {
return strings.Join(parts, "\x00")
}
func splitKey(key string, want int) []string {
parts := strings.Split(key, "\x00")
for len(parts) < want {
parts = append(parts, "")
}
return parts
}
func sortedKeys[T any](m map[string]T) []string {
keys := make([]string, 0, len(m))
for key := range m {
keys = append(keys, key)
}
sort.Strings(keys)
return keys
}
func sortedFloatKeys(m map[string]float64) []string {
keys := make([]string, 0, len(m))
for key := range m {
keys = append(keys, key)
}
sort.Strings(keys)
return keys
}
func sortedFloatKeysInt(m map[string]int) []string {
keys := make([]string, 0, len(m))
for key := range m {
keys = append(keys, key)
}
sort.Strings(keys)
return keys
}