199 lines
6.3 KiB
Go
199 lines
6.3 KiB
Go
package service
|
|
|
|
import (
|
|
"fmt"
|
|
"io"
|
|
"sort"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"metis/pkg/facts"
|
|
)
|
|
|
|
// Metrics captures the small Prometheus surface exported by Metis.
|
|
type Metrics struct {
|
|
mu sync.RWMutex
|
|
|
|
builds map[string]int
|
|
flashes map[string]int
|
|
snapshots map[string]int
|
|
lastSnapshotUnix map[string]float64
|
|
watches map[string]int
|
|
lastWatchSuccess float64
|
|
classDriftCounts map[string]int
|
|
lastWatchChangeSize float64
|
|
}
|
|
|
|
// NewMetrics builds a zero-value metrics registry.
|
|
func NewMetrics() *Metrics {
|
|
return &Metrics{
|
|
builds: map[string]int{},
|
|
flashes: map[string]int{},
|
|
snapshots: map[string]int{},
|
|
lastSnapshotUnix: map[string]float64{},
|
|
watches: map[string]int{},
|
|
classDriftCounts: map[string]int{},
|
|
}
|
|
}
|
|
|
|
// RecordBuild increments the per-node build counter because the UI and
|
|
// Prometheus graphs need a stable view of build outcomes by node.
|
|
func (m *Metrics) RecordBuild(node, status string) {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
m.builds[counterKey(node, status)]++
|
|
}
|
|
|
|
// RecordFlash increments the per-node and per-host flash counter because the
|
|
// replacement workflow needs separate visibility for build and burn stages.
|
|
func (m *Metrics) RecordFlash(node, host, status string) {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
m.flashes[counterKey(node, host, status)]++
|
|
}
|
|
|
|
// RecordSnapshot tracks accepted sentinel snapshots because drift detection
|
|
// depends on the last successful push per node.
|
|
func (m *Metrics) RecordSnapshot(node, status string, ts time.Time) {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
m.snapshots[counterKey(node, status)]++
|
|
if !ts.IsZero() {
|
|
m.lastSnapshotUnix[node] = float64(ts.Unix())
|
|
}
|
|
}
|
|
|
|
// RecordWatch increments the sentinel watch outcome counter because the
|
|
// dashboard needs to show whether the latest reconciliation succeeded.
|
|
func (m *Metrics) RecordWatch(status string) {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
m.watches[counterKey(status)]++
|
|
if status == "ok" {
|
|
m.lastWatchSuccess = float64(time.Now().UTC().Unix())
|
|
}
|
|
}
|
|
|
|
// SetDriftTargets refreshes the target-count gauge because the UI exposes how
|
|
// much class configuration is already populated versus still missing.
|
|
func (m *Metrics) SetDriftTargets(targets map[string]facts.Targets, changed int) {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
m.classDriftCounts = map[string]int{}
|
|
for class, target := range targets {
|
|
count := 0
|
|
if strings.TrimSpace(target.Kernel) != "" {
|
|
count++
|
|
}
|
|
if strings.TrimSpace(target.OSImage) != "" {
|
|
count++
|
|
}
|
|
if strings.TrimSpace(target.Containerd) != "" {
|
|
count++
|
|
}
|
|
if strings.TrimSpace(target.K3sVersion) != "" {
|
|
count++
|
|
}
|
|
count += len(target.Packages)
|
|
m.classDriftCounts[class] = count
|
|
}
|
|
m.lastWatchChangeSize = float64(changed)
|
|
}
|
|
|
|
// Render writes a Prometheus text exposition response.
|
|
func (m *Metrics) Render(w io.Writer) {
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
|
|
fmt.Fprintln(w, "# HELP metis_builds_total Replacement image builds by node and status")
|
|
fmt.Fprintln(w, "# TYPE metis_builds_total counter")
|
|
for _, key := range sortedKeys(m.builds) {
|
|
parts := splitKey(key, 2)
|
|
node, status := parts[0], parts[1]
|
|
fmt.Fprintf(w, "metis_builds_total{node=%q,status=%q} %d\n", node, status, m.builds[key])
|
|
}
|
|
|
|
fmt.Fprintln(w, "# HELP metis_flashes_total Replacement flashes by node, host, and status")
|
|
fmt.Fprintln(w, "# TYPE metis_flashes_total counter")
|
|
for _, key := range sortedKeys(m.flashes) {
|
|
parts := splitKey(key, 3)
|
|
node, host, status := parts[0], parts[1], parts[2]
|
|
fmt.Fprintf(w, "metis_flashes_total{node=%q,host=%q,status=%q} %d\n", node, host, status, m.flashes[key])
|
|
}
|
|
|
|
fmt.Fprintln(w, "# HELP metis_sentinel_snapshots_total Sentinel snapshots accepted by node and status")
|
|
fmt.Fprintln(w, "# TYPE metis_sentinel_snapshots_total counter")
|
|
for _, key := range sortedKeys(m.snapshots) {
|
|
parts := splitKey(key, 2)
|
|
node, status := parts[0], parts[1]
|
|
fmt.Fprintf(w, "metis_sentinel_snapshots_total{node=%q,status=%q} %d\n", node, status, m.snapshots[key])
|
|
}
|
|
|
|
fmt.Fprintln(w, "# HELP metis_sentinel_snapshot_timestamp_seconds Last accepted sentinel snapshot timestamp by node")
|
|
fmt.Fprintln(w, "# TYPE metis_sentinel_snapshot_timestamp_seconds gauge")
|
|
for _, node := range sortedFloatKeys(m.lastSnapshotUnix) {
|
|
fmt.Fprintf(w, "metis_sentinel_snapshot_timestamp_seconds{node=%q} %.0f\n", node, m.lastSnapshotUnix[node])
|
|
}
|
|
|
|
fmt.Fprintln(w, "# HELP metis_sentinel_watch_total Sentinel watch runs by status")
|
|
fmt.Fprintln(w, "# TYPE metis_sentinel_watch_total counter")
|
|
for _, key := range sortedKeys(m.watches) {
|
|
status := splitKey(key, 1)[0]
|
|
fmt.Fprintf(w, "metis_sentinel_watch_total{status=%q} %d\n", status, m.watches[key])
|
|
}
|
|
|
|
fmt.Fprintln(w, "# HELP metis_sentinel_watch_last_success_timestamp_seconds Last successful sentinel watch timestamp")
|
|
fmt.Fprintln(w, "# TYPE metis_sentinel_watch_last_success_timestamp_seconds gauge")
|
|
fmt.Fprintf(w, "metis_sentinel_watch_last_success_timestamp_seconds %.0f\n", m.lastWatchSuccess)
|
|
|
|
fmt.Fprintln(w, "# HELP metis_sentinel_watch_changed_classes Number of class target sets changed by the last watch")
|
|
fmt.Fprintln(w, "# TYPE metis_sentinel_watch_changed_classes gauge")
|
|
fmt.Fprintf(w, "metis_sentinel_watch_changed_classes %.0f\n", m.lastWatchChangeSize)
|
|
|
|
fmt.Fprintln(w, "# HELP metis_class_target_fields Count of populated target fields per class")
|
|
fmt.Fprintln(w, "# TYPE metis_class_target_fields gauge")
|
|
for _, class := range sortedFloatKeysInt(m.classDriftCounts) {
|
|
fmt.Fprintf(w, "metis_class_target_fields{class=%q} %d\n", class, m.classDriftCounts[class])
|
|
}
|
|
}
|
|
|
|
func counterKey(parts ...string) string {
|
|
return strings.Join(parts, "\x00")
|
|
}
|
|
|
|
func splitKey(key string, want int) []string {
|
|
parts := strings.Split(key, "\x00")
|
|
for len(parts) < want {
|
|
parts = append(parts, "")
|
|
}
|
|
return parts
|
|
}
|
|
|
|
func sortedKeys[T any](m map[string]T) []string {
|
|
keys := make([]string, 0, len(m))
|
|
for key := range m {
|
|
keys = append(keys, key)
|
|
}
|
|
sort.Strings(keys)
|
|
return keys
|
|
}
|
|
|
|
func sortedFloatKeys(m map[string]float64) []string {
|
|
keys := make([]string, 0, len(m))
|
|
for key := range m {
|
|
keys = append(keys, key)
|
|
}
|
|
sort.Strings(keys)
|
|
return keys
|
|
}
|
|
|
|
func sortedFloatKeysInt(m map[string]int) []string {
|
|
keys := make([]string, 0, len(m))
|
|
for key := range m {
|
|
keys = append(keys, key)
|
|
}
|
|
sort.Strings(keys)
|
|
return keys
|
|
}
|