ananke/internal/metrics/exporter.go

231 lines
9.1 KiB
Go
Raw Normal View History

package metrics
import (
"fmt"
"net/http"
"os"
"sort"
"strings"
"sync"
"time"
)
type Sample struct {
Name string
Target string
OnBattery bool
LowBattery bool
RuntimeSecond int
BatteryCharge float64
LoadPercent float64
PowerNominalW float64
ThresholdSec int
Trigger bool
BreachCount int
Status string
LastError string
UpdatedAt time.Time
}
type Exporter struct {
mu sync.RWMutex
shutdownBudgetSec int
shutdownTriggers int
lastShutdownReason string
lastShutdownAt time.Time
samples map[string]Sample
}
// New runs one orchestration or CLI step.
// Signature: New() *Exporter.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func New() *Exporter {
return &Exporter{
samples: make(map[string]Sample),
}
}
// UpdateBudget runs one orchestration or CLI step.
// Signature: (e *Exporter) UpdateBudget(seconds int).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (e *Exporter) UpdateBudget(seconds int) {
e.mu.Lock()
defer e.mu.Unlock()
e.shutdownBudgetSec = seconds
}
// UpdateSample runs one orchestration or CLI step.
// Signature: (e *Exporter) UpdateSample(s Sample).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (e *Exporter) UpdateSample(s Sample) {
e.mu.Lock()
defer e.mu.Unlock()
if s.UpdatedAt.IsZero() {
s.UpdatedAt = time.Now().UTC()
}
e.samples[s.Name] = s
}
// MarkShutdown runs one orchestration or CLI step.
// Signature: (e *Exporter) MarkShutdown(reason string).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (e *Exporter) MarkShutdown(reason string) {
e.mu.Lock()
defer e.mu.Unlock()
e.shutdownTriggers++
e.lastShutdownReason = reason
e.lastShutdownAt = time.Now().UTC()
}
// Handler runs one orchestration or CLI step.
// Signature: (e *Exporter) Handler(path string) http.Handler.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (e *Exporter) Handler(path string) http.Handler {
mux := http.NewServeMux()
metricsPath := path
if metricsPath == "" {
metricsPath = "/metrics"
}
mux.HandleFunc(metricsPath, e.serveMetrics)
mux.HandleFunc("/healthz", func(w http.ResponseWriter, _ *http.Request) {
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte("ok\n"))
})
return mux
}
// serveMetrics runs one orchestration or CLI step.
// Signature: (e *Exporter) serveMetrics(w http.ResponseWriter, _ *http.Request).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (e *Exporter) serveMetrics(w http.ResponseWriter, _ *http.Request) {
e.mu.RLock()
defer e.mu.RUnlock()
w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8")
var b strings.Builder
b.WriteString("# HELP ananke_shutdown_budget_seconds Estimated graceful shutdown budget in seconds (p95).\n")
b.WriteString("# TYPE ananke_shutdown_budget_seconds gauge\n")
b.WriteString(fmt.Sprintf("ananke_shutdown_budget_seconds %d\n", e.shutdownBudgetSec))
b.WriteString("# HELP ananke_shutdown_triggers_total Total number of shutdown triggers issued by this instance.\n")
b.WriteString("# TYPE ananke_shutdown_triggers_total counter\n")
b.WriteString(fmt.Sprintf("ananke_shutdown_triggers_total %d\n", e.shutdownTriggers))
b.WriteString("# HELP ananke_shutdown_last_trigger_timestamp_seconds Unix timestamp of last shutdown trigger.\n")
b.WriteString("# TYPE ananke_shutdown_last_trigger_timestamp_seconds gauge\n")
if e.lastShutdownAt.IsZero() {
b.WriteString("ananke_shutdown_last_trigger_timestamp_seconds 0\n")
} else {
b.WriteString(fmt.Sprintf("ananke_shutdown_last_trigger_timestamp_seconds %d\n", e.lastShutdownAt.Unix()))
}
b.WriteString("# HELP ananke_ups_on_battery Whether a UPS source is currently on battery.\n")
b.WriteString("# TYPE ananke_ups_on_battery gauge\n")
b.WriteString("# HELP ananke_ups_low_battery Whether a UPS source currently reports low battery.\n")
b.WriteString("# TYPE ananke_ups_low_battery gauge\n")
b.WriteString("# HELP ananke_ups_runtime_seconds Battery runtime remaining reported by UPS.\n")
b.WriteString("# TYPE ananke_ups_runtime_seconds gauge\n")
b.WriteString("# HELP ananke_ups_battery_charge_percent Battery charge percentage reported by UPS.\n")
b.WriteString("# TYPE ananke_ups_battery_charge_percent gauge\n")
b.WriteString("# HELP ananke_ups_load_percent UPS output load percentage.\n")
b.WriteString("# TYPE ananke_ups_load_percent gauge\n")
b.WriteString("# HELP ananke_ups_power_nominal_watts UPS nominal power rating in watts.\n")
b.WriteString("# TYPE ananke_ups_power_nominal_watts gauge\n")
b.WriteString("# HELP ananke_ups_threshold_seconds Red-line threshold for runtime-based shutdown.\n")
b.WriteString("# TYPE ananke_ups_threshold_seconds gauge\n")
b.WriteString("# HELP ananke_ups_trigger_active Whether this UPS source currently breaches shutdown trigger conditions.\n")
b.WriteString("# TYPE ananke_ups_trigger_active gauge\n")
b.WriteString("# HELP ananke_ups_breach_count Current debounce breach count for this UPS source.\n")
b.WriteString("# TYPE ananke_ups_breach_count gauge\n")
b.WriteString("# HELP ananke_ups_last_sample_timestamp_seconds Unix timestamp of most recent sample.\n")
b.WriteString("# TYPE ananke_ups_last_sample_timestamp_seconds gauge\n")
b.WriteString("# HELP ananke_ups_error Whether the last sample had an error.\n")
b.WriteString("# TYPE ananke_ups_error gauge\n")
names := make([]string, 0, len(e.samples))
for name := range e.samples {
names = append(names, name)
}
sort.Strings(names)
for _, name := range names {
s := e.samples[name]
labels := fmt.Sprintf("{source=%q,target=%q,status=%q,last_reason=%q}",
safe(name), safe(s.Target), safe(s.Status), safe(e.lastShutdownReason))
b.WriteString(fmt.Sprintf("ananke_ups_on_battery%s %d\n", labels, boolNum(s.OnBattery)))
b.WriteString(fmt.Sprintf("ananke_ups_low_battery%s %d\n", labels, boolNum(s.LowBattery)))
b.WriteString(fmt.Sprintf("ananke_ups_runtime_seconds%s %d\n", labels, s.RuntimeSecond))
b.WriteString(fmt.Sprintf("ananke_ups_battery_charge_percent%s %.2f\n", labels, s.BatteryCharge))
b.WriteString(fmt.Sprintf("ananke_ups_load_percent%s %.2f\n", labels, s.LoadPercent))
b.WriteString(fmt.Sprintf("ananke_ups_power_nominal_watts%s %.2f\n", labels, s.PowerNominalW))
b.WriteString(fmt.Sprintf("ananke_ups_threshold_seconds%s %d\n", labels, s.ThresholdSec))
b.WriteString(fmt.Sprintf("ananke_ups_trigger_active%s %d\n", labels, boolNum(s.Trigger)))
b.WriteString(fmt.Sprintf("ananke_ups_breach_count%s %d\n", labels, s.BreachCount))
if s.UpdatedAt.IsZero() {
b.WriteString(fmt.Sprintf("ananke_ups_last_sample_timestamp_seconds%s 0\n", labels))
} else {
b.WriteString(fmt.Sprintf("ananke_ups_last_sample_timestamp_seconds%s %d\n", labels, s.UpdatedAt.Unix()))
}
b.WriteString(fmt.Sprintf("ananke_ups_error%s %d\n", labels, boolNum(s.LastError != "")))
}
appendQualityGateMetrics(&b)
_, _ = w.Write([]byte(b.String()))
}
// appendQualityGateMetrics runs one orchestration or CLI step.
// Signature: appendQualityGateMetrics(dst *strings.Builder).
// Why: quality-gate pass/fail telemetry should appear alongside UPS metrics so
// Grafana can track Ananke suite health over time.
func appendQualityGateMetrics(dst *strings.Builder) {
path := strings.TrimSpace(os.Getenv("ANANKE_QUALITY_METRICS_FILE"))
if path == "" {
path = "/var/lib/ananke/quality-gate.prom"
}
raw, err := os.ReadFile(path)
if err != nil {
appendDefaultQualityGateMetrics(dst)
return
}
text := strings.TrimSpace(string(raw))
if text == "" {
appendDefaultQualityGateMetrics(dst)
return
}
if dst.Len() > 0 {
dst.WriteString("\n")
}
dst.WriteString(text)
if !strings.HasSuffix(text, "\n") {
dst.WriteString("\n")
}
}
// appendDefaultQualityGateMetrics runs one orchestration or CLI step.
// Signature: appendDefaultQualityGateMetrics(dst *strings.Builder).
// Why: Grafana should always have baseline Ananke quality-gate series so
// overview panels never show "no data" before the first host-side gate run.
func appendDefaultQualityGateMetrics(dst *strings.Builder) {
if dst.Len() > 0 {
dst.WriteString("\n")
}
dst.WriteString("# HELP ananke_quality_gate_runs_total Total Ananke quality gate runs by status.\n")
dst.WriteString("# TYPE ananke_quality_gate_runs_total counter\n")
dst.WriteString("ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"} 0\n")
dst.WriteString("ananke_quality_gate_runs_total{suite=\"ananke\",status=\"failed\"} 0\n")
}
// boolNum runs one orchestration or CLI step.
// Signature: boolNum(v bool) int.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func boolNum(v bool) int {
if v {
return 1
}
return 0
}
// safe runs one orchestration or CLI step.
// Signature: safe(in string) string.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func safe(in string) string {
out := strings.ReplaceAll(in, "\\", "\\\\")
return strings.ReplaceAll(out, "\"", "\\\"")
}