151 lines
5.2 KiB
Go
151 lines
5.2 KiB
Go
package metrics
|
|
|
|
import (
|
|
"fmt"
|
|
"net/http"
|
|
"sort"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
)
|
|
|
|
type Sample struct {
|
|
Name string
|
|
Target string
|
|
OnBattery bool
|
|
LowBattery bool
|
|
RuntimeSecond int
|
|
ThresholdSec int
|
|
Trigger bool
|
|
BreachCount int
|
|
Status string
|
|
LastError string
|
|
UpdatedAt time.Time
|
|
}
|
|
|
|
type Exporter struct {
|
|
mu sync.RWMutex
|
|
shutdownBudgetSec int
|
|
shutdownTriggers int
|
|
lastShutdownReason string
|
|
lastShutdownAt time.Time
|
|
samples map[string]Sample
|
|
}
|
|
|
|
func New() *Exporter {
|
|
return &Exporter{
|
|
samples: make(map[string]Sample),
|
|
}
|
|
}
|
|
|
|
func (e *Exporter) UpdateBudget(seconds int) {
|
|
e.mu.Lock()
|
|
defer e.mu.Unlock()
|
|
e.shutdownBudgetSec = seconds
|
|
}
|
|
|
|
func (e *Exporter) UpdateSample(s Sample) {
|
|
e.mu.Lock()
|
|
defer e.mu.Unlock()
|
|
if s.UpdatedAt.IsZero() {
|
|
s.UpdatedAt = time.Now().UTC()
|
|
}
|
|
e.samples[s.Name] = s
|
|
}
|
|
|
|
func (e *Exporter) MarkShutdown(reason string) {
|
|
e.mu.Lock()
|
|
defer e.mu.Unlock()
|
|
e.shutdownTriggers++
|
|
e.lastShutdownReason = reason
|
|
e.lastShutdownAt = time.Now().UTC()
|
|
}
|
|
|
|
func (e *Exporter) Handler(path string) http.Handler {
|
|
mux := http.NewServeMux()
|
|
metricsPath := path
|
|
if metricsPath == "" {
|
|
metricsPath = "/metrics"
|
|
}
|
|
mux.HandleFunc(metricsPath, e.serveMetrics)
|
|
mux.HandleFunc("/healthz", func(w http.ResponseWriter, _ *http.Request) {
|
|
w.WriteHeader(http.StatusOK)
|
|
_, _ = w.Write([]byte("ok\n"))
|
|
})
|
|
return mux
|
|
}
|
|
|
|
func (e *Exporter) serveMetrics(w http.ResponseWriter, _ *http.Request) {
|
|
e.mu.RLock()
|
|
defer e.mu.RUnlock()
|
|
|
|
w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8")
|
|
var b strings.Builder
|
|
b.WriteString("# HELP hecate_shutdown_budget_seconds Estimated graceful shutdown budget in seconds (p95).\n")
|
|
b.WriteString("# TYPE hecate_shutdown_budget_seconds gauge\n")
|
|
b.WriteString(fmt.Sprintf("hecate_shutdown_budget_seconds %d\n", e.shutdownBudgetSec))
|
|
b.WriteString("# HELP hecate_shutdown_triggers_total Total number of shutdown triggers issued by this instance.\n")
|
|
b.WriteString("# TYPE hecate_shutdown_triggers_total counter\n")
|
|
b.WriteString(fmt.Sprintf("hecate_shutdown_triggers_total %d\n", e.shutdownTriggers))
|
|
b.WriteString("# HELP hecate_shutdown_last_trigger_timestamp_seconds Unix timestamp of last shutdown trigger.\n")
|
|
b.WriteString("# TYPE hecate_shutdown_last_trigger_timestamp_seconds gauge\n")
|
|
if e.lastShutdownAt.IsZero() {
|
|
b.WriteString("hecate_shutdown_last_trigger_timestamp_seconds 0\n")
|
|
} else {
|
|
b.WriteString(fmt.Sprintf("hecate_shutdown_last_trigger_timestamp_seconds %d\n", e.lastShutdownAt.Unix()))
|
|
}
|
|
b.WriteString("# HELP hecate_ups_on_battery Whether a UPS source is currently on battery.\n")
|
|
b.WriteString("# TYPE hecate_ups_on_battery gauge\n")
|
|
b.WriteString("# HELP hecate_ups_low_battery Whether a UPS source currently reports low battery.\n")
|
|
b.WriteString("# TYPE hecate_ups_low_battery gauge\n")
|
|
b.WriteString("# HELP hecate_ups_runtime_seconds Battery runtime remaining reported by UPS.\n")
|
|
b.WriteString("# TYPE hecate_ups_runtime_seconds gauge\n")
|
|
b.WriteString("# HELP hecate_ups_threshold_seconds Red-line threshold for runtime-based shutdown.\n")
|
|
b.WriteString("# TYPE hecate_ups_threshold_seconds gauge\n")
|
|
b.WriteString("# HELP hecate_ups_trigger_active Whether this UPS source currently breaches shutdown trigger conditions.\n")
|
|
b.WriteString("# TYPE hecate_ups_trigger_active gauge\n")
|
|
b.WriteString("# HELP hecate_ups_breach_count Current debounce breach count for this UPS source.\n")
|
|
b.WriteString("# TYPE hecate_ups_breach_count gauge\n")
|
|
b.WriteString("# HELP hecate_ups_last_sample_timestamp_seconds Unix timestamp of most recent sample.\n")
|
|
b.WriteString("# TYPE hecate_ups_last_sample_timestamp_seconds gauge\n")
|
|
b.WriteString("# HELP hecate_ups_error Whether the last sample had an error.\n")
|
|
b.WriteString("# TYPE hecate_ups_error gauge\n")
|
|
|
|
names := make([]string, 0, len(e.samples))
|
|
for name := range e.samples {
|
|
names = append(names, name)
|
|
}
|
|
sort.Strings(names)
|
|
for _, name := range names {
|
|
s := e.samples[name]
|
|
labels := fmt.Sprintf("{source=%q,target=%q,status=%q,last_reason=%q}",
|
|
safe(name), safe(s.Target), safe(s.Status), safe(e.lastShutdownReason))
|
|
b.WriteString(fmt.Sprintf("hecate_ups_on_battery%s %d\n", labels, boolNum(s.OnBattery)))
|
|
b.WriteString(fmt.Sprintf("hecate_ups_low_battery%s %d\n", labels, boolNum(s.LowBattery)))
|
|
b.WriteString(fmt.Sprintf("hecate_ups_runtime_seconds%s %d\n", labels, s.RuntimeSecond))
|
|
b.WriteString(fmt.Sprintf("hecate_ups_threshold_seconds%s %d\n", labels, s.ThresholdSec))
|
|
b.WriteString(fmt.Sprintf("hecate_ups_trigger_active%s %d\n", labels, boolNum(s.Trigger)))
|
|
b.WriteString(fmt.Sprintf("hecate_ups_breach_count%s %d\n", labels, s.BreachCount))
|
|
if s.UpdatedAt.IsZero() {
|
|
b.WriteString(fmt.Sprintf("hecate_ups_last_sample_timestamp_seconds%s 0\n", labels))
|
|
} else {
|
|
b.WriteString(fmt.Sprintf("hecate_ups_last_sample_timestamp_seconds%s %d\n", labels, s.UpdatedAt.Unix()))
|
|
}
|
|
b.WriteString(fmt.Sprintf("hecate_ups_error%s %d\n", labels, boolNum(s.LastError != "")))
|
|
}
|
|
|
|
_, _ = w.Write([]byte(b.String()))
|
|
}
|
|
|
|
func boolNum(v bool) int {
|
|
if v {
|
|
return 1
|
|
}
|
|
return 0
|
|
}
|
|
|
|
func safe(in string) string {
|
|
out := strings.ReplaceAll(in, "\\", "\\\\")
|
|
return strings.ReplaceAll(out, "\"", "\\\"")
|
|
}
|