403 lines
18 KiB
Go
403 lines
18 KiB
Go
package metrics
|
|
|
|
import (
|
|
"fmt"
|
|
"net/http"
|
|
"os"
|
|
"sort"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
)
|
|
|
|
type Sample struct {
|
|
Name string
|
|
Target string
|
|
OnBattery bool
|
|
LowBattery bool
|
|
RuntimeSecond int
|
|
BatteryCharge float64
|
|
LoadPercent float64
|
|
PowerNominalW float64
|
|
ThresholdSec int
|
|
Trigger bool
|
|
BreachCount int
|
|
Status string
|
|
LastError string
|
|
UpdatedAt time.Time
|
|
}
|
|
|
|
type GitOpsSnapshot struct {
|
|
UpdatedAt time.Time
|
|
ScrapeSuccess bool
|
|
Errors map[string]string
|
|
FluxSources []GitOpsFluxSource
|
|
Kustomizations []GitOpsKustomization
|
|
HelmReleases []GitOpsHelmRelease
|
|
}
|
|
|
|
type GitOpsFluxSource struct {
|
|
Namespace string
|
|
Name string
|
|
URL string
|
|
Branch string
|
|
Revision string
|
|
Ready bool
|
|
Reason string
|
|
Suspended bool
|
|
}
|
|
|
|
type GitOpsKustomization struct {
|
|
Namespace string
|
|
Name string
|
|
Path string
|
|
SourceNamespace string
|
|
SourceName string
|
|
Revision string
|
|
Ready bool
|
|
Reason string
|
|
Suspended bool
|
|
}
|
|
|
|
type GitOpsHelmRelease struct {
|
|
Namespace string
|
|
Name string
|
|
Chart string
|
|
Version string
|
|
AppVersion string
|
|
Revision string
|
|
Ready bool
|
|
Reason string
|
|
Suspended bool
|
|
}
|
|
|
|
type Exporter struct {
|
|
mu sync.RWMutex
|
|
shutdownBudgetSec int
|
|
shutdownTriggers int
|
|
lastShutdownReason string
|
|
lastShutdownAt time.Time
|
|
samples map[string]Sample
|
|
gitops GitOpsSnapshot
|
|
}
|
|
|
|
// New runs one orchestration or CLI step.
|
|
// Signature: New() *Exporter.
|
|
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
|
func New() *Exporter {
|
|
return &Exporter{
|
|
samples: make(map[string]Sample),
|
|
}
|
|
}
|
|
|
|
// UpdateBudget runs one orchestration or CLI step.
|
|
// Signature: (e *Exporter) UpdateBudget(seconds int).
|
|
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
|
func (e *Exporter) UpdateBudget(seconds int) {
|
|
e.mu.Lock()
|
|
defer e.mu.Unlock()
|
|
e.shutdownBudgetSec = seconds
|
|
}
|
|
|
|
// UpdateSample runs one orchestration or CLI step.
|
|
// Signature: (e *Exporter) UpdateSample(s Sample).
|
|
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
|
func (e *Exporter) UpdateSample(s Sample) {
|
|
e.mu.Lock()
|
|
defer e.mu.Unlock()
|
|
if s.UpdatedAt.IsZero() {
|
|
s.UpdatedAt = time.Now().UTC()
|
|
}
|
|
e.samples[s.Name] = s
|
|
}
|
|
|
|
// UpdateGitOpsSnapshot records the most recent Flux object-state scrape.
|
|
// Signature: (e *Exporter) UpdateGitOpsSnapshot(snapshot GitOpsSnapshot).
|
|
// Why: Grafana needs object readiness and branch/revision state, while Flux's
|
|
// controller metrics only expose controller health in this cluster.
|
|
func (e *Exporter) UpdateGitOpsSnapshot(snapshot GitOpsSnapshot) {
|
|
e.mu.Lock()
|
|
defer e.mu.Unlock()
|
|
if snapshot.UpdatedAt.IsZero() {
|
|
snapshot.UpdatedAt = time.Now().UTC()
|
|
}
|
|
if snapshot.Errors == nil {
|
|
snapshot.Errors = map[string]string{}
|
|
}
|
|
e.gitops = snapshot
|
|
}
|
|
|
|
// MarkShutdown runs one orchestration or CLI step.
|
|
// Signature: (e *Exporter) MarkShutdown(reason string).
|
|
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
|
func (e *Exporter) MarkShutdown(reason string) {
|
|
e.mu.Lock()
|
|
defer e.mu.Unlock()
|
|
e.shutdownTriggers++
|
|
e.lastShutdownReason = reason
|
|
e.lastShutdownAt = time.Now().UTC()
|
|
}
|
|
|
|
// Handler runs one orchestration or CLI step.
|
|
// Signature: (e *Exporter) Handler(path string) http.Handler.
|
|
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
|
func (e *Exporter) Handler(path string) http.Handler {
|
|
mux := http.NewServeMux()
|
|
metricsPath := path
|
|
if metricsPath == "" {
|
|
metricsPath = "/metrics"
|
|
}
|
|
mux.HandleFunc(metricsPath, e.serveMetrics)
|
|
mux.HandleFunc("/healthz", func(w http.ResponseWriter, _ *http.Request) {
|
|
w.WriteHeader(http.StatusOK)
|
|
_, _ = w.Write([]byte("ok\n"))
|
|
})
|
|
return mux
|
|
}
|
|
|
|
// serveMetrics runs one orchestration or CLI step.
|
|
// Signature: (e *Exporter) serveMetrics(w http.ResponseWriter, _ *http.Request).
|
|
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
|
func (e *Exporter) serveMetrics(w http.ResponseWriter, _ *http.Request) {
|
|
e.mu.RLock()
|
|
defer e.mu.RUnlock()
|
|
|
|
w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8")
|
|
var b strings.Builder
|
|
b.WriteString("# HELP ananke_shutdown_budget_seconds Estimated graceful shutdown budget in seconds (p95).\n")
|
|
b.WriteString("# TYPE ananke_shutdown_budget_seconds gauge\n")
|
|
b.WriteString(fmt.Sprintf("ananke_shutdown_budget_seconds %d\n", e.shutdownBudgetSec))
|
|
b.WriteString("# HELP ananke_shutdown_triggers_total Total number of shutdown triggers issued by this instance.\n")
|
|
b.WriteString("# TYPE ananke_shutdown_triggers_total counter\n")
|
|
b.WriteString(fmt.Sprintf("ananke_shutdown_triggers_total %d\n", e.shutdownTriggers))
|
|
b.WriteString("# HELP ananke_shutdown_last_trigger_timestamp_seconds Unix timestamp of last shutdown trigger.\n")
|
|
b.WriteString("# TYPE ananke_shutdown_last_trigger_timestamp_seconds gauge\n")
|
|
if e.lastShutdownAt.IsZero() {
|
|
b.WriteString("ananke_shutdown_last_trigger_timestamp_seconds 0\n")
|
|
} else {
|
|
b.WriteString(fmt.Sprintf("ananke_shutdown_last_trigger_timestamp_seconds %d\n", e.lastShutdownAt.Unix()))
|
|
}
|
|
b.WriteString("# HELP ananke_ups_on_battery Whether a UPS source is currently on battery.\n")
|
|
b.WriteString("# TYPE ananke_ups_on_battery gauge\n")
|
|
b.WriteString("# HELP ananke_ups_low_battery Whether a UPS source currently reports low battery.\n")
|
|
b.WriteString("# TYPE ananke_ups_low_battery gauge\n")
|
|
b.WriteString("# HELP ananke_ups_runtime_seconds Battery runtime remaining reported by UPS.\n")
|
|
b.WriteString("# TYPE ananke_ups_runtime_seconds gauge\n")
|
|
b.WriteString("# HELP ananke_ups_battery_charge_percent Battery charge percentage reported by UPS.\n")
|
|
b.WriteString("# TYPE ananke_ups_battery_charge_percent gauge\n")
|
|
b.WriteString("# HELP ananke_ups_load_percent UPS output load percentage.\n")
|
|
b.WriteString("# TYPE ananke_ups_load_percent gauge\n")
|
|
b.WriteString("# HELP ananke_ups_power_nominal_watts UPS nominal power rating in watts.\n")
|
|
b.WriteString("# TYPE ananke_ups_power_nominal_watts gauge\n")
|
|
b.WriteString("# HELP ananke_ups_threshold_seconds Red-line threshold for runtime-based shutdown.\n")
|
|
b.WriteString("# TYPE ananke_ups_threshold_seconds gauge\n")
|
|
b.WriteString("# HELP ananke_ups_trigger_active Whether this UPS source currently breaches shutdown trigger conditions.\n")
|
|
b.WriteString("# TYPE ananke_ups_trigger_active gauge\n")
|
|
b.WriteString("# HELP ananke_ups_breach_count Current debounce breach count for this UPS source.\n")
|
|
b.WriteString("# TYPE ananke_ups_breach_count gauge\n")
|
|
b.WriteString("# HELP ananke_ups_last_sample_timestamp_seconds Unix timestamp of most recent sample.\n")
|
|
b.WriteString("# TYPE ananke_ups_last_sample_timestamp_seconds gauge\n")
|
|
b.WriteString("# HELP ananke_ups_error Whether the last sample had an error.\n")
|
|
b.WriteString("# TYPE ananke_ups_error gauge\n")
|
|
|
|
names := make([]string, 0, len(e.samples))
|
|
for name := range e.samples {
|
|
names = append(names, name)
|
|
}
|
|
sort.Strings(names)
|
|
for _, name := range names {
|
|
s := e.samples[name]
|
|
labels := fmt.Sprintf("{source=%q,target=%q,status=%q,last_reason=%q}",
|
|
safe(name), safe(s.Target), safe(s.Status), safe(e.lastShutdownReason))
|
|
b.WriteString(fmt.Sprintf("ananke_ups_on_battery%s %d\n", labels, boolNum(s.OnBattery)))
|
|
b.WriteString(fmt.Sprintf("ananke_ups_low_battery%s %d\n", labels, boolNum(s.LowBattery)))
|
|
b.WriteString(fmt.Sprintf("ananke_ups_runtime_seconds%s %d\n", labels, s.RuntimeSecond))
|
|
b.WriteString(fmt.Sprintf("ananke_ups_battery_charge_percent%s %.2f\n", labels, s.BatteryCharge))
|
|
b.WriteString(fmt.Sprintf("ananke_ups_load_percent%s %.2f\n", labels, s.LoadPercent))
|
|
b.WriteString(fmt.Sprintf("ananke_ups_power_nominal_watts%s %.2f\n", labels, s.PowerNominalW))
|
|
b.WriteString(fmt.Sprintf("ananke_ups_threshold_seconds%s %d\n", labels, s.ThresholdSec))
|
|
b.WriteString(fmt.Sprintf("ananke_ups_trigger_active%s %d\n", labels, boolNum(s.Trigger)))
|
|
b.WriteString(fmt.Sprintf("ananke_ups_breach_count%s %d\n", labels, s.BreachCount))
|
|
if s.UpdatedAt.IsZero() {
|
|
b.WriteString(fmt.Sprintf("ananke_ups_last_sample_timestamp_seconds%s 0\n", labels))
|
|
} else {
|
|
b.WriteString(fmt.Sprintf("ananke_ups_last_sample_timestamp_seconds%s %d\n", labels, s.UpdatedAt.Unix()))
|
|
}
|
|
b.WriteString(fmt.Sprintf("ananke_ups_error%s %d\n", labels, boolNum(s.LastError != "")))
|
|
}
|
|
appendGitOpsMetrics(&b, e.gitops)
|
|
appendQualityGateMetrics(&b)
|
|
|
|
_, _ = w.Write([]byte(b.String()))
|
|
}
|
|
|
|
// appendGitOpsMetrics writes Flux object-state metrics collected by the
|
|
// long-running daemon loop.
|
|
// Signature: appendGitOpsMetrics(dst *strings.Builder, snapshot GitOpsSnapshot).
|
|
// Why: this keeps the expensive Kubernetes API reads out of the HTTP scrape path
|
|
// while still making current GitOps health cheap for Grafana to query.
|
|
func appendGitOpsMetrics(dst *strings.Builder, snapshot GitOpsSnapshot) {
|
|
if dst.Len() > 0 {
|
|
dst.WriteString("\n")
|
|
}
|
|
dst.WriteString("# HELP ananke_gitops_last_scrape_timestamp_seconds Unix timestamp of the latest GitOps object-state scrape.\n")
|
|
dst.WriteString("# TYPE ananke_gitops_last_scrape_timestamp_seconds gauge\n")
|
|
if snapshot.UpdatedAt.IsZero() {
|
|
dst.WriteString("ananke_gitops_last_scrape_timestamp_seconds 0\n")
|
|
} else {
|
|
dst.WriteString(fmt.Sprintf("ananke_gitops_last_scrape_timestamp_seconds %d\n", snapshot.UpdatedAt.Unix()))
|
|
}
|
|
dst.WriteString("# HELP ananke_gitops_scrape_success Whether the latest GitOps object-state scrape completed without errors.\n")
|
|
dst.WriteString("# TYPE ananke_gitops_scrape_success gauge\n")
|
|
dst.WriteString(fmt.Sprintf("ananke_gitops_scrape_success %d\n", boolNum(snapshot.ScrapeSuccess)))
|
|
dst.WriteString("# HELP ananke_gitops_scrape_error Whether a GitOps resource family failed during the latest scrape.\n")
|
|
dst.WriteString("# TYPE ananke_gitops_scrape_error gauge\n")
|
|
resources := []string{"gitrepository", "kustomization", "helmrelease"}
|
|
for _, resource := range resources {
|
|
_, failed := snapshot.Errors[resource]
|
|
dst.WriteString(fmt.Sprintf("ananke_gitops_scrape_error{resource=%q} %d\n", resource, boolNum(failed)))
|
|
}
|
|
|
|
dst.WriteString("# HELP ananke_gitops_flux_source_info Current Flux GitRepository source metadata.\n")
|
|
dst.WriteString("# TYPE ananke_gitops_flux_source_info gauge\n")
|
|
dst.WriteString("# HELP ananke_gitops_flux_source_ready Whether a Flux GitRepository source is Ready.\n")
|
|
dst.WriteString("# TYPE ananke_gitops_flux_source_ready gauge\n")
|
|
dst.WriteString("# HELP ananke_gitops_flux_source_suspended Whether a Flux GitRepository source is suspended.\n")
|
|
dst.WriteString("# TYPE ananke_gitops_flux_source_suspended gauge\n")
|
|
sort.Slice(snapshot.FluxSources, func(i, j int) bool {
|
|
return snapshot.FluxSources[i].Namespace+"/"+snapshot.FluxSources[i].Name < snapshot.FluxSources[j].Namespace+"/"+snapshot.FluxSources[j].Name
|
|
})
|
|
for _, source := range snapshot.FluxSources {
|
|
infoLabels := fmt.Sprintf("{namespace=%q,name=%q,url=%q,branch=%q,revision=%q,ready=%q,reason=%q}",
|
|
safe(source.Namespace), safe(source.Name), safe(source.URL), safe(source.Branch),
|
|
safe(source.Revision), readyLabel(source.Ready), safe(defaultLabel(source.Reason, "unknown")))
|
|
baseLabels := fmt.Sprintf("{namespace=%q,name=%q}", safe(source.Namespace), safe(source.Name))
|
|
dst.WriteString(fmt.Sprintf("ananke_gitops_flux_source_info%s 1\n", infoLabels))
|
|
dst.WriteString(fmt.Sprintf("ananke_gitops_flux_source_ready%s %d\n", baseLabels, boolNum(source.Ready)))
|
|
dst.WriteString(fmt.Sprintf("ananke_gitops_flux_source_suspended%s %d\n", baseLabels, boolNum(source.Suspended)))
|
|
}
|
|
|
|
dst.WriteString("# HELP ananke_gitops_kustomization_info Current Flux Kustomization metadata.\n")
|
|
dst.WriteString("# TYPE ananke_gitops_kustomization_info gauge\n")
|
|
dst.WriteString("# HELP ananke_gitops_kustomization_ready Whether a Flux Kustomization is Ready.\n")
|
|
dst.WriteString("# TYPE ananke_gitops_kustomization_ready gauge\n")
|
|
dst.WriteString("# HELP ananke_gitops_kustomization_suspended Whether a Flux Kustomization is suspended.\n")
|
|
dst.WriteString("# TYPE ananke_gitops_kustomization_suspended gauge\n")
|
|
sort.Slice(snapshot.Kustomizations, func(i, j int) bool {
|
|
return snapshot.Kustomizations[i].Namespace+"/"+snapshot.Kustomizations[i].Name < snapshot.Kustomizations[j].Namespace+"/"+snapshot.Kustomizations[j].Name
|
|
})
|
|
for _, kustomization := range snapshot.Kustomizations {
|
|
infoLabels := fmt.Sprintf("{namespace=%q,name=%q,path=%q,source_namespace=%q,source_name=%q,revision=%q,ready=%q,reason=%q}",
|
|
safe(kustomization.Namespace), safe(kustomization.Name), safe(kustomization.Path),
|
|
safe(kustomization.SourceNamespace), safe(kustomization.SourceName), safe(kustomization.Revision),
|
|
readyLabel(kustomization.Ready), safe(defaultLabel(kustomization.Reason, "unknown")))
|
|
baseLabels := fmt.Sprintf("{namespace=%q,name=%q}", safe(kustomization.Namespace), safe(kustomization.Name))
|
|
dst.WriteString(fmt.Sprintf("ananke_gitops_kustomization_info%s 1\n", infoLabels))
|
|
dst.WriteString(fmt.Sprintf("ananke_gitops_kustomization_ready%s %d\n", baseLabels, boolNum(kustomization.Ready)))
|
|
dst.WriteString(fmt.Sprintf("ananke_gitops_kustomization_suspended%s %d\n", baseLabels, boolNum(kustomization.Suspended)))
|
|
}
|
|
|
|
dst.WriteString("# HELP ananke_gitops_helmrelease_info Current Flux HelmRelease metadata.\n")
|
|
dst.WriteString("# TYPE ananke_gitops_helmrelease_info gauge\n")
|
|
dst.WriteString("# HELP ananke_gitops_helmrelease_ready Whether a Flux HelmRelease is Ready.\n")
|
|
dst.WriteString("# TYPE ananke_gitops_helmrelease_ready gauge\n")
|
|
dst.WriteString("# HELP ananke_gitops_helmrelease_suspended Whether a Flux HelmRelease is suspended.\n")
|
|
dst.WriteString("# TYPE ananke_gitops_helmrelease_suspended gauge\n")
|
|
sort.Slice(snapshot.HelmReleases, func(i, j int) bool {
|
|
return snapshot.HelmReleases[i].Namespace+"/"+snapshot.HelmReleases[i].Name < snapshot.HelmReleases[j].Namespace+"/"+snapshot.HelmReleases[j].Name
|
|
})
|
|
for _, release := range snapshot.HelmReleases {
|
|
infoLabels := fmt.Sprintf("{namespace=%q,name=%q,chart=%q,version=%q,app_version=%q,revision=%q,ready=%q,reason=%q}",
|
|
safe(release.Namespace), safe(release.Name), safe(release.Chart), safe(release.Version),
|
|
safe(release.AppVersion), safe(release.Revision), readyLabel(release.Ready),
|
|
safe(defaultLabel(release.Reason, "unknown")))
|
|
baseLabels := fmt.Sprintf("{namespace=%q,name=%q}", safe(release.Namespace), safe(release.Name))
|
|
dst.WriteString(fmt.Sprintf("ananke_gitops_helmrelease_info%s 1\n", infoLabels))
|
|
dst.WriteString(fmt.Sprintf("ananke_gitops_helmrelease_ready%s %d\n", baseLabels, boolNum(release.Ready)))
|
|
dst.WriteString(fmt.Sprintf("ananke_gitops_helmrelease_suspended%s %d\n", baseLabels, boolNum(release.Suspended)))
|
|
}
|
|
}
|
|
|
|
// appendQualityGateMetrics runs one orchestration or CLI step.
|
|
// Signature: appendQualityGateMetrics(dst *strings.Builder).
|
|
// Why: quality-gate pass/fail telemetry should appear alongside UPS metrics so
|
|
// Grafana can track Ananke suite health over time.
|
|
func appendQualityGateMetrics(dst *strings.Builder) {
|
|
path := strings.TrimSpace(os.Getenv("ANANKE_QUALITY_METRICS_FILE"))
|
|
if path == "" {
|
|
path = "/var/lib/ananke/quality-gate.prom"
|
|
}
|
|
raw, err := os.ReadFile(path)
|
|
if err != nil {
|
|
appendDefaultQualityGateMetrics(dst)
|
|
return
|
|
}
|
|
text := strings.TrimSpace(string(raw))
|
|
if text == "" {
|
|
appendDefaultQualityGateMetrics(dst)
|
|
return
|
|
}
|
|
if dst.Len() > 0 {
|
|
dst.WriteString("\n")
|
|
}
|
|
dst.WriteString(text)
|
|
if !strings.HasSuffix(text, "\n") {
|
|
dst.WriteString("\n")
|
|
}
|
|
}
|
|
|
|
// appendDefaultQualityGateMetrics runs one orchestration or CLI step.
|
|
// Signature: appendDefaultQualityGateMetrics(dst *strings.Builder).
|
|
// Why: Grafana should always have baseline Ananke quality-gate series so
|
|
// overview panels never show "no data" before the first host-side gate run.
|
|
func appendDefaultQualityGateMetrics(dst *strings.Builder) {
|
|
if dst.Len() > 0 {
|
|
dst.WriteString("\n")
|
|
}
|
|
dst.WriteString("# HELP ananke_quality_gate_runs_total Total Ananke quality gate runs by status.\n")
|
|
dst.WriteString("# TYPE ananke_quality_gate_runs_total counter\n")
|
|
dst.WriteString("ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"} 0\n")
|
|
dst.WriteString("ananke_quality_gate_runs_total{suite=\"ananke\",status=\"failed\"} 0\n")
|
|
}
|
|
|
|
// boolNum runs one orchestration or CLI step.
|
|
// Signature: boolNum(v bool) int.
|
|
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
|
func boolNum(v bool) int {
|
|
if v {
|
|
return 1
|
|
}
|
|
return 0
|
|
}
|
|
|
|
// safe runs one orchestration or CLI step.
|
|
// Signature: safe(in string) string.
|
|
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
|
func safe(in string) string {
|
|
out := strings.ReplaceAll(in, "\\", "\\\\")
|
|
return strings.ReplaceAll(out, "\"", "\\\"")
|
|
}
|
|
|
|
// readyLabel formats a boolean readiness state as a stable label value.
|
|
// Signature: readyLabel(ready bool) string.
|
|
// Why: info metrics need a human-readable status label without changing the
|
|
// numeric ready gauges used for alerting and aggregation.
|
|
func readyLabel(ready bool) string {
|
|
if ready {
|
|
return "true"
|
|
}
|
|
return "false"
|
|
}
|
|
|
|
// defaultLabel returns a safe fallback for empty metric label values.
|
|
// Signature: defaultLabel(value string, fallback string) string.
|
|
// Why: Flux status fields can be absent during startup, but Prometheus labels
|
|
// should remain explicit rather than silently becoming empty strings.
|
|
func defaultLabel(value string, fallback string) string {
|
|
value = strings.TrimSpace(value)
|
|
if value == "" {
|
|
return fallback
|
|
}
|
|
return value
|
|
}
|