package metrics import ( "fmt" "net/http" "os" "sort" "strings" "sync" "time" ) type Sample struct { Name string Target string OnBattery bool LowBattery bool RuntimeSecond int BatteryCharge float64 LoadPercent float64 PowerNominalW float64 ThresholdSec int Trigger bool BreachCount int Status string LastError string UpdatedAt time.Time } type GitOpsSnapshot struct { UpdatedAt time.Time ScrapeSuccess bool Errors map[string]string FluxSources []GitOpsFluxSource Kustomizations []GitOpsKustomization HelmReleases []GitOpsHelmRelease } type GitOpsFluxSource struct { Namespace string Name string URL string Branch string Revision string Ready bool Reason string Suspended bool } type GitOpsKustomization struct { Namespace string Name string Path string SourceNamespace string SourceName string Revision string Ready bool Reason string Suspended bool } type GitOpsHelmRelease struct { Namespace string Name string Chart string Version string AppVersion string Revision string Ready bool Reason string Suspended bool } type Exporter struct { mu sync.RWMutex shutdownBudgetSec int shutdownTriggers int lastShutdownReason string lastShutdownAt time.Time samples map[string]Sample gitops GitOpsSnapshot } // New runs one orchestration or CLI step. // Signature: New() *Exporter. // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func New() *Exporter { return &Exporter{ samples: make(map[string]Sample), } } // UpdateBudget runs one orchestration or CLI step. // Signature: (e *Exporter) UpdateBudget(seconds int). // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func (e *Exporter) UpdateBudget(seconds int) { e.mu.Lock() defer e.mu.Unlock() e.shutdownBudgetSec = seconds } // UpdateSample runs one orchestration or CLI step. // Signature: (e *Exporter) UpdateSample(s Sample). // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func (e *Exporter) UpdateSample(s Sample) { e.mu.Lock() defer e.mu.Unlock() if s.UpdatedAt.IsZero() { s.UpdatedAt = time.Now().UTC() } e.samples[s.Name] = s } // UpdateGitOpsSnapshot records the most recent Flux object-state scrape. // Signature: (e *Exporter) UpdateGitOpsSnapshot(snapshot GitOpsSnapshot). // Why: Grafana needs object readiness and branch/revision state, while Flux's // controller metrics only expose controller health in this cluster. func (e *Exporter) UpdateGitOpsSnapshot(snapshot GitOpsSnapshot) { e.mu.Lock() defer e.mu.Unlock() if snapshot.UpdatedAt.IsZero() { snapshot.UpdatedAt = time.Now().UTC() } if snapshot.Errors == nil { snapshot.Errors = map[string]string{} } e.gitops = snapshot } // MarkShutdown runs one orchestration or CLI step. // Signature: (e *Exporter) MarkShutdown(reason string). // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func (e *Exporter) MarkShutdown(reason string) { e.mu.Lock() defer e.mu.Unlock() e.shutdownTriggers++ e.lastShutdownReason = reason e.lastShutdownAt = time.Now().UTC() } // Handler runs one orchestration or CLI step. // Signature: (e *Exporter) Handler(path string) http.Handler. // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func (e *Exporter) Handler(path string) http.Handler { mux := http.NewServeMux() metricsPath := path if metricsPath == "" { metricsPath = "/metrics" } mux.HandleFunc(metricsPath, e.serveMetrics) mux.HandleFunc("/healthz", func(w http.ResponseWriter, _ *http.Request) { w.WriteHeader(http.StatusOK) _, _ = w.Write([]byte("ok\n")) }) return mux } // serveMetrics runs one orchestration or CLI step. // Signature: (e *Exporter) serveMetrics(w http.ResponseWriter, _ *http.Request). // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func (e *Exporter) serveMetrics(w http.ResponseWriter, _ *http.Request) { e.mu.RLock() defer e.mu.RUnlock() w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8") var b strings.Builder b.WriteString("# HELP ananke_shutdown_budget_seconds Estimated graceful shutdown budget in seconds (p95).\n") b.WriteString("# TYPE ananke_shutdown_budget_seconds gauge\n") b.WriteString(fmt.Sprintf("ananke_shutdown_budget_seconds %d\n", e.shutdownBudgetSec)) b.WriteString("# HELP ananke_shutdown_triggers_total Total number of shutdown triggers issued by this instance.\n") b.WriteString("# TYPE ananke_shutdown_triggers_total counter\n") b.WriteString(fmt.Sprintf("ananke_shutdown_triggers_total %d\n", e.shutdownTriggers)) b.WriteString("# HELP ananke_shutdown_last_trigger_timestamp_seconds Unix timestamp of last shutdown trigger.\n") b.WriteString("# TYPE ananke_shutdown_last_trigger_timestamp_seconds gauge\n") if e.lastShutdownAt.IsZero() { b.WriteString("ananke_shutdown_last_trigger_timestamp_seconds 0\n") } else { b.WriteString(fmt.Sprintf("ananke_shutdown_last_trigger_timestamp_seconds %d\n", e.lastShutdownAt.Unix())) } b.WriteString("# HELP ananke_ups_on_battery Whether a UPS source is currently on battery.\n") b.WriteString("# TYPE ananke_ups_on_battery gauge\n") b.WriteString("# HELP ananke_ups_low_battery Whether a UPS source currently reports low battery.\n") b.WriteString("# TYPE ananke_ups_low_battery gauge\n") b.WriteString("# HELP ananke_ups_runtime_seconds Battery runtime remaining reported by UPS.\n") b.WriteString("# TYPE ananke_ups_runtime_seconds gauge\n") b.WriteString("# HELP ananke_ups_battery_charge_percent Battery charge percentage reported by UPS.\n") b.WriteString("# TYPE ananke_ups_battery_charge_percent gauge\n") b.WriteString("# HELP ananke_ups_load_percent UPS output load percentage.\n") b.WriteString("# TYPE ananke_ups_load_percent gauge\n") b.WriteString("# HELP ananke_ups_power_nominal_watts UPS nominal power rating in watts.\n") b.WriteString("# TYPE ananke_ups_power_nominal_watts gauge\n") b.WriteString("# HELP ananke_ups_threshold_seconds Red-line threshold for runtime-based shutdown.\n") b.WriteString("# TYPE ananke_ups_threshold_seconds gauge\n") b.WriteString("# HELP ananke_ups_trigger_active Whether this UPS source currently breaches shutdown trigger conditions.\n") b.WriteString("# TYPE ananke_ups_trigger_active gauge\n") b.WriteString("# HELP ananke_ups_breach_count Current debounce breach count for this UPS source.\n") b.WriteString("# TYPE ananke_ups_breach_count gauge\n") b.WriteString("# HELP ananke_ups_last_sample_timestamp_seconds Unix timestamp of most recent sample.\n") b.WriteString("# TYPE ananke_ups_last_sample_timestamp_seconds gauge\n") b.WriteString("# HELP ananke_ups_error Whether the last sample had an error.\n") b.WriteString("# TYPE ananke_ups_error gauge\n") names := make([]string, 0, len(e.samples)) for name := range e.samples { names = append(names, name) } sort.Strings(names) for _, name := range names { s := e.samples[name] labels := fmt.Sprintf("{source=%q,target=%q,status=%q,last_reason=%q}", safe(name), safe(s.Target), safe(s.Status), safe(e.lastShutdownReason)) b.WriteString(fmt.Sprintf("ananke_ups_on_battery%s %d\n", labels, boolNum(s.OnBattery))) b.WriteString(fmt.Sprintf("ananke_ups_low_battery%s %d\n", labels, boolNum(s.LowBattery))) b.WriteString(fmt.Sprintf("ananke_ups_runtime_seconds%s %d\n", labels, s.RuntimeSecond)) b.WriteString(fmt.Sprintf("ananke_ups_battery_charge_percent%s %.2f\n", labels, s.BatteryCharge)) b.WriteString(fmt.Sprintf("ananke_ups_load_percent%s %.2f\n", labels, s.LoadPercent)) b.WriteString(fmt.Sprintf("ananke_ups_power_nominal_watts%s %.2f\n", labels, s.PowerNominalW)) b.WriteString(fmt.Sprintf("ananke_ups_threshold_seconds%s %d\n", labels, s.ThresholdSec)) b.WriteString(fmt.Sprintf("ananke_ups_trigger_active%s %d\n", labels, boolNum(s.Trigger))) b.WriteString(fmt.Sprintf("ananke_ups_breach_count%s %d\n", labels, s.BreachCount)) if s.UpdatedAt.IsZero() { b.WriteString(fmt.Sprintf("ananke_ups_last_sample_timestamp_seconds%s 0\n", labels)) } else { b.WriteString(fmt.Sprintf("ananke_ups_last_sample_timestamp_seconds%s %d\n", labels, s.UpdatedAt.Unix())) } b.WriteString(fmt.Sprintf("ananke_ups_error%s %d\n", labels, boolNum(s.LastError != ""))) } appendGitOpsMetrics(&b, e.gitops) appendQualityGateMetrics(&b) _, _ = w.Write([]byte(b.String())) } // appendGitOpsMetrics writes Flux object-state metrics collected by the // long-running daemon loop. // Signature: appendGitOpsMetrics(dst *strings.Builder, snapshot GitOpsSnapshot). // Why: this keeps the expensive Kubernetes API reads out of the HTTP scrape path // while still making current GitOps health cheap for Grafana to query. func appendGitOpsMetrics(dst *strings.Builder, snapshot GitOpsSnapshot) { if dst.Len() > 0 { dst.WriteString("\n") } dst.WriteString("# HELP ananke_gitops_last_scrape_timestamp_seconds Unix timestamp of the latest GitOps object-state scrape.\n") dst.WriteString("# TYPE ananke_gitops_last_scrape_timestamp_seconds gauge\n") if snapshot.UpdatedAt.IsZero() { dst.WriteString("ananke_gitops_last_scrape_timestamp_seconds 0\n") } else { dst.WriteString(fmt.Sprintf("ananke_gitops_last_scrape_timestamp_seconds %d\n", snapshot.UpdatedAt.Unix())) } dst.WriteString("# HELP ananke_gitops_scrape_success Whether the latest GitOps object-state scrape completed without errors.\n") dst.WriteString("# TYPE ananke_gitops_scrape_success gauge\n") dst.WriteString(fmt.Sprintf("ananke_gitops_scrape_success %d\n", boolNum(snapshot.ScrapeSuccess))) dst.WriteString("# HELP ananke_gitops_scrape_error Whether a GitOps resource family failed during the latest scrape.\n") dst.WriteString("# TYPE ananke_gitops_scrape_error gauge\n") resources := []string{"gitrepository", "kustomization", "helmrelease"} for _, resource := range resources { _, failed := snapshot.Errors[resource] dst.WriteString(fmt.Sprintf("ananke_gitops_scrape_error{resource=%q} %d\n", resource, boolNum(failed))) } dst.WriteString("# HELP ananke_gitops_flux_source_info Current Flux GitRepository source metadata.\n") dst.WriteString("# TYPE ananke_gitops_flux_source_info gauge\n") dst.WriteString("# HELP ananke_gitops_flux_source_ready Whether a Flux GitRepository source is Ready.\n") dst.WriteString("# TYPE ananke_gitops_flux_source_ready gauge\n") dst.WriteString("# HELP ananke_gitops_flux_source_suspended Whether a Flux GitRepository source is suspended.\n") dst.WriteString("# TYPE ananke_gitops_flux_source_suspended gauge\n") sort.Slice(snapshot.FluxSources, func(i, j int) bool { return snapshot.FluxSources[i].Namespace+"/"+snapshot.FluxSources[i].Name < snapshot.FluxSources[j].Namespace+"/"+snapshot.FluxSources[j].Name }) for _, source := range snapshot.FluxSources { infoLabels := fmt.Sprintf("{namespace=%q,name=%q,url=%q,branch=%q,revision=%q,ready=%q,reason=%q}", safe(source.Namespace), safe(source.Name), safe(source.URL), safe(source.Branch), safe(source.Revision), readyLabel(source.Ready), safe(defaultLabel(source.Reason, "unknown"))) baseLabels := fmt.Sprintf("{namespace=%q,name=%q}", safe(source.Namespace), safe(source.Name)) dst.WriteString(fmt.Sprintf("ananke_gitops_flux_source_info%s 1\n", infoLabels)) dst.WriteString(fmt.Sprintf("ananke_gitops_flux_source_ready%s %d\n", baseLabels, boolNum(source.Ready))) dst.WriteString(fmt.Sprintf("ananke_gitops_flux_source_suspended%s %d\n", baseLabels, boolNum(source.Suspended))) } dst.WriteString("# HELP ananke_gitops_kustomization_info Current Flux Kustomization metadata.\n") dst.WriteString("# TYPE ananke_gitops_kustomization_info gauge\n") dst.WriteString("# HELP ananke_gitops_kustomization_ready Whether a Flux Kustomization is Ready.\n") dst.WriteString("# TYPE ananke_gitops_kustomization_ready gauge\n") dst.WriteString("# HELP ananke_gitops_kustomization_suspended Whether a Flux Kustomization is suspended.\n") dst.WriteString("# TYPE ananke_gitops_kustomization_suspended gauge\n") sort.Slice(snapshot.Kustomizations, func(i, j int) bool { return snapshot.Kustomizations[i].Namespace+"/"+snapshot.Kustomizations[i].Name < snapshot.Kustomizations[j].Namespace+"/"+snapshot.Kustomizations[j].Name }) for _, kustomization := range snapshot.Kustomizations { infoLabels := fmt.Sprintf("{namespace=%q,name=%q,path=%q,source_namespace=%q,source_name=%q,revision=%q,ready=%q,reason=%q}", safe(kustomization.Namespace), safe(kustomization.Name), safe(kustomization.Path), safe(kustomization.SourceNamespace), safe(kustomization.SourceName), safe(kustomization.Revision), readyLabel(kustomization.Ready), safe(defaultLabel(kustomization.Reason, "unknown"))) baseLabels := fmt.Sprintf("{namespace=%q,name=%q}", safe(kustomization.Namespace), safe(kustomization.Name)) dst.WriteString(fmt.Sprintf("ananke_gitops_kustomization_info%s 1\n", infoLabels)) dst.WriteString(fmt.Sprintf("ananke_gitops_kustomization_ready%s %d\n", baseLabels, boolNum(kustomization.Ready))) dst.WriteString(fmt.Sprintf("ananke_gitops_kustomization_suspended%s %d\n", baseLabels, boolNum(kustomization.Suspended))) } dst.WriteString("# HELP ananke_gitops_helmrelease_info Current Flux HelmRelease metadata.\n") dst.WriteString("# TYPE ananke_gitops_helmrelease_info gauge\n") dst.WriteString("# HELP ananke_gitops_helmrelease_ready Whether a Flux HelmRelease is Ready.\n") dst.WriteString("# TYPE ananke_gitops_helmrelease_ready gauge\n") dst.WriteString("# HELP ananke_gitops_helmrelease_suspended Whether a Flux HelmRelease is suspended.\n") dst.WriteString("# TYPE ananke_gitops_helmrelease_suspended gauge\n") sort.Slice(snapshot.HelmReleases, func(i, j int) bool { return snapshot.HelmReleases[i].Namespace+"/"+snapshot.HelmReleases[i].Name < snapshot.HelmReleases[j].Namespace+"/"+snapshot.HelmReleases[j].Name }) for _, release := range snapshot.HelmReleases { infoLabels := fmt.Sprintf("{namespace=%q,name=%q,chart=%q,version=%q,app_version=%q,revision=%q,ready=%q,reason=%q}", safe(release.Namespace), safe(release.Name), safe(release.Chart), safe(release.Version), safe(release.AppVersion), safe(release.Revision), readyLabel(release.Ready), safe(defaultLabel(release.Reason, "unknown"))) baseLabels := fmt.Sprintf("{namespace=%q,name=%q}", safe(release.Namespace), safe(release.Name)) dst.WriteString(fmt.Sprintf("ananke_gitops_helmrelease_info%s 1\n", infoLabels)) dst.WriteString(fmt.Sprintf("ananke_gitops_helmrelease_ready%s %d\n", baseLabels, boolNum(release.Ready))) dst.WriteString(fmt.Sprintf("ananke_gitops_helmrelease_suspended%s %d\n", baseLabels, boolNum(release.Suspended))) } } // appendQualityGateMetrics runs one orchestration or CLI step. // Signature: appendQualityGateMetrics(dst *strings.Builder). // Why: quality-gate pass/fail telemetry should appear alongside UPS metrics so // Grafana can track Ananke suite health over time. func appendQualityGateMetrics(dst *strings.Builder) { path := strings.TrimSpace(os.Getenv("ANANKE_QUALITY_METRICS_FILE")) if path == "" { path = "/var/lib/ananke/quality-gate.prom" } raw, err := os.ReadFile(path) if err != nil { appendDefaultQualityGateMetrics(dst) return } text := strings.TrimSpace(string(raw)) if text == "" { appendDefaultQualityGateMetrics(dst) return } if dst.Len() > 0 { dst.WriteString("\n") } dst.WriteString(text) if !strings.HasSuffix(text, "\n") { dst.WriteString("\n") } } // appendDefaultQualityGateMetrics runs one orchestration or CLI step. // Signature: appendDefaultQualityGateMetrics(dst *strings.Builder). // Why: Grafana should always have baseline Ananke quality-gate series so // overview panels never show "no data" before the first host-side gate run. func appendDefaultQualityGateMetrics(dst *strings.Builder) { if dst.Len() > 0 { dst.WriteString("\n") } dst.WriteString("# HELP ananke_quality_gate_runs_total Total Ananke quality gate runs by status.\n") dst.WriteString("# TYPE ananke_quality_gate_runs_total counter\n") dst.WriteString("ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"} 0\n") dst.WriteString("ananke_quality_gate_runs_total{suite=\"ananke\",status=\"failed\"} 0\n") } // boolNum runs one orchestration or CLI step. // Signature: boolNum(v bool) int. // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func boolNum(v bool) int { if v { return 1 } return 0 } // safe runs one orchestration or CLI step. // Signature: safe(in string) string. // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func safe(in string) string { out := strings.ReplaceAll(in, "\\", "\\\\") return strings.ReplaceAll(out, "\"", "\\\"") } // readyLabel formats a boolean readiness state as a stable label value. // Signature: readyLabel(ready bool) string. // Why: info metrics need a human-readable status label without changing the // numeric ready gauges used for alerting and aggregation. func readyLabel(ready bool) string { if ready { return "true" } return "false" } // defaultLabel returns a safe fallback for empty metric label values. // Signature: defaultLabel(value string, fallback string) string. // Why: Flux status fields can be absent during startup, but Prometheus labels // should remain explicit rather than silently becoming empty strings. func defaultLabel(value string, fallback string) string { value = strings.TrimSpace(value) if value == "" { return fallback } return value }