monitoring: export gitops state from ananke

This commit is contained in:
codex 2026-05-15 19:36:58 -03:00
parent 3cc980795a
commit 087728d481
17 changed files with 1117 additions and 11 deletions

4
Jenkinsfile vendored
View File

@ -311,6 +311,10 @@ PY
import re
from pathlib import Path
coverage_path = Path("build/coverage-percent.txt")
if coverage_path.exists():
print(coverage_path.read_text(encoding="utf-8", errors="ignore").strip() or "0.0")
raise SystemExit(0)
log_path = Path("build/quality-gate.out")
text = log_path.read_text(encoding="utf-8", errors="ignore") if log_path.exists() else ""
values = [float(match.group(1)) for match in re.finditer(r"([0-9]+(?:\\.[0-9]+)?)%", text)]

View File

@ -197,6 +197,9 @@ metrics:
enabled: true
bind_addr: 0.0.0.0:9560
path: /metrics
gitops:
enabled: true
poll_seconds: 60
state:
dir: /var/lib/ananke
reports_dir: /var/lib/ananke/reports

View File

@ -329,6 +329,9 @@ metrics:
enabled: true
bind_addr: 0.0.0.0:9560
path: /metrics
gitops:
enabled: true
poll_seconds: 60
state:
dir: /var/lib/ananke
reports_dir: /var/lib/ananke/reports

View File

@ -329,6 +329,9 @@ metrics:
enabled: true
bind_addr: 0.0.0.0:9560
path: /metrics
gitops:
enabled: true
poll_seconds: 60
state:
dir: /var/lib/ananke
reports_dir: /var/lib/ananke/reports

View File

@ -270,6 +270,9 @@ func (c *Config) applyDefaults() {
if c.Metrics.Path == "" {
c.Metrics.Path = "/metrics"
}
if c.GitOps.PollSeconds <= 0 {
c.GitOps.PollSeconds = 60
}
if c.State.Dir == "" {
c.State.Dir = "/var/lib/ananke"
}

View File

@ -156,6 +156,10 @@ func defaults() Config {
BindAddr: "0.0.0.0:9560",
Path: "/metrics",
},
GitOps: GitOps{
Enabled: true,
PollSeconds: 60,
},
State: State{
Dir: "/var/lib/ananke",
ReportsDir: "/var/lib/ananke/reports",

View File

@ -23,6 +23,7 @@ type Config struct {
UPS UPS `yaml:"ups"`
Coordination Coordination `yaml:"coordination"`
Metrics Metrics `yaml:"metrics"`
GitOps GitOps `yaml:"gitops"`
State State `yaml:"state"`
}
@ -174,6 +175,11 @@ type Metrics struct {
Path string `yaml:"path"`
}
type GitOps struct {
Enabled bool `yaml:"enabled"`
PollSeconds int `yaml:"poll_seconds"`
}
type State struct {
Dir string `yaml:"dir"`
ReportsDir string `yaml:"reports_dir"`

View File

@ -27,6 +27,50 @@ type Sample struct {
UpdatedAt time.Time
}
type GitOpsSnapshot struct {
UpdatedAt time.Time
ScrapeSuccess bool
Errors map[string]string
FluxSources []GitOpsFluxSource
Kustomizations []GitOpsKustomization
HelmReleases []GitOpsHelmRelease
}
type GitOpsFluxSource struct {
Namespace string
Name string
URL string
Branch string
Revision string
Ready bool
Reason string
Suspended bool
}
type GitOpsKustomization struct {
Namespace string
Name string
Path string
SourceNamespace string
SourceName string
Revision string
Ready bool
Reason string
Suspended bool
}
type GitOpsHelmRelease struct {
Namespace string
Name string
Chart string
Version string
AppVersion string
Revision string
Ready bool
Reason string
Suspended bool
}
type Exporter struct {
mu sync.RWMutex
shutdownBudgetSec int
@ -34,6 +78,7 @@ type Exporter struct {
lastShutdownReason string
lastShutdownAt time.Time
samples map[string]Sample
gitops GitOpsSnapshot
}
// New runs one orchestration or CLI step.
@ -66,6 +111,22 @@ func (e *Exporter) UpdateSample(s Sample) {
e.samples[s.Name] = s
}
// UpdateGitOpsSnapshot records the most recent Flux object-state scrape.
// Signature: (e *Exporter) UpdateGitOpsSnapshot(snapshot GitOpsSnapshot).
// Why: Grafana needs object readiness and branch/revision state, while Flux's
// controller metrics only expose controller health in this cluster.
func (e *Exporter) UpdateGitOpsSnapshot(snapshot GitOpsSnapshot) {
e.mu.Lock()
defer e.mu.Unlock()
if snapshot.UpdatedAt.IsZero() {
snapshot.UpdatedAt = time.Now().UTC()
}
if snapshot.Errors == nil {
snapshot.Errors = map[string]string{}
}
e.gitops = snapshot
}
// MarkShutdown runs one orchestration or CLI step.
// Signature: (e *Exporter) MarkShutdown(reason string).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
@ -164,11 +225,99 @@ func (e *Exporter) serveMetrics(w http.ResponseWriter, _ *http.Request) {
}
b.WriteString(fmt.Sprintf("ananke_ups_error%s %d\n", labels, boolNum(s.LastError != "")))
}
appendGitOpsMetrics(&b, e.gitops)
appendQualityGateMetrics(&b)
_, _ = w.Write([]byte(b.String()))
}
// appendGitOpsMetrics writes Flux object-state metrics collected by the
// long-running daemon loop.
// Signature: appendGitOpsMetrics(dst *strings.Builder, snapshot GitOpsSnapshot).
// Why: this keeps the expensive Kubernetes API reads out of the HTTP scrape path
// while still making current GitOps health cheap for Grafana to query.
func appendGitOpsMetrics(dst *strings.Builder, snapshot GitOpsSnapshot) {
if dst.Len() > 0 {
dst.WriteString("\n")
}
dst.WriteString("# HELP ananke_gitops_last_scrape_timestamp_seconds Unix timestamp of the latest GitOps object-state scrape.\n")
dst.WriteString("# TYPE ananke_gitops_last_scrape_timestamp_seconds gauge\n")
if snapshot.UpdatedAt.IsZero() {
dst.WriteString("ananke_gitops_last_scrape_timestamp_seconds 0\n")
} else {
dst.WriteString(fmt.Sprintf("ananke_gitops_last_scrape_timestamp_seconds %d\n", snapshot.UpdatedAt.Unix()))
}
dst.WriteString("# HELP ananke_gitops_scrape_success Whether the latest GitOps object-state scrape completed without errors.\n")
dst.WriteString("# TYPE ananke_gitops_scrape_success gauge\n")
dst.WriteString(fmt.Sprintf("ananke_gitops_scrape_success %d\n", boolNum(snapshot.ScrapeSuccess)))
dst.WriteString("# HELP ananke_gitops_scrape_error Whether a GitOps resource family failed during the latest scrape.\n")
dst.WriteString("# TYPE ananke_gitops_scrape_error gauge\n")
resources := []string{"gitrepository", "kustomization", "helmrelease"}
for _, resource := range resources {
_, failed := snapshot.Errors[resource]
dst.WriteString(fmt.Sprintf("ananke_gitops_scrape_error{resource=%q} %d\n", resource, boolNum(failed)))
}
dst.WriteString("# HELP ananke_gitops_flux_source_info Current Flux GitRepository source metadata.\n")
dst.WriteString("# TYPE ananke_gitops_flux_source_info gauge\n")
dst.WriteString("# HELP ananke_gitops_flux_source_ready Whether a Flux GitRepository source is Ready.\n")
dst.WriteString("# TYPE ananke_gitops_flux_source_ready gauge\n")
dst.WriteString("# HELP ananke_gitops_flux_source_suspended Whether a Flux GitRepository source is suspended.\n")
dst.WriteString("# TYPE ananke_gitops_flux_source_suspended gauge\n")
sort.Slice(snapshot.FluxSources, func(i, j int) bool {
return snapshot.FluxSources[i].Namespace+"/"+snapshot.FluxSources[i].Name < snapshot.FluxSources[j].Namespace+"/"+snapshot.FluxSources[j].Name
})
for _, source := range snapshot.FluxSources {
infoLabels := fmt.Sprintf("{namespace=%q,name=%q,url=%q,branch=%q,revision=%q,ready=%q,reason=%q}",
safe(source.Namespace), safe(source.Name), safe(source.URL), safe(source.Branch),
safe(source.Revision), readyLabel(source.Ready), safe(defaultLabel(source.Reason, "unknown")))
baseLabels := fmt.Sprintf("{namespace=%q,name=%q}", safe(source.Namespace), safe(source.Name))
dst.WriteString(fmt.Sprintf("ananke_gitops_flux_source_info%s 1\n", infoLabels))
dst.WriteString(fmt.Sprintf("ananke_gitops_flux_source_ready%s %d\n", baseLabels, boolNum(source.Ready)))
dst.WriteString(fmt.Sprintf("ananke_gitops_flux_source_suspended%s %d\n", baseLabels, boolNum(source.Suspended)))
}
dst.WriteString("# HELP ananke_gitops_kustomization_info Current Flux Kustomization metadata.\n")
dst.WriteString("# TYPE ananke_gitops_kustomization_info gauge\n")
dst.WriteString("# HELP ananke_gitops_kustomization_ready Whether a Flux Kustomization is Ready.\n")
dst.WriteString("# TYPE ananke_gitops_kustomization_ready gauge\n")
dst.WriteString("# HELP ananke_gitops_kustomization_suspended Whether a Flux Kustomization is suspended.\n")
dst.WriteString("# TYPE ananke_gitops_kustomization_suspended gauge\n")
sort.Slice(snapshot.Kustomizations, func(i, j int) bool {
return snapshot.Kustomizations[i].Namespace+"/"+snapshot.Kustomizations[i].Name < snapshot.Kustomizations[j].Namespace+"/"+snapshot.Kustomizations[j].Name
})
for _, kustomization := range snapshot.Kustomizations {
infoLabels := fmt.Sprintf("{namespace=%q,name=%q,path=%q,source_namespace=%q,source_name=%q,revision=%q,ready=%q,reason=%q}",
safe(kustomization.Namespace), safe(kustomization.Name), safe(kustomization.Path),
safe(kustomization.SourceNamespace), safe(kustomization.SourceName), safe(kustomization.Revision),
readyLabel(kustomization.Ready), safe(defaultLabel(kustomization.Reason, "unknown")))
baseLabels := fmt.Sprintf("{namespace=%q,name=%q}", safe(kustomization.Namespace), safe(kustomization.Name))
dst.WriteString(fmt.Sprintf("ananke_gitops_kustomization_info%s 1\n", infoLabels))
dst.WriteString(fmt.Sprintf("ananke_gitops_kustomization_ready%s %d\n", baseLabels, boolNum(kustomization.Ready)))
dst.WriteString(fmt.Sprintf("ananke_gitops_kustomization_suspended%s %d\n", baseLabels, boolNum(kustomization.Suspended)))
}
dst.WriteString("# HELP ananke_gitops_helmrelease_info Current Flux HelmRelease metadata.\n")
dst.WriteString("# TYPE ananke_gitops_helmrelease_info gauge\n")
dst.WriteString("# HELP ananke_gitops_helmrelease_ready Whether a Flux HelmRelease is Ready.\n")
dst.WriteString("# TYPE ananke_gitops_helmrelease_ready gauge\n")
dst.WriteString("# HELP ananke_gitops_helmrelease_suspended Whether a Flux HelmRelease is suspended.\n")
dst.WriteString("# TYPE ananke_gitops_helmrelease_suspended gauge\n")
sort.Slice(snapshot.HelmReleases, func(i, j int) bool {
return snapshot.HelmReleases[i].Namespace+"/"+snapshot.HelmReleases[i].Name < snapshot.HelmReleases[j].Namespace+"/"+snapshot.HelmReleases[j].Name
})
for _, release := range snapshot.HelmReleases {
infoLabels := fmt.Sprintf("{namespace=%q,name=%q,chart=%q,version=%q,app_version=%q,revision=%q,ready=%q,reason=%q}",
safe(release.Namespace), safe(release.Name), safe(release.Chart), safe(release.Version),
safe(release.AppVersion), safe(release.Revision), readyLabel(release.Ready),
safe(defaultLabel(release.Reason, "unknown")))
baseLabels := fmt.Sprintf("{namespace=%q,name=%q}", safe(release.Namespace), safe(release.Name))
dst.WriteString(fmt.Sprintf("ananke_gitops_helmrelease_info%s 1\n", infoLabels))
dst.WriteString(fmt.Sprintf("ananke_gitops_helmrelease_ready%s %d\n", baseLabels, boolNum(release.Ready)))
dst.WriteString(fmt.Sprintf("ananke_gitops_helmrelease_suspended%s %d\n", baseLabels, boolNum(release.Suspended)))
}
}
// appendQualityGateMetrics runs one orchestration or CLI step.
// Signature: appendQualityGateMetrics(dst *strings.Builder).
// Why: quality-gate pass/fail telemetry should appear alongside UPS metrics so
@ -228,3 +377,26 @@ func safe(in string) string {
out := strings.ReplaceAll(in, "\\", "\\\\")
return strings.ReplaceAll(out, "\"", "\\\"")
}
// readyLabel formats a boolean readiness state as a stable label value.
// Signature: readyLabel(ready bool) string.
// Why: info metrics need a human-readable status label without changing the
// numeric ready gauges used for alerting and aggregation.
func readyLabel(ready bool) string {
if ready {
return "true"
}
return "false"
}
// defaultLabel returns a safe fallback for empty metric label values.
// Signature: defaultLabel(value string, fallback string) string.
// Why: Flux status fields can be absent during startup, but Prometheus labels
// should remain explicit rather than silently becoming empty strings.
func defaultLabel(value string, fallback string) string {
value = strings.TrimSpace(value)
if value == "" {
return fallback
}
return value
}

View File

@ -97,6 +97,9 @@ func (d *Daemon) Run(ctx context.Context) error {
onBatterySince := map[string]time.Time{}
breachCount := map[string]int{}
lastAutoHeal := time.Time{}
lastGitOpsPoll := time.Time{}
gitOpsPollRunning := false
gitOpsDone := make(chan struct{}, 1)
for _, t := range d.targets {
lastGood[t.Name] = time.Now()
}
@ -108,6 +111,8 @@ func (d *Daemon) Run(ctx context.Context) error {
select {
case <-ctx.Done():
return ctx.Err()
case <-gitOpsDone:
gitOpsPollRunning = false
case <-t.C:
budget := d.orch.EstimatedEmergencyShutdownSeconds()
threshold := int(math.Ceil(float64(budget) * d.cfg.UPS.RuntimeSafetyFactor))
@ -201,6 +206,7 @@ func (d *Daemon) Run(ctx context.Context) error {
}
d.maybeRunPostStartAutoHeal(ctx, &lastAutoHeal, anyOnBattery)
gitOpsPollRunning = d.maybeStartGitOpsSnapshot(ctx, &lastGitOpsPoll, gitOpsPollRunning, gitOpsDone)
}
}
}

View File

@ -0,0 +1,355 @@
package service
import (
"context"
"encoding/json"
"fmt"
"os/exec"
"sort"
"strings"
"time"
"scm.bstein.dev/bstein/ananke/internal/config"
"scm.bstein.dev/bstein/ananke/internal/metrics"
)
var gitOpsKubectlOutput = runGitOpsKubectlOutput
type gitOpsResourceList struct {
Items []gitOpsResource `json:"items"`
}
type gitOpsResource struct {
Metadata struct {
Name string `json:"name"`
Namespace string `json:"namespace"`
} `json:"metadata"`
Spec struct {
Suspend bool `json:"suspend"`
Ref struct {
Branch string `json:"branch"`
Tag string `json:"tag"`
SemVer string `json:"semver"`
} `json:"ref"`
URL string `json:"url"`
Path string `json:"path"`
SourceRef struct {
Name string `json:"name"`
Namespace string `json:"namespace"`
} `json:"sourceRef"`
Chart struct {
Spec struct {
Chart string `json:"chart"`
Version string `json:"version"`
} `json:"spec"`
} `json:"chart"`
} `json:"spec"`
Status struct {
Artifact struct {
Revision string `json:"revision"`
} `json:"artifact"`
LastAppliedRevision string `json:"lastAppliedRevision"`
LastAttemptedRevision string `json:"lastAttemptedRevision"`
Conditions []struct {
Type string `json:"type"`
Status string `json:"status"`
Reason string `json:"reason"`
} `json:"conditions"`
History []struct {
ChartName string `json:"chartName"`
ChartVersion string `json:"chartVersion"`
AppVersion string `json:"appVersion"`
Digest string `json:"digest"`
} `json:"history"`
} `json:"status"`
}
// maybeStartGitOpsSnapshot starts a bounded background scrape when the GitOps
// sample interval has elapsed.
// Signature: (d *Daemon) maybeStartGitOpsSnapshot(ctx context.Context, lastRun *time.Time, running bool, done chan<- struct{}) bool.
// Why: Kubernetes API reads must not block UPS polling, especially during
// emergency power events when the daemon's first job is still safe shutdown.
func (d *Daemon) maybeStartGitOpsSnapshot(ctx context.Context, lastRun *time.Time, running bool, done chan<- struct{}) bool {
if !d.cfg.GitOps.Enabled || d.exporter == nil || running {
return running
}
interval := time.Duration(d.cfg.GitOps.PollSeconds) * time.Second
if interval <= 0 {
interval = 60 * time.Second
}
now := time.Now()
if lastRun != nil && !lastRun.IsZero() && now.Sub(*lastRun) < interval {
return running
}
if lastRun != nil {
*lastRun = now
}
cfg := d.cfg
exporter := d.exporter
logger := d.log
go func() {
defer func() {
select {
case done <- struct{}{}:
default:
}
}()
scrapeCtx, cancel := context.WithTimeout(ctx, 25*time.Second)
defer cancel()
snapshot := collectGitOpsSnapshot(scrapeCtx, cfg)
exporter.UpdateGitOpsSnapshot(snapshot)
if !snapshot.ScrapeSuccess && logger != nil {
logger.Printf("warning: gitops metrics scrape partial: %s", gitOpsErrorSummary(snapshot.Errors))
}
}()
return true
}
// collectGitOpsSnapshot queries Flux custom resources and converts them into
// Prometheus-safe metric state.
// Signature: collectGitOpsSnapshot(ctx context.Context, cfg config.Config) metrics.GitOpsSnapshot.
// Why: the Grafana overview needs current branch/readiness/suspend state, which
// is object status rather than controller operational telemetry.
func collectGitOpsSnapshot(ctx context.Context, cfg config.Config) metrics.GitOpsSnapshot {
snapshot := metrics.GitOpsSnapshot{
UpdatedAt: time.Now().UTC(),
ScrapeSuccess: true,
Errors: map[string]string{},
}
if raw, err := gitOpsKubectlJSON(ctx, cfg, "get", "gitrepositories.source.toolkit.fluxcd.io", "-A", "-o", "json"); err != nil {
snapshot.Errors["gitrepository"] = err.Error()
} else if sources, err := parseGitRepositories(raw); err != nil {
snapshot.Errors["gitrepository"] = err.Error()
} else {
snapshot.FluxSources = sources
}
if raw, err := gitOpsKubectlJSON(ctx, cfg, "get", "kustomizations.kustomize.toolkit.fluxcd.io", "-A", "-o", "json"); err != nil {
snapshot.Errors["kustomization"] = err.Error()
} else if kustomizations, err := parseKustomizations(raw); err != nil {
snapshot.Errors["kustomization"] = err.Error()
} else {
snapshot.Kustomizations = kustomizations
}
if raw, err := gitOpsKubectlJSON(ctx, cfg, "get", "helmreleases.helm.toolkit.fluxcd.io", "-A", "-o", "json"); err != nil {
snapshot.Errors["helmrelease"] = err.Error()
} else if releases, err := parseHelmReleases(raw); err != nil {
snapshot.Errors["helmrelease"] = err.Error()
} else {
snapshot.HelmReleases = releases
}
snapshot.ScrapeSuccess = len(snapshot.Errors) == 0
return snapshot
}
// parseGitRepositories extracts Git source branch/revision/readiness from Flux
// GitRepository JSON.
// Signature: parseGitRepositories(raw []byte) ([]metrics.GitOpsFluxSource, error).
// Why: keeping status parsing isolated makes it testable without a live Flux
// API and protects the daemon loop from brittle JSON traversal code.
func parseGitRepositories(raw []byte) ([]metrics.GitOpsFluxSource, error) {
var list gitOpsResourceList
if err := json.Unmarshal(raw, &list); err != nil {
return nil, fmt.Errorf("decode gitrepositories: %w", err)
}
out := make([]metrics.GitOpsFluxSource, 0, len(list.Items))
for _, item := range list.Items {
ready, reason := fluxReady(item.Status.Conditions)
out = append(out, metrics.GitOpsFluxSource{
Namespace: defaultString(item.Metadata.Namespace, "default"),
Name: item.Metadata.Name,
URL: item.Spec.URL,
Branch: sourceBranch(item),
Revision: item.Status.Artifact.Revision,
Ready: ready,
Reason: reason,
Suspended: item.Spec.Suspend,
})
}
sort.Slice(out, func(i, j int) bool {
return out[i].Namespace+"/"+out[i].Name < out[j].Namespace+"/"+out[j].Name
})
return out, nil
}
// parseKustomizations extracts Kustomization health and source metadata from
// Flux Kustomization JSON.
// Signature: parseKustomizations(raw []byte) ([]metrics.GitOpsKustomization, error).
// Why: Kustomization status drives both the overview summary and the detailed
// GitOps table, so the parser keeps the metric contract in one place.
func parseKustomizations(raw []byte) ([]metrics.GitOpsKustomization, error) {
var list gitOpsResourceList
if err := json.Unmarshal(raw, &list); err != nil {
return nil, fmt.Errorf("decode kustomizations: %w", err)
}
out := make([]metrics.GitOpsKustomization, 0, len(list.Items))
for _, item := range list.Items {
ready, reason := fluxReady(item.Status.Conditions)
namespace := defaultString(item.Metadata.Namespace, "default")
out = append(out, metrics.GitOpsKustomization{
Namespace: namespace,
Name: item.Metadata.Name,
Path: item.Spec.Path,
SourceNamespace: defaultString(item.Spec.SourceRef.Namespace, namespace),
SourceName: item.Spec.SourceRef.Name,
Revision: firstNonEmpty(item.Status.LastAppliedRevision, item.Status.LastAttemptedRevision),
Ready: ready,
Reason: reason,
Suspended: item.Spec.Suspend,
})
}
sort.Slice(out, func(i, j int) bool {
return out[i].Namespace+"/"+out[i].Name < out[j].Namespace+"/"+out[j].Name
})
return out, nil
}
// parseHelmReleases extracts HelmRelease health and chart metadata from Flux
// HelmRelease JSON.
// Signature: parseHelmReleases(raw []byte) ([]metrics.GitOpsHelmRelease, error).
// Why: HelmRelease status has different chart fields than Kustomizations, and
// isolating it keeps dashboard metrics stable as Flux payloads evolve.
func parseHelmReleases(raw []byte) ([]metrics.GitOpsHelmRelease, error) {
var list gitOpsResourceList
if err := json.Unmarshal(raw, &list); err != nil {
return nil, fmt.Errorf("decode helmreleases: %w", err)
}
out := make([]metrics.GitOpsHelmRelease, 0, len(list.Items))
for _, item := range list.Items {
ready, reason := fluxReady(item.Status.Conditions)
chart := item.Spec.Chart.Spec.Chart
version := item.Spec.Chart.Spec.Version
appVersion := ""
revision := firstNonEmpty(item.Status.LastAppliedRevision, item.Status.LastAttemptedRevision)
if len(item.Status.History) > 0 {
latest := item.Status.History[0]
chart = firstNonEmpty(latest.ChartName, chart)
version = firstNonEmpty(latest.ChartVersion, version)
appVersion = latest.AppVersion
revision = firstNonEmpty(latest.Digest, revision)
}
out = append(out, metrics.GitOpsHelmRelease{
Namespace: defaultString(item.Metadata.Namespace, "default"),
Name: item.Metadata.Name,
Chart: chart,
Version: version,
AppVersion: appVersion,
Revision: revision,
Ready: ready,
Reason: reason,
Suspended: item.Spec.Suspend,
})
}
sort.Slice(out, func(i, j int) bool {
return out[i].Namespace+"/"+out[i].Name < out[j].Namespace+"/"+out[j].Name
})
return out, nil
}
// gitOpsKubectlJSON runs kubectl with the configured kubeconfig and a short
// request timeout.
// Signature: gitOpsKubectlJSON(ctx context.Context, cfg config.Config, args ...string) ([]byte, error).
// Why: every GitOps scrape should fail quickly instead of stealing time from
// UPS safety polling during outage recovery.
func gitOpsKubectlJSON(ctx context.Context, cfg config.Config, args ...string) ([]byte, error) {
finalArgs := []string{}
if strings.TrimSpace(cfg.Kubeconfig) != "" {
finalArgs = append(finalArgs, "--kubeconfig", cfg.Kubeconfig)
}
finalArgs = append(finalArgs, "--request-timeout=8s")
finalArgs = append(finalArgs, args...)
return gitOpsKubectlOutput(ctx, "kubectl", finalArgs...)
}
// runGitOpsKubectlOutput is the production command runner for GitOps snapshot
// collection.
// Signature: runGitOpsKubectlOutput(ctx context.Context, name string, args ...string) ([]byte, error).
// Why: preserving stderr in errors makes live diagnosis possible when Flux CRD
// reads fail on a recovering control plane.
func runGitOpsKubectlOutput(ctx context.Context, name string, args ...string) ([]byte, error) {
cmd := exec.CommandContext(ctx, name, args...)
out, err := cmd.CombinedOutput()
if err != nil {
trimmed := strings.TrimSpace(string(out))
if trimmed == "" {
return nil, err
}
return nil, fmt.Errorf("%w: %s", err, trimmed)
}
return out, nil
}
// fluxReady resolves a Flux Ready condition into a boolean and reason.
// Signature: fluxReady(conditions []struct { Type string; Status string; Reason string }) (bool, string).
// Why: Flux resources all use Ready conditions, so one helper keeps status
// interpretation consistent across GitRepository, Kustomization, and HelmRelease.
func fluxReady(conditions []struct {
Type string `json:"type"`
Status string `json:"status"`
Reason string `json:"reason"`
}) (bool, string) {
for _, condition := range conditions {
if condition.Type == "Ready" {
return strings.EqualFold(condition.Status, "True"), defaultString(condition.Reason, "ReadyUnknown")
}
}
return false, "ReadyMissing"
}
// sourceBranch returns the most useful source reference label for a
// GitRepository.
// Signature: sourceBranch(item gitOpsResource) string.
// Why: branch is preferred for Atlas, but tags and semver refs should still
// render clearly if Flux is ever pinned differently.
func sourceBranch(item gitOpsResource) string {
return firstNonEmpty(item.Spec.Ref.Branch, item.Spec.Ref.Tag, item.Spec.Ref.SemVer, "unknown")
}
// defaultString trims a value and returns a fallback when it is empty.
// Signature: defaultString(value string, fallback string) string.
// Why: Flux omits some optional fields; dashboards are easier to read with
// explicit fallback labels.
func defaultString(value string, fallback string) string {
value = strings.TrimSpace(value)
if value == "" {
return fallback
}
return value
}
// firstNonEmpty returns the first non-empty trimmed value.
// Signature: firstNonEmpty(values ...string) string.
// Why: Flux has several revision fields depending on resource kind and
// reconciliation stage, so callers can express preferred fallback order.
func firstNonEmpty(values ...string) string {
for _, value := range values {
value = strings.TrimSpace(value)
if value != "" {
return value
}
}
return ""
}
// gitOpsErrorSummary renders resource-family scrape errors for daemon logs.
// Signature: gitOpsErrorSummary(errors map[string]string) string.
// Why: a compact sorted summary keeps recurring scrape warnings readable in
// journald during partial cluster recovery.
func gitOpsErrorSummary(errors map[string]string) string {
if len(errors) == 0 {
return "none"
}
keys := make([]string, 0, len(errors))
for key := range errors {
keys = append(keys, key)
}
sort.Strings(keys)
parts := make([]string, 0, len(keys))
for _, key := range keys {
parts = append(parts, key+"="+errors[key])
}
return strings.Join(parts, "; ")
}

View File

@ -0,0 +1,127 @@
package service
import (
"context"
"log"
"time"
"scm.bstein.dev/bstein/ananke/internal/config"
"scm.bstein.dev/bstein/ananke/internal/metrics"
)
type TestHookGitOpsRunner func(context.Context, string, ...string) ([]byte, error)
// TestHookParseGitRepositories exposes GitRepository parsing to the top-level
// testing module.
// Signature: TestHookParseGitRepositories(raw []byte) ([]metrics.GitOpsFluxSource, error).
// Why: Ananke keeps split-module tests outside internal packages, but the
// parser needs direct contract coverage without a live cluster.
func TestHookParseGitRepositories(raw []byte) ([]metrics.GitOpsFluxSource, error) {
return parseGitRepositories(raw)
}
// TestHookParseKustomizations exposes Kustomization parsing to the top-level
// testing module.
// Signature: TestHookParseKustomizations(raw []byte) ([]metrics.GitOpsKustomization, error).
// Why: dashboard-facing labels should be verified from representative Flux JSON
// without relying on the current production cluster shape.
func TestHookParseKustomizations(raw []byte) ([]metrics.GitOpsKustomization, error) {
return parseKustomizations(raw)
}
// TestHookParseHelmReleases exposes HelmRelease parsing to the top-level
// testing module.
// Signature: TestHookParseHelmReleases(raw []byte) ([]metrics.GitOpsHelmRelease, error).
// Why: Helm status fields vary by reconciliation phase, and split-module tests
// need a stable seam for parser coverage.
func TestHookParseHelmReleases(raw []byte) ([]metrics.GitOpsHelmRelease, error) {
return parseHelmReleases(raw)
}
// TestHookCollectGitOpsSnapshotWithRunner runs snapshot collection with an
// injected kubectl runner.
// Signature: TestHookCollectGitOpsSnapshotWithRunner(ctx context.Context, cfg config.Config, runner TestHookGitOpsRunner) metrics.GitOpsSnapshot.
// Why: this covers success and partial-failure collection paths without shelling
// out to kubectl from unit tests.
func TestHookCollectGitOpsSnapshotWithRunner(ctx context.Context, cfg config.Config, runner TestHookGitOpsRunner) metrics.GitOpsSnapshot {
original := gitOpsKubectlOutput
defer func() { gitOpsKubectlOutput = original }()
gitOpsKubectlOutput = runner
return collectGitOpsSnapshot(ctx, cfg)
}
// TestHookMaybeStartGitOpsSnapshot starts the daemon's background GitOps scrape
// from split-module tests.
// Signature: TestHookMaybeStartGitOpsSnapshot(ctx context.Context, cfg config.Config, exporter *metrics.Exporter, logger *log.Logger, lastRun *time.Time, running bool, done chan<- struct{}) bool.
// Why: the daemon intentionally keeps scraping asynchronous so GitOps telemetry
// cannot delay UPS polling; the behavior needs direct coverage.
func TestHookMaybeStartGitOpsSnapshot(ctx context.Context, cfg config.Config, exporter *metrics.Exporter, logger *log.Logger, lastRun *time.Time, running bool, done chan<- struct{}) bool {
d := &Daemon{cfg: cfg, exporter: exporter, log: logger}
return d.maybeStartGitOpsSnapshot(ctx, lastRun, running, done)
}
// TestHookMaybeStartGitOpsSnapshotWithRunner starts a background GitOps scrape
// with an injected runner and restores the production runner after completion.
// Signature: TestHookMaybeStartGitOpsSnapshotWithRunner(ctx context.Context, cfg config.Config, exporter *metrics.Exporter, logger *log.Logger, lastRun *time.Time, running bool, done chan<- struct{}, runner TestHookGitOpsRunner) bool.
// Why: the scrape is asynchronous, so split-module tests need a seam that keeps
// fake kubectl behavior installed until the goroutine exits.
func TestHookMaybeStartGitOpsSnapshotWithRunner(ctx context.Context, cfg config.Config, exporter *metrics.Exporter, logger *log.Logger, lastRun *time.Time, running bool, done chan<- struct{}, runner TestHookGitOpsRunner) bool {
original := gitOpsKubectlOutput
gitOpsKubectlOutput = runner
proxyDone := make(chan struct{}, 1)
d := &Daemon{cfg: cfg, exporter: exporter, log: logger}
started := d.maybeStartGitOpsSnapshot(ctx, lastRun, running, proxyDone)
if !started || running {
gitOpsKubectlOutput = original
return started
}
go func() {
<-proxyDone
gitOpsKubectlOutput = original
select {
case done <- struct{}{}:
default:
}
}()
return started
}
// TestHookRunGitOpsKubectlOutput exposes the production command runner to
// split-module tests.
// Signature: TestHookRunGitOpsKubectlOutput(ctx context.Context, name string, args ...string) ([]byte, error).
// Why: stderr preservation and empty-stderr failure behavior are part of the
// operator-facing diagnostics contract.
func TestHookRunGitOpsKubectlOutput(ctx context.Context, name string, args ...string) ([]byte, error) {
return runGitOpsKubectlOutput(ctx, name, args...)
}
// TestHookGitOpsDefaultString exposes defaultString to split-module tests.
// Signature: TestHookGitOpsDefaultString(value string, fallback string) string.
// Why: fallback label behavior is intentionally tiny but important for readable
// Grafana tables during startup.
func TestHookGitOpsDefaultString(value string, fallback string) string {
return defaultString(value, fallback)
}
// TestHookGitOpsFirstNonEmpty exposes firstNonEmpty to split-module tests.
// Signature: TestHookGitOpsFirstNonEmpty(values ...string) string.
// Why: revision fallback order should remain deterministic as Flux status
// payloads change.
func TestHookGitOpsFirstNonEmpty(values ...string) string {
return firstNonEmpty(values...)
}
// TestHookGitOpsErrorSummary exposes gitOpsErrorSummary to split-module tests.
// Signature: TestHookGitOpsErrorSummary(errors map[string]string) string.
// Why: sorted scrape warnings are easier to inspect in journald during recovery.
func TestHookGitOpsErrorSummary(errors map[string]string) string {
return gitOpsErrorSummary(errors)
}
// TestHookFluxReadyMissing exposes the missing Ready-condition branch to tests.
// Signature: TestHookFluxReadyMissing() (bool, string).
// Why: absent Ready conditions should be treated as not-ready and labelled
// explicitly instead of being mistaken for healthy.
func TestHookFluxReadyMissing() (bool, string) {
return fluxReady(nil)
}

View File

@ -172,6 +172,14 @@ def _read_coverage_percent(path: str) -> float:
return 0.0
def _resolve_repo_path(repo_root: Path, path: str) -> Path:
"""Resolve quality-gate artifact paths relative to the repository root."""
candidate = Path(path)
if candidate.is_absolute():
return candidate
return repo_root / candidate
def _iter_source_files(repo_root: Path):
for rel_root in SOURCE_SCAN_ROOTS:
base = repo_root / rel_root
@ -367,7 +375,9 @@ def main(argv: list[str] | None = None) -> int:
args = parse_args(argv or sys.argv[1:])
repo_root = Path(__file__).resolve().parents[1]
build_dir = repo_root / "build"
gate_rc = _read_exit_code(Path(os.getenv("ANANKE_QUALITY_EXIT_CODE_PATH", str(build_dir / "quality-gate.rc"))))
gate_rc = _read_exit_code(
_resolve_repo_path(repo_root, os.getenv("ANANKE_QUALITY_EXIT_CODE_PATH", str(build_dir / "quality-gate.rc")))
)
current_ok = 1 if gate_rc == 0 else 0
current_failed = 0 if gate_rc == 0 else 1
@ -420,13 +430,18 @@ def main(argv: list[str] | None = None) -> int:
elif not already_recorded:
resolved_ok += current_ok
resolved_failed += current_failed
coverage_percent = _read_coverage_percent(args.coverage_percent_file)
coverage_percent = _read_coverage_percent(str(_resolve_repo_path(repo_root, args.coverage_percent_file)))
source_files_total = _count_source_files(repo_root)
source_lines_over_500 = _count_source_files_over_limit(repo_root, max_lines=500)
quality_output = Path(os.getenv("ANANKE_QUALITY_OUTPUT_FILE", str(build_dir / "quality-gate.out")))
quality_output = _resolve_repo_path(
repo_root,
os.getenv("ANANKE_QUALITY_OUTPUT_FILE", str(build_dir / "quality-gate.out")),
)
tests = _parse_go_test_counts(quality_output)
test_cases = _parse_go_test_cases(quality_output)
docs_status = _read_status(Path(os.getenv("ANANKE_QUALITY_DOCS_STATUS_PATH", str(build_dir / "docs-naming.status"))))
docs_status = _read_status(
_resolve_repo_path(repo_root, os.getenv("ANANKE_QUALITY_DOCS_STATUS_PATH", str(build_dir / "docs-naming.status")))
)
unit_tests_failed = _unit_tests_failed(quality_output, coverage_percent)
checks = {
"tests": "failed" if unit_tests_failed or tests["failed"] > 0 or tests["errors"] > 0 else "ok",

View File

@ -61,6 +61,18 @@ class PublishQualityMetricsTest(unittest.TestCase):
self.server.server_close()
self.thread.join(timeout=5)
def test_relative_artifact_paths_are_repo_rooted(self) -> None:
repo_root = Path("/tmp/ananke-repo")
self.assertEqual(
publisher._resolve_repo_path(repo_root, "build/coverage-percent.txt"),
repo_root / "build" / "coverage-percent.txt",
)
self.assertEqual(
publisher._resolve_repo_path(repo_root, "/tmp/coverage-percent.txt"),
Path("/tmp/coverage-percent.txt"),
)
def _env_for_gate_status(self, status: int = 0) -> dict[str, str]:
tmp_dir = tempfile.TemporaryDirectory()
self.addCleanup(tmp_dir.cleanup)

View File

@ -56,12 +56,13 @@ read_quality_counter() {
write_quality_metrics() {
local exit_code="$1"
local metrics_dir state_dir
local metrics_dir state_dir write_metrics
metrics_dir="$(dirname "${QUALITY_METRICS_FILE}")"
state_dir="$(dirname "${QUALITY_STATE_FILE}")"
write_metrics="${QUALITY_METRICS_ENABLED}"
mkdir -p "${state_dir}" >/dev/null 2>&1 || return 0
if [[ "${QUALITY_METRICS_ENABLED}" == "1" ]]; then
mkdir -p "${metrics_dir}" >/dev/null 2>&1 || return 0
if [[ "${write_metrics}" == "1" ]]; then
mkdir -p "${metrics_dir}" >/dev/null 2>&1 || write_metrics=0
fi
local ok failed total last_success now success_percent
@ -84,12 +85,12 @@ write_quality_metrics() {
QUALITY_SUCCESS_PERCENT="${success_percent}"
local tmp_metrics="" tmp_state
if [[ "${QUALITY_METRICS_ENABLED}" == "1" ]]; then
if [[ "${write_metrics}" == "1" ]]; then
tmp_metrics="$(mktemp "${metrics_dir}/quality-gate.prom.XXXXXX")"
fi
tmp_state="$(mktemp "${state_dir}/quality-gate.state.XXXXXX")"
if [[ "${QUALITY_METRICS_ENABLED}" == "1" ]]; then
if [[ "${write_metrics}" == "1" ]]; then
cat > "${tmp_metrics}" <<EOF
# HELP ananke_quality_gate_runs_total Total Ananke quality gate runs by status.
# TYPE ananke_quality_gate_runs_total counter
@ -115,7 +116,7 @@ last_run=${now}
EOF
mv -f "${tmp_state}" "${QUALITY_STATE_FILE}"
if [[ "${QUALITY_METRICS_ENABLED}" == "1" ]]; then
if [[ "${write_metrics}" == "1" ]]; then
mv -f "${tmp_metrics}" "${QUALITY_METRICS_FILE}"
fi
}
@ -146,6 +147,8 @@ publish_quality_metrics() {
quality_gate_finalize() {
local exit_code="$1"
set +e
printf '%s\n' "${exit_code}" > "${BUILD_DIR}/quality-gate.rc" 2>/dev/null || true
cd "${REPO_DIR}" 2>/dev/null || true
write_quality_metrics "${exit_code}" || true
publish_quality_metrics || true
exit "${exit_code}"
@ -193,4 +196,7 @@ printf '%s\n' "${coverage_percent}" > "${COVERAGE_PERCENT_FILE}"
echo "[quality] per-file coverage gate (95%)"
cd testing
ANANKE_ENFORCE_COVERAGE=1 ANANKE_PER_FILE_COVERAGE_TARGET=95 go test ./coverage -run TestPerFileCoverageReport -count=1 -v
ANANKE_ENFORCE_COVERAGE=1 \
ANANKE_PER_FILE_COVERAGE_TARGET=95 \
ANANKE_PER_FILE_COVERAGE_PERCENT_FILE="${COVERAGE_PERCENT_FILE}" \
go test ./coverage -run TestPerFileCoverageReport -count=1 -v

View File

@ -121,6 +121,7 @@ func TestPerFileCoverageReport(t *testing.T) {
"./metrics",
"./hygiene",
"./orchestrator",
"./service",
"./state",
"./sshutil",
"./ups",
@ -167,17 +168,26 @@ func TestPerFileCoverageReport(t *testing.T) {
var report bytes.Buffer
report.WriteString("per-file coverage\n")
under := make([]string, 0)
minPercent := 100.0
for _, file := range keys {
fc := byFile[file]
pct := 100.0
if fc.total > 0 {
pct = (fc.covered / fc.total) * 100.0
}
if pct < minPercent {
minPercent = pct
}
report.WriteString(fmt.Sprintf("- %s: %.1f%%\n", file, pct))
if pct < target {
under = append(under, fmt.Sprintf("%s (%.1f%% < %.1f%%)", file, pct, target))
}
}
if outputPath := strings.TrimSpace(os.Getenv("ANANKE_PER_FILE_COVERAGE_PERCENT_FILE")); outputPath != "" {
if err := os.WriteFile(outputPath, []byte(fmt.Sprintf("%.1f\n", minPercent)), 0o644); err != nil {
t.Fatalf("write per-file coverage percent: %v", err)
}
}
t.Log(report.String())
if len(under) > 0 && enforce {

View File

@ -104,3 +104,64 @@ func TestExporterHelperContracts(t *testing.T) {
t.Fatalf("unexpected escaped string: %q", got)
}
}
// TestExporterEmitsGitOpsMetrics runs one orchestration or CLI step.
// Signature: TestExporterEmitsGitOpsMetrics(t *testing.T).
// Why: the overview dashboard depends on Ananke-owned Flux object-state metrics
// rather than the narrower controller metrics exposed by Flux itself.
func TestExporterEmitsGitOpsMetrics(t *testing.T) {
exporter := metrics.New()
exporter.UpdateGitOpsSnapshot(metrics.GitOpsSnapshot{
UpdatedAt: time.Unix(1710000100, 0).UTC(),
ScrapeSuccess: true,
FluxSources: []metrics.GitOpsFluxSource{{
Namespace: "flux-system",
Name: "flux-system",
URL: "ssh://git@example/repo.git",
Branch: "main",
Revision: "main@sha1:abc123",
Ready: true,
Reason: "Succeeded",
}},
Kustomizations: []metrics.GitOpsKustomization{{
Namespace: "flux-system",
Name: "monitoring",
Path: "./services/monitoring",
SourceNamespace: "flux-system",
SourceName: "flux-system",
Revision: "main@sha1:abc123",
Ready: true,
Reason: "ReconciliationSucceeded",
}},
HelmReleases: []metrics.GitOpsHelmRelease{{
Namespace: "monitoring",
Name: "grafana",
Chart: "grafana",
Version: "8.5.0",
AppVersion: "11.4.0",
Revision: "sha256:abc123",
Ready: false,
Reason: "Progressing",
Suspended: true,
}},
})
req := httptest.NewRequest(http.MethodGet, "/metrics", nil)
rr := httptest.NewRecorder()
exporter.Handler("/metrics").ServeHTTP(rr, req)
body := rr.Body.String()
mustContain := []string{
"ananke_gitops_last_scrape_timestamp_seconds 1710000100",
"ananke_gitops_scrape_success 1",
`ananke_gitops_flux_source_info{namespace="flux-system",name="flux-system",url="ssh://git@example/repo.git",branch="main",revision="main@sha1:abc123",ready="true",reason="Succeeded"} 1`,
`ananke_gitops_kustomization_ready{namespace="flux-system",name="monitoring"} 1`,
`ananke_gitops_helmrelease_info{namespace="monitoring",name="grafana",chart="grafana",version="8.5.0",app_version="11.4.0",revision="sha256:abc123",ready="false",reason="Progressing"} 1`,
`ananke_gitops_helmrelease_suspended{namespace="monitoring",name="grafana"} 1`,
}
for _, fragment := range mustContain {
if !strings.Contains(body, fragment) {
t.Fatalf("missing GitOps metric fragment %q in output:\n%s", fragment, body)
}
}
}

View File

@ -0,0 +1,316 @@
package servicequality
import (
"context"
"errors"
"io"
"log"
"net/http"
"net/http/httptest"
"strings"
"testing"
"time"
"scm.bstein.dev/bstein/ananke/internal/config"
"scm.bstein.dev/bstein/ananke/internal/metrics"
"scm.bstein.dev/bstein/ananke/internal/service"
)
// TestParseGitRepositories runs one orchestration or CLI step.
// Signature: TestParseGitRepositories(t *testing.T).
// Why: branch, revision, and Ready status are the labels the Grafana GitOps
// panels rely on for the current Flux source.
func TestParseGitRepositories(t *testing.T) {
raw := []byte(`{
"items": [{
"metadata": {"name": "flux-system", "namespace": "flux-system"},
"spec": {"url": "ssh://git@example/repo.git", "ref": {"branch": "main"}},
"status": {
"artifact": {"revision": "main@sha1:abc123"},
"conditions": [{"type": "Ready", "status": "True", "reason": "Succeeded"}]
}
}]
}`)
got, err := service.TestHookParseGitRepositories(raw)
if err != nil {
t.Fatalf("parseGitRepositories failed: %v", err)
}
if len(got) != 1 || got[0].Branch != "main" || got[0].Revision != "main@sha1:abc123" || !got[0].Ready {
t.Fatalf("unexpected GitRepository parse result: %+v", got)
}
}
// TestParseKustomizations runs one orchestration or CLI step.
// Signature: TestParseKustomizations(t *testing.T).
// Why: Kustomization source and suspend labels power the detailed GitOps table.
func TestParseKustomizations(t *testing.T) {
raw := []byte(`{
"items": [{
"metadata": {"name": "monitoring", "namespace": "flux-system"},
"spec": {
"path": "./services/monitoring",
"sourceRef": {"name": "flux-system"},
"suspend": true
},
"status": {
"lastAppliedRevision": "main@sha1:def456",
"conditions": [{"type": "Ready", "status": "False", "reason": "DependencyNotReady"}]
}
}]
}`)
got, err := service.TestHookParseKustomizations(raw)
if err != nil {
t.Fatalf("parseKustomizations failed: %v", err)
}
if len(got) != 1 || got[0].SourceNamespace != "flux-system" || got[0].Ready || !got[0].Suspended {
t.Fatalf("unexpected Kustomization parse result: %+v", got)
}
}
// TestParseHelmReleases runs one orchestration or CLI step.
// Signature: TestParseHelmReleases(t *testing.T).
// Why: HelmRelease chart fields have separate status fallbacks from plain Flux
// Kustomizations and need their own contract coverage.
func TestParseHelmReleases(t *testing.T) {
raw := []byte(`{
"items": [{
"metadata": {"name": "grafana", "namespace": "monitoring"},
"spec": {
"chart": {"spec": {"chart": "grafana", "version": "~8.5.0"}}
},
"status": {
"lastAttemptedRevision": "8.5.8",
"history": [{"chartName": "grafana", "chartVersion": "8.5.8", "appVersion": "11.4.0", "digest": "sha256:abc"}],
"conditions": [{"type": "Ready", "status": "True", "reason": "InstallSucceeded"}]
}
}]
}`)
got, err := service.TestHookParseHelmReleases(raw)
if err != nil {
t.Fatalf("parseHelmReleases failed: %v", err)
}
if len(got) != 1 || got[0].Chart != "grafana" || got[0].Version != "8.5.8" || got[0].Revision != "sha256:abc" || !got[0].Ready {
t.Fatalf("unexpected HelmRelease parse result: %+v", got)
}
}
// TestCollectGitOpsSnapshotPartialFailure runs one orchestration or CLI step.
// Signature: TestCollectGitOpsSnapshotPartialFailure(t *testing.T).
// Why: a recovering cluster may serve some Flux resource families before
// others, and Ananke should publish a partial snapshot instead of failing closed.
func TestCollectGitOpsSnapshotPartialFailure(t *testing.T) {
runner := func(_ context.Context, _ string, args ...string) ([]byte, error) {
joined := strings.Join(args, " ")
switch {
case strings.Contains(joined, "gitrepositories"):
return []byte(`{"items":[]}`), nil
case strings.Contains(joined, "kustomizations"):
return nil, errors.New("api unavailable")
case strings.Contains(joined, "helmreleases"):
return []byte(`{"items":[]}`), nil
default:
return nil, errors.New("unexpected command")
}
}
snapshot := service.TestHookCollectGitOpsSnapshotWithRunner(context.Background(), config.Config{Kubeconfig: "/tmp/kubeconfig"}, runner)
if snapshot.ScrapeSuccess {
t.Fatalf("expected partial failure")
}
if snapshot.Errors["kustomization"] == "" {
t.Fatalf("expected kustomization error, got %+v", snapshot.Errors)
}
if summary := service.TestHookGitOpsErrorSummary(snapshot.Errors); !strings.Contains(summary, "kustomization=api unavailable") {
t.Fatalf("unexpected error summary: %s", summary)
}
}
// TestCollectGitOpsSnapshotParseFailures runs one orchestration or CLI step.
// Signature: TestCollectGitOpsSnapshotParseFailures(t *testing.T).
// Why: malformed API payloads should be surfaced per resource family so the
// exporter can distinguish data errors from transport failures.
func TestCollectGitOpsSnapshotParseFailures(t *testing.T) {
runner := func(_ context.Context, _ string, _ ...string) ([]byte, error) {
return []byte(`not-json`), nil
}
snapshot := service.TestHookCollectGitOpsSnapshotWithRunner(context.Background(), config.Config{}, runner)
if snapshot.ScrapeSuccess {
t.Fatalf("expected parse failures")
}
for _, key := range []string{"gitrepository", "kustomization", "helmrelease"} {
if snapshot.Errors[key] == "" {
t.Fatalf("expected %s parse error, got %+v", key, snapshot.Errors)
}
}
}
// TestCollectGitOpsSnapshotSuccess runs one orchestration or CLI step.
// Signature: TestCollectGitOpsSnapshotSuccess(t *testing.T).
// Why: successful collection should preserve non-branch Git refs as the source
// label rather than hiding them as unknown.
func TestCollectGitOpsSnapshotSuccess(t *testing.T) {
runner := func(_ context.Context, _ string, args ...string) ([]byte, error) {
joined := strings.Join(args, " ")
switch {
case strings.Contains(joined, "gitrepositories"):
return []byte(`{"items":[{"metadata":{"name":"flux-system","namespace":"flux-system"},"spec":{"ref":{"tag":"v1.0.0"}}}]}`), nil
case strings.Contains(joined, "kustomizations"):
return []byte(`{"items":[]}`), nil
case strings.Contains(joined, "helmreleases"):
return []byte(`{"items":[]}`), nil
default:
return nil, errors.New("unexpected command")
}
}
snapshot := service.TestHookCollectGitOpsSnapshotWithRunner(context.Background(), config.Config{}, runner)
if !snapshot.ScrapeSuccess || len(snapshot.FluxSources) != 1 || snapshot.FluxSources[0].Branch != "v1.0.0" {
t.Fatalf("unexpected snapshot: %+v", snapshot)
}
}
// TestMaybeStartGitOpsSnapshot runs one orchestration or CLI step.
// Signature: TestMaybeStartGitOpsSnapshot(t *testing.T).
// Why: Ananke must collect GitOps status asynchronously so a slow Kubernetes API
// cannot delay UPS polling.
func TestMaybeStartGitOpsSnapshot(t *testing.T) {
runner := func(_ context.Context, _ string, _ ...string) ([]byte, error) {
return []byte(`{"items":[]}`), nil
}
exporter := metrics.New()
done := make(chan struct{}, 1)
lastRun := time.Time{}
cfg := config.Config{GitOps: config.GitOps{Enabled: true, PollSeconds: 1}}
withRunner := func() bool {
return service.TestHookCollectGitOpsSnapshotWithRunner(context.Background(), config.Config{}, runner).ScrapeSuccess
}
if !withRunner() {
t.Fatalf("expected injected runner sanity check to pass")
}
if running := service.TestHookMaybeStartGitOpsSnapshotWithRunner(context.Background(), cfg, exporter, log.New(io.Discard, "", 0), &lastRun, false, done, runner); !running {
t.Fatalf("expected scrape to start")
}
select {
case <-done:
case <-time.After(2 * time.Second):
t.Fatalf("timed out waiting for scrape")
}
req := httptest.NewRequest(http.MethodGet, "/metrics", nil)
rr := httptest.NewRecorder()
exporter.Handler("/metrics").ServeHTTP(rr, req)
if !strings.Contains(rr.Body.String(), "ananke_gitops_scrape_success") {
t.Fatalf("expected GitOps scrape metric, got:\n%s", rr.Body.String())
}
if running := service.TestHookMaybeStartGitOpsSnapshot(context.Background(), cfg, exporter, log.New(io.Discard, "", 0), &lastRun, true, done); !running {
t.Fatalf("running scrape should stay marked running")
}
cfg.GitOps.Enabled = false
if running := service.TestHookMaybeStartGitOpsSnapshot(context.Background(), cfg, exporter, nil, &lastRun, false, done); running {
t.Fatalf("disabled GitOps scrape should not start")
}
}
// TestRunGitOpsKubectlOutput runs one orchestration or CLI step.
// Signature: TestRunGitOpsKubectlOutput(t *testing.T).
// Why: command diagnostics should preserve stderr for operator-visible errors.
func TestRunGitOpsKubectlOutput(t *testing.T) {
out, err := service.TestHookRunGitOpsKubectlOutput(context.Background(), "sh", "-c", "printf ok")
if err != nil || string(out) != "ok" {
t.Fatalf("unexpected command success result: out=%q err=%v", out, err)
}
if _, err := service.TestHookRunGitOpsKubectlOutput(context.Background(), "sh", "-c", "echo bad >&2; exit 7"); err == nil || !strings.Contains(err.Error(), "bad") {
t.Fatalf("expected stderr to be preserved in error, got %v", err)
}
if _, err := service.TestHookRunGitOpsKubectlOutput(context.Background(), "sh", "-c", "exit 8"); err == nil {
t.Fatalf("expected empty-stderr command failure")
}
}
// TestGitOpsParseErrors runs one orchestration or CLI step.
// Signature: TestGitOpsParseErrors(t *testing.T).
// Why: invalid JSON should return parse errors for every Flux resource parser.
func TestGitOpsParseErrors(t *testing.T) {
if _, err := service.TestHookParseGitRepositories([]byte(`not-json`)); err == nil {
t.Fatalf("expected GitRepository parse error")
}
if _, err := service.TestHookParseKustomizations([]byte(`not-json`)); err == nil {
t.Fatalf("expected Kustomization parse error")
}
if _, err := service.TestHookParseHelmReleases([]byte(`not-json`)); err == nil {
t.Fatalf("expected HelmRelease parse error")
}
}
// TestMaybeStartGitOpsSnapshotSkipsFreshSampleAndNilExporter runs one
// orchestration or CLI step.
// Signature: TestMaybeStartGitOpsSnapshotSkipsFreshSampleAndNilExporter(t *testing.T).
// Why: disabled or already-fresh scrapes should be cheap no-ops in the daemon
// loop.
func TestMaybeStartGitOpsSnapshotSkipsFreshSampleAndNilExporter(t *testing.T) {
cfg := config.Config{GitOps: config.GitOps{Enabled: true, PollSeconds: 60}}
done := make(chan struct{}, 1)
lastRun := time.Now()
if running := service.TestHookMaybeStartGitOpsSnapshot(context.Background(), cfg, nil, nil, &lastRun, false, done); running {
t.Fatalf("nil exporter should not start scrape")
}
if running := service.TestHookMaybeStartGitOpsSnapshot(context.Background(), cfg, metrics.New(), nil, &lastRun, false, done); running {
t.Fatalf("fresh sample should not start another scrape")
}
runner := func(_ context.Context, _ string, _ ...string) ([]byte, error) {
return []byte(`{"items":[]}`), nil
}
if running := service.TestHookMaybeStartGitOpsSnapshotWithRunner(context.Background(), cfg, metrics.New(), nil, &lastRun, true, done, runner); !running {
t.Fatalf("already-running scrape should remain marked running")
}
cfg.GitOps.Enabled = false
lastRun = time.Time{}
if running := service.TestHookMaybeStartGitOpsSnapshotWithRunner(context.Background(), cfg, metrics.New(), nil, &lastRun, false, done, runner); running {
t.Fatalf("disabled scrape should not start through runner hook")
}
}
// TestMaybeStartGitOpsSnapshotLogsPartialFailureWithoutLogger runs one
// orchestration or CLI step.
// Signature: TestMaybeStartGitOpsSnapshotLogsPartialFailureWithoutLogger(t *testing.T).
// Why: scrape failure handling should not depend on a logger being present.
func TestMaybeStartGitOpsSnapshotLogsPartialFailureWithoutLogger(t *testing.T) {
done := make(chan struct{}, 1)
lastRun := time.Time{}
cfg := config.Config{GitOps: config.GitOps{Enabled: true, PollSeconds: 1}}
runner := func(_ context.Context, _ string, _ ...string) ([]byte, error) {
return nil, errors.New("api down")
}
if running := service.TestHookMaybeStartGitOpsSnapshotWithRunner(context.Background(), cfg, metrics.New(), nil, &lastRun, false, done, runner); !running {
t.Fatalf("expected scrape to start")
}
select {
case <-done:
case <-time.After(2 * time.Second):
t.Fatalf("timed out waiting for failed scrape")
}
}
// TestGitOpsHelpers runs one orchestration or CLI step.
// Signature: TestGitOpsHelpers(t *testing.T).
// Why: tiny label helpers carry dashboard readability contracts and are worth
// locking down.
func TestGitOpsHelpers(t *testing.T) {
if got := service.TestHookGitOpsDefaultString(" ", "fallback"); got != "fallback" {
t.Fatalf("unexpected defaultString fallback: %q", got)
}
if got := service.TestHookGitOpsFirstNonEmpty(" ", "second"); got != "second" {
t.Fatalf("unexpected firstNonEmpty result: %q", got)
}
if got := service.TestHookGitOpsFirstNonEmpty(" ", ""); got != "" {
t.Fatalf("unexpected empty firstNonEmpty result: %q", got)
}
if got := service.TestHookGitOpsErrorSummary(nil); got != "none" {
t.Fatalf("unexpected empty summary: %q", got)
}
ready, reason := service.TestHookFluxReadyMissing()
if ready || reason != "ReadyMissing" {
t.Fatalf("unexpected empty Ready condition parse: ready=%t reason=%s", ready, reason)
}
}