monitoring: export gitops state from ananke

2026-05-15 19:36:58 -03:00 · 2026-05-15 19:36:58 -03:00 · 087728d481
commit 087728d481
parent 3cc980795a
17 changed files with 1117 additions and 11 deletions
--- a/4
+++ b/4
@ -311,6 +311,10 @@ PY
 import re
 from pathlib import Path
 coverage_path = Path("build/coverage-percent.txt")
 if coverage_path.exists():
    print(coverage_path.read_text(encoding="utf-8", errors="ignore").strip() or "0.0")
    raise SystemExit(0)
 log_path = Path("build/quality-gate.out")
 text = log_path.read_text(encoding="utf-8", errors="ignore") if log_path.exists() else ""
 values = [float(match.group(1)) for match in re.finditer(r"([0-9]+(?:\\.[0-9]+)?)%", text)]
--- a/configs/ananke.example.yaml
+++ b/configs/ananke.example.yaml
@ -197,6 +197,9 @@ metrics:
  enabled: true
  bind_addr: 0.0.0.0:9560
  path: /metrics
 gitops:
  enabled: true
  poll_seconds: 60
 state:
  dir: /var/lib/ananke
  reports_dir: /var/lib/ananke/reports
--- a/configs/ananke.tethys.yaml
+++ b/configs/ananke.tethys.yaml
@ -329,6 +329,9 @@ metrics:
  enabled: true
  bind_addr: 0.0.0.0:9560
  path: /metrics
 gitops:
  enabled: true
  poll_seconds: 60
 state:
  dir: /var/lib/ananke
  reports_dir: /var/lib/ananke/reports
--- a/configs/ananke.titan-db.yaml
+++ b/configs/ananke.titan-db.yaml
@ -329,6 +329,9 @@ metrics:
  enabled: true
  bind_addr: 0.0.0.0:9560
  path: /metrics
 gitops:
  enabled: true
  poll_seconds: 60
 state:
  dir: /var/lib/ananke
  reports_dir: /var/lib/ananke/reports
--- a/internal/config/apply_defaults.go
+++ b/internal/config/apply_defaults.go
@ -270,6 +270,9 @@ func (c *Config) applyDefaults() {
 	if c.Metrics.Path == "" {
 		c.Metrics.Path = "/metrics"
 	}
 	if c.GitOps.PollSeconds <= 0 {
 		c.GitOps.PollSeconds = 60
 	}
 	if c.State.Dir == "" {
 		c.State.Dir = "/var/lib/ananke"
 	}
--- a/internal/config/defaults.go
+++ b/internal/config/defaults.go
@ -156,6 +156,10 @@ func defaults() Config {
 			BindAddr: "0.0.0.0:9560",
 			Path:     "/metrics",
 		},
 		GitOps: GitOps{
 			Enabled:     true,
 			PollSeconds: 60,
 		},
 		State: State{
 			Dir:            "/var/lib/ananke",
 			ReportsDir:     "/var/lib/ananke/reports",
--- a/internal/config/types.go
+++ b/internal/config/types.go
@ -23,6 +23,7 @@ type Config struct {
 	UPS                 UPS               `yaml:"ups"`
 	Coordination        Coordination      `yaml:"coordination"`
 	Metrics             Metrics           `yaml:"metrics"`
 	GitOps              GitOps            `yaml:"gitops"`
 	State               State             `yaml:"state"`
 }
@ -174,6 +175,11 @@ type Metrics struct {
 	Path     string `yaml:"path"`
 }
 type GitOps struct {
 	Enabled     bool `yaml:"enabled"`
 	PollSeconds int  `yaml:"poll_seconds"`
 }
 type State struct {
 	Dir            string `yaml:"dir"`
 	ReportsDir     string `yaml:"reports_dir"`
--- a/internal/metrics/exporter.go
+++ b/internal/metrics/exporter.go
@ -27,6 +27,50 @@ type Sample struct {
 	UpdatedAt     time.Time
 }
 type GitOpsSnapshot struct {
 	UpdatedAt      time.Time
 	ScrapeSuccess  bool
 	Errors         map[string]string
 	FluxSources    []GitOpsFluxSource
 	Kustomizations []GitOpsKustomization
 	HelmReleases   []GitOpsHelmRelease
 }
 type GitOpsFluxSource struct {
 	Namespace string
 	Name      string
 	URL       string
 	Branch    string
 	Revision  string
 	Ready     bool
 	Reason    string
 	Suspended bool
 }
 type GitOpsKustomization struct {
 	Namespace       string
 	Name            string
 	Path            string
 	SourceNamespace string
 	SourceName      string
 	Revision        string
 	Ready           bool
 	Reason          string
 	Suspended       bool
 }
 type GitOpsHelmRelease struct {
 	Namespace  string
 	Name       string
 	Chart      string
 	Version    string
 	AppVersion string
 	Revision   string
 	Ready      bool
 	Reason     string
 	Suspended  bool
 }
 type Exporter struct {
 	mu                 sync.RWMutex
 	shutdownBudgetSec  int
@ -34,6 +78,7 @@ type Exporter struct {
 	lastShutdownReason string
 	lastShutdownAt     time.Time
 	samples            map[string]Sample
 	gitops             GitOpsSnapshot
 }
 // New runs one orchestration or CLI step.
@ -66,6 +111,22 @@ func (e *Exporter) UpdateSample(s Sample) {
 	e.samples[s.Name] = s
 }
 // UpdateGitOpsSnapshot records the most recent Flux object-state scrape.
 // Signature: (e *Exporter) UpdateGitOpsSnapshot(snapshot GitOpsSnapshot).
 // Why: Grafana needs object readiness and branch/revision state, while Flux's
 // controller metrics only expose controller health in this cluster.
 func (e *Exporter) UpdateGitOpsSnapshot(snapshot GitOpsSnapshot) {
 	e.mu.Lock()
 	defer e.mu.Unlock()
 	if snapshot.UpdatedAt.IsZero() {
 		snapshot.UpdatedAt = time.Now().UTC()
 	}
 	if snapshot.Errors == nil {
 		snapshot.Errors = map[string]string{}
 	}
 	e.gitops = snapshot
 }
 // MarkShutdown runs one orchestration or CLI step.
 // Signature: (e *Exporter) MarkShutdown(reason string).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
@ -164,11 +225,99 @@ func (e *Exporter) serveMetrics(w http.ResponseWriter, _ *http.Request) {
 		}
 		b.WriteString(fmt.Sprintf("ananke_ups_error%s %d\n", labels, boolNum(s.LastError != "")))
 	}
 	appendGitOpsMetrics(&b, e.gitops)
 	appendQualityGateMetrics(&b)
 	_, _ = w.Write([]byte(b.String()))
 }
 // appendGitOpsMetrics writes Flux object-state metrics collected by the
 // long-running daemon loop.
 // Signature: appendGitOpsMetrics(dst *strings.Builder, snapshot GitOpsSnapshot).
 // Why: this keeps the expensive Kubernetes API reads out of the HTTP scrape path
 // while still making current GitOps health cheap for Grafana to query.
 func appendGitOpsMetrics(dst *strings.Builder, snapshot GitOpsSnapshot) {
 	if dst.Len() > 0 {
 		dst.WriteString("\n")
 	}
 	dst.WriteString("# HELP ananke_gitops_last_scrape_timestamp_seconds Unix timestamp of the latest GitOps object-state scrape.\n")
 	dst.WriteString("# TYPE ananke_gitops_last_scrape_timestamp_seconds gauge\n")
 	if snapshot.UpdatedAt.IsZero() {
 		dst.WriteString("ananke_gitops_last_scrape_timestamp_seconds 0\n")
 	} else {
 		dst.WriteString(fmt.Sprintf("ananke_gitops_last_scrape_timestamp_seconds %d\n", snapshot.UpdatedAt.Unix()))
 	}
 	dst.WriteString("# HELP ananke_gitops_scrape_success Whether the latest GitOps object-state scrape completed without errors.\n")
 	dst.WriteString("# TYPE ananke_gitops_scrape_success gauge\n")
 	dst.WriteString(fmt.Sprintf("ananke_gitops_scrape_success %d\n", boolNum(snapshot.ScrapeSuccess)))
 	dst.WriteString("# HELP ananke_gitops_scrape_error Whether a GitOps resource family failed during the latest scrape.\n")
 	dst.WriteString("# TYPE ananke_gitops_scrape_error gauge\n")
 	resources := []string{"gitrepository", "kustomization", "helmrelease"}
 	for _, resource := range resources {
 		_, failed := snapshot.Errors[resource]
 		dst.WriteString(fmt.Sprintf("ananke_gitops_scrape_error{resource=%q} %d\n", resource, boolNum(failed)))
 	}
 	dst.WriteString("# HELP ananke_gitops_flux_source_info Current Flux GitRepository source metadata.\n")
 	dst.WriteString("# TYPE ananke_gitops_flux_source_info gauge\n")
 	dst.WriteString("# HELP ananke_gitops_flux_source_ready Whether a Flux GitRepository source is Ready.\n")
 	dst.WriteString("# TYPE ananke_gitops_flux_source_ready gauge\n")
 	dst.WriteString("# HELP ananke_gitops_flux_source_suspended Whether a Flux GitRepository source is suspended.\n")
 	dst.WriteString("# TYPE ananke_gitops_flux_source_suspended gauge\n")
 	sort.Slice(snapshot.FluxSources, func(i, j int) bool {
 		return snapshot.FluxSources[i].Namespace+"/"+snapshot.FluxSources[i].Name < snapshot.FluxSources[j].Namespace+"/"+snapshot.FluxSources[j].Name
 	})
 	for _, source := range snapshot.FluxSources {
 		infoLabels := fmt.Sprintf("{namespace=%q,name=%q,url=%q,branch=%q,revision=%q,ready=%q,reason=%q}",
 			safe(source.Namespace), safe(source.Name), safe(source.URL), safe(source.Branch),
 			safe(source.Revision), readyLabel(source.Ready), safe(defaultLabel(source.Reason, "unknown")))
 		baseLabels := fmt.Sprintf("{namespace=%q,name=%q}", safe(source.Namespace), safe(source.Name))
 		dst.WriteString(fmt.Sprintf("ananke_gitops_flux_source_info%s 1\n", infoLabels))
 		dst.WriteString(fmt.Sprintf("ananke_gitops_flux_source_ready%s %d\n", baseLabels, boolNum(source.Ready)))
 		dst.WriteString(fmt.Sprintf("ananke_gitops_flux_source_suspended%s %d\n", baseLabels, boolNum(source.Suspended)))
 	}
 	dst.WriteString("# HELP ananke_gitops_kustomization_info Current Flux Kustomization metadata.\n")
 	dst.WriteString("# TYPE ananke_gitops_kustomization_info gauge\n")
 	dst.WriteString("# HELP ananke_gitops_kustomization_ready Whether a Flux Kustomization is Ready.\n")
 	dst.WriteString("# TYPE ananke_gitops_kustomization_ready gauge\n")
 	dst.WriteString("# HELP ananke_gitops_kustomization_suspended Whether a Flux Kustomization is suspended.\n")
 	dst.WriteString("# TYPE ananke_gitops_kustomization_suspended gauge\n")
 	sort.Slice(snapshot.Kustomizations, func(i, j int) bool {
 		return snapshot.Kustomizations[i].Namespace+"/"+snapshot.Kustomizations[i].Name < snapshot.Kustomizations[j].Namespace+"/"+snapshot.Kustomizations[j].Name
 	})
 	for _, kustomization := range snapshot.Kustomizations {
 		infoLabels := fmt.Sprintf("{namespace=%q,name=%q,path=%q,source_namespace=%q,source_name=%q,revision=%q,ready=%q,reason=%q}",
 			safe(kustomization.Namespace), safe(kustomization.Name), safe(kustomization.Path),
 			safe(kustomization.SourceNamespace), safe(kustomization.SourceName), safe(kustomization.Revision),
 			readyLabel(kustomization.Ready), safe(defaultLabel(kustomization.Reason, "unknown")))
 		baseLabels := fmt.Sprintf("{namespace=%q,name=%q}", safe(kustomization.Namespace), safe(kustomization.Name))
 		dst.WriteString(fmt.Sprintf("ananke_gitops_kustomization_info%s 1\n", infoLabels))
 		dst.WriteString(fmt.Sprintf("ananke_gitops_kustomization_ready%s %d\n", baseLabels, boolNum(kustomization.Ready)))
 		dst.WriteString(fmt.Sprintf("ananke_gitops_kustomization_suspended%s %d\n", baseLabels, boolNum(kustomization.Suspended)))
 	}
 	dst.WriteString("# HELP ananke_gitops_helmrelease_info Current Flux HelmRelease metadata.\n")
 	dst.WriteString("# TYPE ananke_gitops_helmrelease_info gauge\n")
 	dst.WriteString("# HELP ananke_gitops_helmrelease_ready Whether a Flux HelmRelease is Ready.\n")
 	dst.WriteString("# TYPE ananke_gitops_helmrelease_ready gauge\n")
 	dst.WriteString("# HELP ananke_gitops_helmrelease_suspended Whether a Flux HelmRelease is suspended.\n")
 	dst.WriteString("# TYPE ananke_gitops_helmrelease_suspended gauge\n")
 	sort.Slice(snapshot.HelmReleases, func(i, j int) bool {
 		return snapshot.HelmReleases[i].Namespace+"/"+snapshot.HelmReleases[i].Name < snapshot.HelmReleases[j].Namespace+"/"+snapshot.HelmReleases[j].Name
 	})
 	for _, release := range snapshot.HelmReleases {
 		infoLabels := fmt.Sprintf("{namespace=%q,name=%q,chart=%q,version=%q,app_version=%q,revision=%q,ready=%q,reason=%q}",
 			safe(release.Namespace), safe(release.Name), safe(release.Chart), safe(release.Version),
 			safe(release.AppVersion), safe(release.Revision), readyLabel(release.Ready),
 			safe(defaultLabel(release.Reason, "unknown")))
 		baseLabels := fmt.Sprintf("{namespace=%q,name=%q}", safe(release.Namespace), safe(release.Name))
 		dst.WriteString(fmt.Sprintf("ananke_gitops_helmrelease_info%s 1\n", infoLabels))
 		dst.WriteString(fmt.Sprintf("ananke_gitops_helmrelease_ready%s %d\n", baseLabels, boolNum(release.Ready)))
 		dst.WriteString(fmt.Sprintf("ananke_gitops_helmrelease_suspended%s %d\n", baseLabels, boolNum(release.Suspended)))
 	}
 }
 // appendQualityGateMetrics runs one orchestration or CLI step.
 // Signature: appendQualityGateMetrics(dst *strings.Builder).
 // Why: quality-gate pass/fail telemetry should appear alongside UPS metrics so
@ -228,3 +377,26 @@ func safe(in string) string {
 	out := strings.ReplaceAll(in, "\\", "\\\\")
 	return strings.ReplaceAll(out, "\"", "\\\"")
 }
 // readyLabel formats a boolean readiness state as a stable label value.
 // Signature: readyLabel(ready bool) string.
 // Why: info metrics need a human-readable status label without changing the
 // numeric ready gauges used for alerting and aggregation.
 func readyLabel(ready bool) string {
 	if ready {
 		return "true"
 	}
 	return "false"
 }
 // defaultLabel returns a safe fallback for empty metric label values.
 // Signature: defaultLabel(value string, fallback string) string.
 // Why: Flux status fields can be absent during startup, but Prometheus labels
 // should remain explicit rather than silently becoming empty strings.
 func defaultLabel(value string, fallback string) string {
 	value = strings.TrimSpace(value)
 	if value == "" {
 		return fallback
 	}
 	return value
 }
--- a/internal/service/daemon.go
+++ b/internal/service/daemon.go
@ -97,6 +97,9 @@ func (d *Daemon) Run(ctx context.Context) error {
 	onBatterySince := map[string]time.Time{}
 	breachCount := map[string]int{}
 	lastAutoHeal := time.Time{}
 	lastGitOpsPoll := time.Time{}
 	gitOpsPollRunning := false
 	gitOpsDone := make(chan struct{}, 1)
 	for _, t := range d.targets {
 		lastGood[t.Name] = time.Now()
 	}
@ -108,6 +111,8 @@ func (d *Daemon) Run(ctx context.Context) error {
 		select {
 		case <-ctx.Done():
 			return ctx.Err()
 		case <-gitOpsDone:
 			gitOpsPollRunning = false
 		case <-t.C:
 			budget := d.orch.EstimatedEmergencyShutdownSeconds()
 			threshold := int(math.Ceil(float64(budget) * d.cfg.UPS.RuntimeSafetyFactor))
@ -201,6 +206,7 @@ func (d *Daemon) Run(ctx context.Context) error {
 			}
 			d.maybeRunPostStartAutoHeal(ctx, &lastAutoHeal, anyOnBattery)
 			gitOpsPollRunning = d.maybeStartGitOpsSnapshot(ctx, &lastGitOpsPoll, gitOpsPollRunning, gitOpsDone)
 		}
 	}
 }
--- a/internal/service/gitops_snapshot.go
+++ b/internal/service/gitops_snapshot.go
@ -0,0 +1,355 @@
 package service
 import (
 	"context"
 	"encoding/json"
 	"fmt"
 	"os/exec"
 	"sort"
 	"strings"
 	"time"
 	"scm.bstein.dev/bstein/ananke/internal/config"
 	"scm.bstein.dev/bstein/ananke/internal/metrics"
 )
 var gitOpsKubectlOutput = runGitOpsKubectlOutput
 type gitOpsResourceList struct {
 	Items []gitOpsResource `json:"items"`
 }
 type gitOpsResource struct {
 	Metadata struct {
 		Name      string `json:"name"`
 		Namespace string `json:"namespace"`
 	} `json:"metadata"`
 	Spec struct {
 		Suspend bool `json:"suspend"`
 		Ref     struct {
 			Branch string `json:"branch"`
 			Tag    string `json:"tag"`
 			SemVer string `json:"semver"`
 		} `json:"ref"`
 		URL       string `json:"url"`
 		Path      string `json:"path"`
 		SourceRef struct {
 			Name      string `json:"name"`
 			Namespace string `json:"namespace"`
 		} `json:"sourceRef"`
 		Chart struct {
 			Spec struct {
 				Chart   string `json:"chart"`
 				Version string `json:"version"`
 			} `json:"spec"`
 		} `json:"chart"`
 	} `json:"spec"`
 	Status struct {
 		Artifact struct {
 			Revision string `json:"revision"`
 		} `json:"artifact"`
 		LastAppliedRevision   string `json:"lastAppliedRevision"`
 		LastAttemptedRevision string `json:"lastAttemptedRevision"`
 		Conditions            []struct {
 			Type   string `json:"type"`
 			Status string `json:"status"`
 			Reason string `json:"reason"`
 		} `json:"conditions"`
 		History []struct {
 			ChartName    string `json:"chartName"`
 			ChartVersion string `json:"chartVersion"`
 			AppVersion   string `json:"appVersion"`
 			Digest       string `json:"digest"`
 		} `json:"history"`
 	} `json:"status"`
 }
 // maybeStartGitOpsSnapshot starts a bounded background scrape when the GitOps
 // sample interval has elapsed.
 // Signature: (d *Daemon) maybeStartGitOpsSnapshot(ctx context.Context, lastRun *time.Time, running bool, done chan<- struct{}) bool.
 // Why: Kubernetes API reads must not block UPS polling, especially during
 // emergency power events when the daemon's first job is still safe shutdown.
 func (d *Daemon) maybeStartGitOpsSnapshot(ctx context.Context, lastRun *time.Time, running bool, done chan<- struct{}) bool {
 	if !d.cfg.GitOps.Enabled || d.exporter == nil || running {
 		return running
 	}
 	interval := time.Duration(d.cfg.GitOps.PollSeconds) * time.Second
 	if interval <= 0 {
 		interval = 60 * time.Second
 	}
 	now := time.Now()
 	if lastRun != nil && !lastRun.IsZero() && now.Sub(*lastRun) < interval {
 		return running
 	}
 	if lastRun != nil {
 		*lastRun = now
 	}
 	cfg := d.cfg
 	exporter := d.exporter
 	logger := d.log
 	go func() {
 		defer func() {
 			select {
 			case done <- struct{}{}:
 			default:
 			}
 		}()
 		scrapeCtx, cancel := context.WithTimeout(ctx, 25*time.Second)
 		defer cancel()
 		snapshot := collectGitOpsSnapshot(scrapeCtx, cfg)
 		exporter.UpdateGitOpsSnapshot(snapshot)
 		if !snapshot.ScrapeSuccess && logger != nil {
 			logger.Printf("warning: gitops metrics scrape partial: %s", gitOpsErrorSummary(snapshot.Errors))
 		}
 	}()
 	return true
 }
 // collectGitOpsSnapshot queries Flux custom resources and converts them into
 // Prometheus-safe metric state.
 // Signature: collectGitOpsSnapshot(ctx context.Context, cfg config.Config) metrics.GitOpsSnapshot.
 // Why: the Grafana overview needs current branch/readiness/suspend state, which
 // is object status rather than controller operational telemetry.
 func collectGitOpsSnapshot(ctx context.Context, cfg config.Config) metrics.GitOpsSnapshot {
 	snapshot := metrics.GitOpsSnapshot{
 		UpdatedAt:     time.Now().UTC(),
 		ScrapeSuccess: true,
 		Errors:        map[string]string{},
 	}
 	if raw, err := gitOpsKubectlJSON(ctx, cfg, "get", "gitrepositories.source.toolkit.fluxcd.io", "-A", "-o", "json"); err != nil {
 		snapshot.Errors["gitrepository"] = err.Error()
 	} else if sources, err := parseGitRepositories(raw); err != nil {
 		snapshot.Errors["gitrepository"] = err.Error()
 	} else {
 		snapshot.FluxSources = sources
 	}
 	if raw, err := gitOpsKubectlJSON(ctx, cfg, "get", "kustomizations.kustomize.toolkit.fluxcd.io", "-A", "-o", "json"); err != nil {
 		snapshot.Errors["kustomization"] = err.Error()
 	} else if kustomizations, err := parseKustomizations(raw); err != nil {
 		snapshot.Errors["kustomization"] = err.Error()
 	} else {
 		snapshot.Kustomizations = kustomizations
 	}
 	if raw, err := gitOpsKubectlJSON(ctx, cfg, "get", "helmreleases.helm.toolkit.fluxcd.io", "-A", "-o", "json"); err != nil {
 		snapshot.Errors["helmrelease"] = err.Error()
 	} else if releases, err := parseHelmReleases(raw); err != nil {
 		snapshot.Errors["helmrelease"] = err.Error()
 	} else {
 		snapshot.HelmReleases = releases
 	}
 	snapshot.ScrapeSuccess = len(snapshot.Errors) == 0
 	return snapshot
 }
 // parseGitRepositories extracts Git source branch/revision/readiness from Flux
 // GitRepository JSON.
 // Signature: parseGitRepositories(raw []byte) ([]metrics.GitOpsFluxSource, error).
 // Why: keeping status parsing isolated makes it testable without a live Flux
 // API and protects the daemon loop from brittle JSON traversal code.
 func parseGitRepositories(raw []byte) ([]metrics.GitOpsFluxSource, error) {
 	var list gitOpsResourceList
 	if err := json.Unmarshal(raw, &list); err != nil {
 		return nil, fmt.Errorf("decode gitrepositories: %w", err)
 	}
 	out := make([]metrics.GitOpsFluxSource, 0, len(list.Items))
 	for _, item := range list.Items {
 		ready, reason := fluxReady(item.Status.Conditions)
 		out = append(out, metrics.GitOpsFluxSource{
 			Namespace: defaultString(item.Metadata.Namespace, "default"),
 			Name:      item.Metadata.Name,
 			URL:       item.Spec.URL,
 			Branch:    sourceBranch(item),
 			Revision:  item.Status.Artifact.Revision,
 			Ready:     ready,
 			Reason:    reason,
 			Suspended: item.Spec.Suspend,
 		})
 	}
 	sort.Slice(out, func(i, j int) bool {
 		return out[i].Namespace+"/"+out[i].Name < out[j].Namespace+"/"+out[j].Name
 	})
 	return out, nil
 }
 // parseKustomizations extracts Kustomization health and source metadata from
 // Flux Kustomization JSON.
 // Signature: parseKustomizations(raw []byte) ([]metrics.GitOpsKustomization, error).
 // Why: Kustomization status drives both the overview summary and the detailed
 // GitOps table, so the parser keeps the metric contract in one place.
 func parseKustomizations(raw []byte) ([]metrics.GitOpsKustomization, error) {
 	var list gitOpsResourceList
 	if err := json.Unmarshal(raw, &list); err != nil {
 		return nil, fmt.Errorf("decode kustomizations: %w", err)
 	}
 	out := make([]metrics.GitOpsKustomization, 0, len(list.Items))
 	for _, item := range list.Items {
 		ready, reason := fluxReady(item.Status.Conditions)
 		namespace := defaultString(item.Metadata.Namespace, "default")
 		out = append(out, metrics.GitOpsKustomization{
 			Namespace:       namespace,
 			Name:            item.Metadata.Name,
 			Path:            item.Spec.Path,
 			SourceNamespace: defaultString(item.Spec.SourceRef.Namespace, namespace),
 			SourceName:      item.Spec.SourceRef.Name,
 			Revision:        firstNonEmpty(item.Status.LastAppliedRevision, item.Status.LastAttemptedRevision),
 			Ready:           ready,
 			Reason:          reason,
 			Suspended:       item.Spec.Suspend,
 		})
 	}
 	sort.Slice(out, func(i, j int) bool {
 		return out[i].Namespace+"/"+out[i].Name < out[j].Namespace+"/"+out[j].Name
 	})
 	return out, nil
 }
 // parseHelmReleases extracts HelmRelease health and chart metadata from Flux
 // HelmRelease JSON.
 // Signature: parseHelmReleases(raw []byte) ([]metrics.GitOpsHelmRelease, error).
 // Why: HelmRelease status has different chart fields than Kustomizations, and
 // isolating it keeps dashboard metrics stable as Flux payloads evolve.
 func parseHelmReleases(raw []byte) ([]metrics.GitOpsHelmRelease, error) {
 	var list gitOpsResourceList
 	if err := json.Unmarshal(raw, &list); err != nil {
 		return nil, fmt.Errorf("decode helmreleases: %w", err)
 	}
 	out := make([]metrics.GitOpsHelmRelease, 0, len(list.Items))
 	for _, item := range list.Items {
 		ready, reason := fluxReady(item.Status.Conditions)
 		chart := item.Spec.Chart.Spec.Chart
 		version := item.Spec.Chart.Spec.Version
 		appVersion := ""
 		revision := firstNonEmpty(item.Status.LastAppliedRevision, item.Status.LastAttemptedRevision)
 		if len(item.Status.History) > 0 {
 			latest := item.Status.History[0]
 			chart = firstNonEmpty(latest.ChartName, chart)
 			version = firstNonEmpty(latest.ChartVersion, version)
 			appVersion = latest.AppVersion
 			revision = firstNonEmpty(latest.Digest, revision)
 		}
 		out = append(out, metrics.GitOpsHelmRelease{
 			Namespace:  defaultString(item.Metadata.Namespace, "default"),
 			Name:       item.Metadata.Name,
 			Chart:      chart,
 			Version:    version,
 			AppVersion: appVersion,
 			Revision:   revision,
 			Ready:      ready,
 			Reason:     reason,
 			Suspended:  item.Spec.Suspend,
 		})
 	}
 	sort.Slice(out, func(i, j int) bool {
 		return out[i].Namespace+"/"+out[i].Name < out[j].Namespace+"/"+out[j].Name
 	})
 	return out, nil
 }
 // gitOpsKubectlJSON runs kubectl with the configured kubeconfig and a short
 // request timeout.
 // Signature: gitOpsKubectlJSON(ctx context.Context, cfg config.Config, args ...string) ([]byte, error).
 // Why: every GitOps scrape should fail quickly instead of stealing time from
 // UPS safety polling during outage recovery.
 func gitOpsKubectlJSON(ctx context.Context, cfg config.Config, args ...string) ([]byte, error) {
 	finalArgs := []string{}
 	if strings.TrimSpace(cfg.Kubeconfig) != "" {
 		finalArgs = append(finalArgs, "--kubeconfig", cfg.Kubeconfig)
 	}
 	finalArgs = append(finalArgs, "--request-timeout=8s")
 	finalArgs = append(finalArgs, args...)
 	return gitOpsKubectlOutput(ctx, "kubectl", finalArgs...)
 }
 // runGitOpsKubectlOutput is the production command runner for GitOps snapshot
 // collection.
 // Signature: runGitOpsKubectlOutput(ctx context.Context, name string, args ...string) ([]byte, error).
 // Why: preserving stderr in errors makes live diagnosis possible when Flux CRD
 // reads fail on a recovering control plane.
 func runGitOpsKubectlOutput(ctx context.Context, name string, args ...string) ([]byte, error) {
 	cmd := exec.CommandContext(ctx, name, args...)
 	out, err := cmd.CombinedOutput()
 	if err != nil {
 		trimmed := strings.TrimSpace(string(out))
 		if trimmed == "" {
 			return nil, err
 		}
 		return nil, fmt.Errorf("%w: %s", err, trimmed)
 	}
 	return out, nil
 }
 // fluxReady resolves a Flux Ready condition into a boolean and reason.
 // Signature: fluxReady(conditions []struct { Type string; Status string; Reason string }) (bool, string).
 // Why: Flux resources all use Ready conditions, so one helper keeps status
 // interpretation consistent across GitRepository, Kustomization, and HelmRelease.
 func fluxReady(conditions []struct {
 	Type   string `json:"type"`
 	Status string `json:"status"`
 	Reason string `json:"reason"`
 }) (bool, string) {
 	for _, condition := range conditions {
 		if condition.Type == "Ready" {
 			return strings.EqualFold(condition.Status, "True"), defaultString(condition.Reason, "ReadyUnknown")
 		}
 	}
 	return false, "ReadyMissing"
 }
 // sourceBranch returns the most useful source reference label for a
 // GitRepository.
 // Signature: sourceBranch(item gitOpsResource) string.
 // Why: branch is preferred for Atlas, but tags and semver refs should still
 // render clearly if Flux is ever pinned differently.
 func sourceBranch(item gitOpsResource) string {
 	return firstNonEmpty(item.Spec.Ref.Branch, item.Spec.Ref.Tag, item.Spec.Ref.SemVer, "unknown")
 }
 // defaultString trims a value and returns a fallback when it is empty.
 // Signature: defaultString(value string, fallback string) string.
 // Why: Flux omits some optional fields; dashboards are easier to read with
 // explicit fallback labels.
 func defaultString(value string, fallback string) string {
 	value = strings.TrimSpace(value)
 	if value == "" {
 		return fallback
 	}
 	return value
 }
 // firstNonEmpty returns the first non-empty trimmed value.
 // Signature: firstNonEmpty(values ...string) string.
 // Why: Flux has several revision fields depending on resource kind and
 // reconciliation stage, so callers can express preferred fallback order.
 func firstNonEmpty(values ...string) string {
 	for _, value := range values {
 		value = strings.TrimSpace(value)
 		if value != "" {
 			return value
 		}
 	}
 	return ""
 }
 // gitOpsErrorSummary renders resource-family scrape errors for daemon logs.
 // Signature: gitOpsErrorSummary(errors map[string]string) string.
 // Why: a compact sorted summary keeps recurring scrape warnings readable in
 // journald during partial cluster recovery.
 func gitOpsErrorSummary(errors map[string]string) string {
 	if len(errors) == 0 {
 		return "none"
 	}
 	keys := make([]string, 0, len(errors))
 	for key := range errors {
 		keys = append(keys, key)
 	}
 	sort.Strings(keys)
 	parts := make([]string, 0, len(keys))
 	for _, key := range keys {
 		parts = append(parts, key+"="+errors[key])
 	}
 	return strings.Join(parts, "; ")
 }
--- a/internal/service/testing_hooks_gitops.go
+++ b/internal/service/testing_hooks_gitops.go
@ -0,0 +1,127 @@
 package service
 import (
 	"context"
 	"log"
 	"time"
 	"scm.bstein.dev/bstein/ananke/internal/config"
 	"scm.bstein.dev/bstein/ananke/internal/metrics"
 )
 type TestHookGitOpsRunner func(context.Context, string, ...string) ([]byte, error)
 // TestHookParseGitRepositories exposes GitRepository parsing to the top-level
 // testing module.
 // Signature: TestHookParseGitRepositories(raw []byte) ([]metrics.GitOpsFluxSource, error).
 // Why: Ananke keeps split-module tests outside internal packages, but the
 // parser needs direct contract coverage without a live cluster.
 func TestHookParseGitRepositories(raw []byte) ([]metrics.GitOpsFluxSource, error) {
 	return parseGitRepositories(raw)
 }
 // TestHookParseKustomizations exposes Kustomization parsing to the top-level
 // testing module.
 // Signature: TestHookParseKustomizations(raw []byte) ([]metrics.GitOpsKustomization, error).
 // Why: dashboard-facing labels should be verified from representative Flux JSON
 // without relying on the current production cluster shape.
 func TestHookParseKustomizations(raw []byte) ([]metrics.GitOpsKustomization, error) {
 	return parseKustomizations(raw)
 }
 // TestHookParseHelmReleases exposes HelmRelease parsing to the top-level
 // testing module.
 // Signature: TestHookParseHelmReleases(raw []byte) ([]metrics.GitOpsHelmRelease, error).
 // Why: Helm status fields vary by reconciliation phase, and split-module tests
 // need a stable seam for parser coverage.
 func TestHookParseHelmReleases(raw []byte) ([]metrics.GitOpsHelmRelease, error) {
 	return parseHelmReleases(raw)
 }
 // TestHookCollectGitOpsSnapshotWithRunner runs snapshot collection with an
 // injected kubectl runner.
 // Signature: TestHookCollectGitOpsSnapshotWithRunner(ctx context.Context, cfg config.Config, runner TestHookGitOpsRunner) metrics.GitOpsSnapshot.
 // Why: this covers success and partial-failure collection paths without shelling
 // out to kubectl from unit tests.
 func TestHookCollectGitOpsSnapshotWithRunner(ctx context.Context, cfg config.Config, runner TestHookGitOpsRunner) metrics.GitOpsSnapshot {
 	original := gitOpsKubectlOutput
 	defer func() { gitOpsKubectlOutput = original }()
 	gitOpsKubectlOutput = runner
 	return collectGitOpsSnapshot(ctx, cfg)
 }
 // TestHookMaybeStartGitOpsSnapshot starts the daemon's background GitOps scrape
 // from split-module tests.
 // Signature: TestHookMaybeStartGitOpsSnapshot(ctx context.Context, cfg config.Config, exporter *metrics.Exporter, logger *log.Logger, lastRun *time.Time, running bool, done chan<- struct{}) bool.
 // Why: the daemon intentionally keeps scraping asynchronous so GitOps telemetry
 // cannot delay UPS polling; the behavior needs direct coverage.
 func TestHookMaybeStartGitOpsSnapshot(ctx context.Context, cfg config.Config, exporter *metrics.Exporter, logger *log.Logger, lastRun *time.Time, running bool, done chan<- struct{}) bool {
 	d := &Daemon{cfg: cfg, exporter: exporter, log: logger}
 	return d.maybeStartGitOpsSnapshot(ctx, lastRun, running, done)
 }
 // TestHookMaybeStartGitOpsSnapshotWithRunner starts a background GitOps scrape
 // with an injected runner and restores the production runner after completion.
 // Signature: TestHookMaybeStartGitOpsSnapshotWithRunner(ctx context.Context, cfg config.Config, exporter *metrics.Exporter, logger *log.Logger, lastRun *time.Time, running bool, done chan<- struct{}, runner TestHookGitOpsRunner) bool.
 // Why: the scrape is asynchronous, so split-module tests need a seam that keeps
 // fake kubectl behavior installed until the goroutine exits.
 func TestHookMaybeStartGitOpsSnapshotWithRunner(ctx context.Context, cfg config.Config, exporter *metrics.Exporter, logger *log.Logger, lastRun *time.Time, running bool, done chan<- struct{}, runner TestHookGitOpsRunner) bool {
 	original := gitOpsKubectlOutput
 	gitOpsKubectlOutput = runner
 	proxyDone := make(chan struct{}, 1)
 	d := &Daemon{cfg: cfg, exporter: exporter, log: logger}
 	started := d.maybeStartGitOpsSnapshot(ctx, lastRun, running, proxyDone)
 	if !started || running {
 		gitOpsKubectlOutput = original
 		return started
 	}
 	go func() {
 		<-proxyDone
 		gitOpsKubectlOutput = original
 		select {
 		case done <- struct{}{}:
 		default:
 		}
 	}()
 	return started
 }
 // TestHookRunGitOpsKubectlOutput exposes the production command runner to
 // split-module tests.
 // Signature: TestHookRunGitOpsKubectlOutput(ctx context.Context, name string, args ...string) ([]byte, error).
 // Why: stderr preservation and empty-stderr failure behavior are part of the
 // operator-facing diagnostics contract.
 func TestHookRunGitOpsKubectlOutput(ctx context.Context, name string, args ...string) ([]byte, error) {
 	return runGitOpsKubectlOutput(ctx, name, args...)
 }
 // TestHookGitOpsDefaultString exposes defaultString to split-module tests.
 // Signature: TestHookGitOpsDefaultString(value string, fallback string) string.
 // Why: fallback label behavior is intentionally tiny but important for readable
 // Grafana tables during startup.
 func TestHookGitOpsDefaultString(value string, fallback string) string {
 	return defaultString(value, fallback)
 }
 // TestHookGitOpsFirstNonEmpty exposes firstNonEmpty to split-module tests.
 // Signature: TestHookGitOpsFirstNonEmpty(values ...string) string.
 // Why: revision fallback order should remain deterministic as Flux status
 // payloads change.
 func TestHookGitOpsFirstNonEmpty(values ...string) string {
 	return firstNonEmpty(values...)
 }
 // TestHookGitOpsErrorSummary exposes gitOpsErrorSummary to split-module tests.
 // Signature: TestHookGitOpsErrorSummary(errors map[string]string) string.
 // Why: sorted scrape warnings are easier to inspect in journald during recovery.
 func TestHookGitOpsErrorSummary(errors map[string]string) string {
 	return gitOpsErrorSummary(errors)
 }
 // TestHookFluxReadyMissing exposes the missing Ready-condition branch to tests.
 // Signature: TestHookFluxReadyMissing() (bool, string).
 // Why: absent Ready conditions should be treated as not-ready and labelled
 // explicitly instead of being mistaken for healthy.
 func TestHookFluxReadyMissing() (bool, string) {
 	return fluxReady(nil)
 }
--- a/scripts/publish_quality_metrics.py
+++ b/scripts/publish_quality_metrics.py
@ -172,6 +172,14 @@ def _read_coverage_percent(path: str) -> float:
        return 0.0
 def _resolve_repo_path(repo_root: Path, path: str) -> Path:
    """Resolve quality-gate artifact paths relative to the repository root."""
    candidate = Path(path)
    if candidate.is_absolute():
        return candidate
    return repo_root / candidate
 def _iter_source_files(repo_root: Path):
    for rel_root in SOURCE_SCAN_ROOTS:
        base = repo_root / rel_root
@ -367,7 +375,9 @@ def main(argv: list[str] | None = None) -> int:
    args = parse_args(argv or sys.argv[1:])
    repo_root = Path(__file__).resolve().parents[1]
    build_dir = repo_root / "build"
-    gate_rc = _read_exit_code(Path(os.getenv("ANANKE_QUALITY_EXIT_CODE_PATH", str(build_dir / "quality-gate.rc"))))
+    gate_rc = _read_exit_code(
        _resolve_repo_path(repo_root, os.getenv("ANANKE_QUALITY_EXIT_CODE_PATH", str(build_dir / "quality-gate.rc")))
    )
    current_ok = 1 if gate_rc == 0 else 0
    current_failed = 0 if gate_rc == 0 else 1
@ -420,13 +430,18 @@ def main(argv: list[str] | None = None) -> int:
    elif not already_recorded:
        resolved_ok += current_ok
        resolved_failed += current_failed
-    coverage_percent = _read_coverage_percent(args.coverage_percent_file)
+    coverage_percent = _read_coverage_percent(str(_resolve_repo_path(repo_root, args.coverage_percent_file)))
    source_files_total = _count_source_files(repo_root)
    source_lines_over_500 = _count_source_files_over_limit(repo_root, max_lines=500)
-    quality_output = Path(os.getenv("ANANKE_QUALITY_OUTPUT_FILE", str(build_dir / "quality-gate.out")))
+    quality_output = _resolve_repo_path(
        repo_root,
        os.getenv("ANANKE_QUALITY_OUTPUT_FILE", str(build_dir / "quality-gate.out")),
    )
    tests = _parse_go_test_counts(quality_output)
    test_cases = _parse_go_test_cases(quality_output)
-    docs_status = _read_status(Path(os.getenv("ANANKE_QUALITY_DOCS_STATUS_PATH", str(build_dir / "docs-naming.status"))))
+    docs_status = _read_status(
        _resolve_repo_path(repo_root, os.getenv("ANANKE_QUALITY_DOCS_STATUS_PATH", str(build_dir / "docs-naming.status")))
    )
    unit_tests_failed = _unit_tests_failed(quality_output, coverage_percent)
    checks = {
        "tests": "failed" if unit_tests_failed or tests["failed"] > 0 or tests["errors"] > 0 else "ok",
--- a/scripts/publish_quality_metrics_test.py
+++ b/scripts/publish_quality_metrics_test.py
@ -61,6 +61,18 @@ class PublishQualityMetricsTest(unittest.TestCase):
        self.server.server_close()
        self.thread.join(timeout=5)
    def test_relative_artifact_paths_are_repo_rooted(self) -> None:
        repo_root = Path("/tmp/ananke-repo")
        self.assertEqual(
            publisher._resolve_repo_path(repo_root, "build/coverage-percent.txt"),
            repo_root / "build" / "coverage-percent.txt",
        )
        self.assertEqual(
            publisher._resolve_repo_path(repo_root, "/tmp/coverage-percent.txt"),
            Path("/tmp/coverage-percent.txt"),
        )
    def _env_for_gate_status(self, status: int = 0) -> dict[str, str]:
        tmp_dir = tempfile.TemporaryDirectory()
        self.addCleanup(tmp_dir.cleanup)
--- a/scripts/quality_gate.sh
+++ b/scripts/quality_gate.sh
@ -56,12 +56,13 @@ read_quality_counter() {
 write_quality_metrics() {
  local exit_code="$1"
-  local metrics_dir state_dir
+  local metrics_dir state_dir write_metrics
  metrics_dir="$(dirname "${QUALITY_METRICS_FILE}")"
  state_dir="$(dirname "${QUALITY_STATE_FILE}")"
  write_metrics="${QUALITY_METRICS_ENABLED}"
  mkdir -p "${state_dir}" >/dev/null 2>&1 || return 0
-  if [[ "${QUALITY_METRICS_ENABLED}" == "1" ]]; then
+  if [[ "${write_metrics}" == "1" ]]; then
-    mkdir -p "${metrics_dir}" >/dev/null 2>&1 || return 0
+    mkdir -p "${metrics_dir}" >/dev/null 2>&1 || write_metrics=0
  fi
  local ok failed total last_success now success_percent
@ -84,12 +85,12 @@ write_quality_metrics() {
  QUALITY_SUCCESS_PERCENT="${success_percent}"
  local tmp_metrics="" tmp_state
-  if [[ "${QUALITY_METRICS_ENABLED}" == "1" ]]; then
+  if [[ "${write_metrics}" == "1" ]]; then
    tmp_metrics="$(mktemp "${metrics_dir}/quality-gate.prom.XXXXXX")"
  fi
  tmp_state="$(mktemp "${state_dir}/quality-gate.state.XXXXXX")"
-  if [[ "${QUALITY_METRICS_ENABLED}" == "1" ]]; then
+  if [[ "${write_metrics}" == "1" ]]; then
    cat > "${tmp_metrics}" <<EOF
 # HELP ananke_quality_gate_runs_total Total Ananke quality gate runs by status.
 # TYPE ananke_quality_gate_runs_total counter
@ -115,7 +116,7 @@ last_run=${now}
 EOF
  mv -f "${tmp_state}" "${QUALITY_STATE_FILE}"
-  if [[ "${QUALITY_METRICS_ENABLED}" == "1" ]]; then
+  if [[ "${write_metrics}" == "1" ]]; then
    mv -f "${tmp_metrics}" "${QUALITY_METRICS_FILE}"
  fi
 }
@ -146,6 +147,8 @@ publish_quality_metrics() {
 quality_gate_finalize() {
  local exit_code="$1"
  set +e
  printf '%s\n' "${exit_code}" > "${BUILD_DIR}/quality-gate.rc" 2>/dev/null || true
  cd "${REPO_DIR}" 2>/dev/null || true
  write_quality_metrics "${exit_code}" || true
  publish_quality_metrics || true
  exit "${exit_code}"
@ -193,4 +196,7 @@ printf '%s\n' "${coverage_percent}" > "${COVERAGE_PERCENT_FILE}"
 echo "[quality] per-file coverage gate (95%)"
 cd testing
-ANANKE_ENFORCE_COVERAGE=1 ANANKE_PER_FILE_COVERAGE_TARGET=95 go test ./coverage -run TestPerFileCoverageReport -count=1 -v
+ANANKE_ENFORCE_COVERAGE=1 \
  ANANKE_PER_FILE_COVERAGE_TARGET=95 \
  ANANKE_PER_FILE_COVERAGE_PERCENT_FILE="${COVERAGE_PERCENT_FILE}" \
  go test ./coverage -run TestPerFileCoverageReport -count=1 -v
--- a/testing/coverage/coverage_test.go
+++ b/testing/coverage/coverage_test.go
@ -121,6 +121,7 @@ func TestPerFileCoverageReport(t *testing.T) {
 		"./metrics",
 		"./hygiene",
 		"./orchestrator",
 		"./service",
 		"./state",
 		"./sshutil",
 		"./ups",
@ -167,17 +168,26 @@ func TestPerFileCoverageReport(t *testing.T) {
 	var report bytes.Buffer
 	report.WriteString("per-file coverage\n")
 	under := make([]string, 0)
 	minPercent := 100.0
 	for _, file := range keys {
 		fc := byFile[file]
 		pct := 100.0
 		if fc.total > 0 {
 			pct = (fc.covered / fc.total) * 100.0
 		}
 		if pct < minPercent {
 			minPercent = pct
 		}
 		report.WriteString(fmt.Sprintf("- %s: %.1f%%\n", file, pct))
 		if pct < target {
 			under = append(under, fmt.Sprintf("%s (%.1f%% < %.1f%%)", file, pct, target))
 		}
 	}
 	if outputPath := strings.TrimSpace(os.Getenv("ANANKE_PER_FILE_COVERAGE_PERCENT_FILE")); outputPath != "" {
 		if err := os.WriteFile(outputPath, []byte(fmt.Sprintf("%.1f\n", minPercent)), 0o644); err != nil {
 			t.Fatalf("write per-file coverage percent: %v", err)
 		}
 	}
 	t.Log(report.String())
 	if len(under) > 0 && enforce {
--- a/testing/metrics/exporter_http_contract_test.go
+++ b/testing/metrics/exporter_http_contract_test.go
@ -104,3 +104,64 @@ func TestExporterHelperContracts(t *testing.T) {
 		t.Fatalf("unexpected escaped string: %q", got)
 	}
 }
 // TestExporterEmitsGitOpsMetrics runs one orchestration or CLI step.
 // Signature: TestExporterEmitsGitOpsMetrics(t *testing.T).
 // Why: the overview dashboard depends on Ananke-owned Flux object-state metrics
 // rather than the narrower controller metrics exposed by Flux itself.
 func TestExporterEmitsGitOpsMetrics(t *testing.T) {
 	exporter := metrics.New()
 	exporter.UpdateGitOpsSnapshot(metrics.GitOpsSnapshot{
 		UpdatedAt:     time.Unix(1710000100, 0).UTC(),
 		ScrapeSuccess: true,
 		FluxSources: []metrics.GitOpsFluxSource{{
 			Namespace: "flux-system",
 			Name:      "flux-system",
 			URL:       "ssh://git@example/repo.git",
 			Branch:    "main",
 			Revision:  "main@sha1:abc123",
 			Ready:     true,
 			Reason:    "Succeeded",
 		}},
 		Kustomizations: []metrics.GitOpsKustomization{{
 			Namespace:       "flux-system",
 			Name:            "monitoring",
 			Path:            "./services/monitoring",
 			SourceNamespace: "flux-system",
 			SourceName:      "flux-system",
 			Revision:        "main@sha1:abc123",
 			Ready:           true,
 			Reason:          "ReconciliationSucceeded",
 		}},
 		HelmReleases: []metrics.GitOpsHelmRelease{{
 			Namespace:  "monitoring",
 			Name:       "grafana",
 			Chart:      "grafana",
 			Version:    "8.5.0",
 			AppVersion: "11.4.0",
 			Revision:   "sha256:abc123",
 			Ready:      false,
 			Reason:     "Progressing",
 			Suspended:  true,
 		}},
 	})
 	req := httptest.NewRequest(http.MethodGet, "/metrics", nil)
 	rr := httptest.NewRecorder()
 	exporter.Handler("/metrics").ServeHTTP(rr, req)
 	body := rr.Body.String()
 	mustContain := []string{
 		"ananke_gitops_last_scrape_timestamp_seconds 1710000100",
 		"ananke_gitops_scrape_success 1",
 		`ananke_gitops_flux_source_info{namespace="flux-system",name="flux-system",url="ssh://git@example/repo.git",branch="main",revision="main@sha1:abc123",ready="true",reason="Succeeded"} 1`,
 		`ananke_gitops_kustomization_ready{namespace="flux-system",name="monitoring"} 1`,
 		`ananke_gitops_helmrelease_info{namespace="monitoring",name="grafana",chart="grafana",version="8.5.0",app_version="11.4.0",revision="sha256:abc123",ready="false",reason="Progressing"} 1`,
 		`ananke_gitops_helmrelease_suspended{namespace="monitoring",name="grafana"} 1`,
 	}
 	for _, fragment := range mustContain {
 		if !strings.Contains(body, fragment) {
 			t.Fatalf("missing GitOps metric fragment %q in output:\n%s", fragment, body)
 		}
 	}
 }
--- a/testing/service/gitops_snapshot_test.go
+++ b/testing/service/gitops_snapshot_test.go
@ -0,0 +1,316 @@
 package servicequality
 import (
 	"context"
 	"errors"
 	"io"
 	"log"
 	"net/http"
 	"net/http/httptest"
 	"strings"
 	"testing"
 	"time"
 	"scm.bstein.dev/bstein/ananke/internal/config"
 	"scm.bstein.dev/bstein/ananke/internal/metrics"
 	"scm.bstein.dev/bstein/ananke/internal/service"
 )
 // TestParseGitRepositories runs one orchestration or CLI step.
 // Signature: TestParseGitRepositories(t *testing.T).
 // Why: branch, revision, and Ready status are the labels the Grafana GitOps
 // panels rely on for the current Flux source.
 func TestParseGitRepositories(t *testing.T) {
 	raw := []byte(`{
 	  "items": [{
 	    "metadata": {"name": "flux-system", "namespace": "flux-system"},
 	    "spec": {"url": "ssh://git@example/repo.git", "ref": {"branch": "main"}},
 	    "status": {
 	      "artifact": {"revision": "main@sha1:abc123"},
 	      "conditions": [{"type": "Ready", "status": "True", "reason": "Succeeded"}]
 	    }
 	  }]
 	}`)
 	got, err := service.TestHookParseGitRepositories(raw)
 	if err != nil {
 		t.Fatalf("parseGitRepositories failed: %v", err)
 	}
 	if len(got) != 1 || got[0].Branch != "main" || got[0].Revision != "main@sha1:abc123" || !got[0].Ready {
 		t.Fatalf("unexpected GitRepository parse result: %+v", got)
 	}
 }
 // TestParseKustomizations runs one orchestration or CLI step.
 // Signature: TestParseKustomizations(t *testing.T).
 // Why: Kustomization source and suspend labels power the detailed GitOps table.
 func TestParseKustomizations(t *testing.T) {
 	raw := []byte(`{
 	  "items": [{
 	    "metadata": {"name": "monitoring", "namespace": "flux-system"},
 	    "spec": {
 	      "path": "./services/monitoring",
 	      "sourceRef": {"name": "flux-system"},
 	      "suspend": true
 	    },
 	    "status": {
 	      "lastAppliedRevision": "main@sha1:def456",
 	      "conditions": [{"type": "Ready", "status": "False", "reason": "DependencyNotReady"}]
 	    }
 	  }]
 	}`)
 	got, err := service.TestHookParseKustomizations(raw)
 	if err != nil {
 		t.Fatalf("parseKustomizations failed: %v", err)
 	}
 	if len(got) != 1 || got[0].SourceNamespace != "flux-system" || got[0].Ready || !got[0].Suspended {
 		t.Fatalf("unexpected Kustomization parse result: %+v", got)
 	}
 }
 // TestParseHelmReleases runs one orchestration or CLI step.
 // Signature: TestParseHelmReleases(t *testing.T).
 // Why: HelmRelease chart fields have separate status fallbacks from plain Flux
 // Kustomizations and need their own contract coverage.
 func TestParseHelmReleases(t *testing.T) {
 	raw := []byte(`{
 	  "items": [{
 	    "metadata": {"name": "grafana", "namespace": "monitoring"},
 	    "spec": {
 	      "chart": {"spec": {"chart": "grafana", "version": "~8.5.0"}}
 	    },
 	    "status": {
 	      "lastAttemptedRevision": "8.5.8",
 	      "history": [{"chartName": "grafana", "chartVersion": "8.5.8", "appVersion": "11.4.0", "digest": "sha256:abc"}],
 	      "conditions": [{"type": "Ready", "status": "True", "reason": "InstallSucceeded"}]
 	    }
 	  }]
 	}`)
 	got, err := service.TestHookParseHelmReleases(raw)
 	if err != nil {
 		t.Fatalf("parseHelmReleases failed: %v", err)
 	}
 	if len(got) != 1 || got[0].Chart != "grafana" || got[0].Version != "8.5.8" || got[0].Revision != "sha256:abc" || !got[0].Ready {
 		t.Fatalf("unexpected HelmRelease parse result: %+v", got)
 	}
 }
 // TestCollectGitOpsSnapshotPartialFailure runs one orchestration or CLI step.
 // Signature: TestCollectGitOpsSnapshotPartialFailure(t *testing.T).
 // Why: a recovering cluster may serve some Flux resource families before
 // others, and Ananke should publish a partial snapshot instead of failing closed.
 func TestCollectGitOpsSnapshotPartialFailure(t *testing.T) {
 	runner := func(_ context.Context, _ string, args ...string) ([]byte, error) {
 		joined := strings.Join(args, " ")
 		switch {
 		case strings.Contains(joined, "gitrepositories"):
 			return []byte(`{"items":[]}`), nil
 		case strings.Contains(joined, "kustomizations"):
 			return nil, errors.New("api unavailable")
 		case strings.Contains(joined, "helmreleases"):
 			return []byte(`{"items":[]}`), nil
 		default:
 			return nil, errors.New("unexpected command")
 		}
 	}
 	snapshot := service.TestHookCollectGitOpsSnapshotWithRunner(context.Background(), config.Config{Kubeconfig: "/tmp/kubeconfig"}, runner)
 	if snapshot.ScrapeSuccess {
 		t.Fatalf("expected partial failure")
 	}
 	if snapshot.Errors["kustomization"] == "" {
 		t.Fatalf("expected kustomization error, got %+v", snapshot.Errors)
 	}
 	if summary := service.TestHookGitOpsErrorSummary(snapshot.Errors); !strings.Contains(summary, "kustomization=api unavailable") {
 		t.Fatalf("unexpected error summary: %s", summary)
 	}
 }
 // TestCollectGitOpsSnapshotParseFailures runs one orchestration or CLI step.
 // Signature: TestCollectGitOpsSnapshotParseFailures(t *testing.T).
 // Why: malformed API payloads should be surfaced per resource family so the
 // exporter can distinguish data errors from transport failures.
 func TestCollectGitOpsSnapshotParseFailures(t *testing.T) {
 	runner := func(_ context.Context, _ string, _ ...string) ([]byte, error) {
 		return []byte(`not-json`), nil
 	}
 	snapshot := service.TestHookCollectGitOpsSnapshotWithRunner(context.Background(), config.Config{}, runner)
 	if snapshot.ScrapeSuccess {
 		t.Fatalf("expected parse failures")
 	}
 	for _, key := range []string{"gitrepository", "kustomization", "helmrelease"} {
 		if snapshot.Errors[key] == "" {
 			t.Fatalf("expected %s parse error, got %+v", key, snapshot.Errors)
 		}
 	}
 }
 // TestCollectGitOpsSnapshotSuccess runs one orchestration or CLI step.
 // Signature: TestCollectGitOpsSnapshotSuccess(t *testing.T).
 // Why: successful collection should preserve non-branch Git refs as the source
 // label rather than hiding them as unknown.
 func TestCollectGitOpsSnapshotSuccess(t *testing.T) {
 	runner := func(_ context.Context, _ string, args ...string) ([]byte, error) {
 		joined := strings.Join(args, " ")
 		switch {
 		case strings.Contains(joined, "gitrepositories"):
 			return []byte(`{"items":[{"metadata":{"name":"flux-system","namespace":"flux-system"},"spec":{"ref":{"tag":"v1.0.0"}}}]}`), nil
 		case strings.Contains(joined, "kustomizations"):
 			return []byte(`{"items":[]}`), nil
 		case strings.Contains(joined, "helmreleases"):
 			return []byte(`{"items":[]}`), nil
 		default:
 			return nil, errors.New("unexpected command")
 		}
 	}
 	snapshot := service.TestHookCollectGitOpsSnapshotWithRunner(context.Background(), config.Config{}, runner)
 	if !snapshot.ScrapeSuccess || len(snapshot.FluxSources) != 1 || snapshot.FluxSources[0].Branch != "v1.0.0" {
 		t.Fatalf("unexpected snapshot: %+v", snapshot)
 	}
 }
 // TestMaybeStartGitOpsSnapshot runs one orchestration or CLI step.
 // Signature: TestMaybeStartGitOpsSnapshot(t *testing.T).
 // Why: Ananke must collect GitOps status asynchronously so a slow Kubernetes API
 // cannot delay UPS polling.
 func TestMaybeStartGitOpsSnapshot(t *testing.T) {
 	runner := func(_ context.Context, _ string, _ ...string) ([]byte, error) {
 		return []byte(`{"items":[]}`), nil
 	}
 	exporter := metrics.New()
 	done := make(chan struct{}, 1)
 	lastRun := time.Time{}
 	cfg := config.Config{GitOps: config.GitOps{Enabled: true, PollSeconds: 1}}
 	withRunner := func() bool {
 		return service.TestHookCollectGitOpsSnapshotWithRunner(context.Background(), config.Config{}, runner).ScrapeSuccess
 	}
 	if !withRunner() {
 		t.Fatalf("expected injected runner sanity check to pass")
 	}
 	if running := service.TestHookMaybeStartGitOpsSnapshotWithRunner(context.Background(), cfg, exporter, log.New(io.Discard, "", 0), &lastRun, false, done, runner); !running {
 		t.Fatalf("expected scrape to start")
 	}
 	select {
 	case <-done:
 	case <-time.After(2 * time.Second):
 		t.Fatalf("timed out waiting for scrape")
 	}
 	req := httptest.NewRequest(http.MethodGet, "/metrics", nil)
 	rr := httptest.NewRecorder()
 	exporter.Handler("/metrics").ServeHTTP(rr, req)
 	if !strings.Contains(rr.Body.String(), "ananke_gitops_scrape_success") {
 		t.Fatalf("expected GitOps scrape metric, got:\n%s", rr.Body.String())
 	}
 	if running := service.TestHookMaybeStartGitOpsSnapshot(context.Background(), cfg, exporter, log.New(io.Discard, "", 0), &lastRun, true, done); !running {
 		t.Fatalf("running scrape should stay marked running")
 	}
 	cfg.GitOps.Enabled = false
 	if running := service.TestHookMaybeStartGitOpsSnapshot(context.Background(), cfg, exporter, nil, &lastRun, false, done); running {
 		t.Fatalf("disabled GitOps scrape should not start")
 	}
 }
 // TestRunGitOpsKubectlOutput runs one orchestration or CLI step.
 // Signature: TestRunGitOpsKubectlOutput(t *testing.T).
 // Why: command diagnostics should preserve stderr for operator-visible errors.
 func TestRunGitOpsKubectlOutput(t *testing.T) {
 	out, err := service.TestHookRunGitOpsKubectlOutput(context.Background(), "sh", "-c", "printf ok")
 	if err != nil || string(out) != "ok" {
 		t.Fatalf("unexpected command success result: out=%q err=%v", out, err)
 	}
 	if _, err := service.TestHookRunGitOpsKubectlOutput(context.Background(), "sh", "-c", "echo bad >&2; exit 7"); err == nil || !strings.Contains(err.Error(), "bad") {
 		t.Fatalf("expected stderr to be preserved in error, got %v", err)
 	}
 	if _, err := service.TestHookRunGitOpsKubectlOutput(context.Background(), "sh", "-c", "exit 8"); err == nil {
 		t.Fatalf("expected empty-stderr command failure")
 	}
 }
 // TestGitOpsParseErrors runs one orchestration or CLI step.
 // Signature: TestGitOpsParseErrors(t *testing.T).
 // Why: invalid JSON should return parse errors for every Flux resource parser.
 func TestGitOpsParseErrors(t *testing.T) {
 	if _, err := service.TestHookParseGitRepositories([]byte(`not-json`)); err == nil {
 		t.Fatalf("expected GitRepository parse error")
 	}
 	if _, err := service.TestHookParseKustomizations([]byte(`not-json`)); err == nil {
 		t.Fatalf("expected Kustomization parse error")
 	}
 	if _, err := service.TestHookParseHelmReleases([]byte(`not-json`)); err == nil {
 		t.Fatalf("expected HelmRelease parse error")
 	}
 }
 // TestMaybeStartGitOpsSnapshotSkipsFreshSampleAndNilExporter runs one
 // orchestration or CLI step.
 // Signature: TestMaybeStartGitOpsSnapshotSkipsFreshSampleAndNilExporter(t *testing.T).
 // Why: disabled or already-fresh scrapes should be cheap no-ops in the daemon
 // loop.
 func TestMaybeStartGitOpsSnapshotSkipsFreshSampleAndNilExporter(t *testing.T) {
 	cfg := config.Config{GitOps: config.GitOps{Enabled: true, PollSeconds: 60}}
 	done := make(chan struct{}, 1)
 	lastRun := time.Now()
 	if running := service.TestHookMaybeStartGitOpsSnapshot(context.Background(), cfg, nil, nil, &lastRun, false, done); running {
 		t.Fatalf("nil exporter should not start scrape")
 	}
 	if running := service.TestHookMaybeStartGitOpsSnapshot(context.Background(), cfg, metrics.New(), nil, &lastRun, false, done); running {
 		t.Fatalf("fresh sample should not start another scrape")
 	}
 	runner := func(_ context.Context, _ string, _ ...string) ([]byte, error) {
 		return []byte(`{"items":[]}`), nil
 	}
 	if running := service.TestHookMaybeStartGitOpsSnapshotWithRunner(context.Background(), cfg, metrics.New(), nil, &lastRun, true, done, runner); !running {
 		t.Fatalf("already-running scrape should remain marked running")
 	}
 	cfg.GitOps.Enabled = false
 	lastRun = time.Time{}
 	if running := service.TestHookMaybeStartGitOpsSnapshotWithRunner(context.Background(), cfg, metrics.New(), nil, &lastRun, false, done, runner); running {
 		t.Fatalf("disabled scrape should not start through runner hook")
 	}
 }
 // TestMaybeStartGitOpsSnapshotLogsPartialFailureWithoutLogger runs one
 // orchestration or CLI step.
 // Signature: TestMaybeStartGitOpsSnapshotLogsPartialFailureWithoutLogger(t *testing.T).
 // Why: scrape failure handling should not depend on a logger being present.
 func TestMaybeStartGitOpsSnapshotLogsPartialFailureWithoutLogger(t *testing.T) {
 	done := make(chan struct{}, 1)
 	lastRun := time.Time{}
 	cfg := config.Config{GitOps: config.GitOps{Enabled: true, PollSeconds: 1}}
 	runner := func(_ context.Context, _ string, _ ...string) ([]byte, error) {
 		return nil, errors.New("api down")
 	}
 	if running := service.TestHookMaybeStartGitOpsSnapshotWithRunner(context.Background(), cfg, metrics.New(), nil, &lastRun, false, done, runner); !running {
 		t.Fatalf("expected scrape to start")
 	}
 	select {
 	case <-done:
 	case <-time.After(2 * time.Second):
 		t.Fatalf("timed out waiting for failed scrape")
 	}
 }
 // TestGitOpsHelpers runs one orchestration or CLI step.
 // Signature: TestGitOpsHelpers(t *testing.T).
 // Why: tiny label helpers carry dashboard readability contracts and are worth
 // locking down.
 func TestGitOpsHelpers(t *testing.T) {
 	if got := service.TestHookGitOpsDefaultString("  ", "fallback"); got != "fallback" {
 		t.Fatalf("unexpected defaultString fallback: %q", got)
 	}
 	if got := service.TestHookGitOpsFirstNonEmpty(" ", "second"); got != "second" {
 		t.Fatalf("unexpected firstNonEmpty result: %q", got)
 	}
 	if got := service.TestHookGitOpsFirstNonEmpty(" ", ""); got != "" {
 		t.Fatalf("unexpected empty firstNonEmpty result: %q", got)
 	}
 	if got := service.TestHookGitOpsErrorSummary(nil); got != "none" {
 		t.Fatalf("unexpected empty summary: %q", got)
 	}
 	ready, reason := service.TestHookFluxReadyMissing()
 	if ready || reason != "ReadyMissing" {
 		t.Fatalf("unexpected empty Ready condition parse: ready=%t reason=%s", ready, reason)
 	}
 }