monitoring: export gitops state from ananke

2026-05-15 19:36:58 -03:00 · 2026-05-15 19:36:58 -03:00 · 087728d481
commit 087728d481
parent 3cc980795a
17 changed files with 1117 additions and 11 deletions
--- a/4
+++ b/4
@ -311,6 +311,10 @@ PY
 import re
 from pathlib import Path

+coverage_path = Path("build/coverage-percent.txt")
+if coverage_path.exists():
+    print(coverage_path.read_text(encoding="utf-8", errors="ignore").strip() or "0.0")
+    raise SystemExit(0)
 log_path = Path("build/quality-gate.out")
 text = log_path.read_text(encoding="utf-8", errors="ignore") if log_path.exists() else ""
 values = [float(match.group(1)) for match in re.finditer(r"([0-9]+(?:\\.[0-9]+)?)%", text)]
--- a/configs/ananke.example.yaml
+++ b/configs/ananke.example.yaml
@ -197,6 +197,9 @@ metrics:
  enabled: true
  bind_addr: 0.0.0.0:9560
  path: /metrics
+gitops:
+  enabled: true
+  poll_seconds: 60
 state:
  dir: /var/lib/ananke
  reports_dir: /var/lib/ananke/reports
--- a/configs/ananke.tethys.yaml
+++ b/configs/ananke.tethys.yaml
@ -329,6 +329,9 @@ metrics:
  enabled: true
  bind_addr: 0.0.0.0:9560
  path: /metrics
+gitops:
+  enabled: true
+  poll_seconds: 60
 state:
  dir: /var/lib/ananke
  reports_dir: /var/lib/ananke/reports
--- a/configs/ananke.titan-db.yaml
+++ b/configs/ananke.titan-db.yaml
@ -329,6 +329,9 @@ metrics:
  enabled: true
  bind_addr: 0.0.0.0:9560
  path: /metrics
+gitops:
+  enabled: true
+  poll_seconds: 60
 state:
  dir: /var/lib/ananke
  reports_dir: /var/lib/ananke/reports
--- a/internal/config/apply_defaults.go
+++ b/internal/config/apply_defaults.go
@ -270,6 +270,9 @@ func (c *Config) applyDefaults() {
 	if c.Metrics.Path == "" {
 		c.Metrics.Path = "/metrics"
 	}
+	if c.GitOps.PollSeconds <= 0 {
+		c.GitOps.PollSeconds = 60
+	}
 	if c.State.Dir == "" {
 		c.State.Dir = "/var/lib/ananke"
 	}
--- a/internal/config/defaults.go
+++ b/internal/config/defaults.go
@ -156,6 +156,10 @@ func defaults() Config {
 			BindAddr: "0.0.0.0:9560",
 			Path:     "/metrics",
 		},
+		GitOps: GitOps{
+			Enabled:     true,
+			PollSeconds: 60,
+		},
 		State: State{
 			Dir:            "/var/lib/ananke",
 			ReportsDir:     "/var/lib/ananke/reports",
--- a/internal/config/types.go
+++ b/internal/config/types.go
@ -23,6 +23,7 @@ type Config struct {
 	UPS                 UPS               `yaml:"ups"`
 	Coordination        Coordination      `yaml:"coordination"`
 	Metrics             Metrics           `yaml:"metrics"`
+	GitOps              GitOps            `yaml:"gitops"`
 	State               State             `yaml:"state"`
 }

@ -174,6 +175,11 @@ type Metrics struct {
 	Path     string `yaml:"path"`
 }

+type GitOps struct {
+	Enabled     bool `yaml:"enabled"`
+	PollSeconds int  `yaml:"poll_seconds"`
+}
+
 type State struct {
 	Dir            string `yaml:"dir"`
 	ReportsDir     string `yaml:"reports_dir"`
--- a/internal/metrics/exporter.go
+++ b/internal/metrics/exporter.go
@ -27,6 +27,50 @@ type Sample struct {
 	UpdatedAt     time.Time
 }

+type GitOpsSnapshot struct {
+	UpdatedAt      time.Time
+	ScrapeSuccess  bool
+	Errors         map[string]string
+	FluxSources    []GitOpsFluxSource
+	Kustomizations []GitOpsKustomization
+	HelmReleases   []GitOpsHelmRelease
+}
+
+type GitOpsFluxSource struct {
+	Namespace string
+	Name      string
+	URL       string
+	Branch    string
+	Revision  string
+	Ready     bool
+	Reason    string
+	Suspended bool
+}
+
+type GitOpsKustomization struct {
+	Namespace       string
+	Name            string
+	Path            string
+	SourceNamespace string
+	SourceName      string
+	Revision        string
+	Ready           bool
+	Reason          string
+	Suspended       bool
+}
+
+type GitOpsHelmRelease struct {
+	Namespace  string
+	Name       string
+	Chart      string
+	Version    string
+	AppVersion string
+	Revision   string
+	Ready      bool
+	Reason     string
+	Suspended  bool
+}
+
 type Exporter struct {
 	mu                 sync.RWMutex
 	shutdownBudgetSec  int
@ -34,6 +78,7 @@ type Exporter struct {
 	lastShutdownReason string
 	lastShutdownAt     time.Time
 	samples            map[string]Sample
+	gitops             GitOpsSnapshot
 }

 // New runs one orchestration or CLI step.
@ -66,6 +111,22 @@ func (e *Exporter) UpdateSample(s Sample) {
 	e.samples[s.Name] = s
 }

+// UpdateGitOpsSnapshot records the most recent Flux object-state scrape.
+// Signature: (e *Exporter) UpdateGitOpsSnapshot(snapshot GitOpsSnapshot).
+// Why: Grafana needs object readiness and branch/revision state, while Flux's
+// controller metrics only expose controller health in this cluster.
+func (e *Exporter) UpdateGitOpsSnapshot(snapshot GitOpsSnapshot) {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	if snapshot.UpdatedAt.IsZero() {
+		snapshot.UpdatedAt = time.Now().UTC()
+	}
+	if snapshot.Errors == nil {
+		snapshot.Errors = map[string]string{}
+	}
+	e.gitops = snapshot
+}
+
 // MarkShutdown runs one orchestration or CLI step.
 // Signature: (e *Exporter) MarkShutdown(reason string).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
@ -164,11 +225,99 @@ func (e *Exporter) serveMetrics(w http.ResponseWriter, _ *http.Request) {
 		}
 		b.WriteString(fmt.Sprintf("ananke_ups_error%s %d\n", labels, boolNum(s.LastError != "")))
 	}
+	appendGitOpsMetrics(&b, e.gitops)
 	appendQualityGateMetrics(&b)

 	_, _ = w.Write([]byte(b.String()))
 }

+// appendGitOpsMetrics writes Flux object-state metrics collected by the
+// long-running daemon loop.
+// Signature: appendGitOpsMetrics(dst *strings.Builder, snapshot GitOpsSnapshot).
+// Why: this keeps the expensive Kubernetes API reads out of the HTTP scrape path
+// while still making current GitOps health cheap for Grafana to query.
+func appendGitOpsMetrics(dst *strings.Builder, snapshot GitOpsSnapshot) {
+	if dst.Len() > 0 {
+		dst.WriteString("\n")
+	}
+	dst.WriteString("# HELP ananke_gitops_last_scrape_timestamp_seconds Unix timestamp of the latest GitOps object-state scrape.\n")
+	dst.WriteString("# TYPE ananke_gitops_last_scrape_timestamp_seconds gauge\n")
+	if snapshot.UpdatedAt.IsZero() {
+		dst.WriteString("ananke_gitops_last_scrape_timestamp_seconds 0\n")
+	} else {
+		dst.WriteString(fmt.Sprintf("ananke_gitops_last_scrape_timestamp_seconds %d\n", snapshot.UpdatedAt.Unix()))
+	}
+	dst.WriteString("# HELP ananke_gitops_scrape_success Whether the latest GitOps object-state scrape completed without errors.\n")
+	dst.WriteString("# TYPE ananke_gitops_scrape_success gauge\n")
+	dst.WriteString(fmt.Sprintf("ananke_gitops_scrape_success %d\n", boolNum(snapshot.ScrapeSuccess)))
+	dst.WriteString("# HELP ananke_gitops_scrape_error Whether a GitOps resource family failed during the latest scrape.\n")
+	dst.WriteString("# TYPE ananke_gitops_scrape_error gauge\n")
+	resources := []string{"gitrepository", "kustomization", "helmrelease"}
+	for _, resource := range resources {
+		_, failed := snapshot.Errors[resource]
+		dst.WriteString(fmt.Sprintf("ananke_gitops_scrape_error{resource=%q} %d\n", resource, boolNum(failed)))
+	}
+
+	dst.WriteString("# HELP ananke_gitops_flux_source_info Current Flux GitRepository source metadata.\n")
+	dst.WriteString("# TYPE ananke_gitops_flux_source_info gauge\n")
+	dst.WriteString("# HELP ananke_gitops_flux_source_ready Whether a Flux GitRepository source is Ready.\n")
+	dst.WriteString("# TYPE ananke_gitops_flux_source_ready gauge\n")
+	dst.WriteString("# HELP ananke_gitops_flux_source_suspended Whether a Flux GitRepository source is suspended.\n")
+	dst.WriteString("# TYPE ananke_gitops_flux_source_suspended gauge\n")
+	sort.Slice(snapshot.FluxSources, func(i, j int) bool {
+		return snapshot.FluxSources[i].Namespace+"/"+snapshot.FluxSources[i].Name < snapshot.FluxSources[j].Namespace+"/"+snapshot.FluxSources[j].Name
+	})
+	for _, source := range snapshot.FluxSources {
+		infoLabels := fmt.Sprintf("{namespace=%q,name=%q,url=%q,branch=%q,revision=%q,ready=%q,reason=%q}",
+			safe(source.Namespace), safe(source.Name), safe(source.URL), safe(source.Branch),
+			safe(source.Revision), readyLabel(source.Ready), safe(defaultLabel(source.Reason, "unknown")))
+		baseLabels := fmt.Sprintf("{namespace=%q,name=%q}", safe(source.Namespace), safe(source.Name))
+		dst.WriteString(fmt.Sprintf("ananke_gitops_flux_source_info%s 1\n", infoLabels))
+		dst.WriteString(fmt.Sprintf("ananke_gitops_flux_source_ready%s %d\n", baseLabels, boolNum(source.Ready)))
+		dst.WriteString(fmt.Sprintf("ananke_gitops_flux_source_suspended%s %d\n", baseLabels, boolNum(source.Suspended)))
+	}
+
+	dst.WriteString("# HELP ananke_gitops_kustomization_info Current Flux Kustomization metadata.\n")
+	dst.WriteString("# TYPE ananke_gitops_kustomization_info gauge\n")
+	dst.WriteString("# HELP ananke_gitops_kustomization_ready Whether a Flux Kustomization is Ready.\n")
+	dst.WriteString("# TYPE ananke_gitops_kustomization_ready gauge\n")
+	dst.WriteString("# HELP ananke_gitops_kustomization_suspended Whether a Flux Kustomization is suspended.\n")
+	dst.WriteString("# TYPE ananke_gitops_kustomization_suspended gauge\n")
+	sort.Slice(snapshot.Kustomizations, func(i, j int) bool {
+		return snapshot.Kustomizations[i].Namespace+"/"+snapshot.Kustomizations[i].Name < snapshot.Kustomizations[j].Namespace+"/"+snapshot.Kustomizations[j].Name
+	})
+	for _, kustomization := range snapshot.Kustomizations {
+		infoLabels := fmt.Sprintf("{namespace=%q,name=%q,path=%q,source_namespace=%q,source_name=%q,revision=%q,ready=%q,reason=%q}",
+			safe(kustomization.Namespace), safe(kustomization.Name), safe(kustomization.Path),
+			safe(kustomization.SourceNamespace), safe(kustomization.SourceName), safe(kustomization.Revision),
+			readyLabel(kustomization.Ready), safe(defaultLabel(kustomization.Reason, "unknown")))
+		baseLabels := fmt.Sprintf("{namespace=%q,name=%q}", safe(kustomization.Namespace), safe(kustomization.Name))
+		dst.WriteString(fmt.Sprintf("ananke_gitops_kustomization_info%s 1\n", infoLabels))
+		dst.WriteString(fmt.Sprintf("ananke_gitops_kustomization_ready%s %d\n", baseLabels, boolNum(kustomization.Ready)))
+		dst.WriteString(fmt.Sprintf("ananke_gitops_kustomization_suspended%s %d\n", baseLabels, boolNum(kustomization.Suspended)))
+	}
+
+	dst.WriteString("# HELP ananke_gitops_helmrelease_info Current Flux HelmRelease metadata.\n")
+	dst.WriteString("# TYPE ananke_gitops_helmrelease_info gauge\n")
+	dst.WriteString("# HELP ananke_gitops_helmrelease_ready Whether a Flux HelmRelease is Ready.\n")
+	dst.WriteString("# TYPE ananke_gitops_helmrelease_ready gauge\n")
+	dst.WriteString("# HELP ananke_gitops_helmrelease_suspended Whether a Flux HelmRelease is suspended.\n")
+	dst.WriteString("# TYPE ananke_gitops_helmrelease_suspended gauge\n")
+	sort.Slice(snapshot.HelmReleases, func(i, j int) bool {
+		return snapshot.HelmReleases[i].Namespace+"/"+snapshot.HelmReleases[i].Name < snapshot.HelmReleases[j].Namespace+"/"+snapshot.HelmReleases[j].Name
+	})
+	for _, release := range snapshot.HelmReleases {
+		infoLabels := fmt.Sprintf("{namespace=%q,name=%q,chart=%q,version=%q,app_version=%q,revision=%q,ready=%q,reason=%q}",
+			safe(release.Namespace), safe(release.Name), safe(release.Chart), safe(release.Version),
+			safe(release.AppVersion), safe(release.Revision), readyLabel(release.Ready),
+			safe(defaultLabel(release.Reason, "unknown")))
+		baseLabels := fmt.Sprintf("{namespace=%q,name=%q}", safe(release.Namespace), safe(release.Name))
+		dst.WriteString(fmt.Sprintf("ananke_gitops_helmrelease_info%s 1\n", infoLabels))
+		dst.WriteString(fmt.Sprintf("ananke_gitops_helmrelease_ready%s %d\n", baseLabels, boolNum(release.Ready)))
+		dst.WriteString(fmt.Sprintf("ananke_gitops_helmrelease_suspended%s %d\n", baseLabels, boolNum(release.Suspended)))
+	}
+}
+
 // appendQualityGateMetrics runs one orchestration or CLI step.
 // Signature: appendQualityGateMetrics(dst *strings.Builder).
 // Why: quality-gate pass/fail telemetry should appear alongside UPS metrics so
@ -228,3 +377,26 @@ func safe(in string) string {
 	out := strings.ReplaceAll(in, "\\", "\\\\")
 	return strings.ReplaceAll(out, "\"", "\\\"")
 }
+
+// readyLabel formats a boolean readiness state as a stable label value.
+// Signature: readyLabel(ready bool) string.
+// Why: info metrics need a human-readable status label without changing the
+// numeric ready gauges used for alerting and aggregation.
+func readyLabel(ready bool) string {
+	if ready {
+		return "true"
+	}
+	return "false"
+}
+
+// defaultLabel returns a safe fallback for empty metric label values.
+// Signature: defaultLabel(value string, fallback string) string.
+// Why: Flux status fields can be absent during startup, but Prometheus labels
+// should remain explicit rather than silently becoming empty strings.
+func defaultLabel(value string, fallback string) string {
+	value = strings.TrimSpace(value)
+	if value == "" {
+		return fallback
+	}
+	return value
+}
--- a/internal/service/daemon.go
+++ b/internal/service/daemon.go
@ -97,6 +97,9 @@ func (d *Daemon) Run(ctx context.Context) error {
 	onBatterySince := map[string]time.Time{}
 	breachCount := map[string]int{}
 	lastAutoHeal := time.Time{}
+	lastGitOpsPoll := time.Time{}
+	gitOpsPollRunning := false
+	gitOpsDone := make(chan struct{}, 1)
 	for _, t := range d.targets {
 		lastGood[t.Name] = time.Now()
 	}
@ -108,6 +111,8 @@ func (d *Daemon) Run(ctx context.Context) error {
 		select {
 		case <-ctx.Done():
 			return ctx.Err()
+		case <-gitOpsDone:
+			gitOpsPollRunning = false
 		case <-t.C:
 			budget := d.orch.EstimatedEmergencyShutdownSeconds()
 			threshold := int(math.Ceil(float64(budget) * d.cfg.UPS.RuntimeSafetyFactor))
@ -201,6 +206,7 @@ func (d *Daemon) Run(ctx context.Context) error {
 			}

 			d.maybeRunPostStartAutoHeal(ctx, &lastAutoHeal, anyOnBattery)
+			gitOpsPollRunning = d.maybeStartGitOpsSnapshot(ctx, &lastGitOpsPoll, gitOpsPollRunning, gitOpsDone)
 		}
 	}
 }
--- a/internal/service/gitops_snapshot.go
+++ b/internal/service/gitops_snapshot.go
@ -0,0 +1,355 @@
+package service
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"os/exec"
+	"sort"
+	"strings"
+	"time"
+
+	"scm.bstein.dev/bstein/ananke/internal/config"
+	"scm.bstein.dev/bstein/ananke/internal/metrics"
+)
+
+var gitOpsKubectlOutput = runGitOpsKubectlOutput
+
+type gitOpsResourceList struct {
+	Items []gitOpsResource `json:"items"`
+}
+
+type gitOpsResource struct {
+	Metadata struct {
+		Name      string `json:"name"`
+		Namespace string `json:"namespace"`
+	} `json:"metadata"`
+	Spec struct {
+		Suspend bool `json:"suspend"`
+		Ref     struct {
+			Branch string `json:"branch"`
+			Tag    string `json:"tag"`
+			SemVer string `json:"semver"`
+		} `json:"ref"`
+		URL       string `json:"url"`
+		Path      string `json:"path"`
+		SourceRef struct {
+			Name      string `json:"name"`
+			Namespace string `json:"namespace"`
+		} `json:"sourceRef"`
+		Chart struct {
+			Spec struct {
+				Chart   string `json:"chart"`
+				Version string `json:"version"`
+			} `json:"spec"`
+		} `json:"chart"`
+	} `json:"spec"`
+	Status struct {
+		Artifact struct {
+			Revision string `json:"revision"`
+		} `json:"artifact"`
+		LastAppliedRevision   string `json:"lastAppliedRevision"`
+		LastAttemptedRevision string `json:"lastAttemptedRevision"`
+		Conditions            []struct {
+			Type   string `json:"type"`
+			Status string `json:"status"`
+			Reason string `json:"reason"`
+		} `json:"conditions"`
+		History []struct {
+			ChartName    string `json:"chartName"`
+			ChartVersion string `json:"chartVersion"`
+			AppVersion   string `json:"appVersion"`
+			Digest       string `json:"digest"`
+		} `json:"history"`
+	} `json:"status"`
+}
+
+// maybeStartGitOpsSnapshot starts a bounded background scrape when the GitOps
+// sample interval has elapsed.
+// Signature: (d *Daemon) maybeStartGitOpsSnapshot(ctx context.Context, lastRun *time.Time, running bool, done chan<- struct{}) bool.
+// Why: Kubernetes API reads must not block UPS polling, especially during
+// emergency power events when the daemon's first job is still safe shutdown.
+func (d *Daemon) maybeStartGitOpsSnapshot(ctx context.Context, lastRun *time.Time, running bool, done chan<- struct{}) bool {
+	if !d.cfg.GitOps.Enabled || d.exporter == nil || running {
+		return running
+	}
+	interval := time.Duration(d.cfg.GitOps.PollSeconds) * time.Second
+	if interval <= 0 {
+		interval = 60 * time.Second
+	}
+	now := time.Now()
+	if lastRun != nil && !lastRun.IsZero() && now.Sub(*lastRun) < interval {
+		return running
+	}
+	if lastRun != nil {
+		*lastRun = now
+	}
+	cfg := d.cfg
+	exporter := d.exporter
+	logger := d.log
+	go func() {
+		defer func() {
+			select {
+			case done <- struct{}{}:
+			default:
+			}
+		}()
+		scrapeCtx, cancel := context.WithTimeout(ctx, 25*time.Second)
+		defer cancel()
+		snapshot := collectGitOpsSnapshot(scrapeCtx, cfg)
+		exporter.UpdateGitOpsSnapshot(snapshot)
+		if !snapshot.ScrapeSuccess && logger != nil {
+			logger.Printf("warning: gitops metrics scrape partial: %s", gitOpsErrorSummary(snapshot.Errors))
+		}
+	}()
+	return true
+}
+
+// collectGitOpsSnapshot queries Flux custom resources and converts them into
+// Prometheus-safe metric state.
+// Signature: collectGitOpsSnapshot(ctx context.Context, cfg config.Config) metrics.GitOpsSnapshot.
+// Why: the Grafana overview needs current branch/readiness/suspend state, which
+// is object status rather than controller operational telemetry.
+func collectGitOpsSnapshot(ctx context.Context, cfg config.Config) metrics.GitOpsSnapshot {
+	snapshot := metrics.GitOpsSnapshot{
+		UpdatedAt:     time.Now().UTC(),
+		ScrapeSuccess: true,
+		Errors:        map[string]string{},
+	}
+
+	if raw, err := gitOpsKubectlJSON(ctx, cfg, "get", "gitrepositories.source.toolkit.fluxcd.io", "-A", "-o", "json"); err != nil {
+		snapshot.Errors["gitrepository"] = err.Error()
+	} else if sources, err := parseGitRepositories(raw); err != nil {
+		snapshot.Errors["gitrepository"] = err.Error()
+	} else {
+		snapshot.FluxSources = sources
+	}
+
+	if raw, err := gitOpsKubectlJSON(ctx, cfg, "get", "kustomizations.kustomize.toolkit.fluxcd.io", "-A", "-o", "json"); err != nil {
+		snapshot.Errors["kustomization"] = err.Error()
+	} else if kustomizations, err := parseKustomizations(raw); err != nil {
+		snapshot.Errors["kustomization"] = err.Error()
+	} else {
+		snapshot.Kustomizations = kustomizations
+	}
+
+	if raw, err := gitOpsKubectlJSON(ctx, cfg, "get", "helmreleases.helm.toolkit.fluxcd.io", "-A", "-o", "json"); err != nil {
+		snapshot.Errors["helmrelease"] = err.Error()
+	} else if releases, err := parseHelmReleases(raw); err != nil {
+		snapshot.Errors["helmrelease"] = err.Error()
+	} else {
+		snapshot.HelmReleases = releases
+	}
+
+	snapshot.ScrapeSuccess = len(snapshot.Errors) == 0
+	return snapshot
+}
+
+// parseGitRepositories extracts Git source branch/revision/readiness from Flux
+// GitRepository JSON.
+// Signature: parseGitRepositories(raw []byte) ([]metrics.GitOpsFluxSource, error).
+// Why: keeping status parsing isolated makes it testable without a live Flux
+// API and protects the daemon loop from brittle JSON traversal code.
+func parseGitRepositories(raw []byte) ([]metrics.GitOpsFluxSource, error) {
+	var list gitOpsResourceList
+	if err := json.Unmarshal(raw, &list); err != nil {
+		return nil, fmt.Errorf("decode gitrepositories: %w", err)
+	}
+	out := make([]metrics.GitOpsFluxSource, 0, len(list.Items))
+	for _, item := range list.Items {
+		ready, reason := fluxReady(item.Status.Conditions)
+		out = append(out, metrics.GitOpsFluxSource{
+			Namespace: defaultString(item.Metadata.Namespace, "default"),
+			Name:      item.Metadata.Name,
+			URL:       item.Spec.URL,
+			Branch:    sourceBranch(item),
+			Revision:  item.Status.Artifact.Revision,
+			Ready:     ready,
+			Reason:    reason,
+			Suspended: item.Spec.Suspend,
+		})
+	}
+	sort.Slice(out, func(i, j int) bool {
+		return out[i].Namespace+"/"+out[i].Name < out[j].Namespace+"/"+out[j].Name
+	})
+	return out, nil
+}
+
+// parseKustomizations extracts Kustomization health and source metadata from
+// Flux Kustomization JSON.
+// Signature: parseKustomizations(raw []byte) ([]metrics.GitOpsKustomization, error).
+// Why: Kustomization status drives both the overview summary and the detailed
+// GitOps table, so the parser keeps the metric contract in one place.
+func parseKustomizations(raw []byte) ([]metrics.GitOpsKustomization, error) {
+	var list gitOpsResourceList
+	if err := json.Unmarshal(raw, &list); err != nil {
+		return nil, fmt.Errorf("decode kustomizations: %w", err)
+	}
+	out := make([]metrics.GitOpsKustomization, 0, len(list.Items))
+	for _, item := range list.Items {
+		ready, reason := fluxReady(item.Status.Conditions)
+		namespace := defaultString(item.Metadata.Namespace, "default")
+		out = append(out, metrics.GitOpsKustomization{
+			Namespace:       namespace,
+			Name:            item.Metadata.Name,
+			Path:            item.Spec.Path,
+			SourceNamespace: defaultString(item.Spec.SourceRef.Namespace, namespace),
+			SourceName:      item.Spec.SourceRef.Name,
+			Revision:        firstNonEmpty(item.Status.LastAppliedRevision, item.Status.LastAttemptedRevision),
+			Ready:           ready,
+			Reason:          reason,
+			Suspended:       item.Spec.Suspend,
+		})
+	}
+	sort.Slice(out, func(i, j int) bool {
+		return out[i].Namespace+"/"+out[i].Name < out[j].Namespace+"/"+out[j].Name
+	})
+	return out, nil
+}
+
+// parseHelmReleases extracts HelmRelease health and chart metadata from Flux
+// HelmRelease JSON.
+// Signature: parseHelmReleases(raw []byte) ([]metrics.GitOpsHelmRelease, error).
+// Why: HelmRelease status has different chart fields than Kustomizations, and
+// isolating it keeps dashboard metrics stable as Flux payloads evolve.
+func parseHelmReleases(raw []byte) ([]metrics.GitOpsHelmRelease, error) {
+	var list gitOpsResourceList
+	if err := json.Unmarshal(raw, &list); err != nil {
+		return nil, fmt.Errorf("decode helmreleases: %w", err)
+	}
+	out := make([]metrics.GitOpsHelmRelease, 0, len(list.Items))
+	for _, item := range list.Items {
+		ready, reason := fluxReady(item.Status.Conditions)
+		chart := item.Spec.Chart.Spec.Chart
+		version := item.Spec.Chart.Spec.Version
+		appVersion := ""
+		revision := firstNonEmpty(item.Status.LastAppliedRevision, item.Status.LastAttemptedRevision)
+		if len(item.Status.History) > 0 {
+			latest := item.Status.History[0]
+			chart = firstNonEmpty(latest.ChartName, chart)
+			version = firstNonEmpty(latest.ChartVersion, version)
+			appVersion = latest.AppVersion
+			revision = firstNonEmpty(latest.Digest, revision)
+		}
+		out = append(out, metrics.GitOpsHelmRelease{
+			Namespace:  defaultString(item.Metadata.Namespace, "default"),
+			Name:       item.Metadata.Name,
+			Chart:      chart,
+			Version:    version,
+			AppVersion: appVersion,
+			Revision:   revision,
+			Ready:      ready,
+			Reason:     reason,
+			Suspended:  item.Spec.Suspend,
+		})
+	}
+	sort.Slice(out, func(i, j int) bool {
+		return out[i].Namespace+"/"+out[i].Name < out[j].Namespace+"/"+out[j].Name
+	})
+	return out, nil
+}
+
+// gitOpsKubectlJSON runs kubectl with the configured kubeconfig and a short
+// request timeout.
+// Signature: gitOpsKubectlJSON(ctx context.Context, cfg config.Config, args ...string) ([]byte, error).
+// Why: every GitOps scrape should fail quickly instead of stealing time from
+// UPS safety polling during outage recovery.
+func gitOpsKubectlJSON(ctx context.Context, cfg config.Config, args ...string) ([]byte, error) {
+	finalArgs := []string{}
+	if strings.TrimSpace(cfg.Kubeconfig) != "" {
+		finalArgs = append(finalArgs, "--kubeconfig", cfg.Kubeconfig)
+	}
+	finalArgs = append(finalArgs, "--request-timeout=8s")
+	finalArgs = append(finalArgs, args...)
+	return gitOpsKubectlOutput(ctx, "kubectl", finalArgs...)
+}
+
+// runGitOpsKubectlOutput is the production command runner for GitOps snapshot
+// collection.
+// Signature: runGitOpsKubectlOutput(ctx context.Context, name string, args ...string) ([]byte, error).
+// Why: preserving stderr in errors makes live diagnosis possible when Flux CRD
+// reads fail on a recovering control plane.
+func runGitOpsKubectlOutput(ctx context.Context, name string, args ...string) ([]byte, error) {
+	cmd := exec.CommandContext(ctx, name, args...)
+	out, err := cmd.CombinedOutput()
+	if err != nil {
+		trimmed := strings.TrimSpace(string(out))
+		if trimmed == "" {
+			return nil, err
+		}
+		return nil, fmt.Errorf("%w: %s", err, trimmed)
+	}
+	return out, nil
+}
+
+// fluxReady resolves a Flux Ready condition into a boolean and reason.
+// Signature: fluxReady(conditions []struct { Type string; Status string; Reason string }) (bool, string).
+// Why: Flux resources all use Ready conditions, so one helper keeps status
+// interpretation consistent across GitRepository, Kustomization, and HelmRelease.
+func fluxReady(conditions []struct {
+	Type   string `json:"type"`
+	Status string `json:"status"`
+	Reason string `json:"reason"`
+}) (bool, string) {
+	for _, condition := range conditions {
+		if condition.Type == "Ready" {
+			return strings.EqualFold(condition.Status, "True"), defaultString(condition.Reason, "ReadyUnknown")
+		}
+	}
+	return false, "ReadyMissing"
+}
+
+// sourceBranch returns the most useful source reference label for a
+// GitRepository.
+// Signature: sourceBranch(item gitOpsResource) string.
+// Why: branch is preferred for Atlas, but tags and semver refs should still
+// render clearly if Flux is ever pinned differently.
+func sourceBranch(item gitOpsResource) string {
+	return firstNonEmpty(item.Spec.Ref.Branch, item.Spec.Ref.Tag, item.Spec.Ref.SemVer, "unknown")
+}
+
+// defaultString trims a value and returns a fallback when it is empty.
+// Signature: defaultString(value string, fallback string) string.
+// Why: Flux omits some optional fields; dashboards are easier to read with
+// explicit fallback labels.
+func defaultString(value string, fallback string) string {
+	value = strings.TrimSpace(value)
+	if value == "" {
+		return fallback
+	}
+	return value
+}
+
+// firstNonEmpty returns the first non-empty trimmed value.
+// Signature: firstNonEmpty(values ...string) string.
+// Why: Flux has several revision fields depending on resource kind and
+// reconciliation stage, so callers can express preferred fallback order.
+func firstNonEmpty(values ...string) string {
+	for _, value := range values {
+		value = strings.TrimSpace(value)
+		if value != "" {
+			return value
+		}
+	}
+	return ""
+}
+
+// gitOpsErrorSummary renders resource-family scrape errors for daemon logs.
+// Signature: gitOpsErrorSummary(errors map[string]string) string.
+// Why: a compact sorted summary keeps recurring scrape warnings readable in
+// journald during partial cluster recovery.
+func gitOpsErrorSummary(errors map[string]string) string {
+	if len(errors) == 0 {
+		return "none"
+	}
+	keys := make([]string, 0, len(errors))
+	for key := range errors {
+		keys = append(keys, key)
+	}
+	sort.Strings(keys)
+	parts := make([]string, 0, len(keys))
+	for _, key := range keys {
+		parts = append(parts, key+"="+errors[key])
+	}
+	return strings.Join(parts, "; ")
+}
--- a/internal/service/testing_hooks_gitops.go
+++ b/internal/service/testing_hooks_gitops.go
@ -0,0 +1,127 @@
+package service
+
+import (
+	"context"
+	"log"
+	"time"
+
+	"scm.bstein.dev/bstein/ananke/internal/config"
+	"scm.bstein.dev/bstein/ananke/internal/metrics"
+)
+
+type TestHookGitOpsRunner func(context.Context, string, ...string) ([]byte, error)
+
+// TestHookParseGitRepositories exposes GitRepository parsing to the top-level
+// testing module.
+// Signature: TestHookParseGitRepositories(raw []byte) ([]metrics.GitOpsFluxSource, error).
+// Why: Ananke keeps split-module tests outside internal packages, but the
+// parser needs direct contract coverage without a live cluster.
+func TestHookParseGitRepositories(raw []byte) ([]metrics.GitOpsFluxSource, error) {
+	return parseGitRepositories(raw)
+}
+
+// TestHookParseKustomizations exposes Kustomization parsing to the top-level
+// testing module.
+// Signature: TestHookParseKustomizations(raw []byte) ([]metrics.GitOpsKustomization, error).
+// Why: dashboard-facing labels should be verified from representative Flux JSON
+// without relying on the current production cluster shape.
+func TestHookParseKustomizations(raw []byte) ([]metrics.GitOpsKustomization, error) {
+	return parseKustomizations(raw)
+}
+
+// TestHookParseHelmReleases exposes HelmRelease parsing to the top-level
+// testing module.
+// Signature: TestHookParseHelmReleases(raw []byte) ([]metrics.GitOpsHelmRelease, error).
+// Why: Helm status fields vary by reconciliation phase, and split-module tests
+// need a stable seam for parser coverage.
+func TestHookParseHelmReleases(raw []byte) ([]metrics.GitOpsHelmRelease, error) {
+	return parseHelmReleases(raw)
+}
+
+// TestHookCollectGitOpsSnapshotWithRunner runs snapshot collection with an
+// injected kubectl runner.
+// Signature: TestHookCollectGitOpsSnapshotWithRunner(ctx context.Context, cfg config.Config, runner TestHookGitOpsRunner) metrics.GitOpsSnapshot.
+// Why: this covers success and partial-failure collection paths without shelling
+// out to kubectl from unit tests.
+func TestHookCollectGitOpsSnapshotWithRunner(ctx context.Context, cfg config.Config, runner TestHookGitOpsRunner) metrics.GitOpsSnapshot {
+	original := gitOpsKubectlOutput
+	defer func() { gitOpsKubectlOutput = original }()
+	gitOpsKubectlOutput = runner
+	return collectGitOpsSnapshot(ctx, cfg)
+}
+
+// TestHookMaybeStartGitOpsSnapshot starts the daemon's background GitOps scrape
+// from split-module tests.
+// Signature: TestHookMaybeStartGitOpsSnapshot(ctx context.Context, cfg config.Config, exporter *metrics.Exporter, logger *log.Logger, lastRun *time.Time, running bool, done chan<- struct{}) bool.
+// Why: the daemon intentionally keeps scraping asynchronous so GitOps telemetry
+// cannot delay UPS polling; the behavior needs direct coverage.
+func TestHookMaybeStartGitOpsSnapshot(ctx context.Context, cfg config.Config, exporter *metrics.Exporter, logger *log.Logger, lastRun *time.Time, running bool, done chan<- struct{}) bool {
+	d := &Daemon{cfg: cfg, exporter: exporter, log: logger}
+	return d.maybeStartGitOpsSnapshot(ctx, lastRun, running, done)
+}
+
+// TestHookMaybeStartGitOpsSnapshotWithRunner starts a background GitOps scrape
+// with an injected runner and restores the production runner after completion.
+// Signature: TestHookMaybeStartGitOpsSnapshotWithRunner(ctx context.Context, cfg config.Config, exporter *metrics.Exporter, logger *log.Logger, lastRun *time.Time, running bool, done chan<- struct{}, runner TestHookGitOpsRunner) bool.
+// Why: the scrape is asynchronous, so split-module tests need a seam that keeps
+// fake kubectl behavior installed until the goroutine exits.
+func TestHookMaybeStartGitOpsSnapshotWithRunner(ctx context.Context, cfg config.Config, exporter *metrics.Exporter, logger *log.Logger, lastRun *time.Time, running bool, done chan<- struct{}, runner TestHookGitOpsRunner) bool {
+	original := gitOpsKubectlOutput
+	gitOpsKubectlOutput = runner
+	proxyDone := make(chan struct{}, 1)
+	d := &Daemon{cfg: cfg, exporter: exporter, log: logger}
+	started := d.maybeStartGitOpsSnapshot(ctx, lastRun, running, proxyDone)
+	if !started || running {
+		gitOpsKubectlOutput = original
+		return started
+	}
+	go func() {
+		<-proxyDone
+		gitOpsKubectlOutput = original
+		select {
+		case done <- struct{}{}:
+		default:
+		}
+	}()
+	return started
+}
+
+// TestHookRunGitOpsKubectlOutput exposes the production command runner to
+// split-module tests.
+// Signature: TestHookRunGitOpsKubectlOutput(ctx context.Context, name string, args ...string) ([]byte, error).
+// Why: stderr preservation and empty-stderr failure behavior are part of the
+// operator-facing diagnostics contract.
+func TestHookRunGitOpsKubectlOutput(ctx context.Context, name string, args ...string) ([]byte, error) {
+	return runGitOpsKubectlOutput(ctx, name, args...)
+}
+
+// TestHookGitOpsDefaultString exposes defaultString to split-module tests.
+// Signature: TestHookGitOpsDefaultString(value string, fallback string) string.
+// Why: fallback label behavior is intentionally tiny but important for readable
+// Grafana tables during startup.
+func TestHookGitOpsDefaultString(value string, fallback string) string {
+	return defaultString(value, fallback)
+}
+
+// TestHookGitOpsFirstNonEmpty exposes firstNonEmpty to split-module tests.
+// Signature: TestHookGitOpsFirstNonEmpty(values ...string) string.
+// Why: revision fallback order should remain deterministic as Flux status
+// payloads change.
+func TestHookGitOpsFirstNonEmpty(values ...string) string {
+	return firstNonEmpty(values...)
+}
+
+// TestHookGitOpsErrorSummary exposes gitOpsErrorSummary to split-module tests.
+// Signature: TestHookGitOpsErrorSummary(errors map[string]string) string.
+// Why: sorted scrape warnings are easier to inspect in journald during recovery.
+func TestHookGitOpsErrorSummary(errors map[string]string) string {
+	return gitOpsErrorSummary(errors)
+}
+
+// TestHookFluxReadyMissing exposes the missing Ready-condition branch to tests.
+// Signature: TestHookFluxReadyMissing() (bool, string).
+// Why: absent Ready conditions should be treated as not-ready and labelled
+// explicitly instead of being mistaken for healthy.
+func TestHookFluxReadyMissing() (bool, string) {
+	return fluxReady(nil)
+}
--- a/scripts/publish_quality_metrics.py
+++ b/scripts/publish_quality_metrics.py
@ -172,6 +172,14 @@ def _read_coverage_percent(path: str) -> float:
        return 0.0


+def _resolve_repo_path(repo_root: Path, path: str) -> Path:
+    """Resolve quality-gate artifact paths relative to the repository root."""
+    candidate = Path(path)
+    if candidate.is_absolute():
+        return candidate
+    return repo_root / candidate
+
+
 def _iter_source_files(repo_root: Path):
    for rel_root in SOURCE_SCAN_ROOTS:
        base = repo_root / rel_root
@ -367,7 +375,9 @@ def main(argv: list[str] | None = None) -> int:
    args = parse_args(argv or sys.argv[1:])
    repo_root = Path(__file__).resolve().parents[1]
    build_dir = repo_root / "build"
-    gate_rc = _read_exit_code(Path(os.getenv("ANANKE_QUALITY_EXIT_CODE_PATH", str(build_dir / "quality-gate.rc"))))
+    gate_rc = _read_exit_code(
+        _resolve_repo_path(repo_root, os.getenv("ANANKE_QUALITY_EXIT_CODE_PATH", str(build_dir / "quality-gate.rc")))
+    )
    current_ok = 1 if gate_rc == 0 else 0
    current_failed = 0 if gate_rc == 0 else 1

@ -420,13 +430,18 @@ def main(argv: list[str] | None = None) -> int:
    elif not already_recorded:
        resolved_ok += current_ok
        resolved_failed += current_failed
-    coverage_percent = _read_coverage_percent(args.coverage_percent_file)
+    coverage_percent = _read_coverage_percent(str(_resolve_repo_path(repo_root, args.coverage_percent_file)))
    source_files_total = _count_source_files(repo_root)
    source_lines_over_500 = _count_source_files_over_limit(repo_root, max_lines=500)
-    quality_output = Path(os.getenv("ANANKE_QUALITY_OUTPUT_FILE", str(build_dir / "quality-gate.out")))
+    quality_output = _resolve_repo_path(
+        repo_root,
+        os.getenv("ANANKE_QUALITY_OUTPUT_FILE", str(build_dir / "quality-gate.out")),
+    )
    tests = _parse_go_test_counts(quality_output)
    test_cases = _parse_go_test_cases(quality_output)
-    docs_status = _read_status(Path(os.getenv("ANANKE_QUALITY_DOCS_STATUS_PATH", str(build_dir / "docs-naming.status"))))
+    docs_status = _read_status(
+        _resolve_repo_path(repo_root, os.getenv("ANANKE_QUALITY_DOCS_STATUS_PATH", str(build_dir / "docs-naming.status")))
+    )
    unit_tests_failed = _unit_tests_failed(quality_output, coverage_percent)
    checks = {
        "tests": "failed" if unit_tests_failed or tests["failed"] > 0 or tests["errors"] > 0 else "ok",
--- a/scripts/publish_quality_metrics_test.py
+++ b/scripts/publish_quality_metrics_test.py
@ -61,6 +61,18 @@ class PublishQualityMetricsTest(unittest.TestCase):
        self.server.server_close()
        self.thread.join(timeout=5)

+    def test_relative_artifact_paths_are_repo_rooted(self) -> None:
+        repo_root = Path("/tmp/ananke-repo")
+
+        self.assertEqual(
+            publisher._resolve_repo_path(repo_root, "build/coverage-percent.txt"),
+            repo_root / "build" / "coverage-percent.txt",
+        )
+        self.assertEqual(
+            publisher._resolve_repo_path(repo_root, "/tmp/coverage-percent.txt"),
+            Path("/tmp/coverage-percent.txt"),
+        )
+
    def _env_for_gate_status(self, status: int = 0) -> dict[str, str]:
        tmp_dir = tempfile.TemporaryDirectory()
        self.addCleanup(tmp_dir.cleanup)
--- a/scripts/quality_gate.sh
+++ b/scripts/quality_gate.sh
@ -56,12 +56,13 @@ read_quality_counter() {
 write_quality_metrics() {
  local exit_code="$1"

-  local metrics_dir state_dir
+  local metrics_dir state_dir write_metrics
  metrics_dir="$(dirname "${QUALITY_METRICS_FILE}")"
  state_dir="$(dirname "${QUALITY_STATE_FILE}")"
+  write_metrics="${QUALITY_METRICS_ENABLED}"
  mkdir -p "${state_dir}" >/dev/null 2>&1 || return 0
-  if [[ "${QUALITY_METRICS_ENABLED}" == "1" ]]; then
-    mkdir -p "${metrics_dir}" >/dev/null 2>&1 || return 0
+  if [[ "${write_metrics}" == "1" ]]; then
+    mkdir -p "${metrics_dir}" >/dev/null 2>&1 || write_metrics=0
  fi

  local ok failed total last_success now success_percent
@ -84,12 +85,12 @@ write_quality_metrics() {
  QUALITY_SUCCESS_PERCENT="${success_percent}"

  local tmp_metrics="" tmp_state
-  if [[ "${QUALITY_METRICS_ENABLED}" == "1" ]]; then
+  if [[ "${write_metrics}" == "1" ]]; then
    tmp_metrics="$(mktemp "${metrics_dir}/quality-gate.prom.XXXXXX")"
  fi
  tmp_state="$(mktemp "${state_dir}/quality-gate.state.XXXXXX")"

-  if [[ "${QUALITY_METRICS_ENABLED}" == "1" ]]; then
+  if [[ "${write_metrics}" == "1" ]]; then
    cat > "${tmp_metrics}" <<EOF
 # HELP ananke_quality_gate_runs_total Total Ananke quality gate runs by status.
 # TYPE ananke_quality_gate_runs_total counter
@ -115,7 +116,7 @@ last_run=${now}
 EOF

  mv -f "${tmp_state}" "${QUALITY_STATE_FILE}"
-  if [[ "${QUALITY_METRICS_ENABLED}" == "1" ]]; then
+  if [[ "${write_metrics}" == "1" ]]; then
    mv -f "${tmp_metrics}" "${QUALITY_METRICS_FILE}"
  fi
 }
@ -146,6 +147,8 @@ publish_quality_metrics() {
 quality_gate_finalize() {
  local exit_code="$1"
  set +e
+  printf '%s\n' "${exit_code}" > "${BUILD_DIR}/quality-gate.rc" 2>/dev/null || true
+  cd "${REPO_DIR}" 2>/dev/null || true
  write_quality_metrics "${exit_code}" || true
  publish_quality_metrics || true
  exit "${exit_code}"
@ -193,4 +196,7 @@ printf '%s\n' "${coverage_percent}" > "${COVERAGE_PERCENT_FILE}"

 echo "[quality] per-file coverage gate (95%)"
 cd testing
-ANANKE_ENFORCE_COVERAGE=1 ANANKE_PER_FILE_COVERAGE_TARGET=95 go test ./coverage -run TestPerFileCoverageReport -count=1 -v
+ANANKE_ENFORCE_COVERAGE=1 \
+  ANANKE_PER_FILE_COVERAGE_TARGET=95 \
+  ANANKE_PER_FILE_COVERAGE_PERCENT_FILE="${COVERAGE_PERCENT_FILE}" \
+  go test ./coverage -run TestPerFileCoverageReport -count=1 -v
--- a/testing/coverage/coverage_test.go
+++ b/testing/coverage/coverage_test.go
@ -121,6 +121,7 @@ func TestPerFileCoverageReport(t *testing.T) {
 		"./metrics",
 		"./hygiene",
 		"./orchestrator",
+		"./service",
 		"./state",
 		"./sshutil",
 		"./ups",
@ -167,17 +168,26 @@ func TestPerFileCoverageReport(t *testing.T) {
 	var report bytes.Buffer
 	report.WriteString("per-file coverage\n")
 	under := make([]string, 0)
+	minPercent := 100.0
 	for _, file := range keys {
 		fc := byFile[file]
 		pct := 100.0
 		if fc.total > 0 {
 			pct = (fc.covered / fc.total) * 100.0
 		}
+		if pct < minPercent {
+			minPercent = pct
+		}
 		report.WriteString(fmt.Sprintf("- %s: %.1f%%\n", file, pct))
 		if pct < target {
 			under = append(under, fmt.Sprintf("%s (%.1f%% < %.1f%%)", file, pct, target))
 		}
 	}
+	if outputPath := strings.TrimSpace(os.Getenv("ANANKE_PER_FILE_COVERAGE_PERCENT_FILE")); outputPath != "" {
+		if err := os.WriteFile(outputPath, []byte(fmt.Sprintf("%.1f\n", minPercent)), 0o644); err != nil {
+			t.Fatalf("write per-file coverage percent: %v", err)
+		}
+	}
 	t.Log(report.String())

 	if len(under) > 0 && enforce {
--- a/testing/metrics/exporter_http_contract_test.go
+++ b/testing/metrics/exporter_http_contract_test.go
@ -104,3 +104,64 @@ func TestExporterHelperContracts(t *testing.T) {
 		t.Fatalf("unexpected escaped string: %q", got)
 	}
 }
+
+// TestExporterEmitsGitOpsMetrics runs one orchestration or CLI step.
+// Signature: TestExporterEmitsGitOpsMetrics(t *testing.T).
+// Why: the overview dashboard depends on Ananke-owned Flux object-state metrics
+// rather than the narrower controller metrics exposed by Flux itself.
+func TestExporterEmitsGitOpsMetrics(t *testing.T) {
+	exporter := metrics.New()
+	exporter.UpdateGitOpsSnapshot(metrics.GitOpsSnapshot{
+		UpdatedAt:     time.Unix(1710000100, 0).UTC(),
+		ScrapeSuccess: true,
+		FluxSources: []metrics.GitOpsFluxSource{{
+			Namespace: "flux-system",
+			Name:      "flux-system",
+			URL:       "ssh://git@example/repo.git",
+			Branch:    "main",
+			Revision:  "main@sha1:abc123",
+			Ready:     true,
+			Reason:    "Succeeded",
+		}},
+		Kustomizations: []metrics.GitOpsKustomization{{
+			Namespace:       "flux-system",
+			Name:            "monitoring",
+			Path:            "./services/monitoring",
+			SourceNamespace: "flux-system",
+			SourceName:      "flux-system",
+			Revision:        "main@sha1:abc123",
+			Ready:           true,
+			Reason:          "ReconciliationSucceeded",
+		}},
+		HelmReleases: []metrics.GitOpsHelmRelease{{
+			Namespace:  "monitoring",
+			Name:       "grafana",
+			Chart:      "grafana",
+			Version:    "8.5.0",
+			AppVersion: "11.4.0",
+			Revision:   "sha256:abc123",
+			Ready:      false,
+			Reason:     "Progressing",
+			Suspended:  true,
+		}},
+	})
+
+	req := httptest.NewRequest(http.MethodGet, "/metrics", nil)
+	rr := httptest.NewRecorder()
+	exporter.Handler("/metrics").ServeHTTP(rr, req)
+	body := rr.Body.String()
+
+	mustContain := []string{
+		"ananke_gitops_last_scrape_timestamp_seconds 1710000100",
+		"ananke_gitops_scrape_success 1",
+		`ananke_gitops_flux_source_info{namespace="flux-system",name="flux-system",url="ssh://git@example/repo.git",branch="main",revision="main@sha1:abc123",ready="true",reason="Succeeded"} 1`,
+		`ananke_gitops_kustomization_ready{namespace="flux-system",name="monitoring"} 1`,
+		`ananke_gitops_helmrelease_info{namespace="monitoring",name="grafana",chart="grafana",version="8.5.0",app_version="11.4.0",revision="sha256:abc123",ready="false",reason="Progressing"} 1`,
+		`ananke_gitops_helmrelease_suspended{namespace="monitoring",name="grafana"} 1`,
+	}
+	for _, fragment := range mustContain {
+		if !strings.Contains(body, fragment) {
+			t.Fatalf("missing GitOps metric fragment %q in output:\n%s", fragment, body)
+		}
+	}
+}
--- a/testing/service/gitops_snapshot_test.go
+++ b/testing/service/gitops_snapshot_test.go
@ -0,0 +1,316 @@
+package servicequality
+
+import (
+	"context"
+	"errors"
+	"io"
+	"log"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"testing"
+	"time"
+
+	"scm.bstein.dev/bstein/ananke/internal/config"
+	"scm.bstein.dev/bstein/ananke/internal/metrics"
+	"scm.bstein.dev/bstein/ananke/internal/service"
+)
+
+// TestParseGitRepositories runs one orchestration or CLI step.
+// Signature: TestParseGitRepositories(t *testing.T).
+// Why: branch, revision, and Ready status are the labels the Grafana GitOps
+// panels rely on for the current Flux source.
+func TestParseGitRepositories(t *testing.T) {
+	raw := []byte(`{
+	  "items": [{
+	    "metadata": {"name": "flux-system", "namespace": "flux-system"},
+	    "spec": {"url": "ssh://git@example/repo.git", "ref": {"branch": "main"}},
+	    "status": {
+	      "artifact": {"revision": "main@sha1:abc123"},
+	      "conditions": [{"type": "Ready", "status": "True", "reason": "Succeeded"}]
+	    }
+	  }]
+	}`)
+	got, err := service.TestHookParseGitRepositories(raw)
+	if err != nil {
+		t.Fatalf("parseGitRepositories failed: %v", err)
+	}
+	if len(got) != 1 || got[0].Branch != "main" || got[0].Revision != "main@sha1:abc123" || !got[0].Ready {
+		t.Fatalf("unexpected GitRepository parse result: %+v", got)
+	}
+}
+
+// TestParseKustomizations runs one orchestration or CLI step.
+// Signature: TestParseKustomizations(t *testing.T).
+// Why: Kustomization source and suspend labels power the detailed GitOps table.
+func TestParseKustomizations(t *testing.T) {
+	raw := []byte(`{
+	  "items": [{
+	    "metadata": {"name": "monitoring", "namespace": "flux-system"},
+	    "spec": {
+	      "path": "./services/monitoring",
+	      "sourceRef": {"name": "flux-system"},
+	      "suspend": true
+	    },
+	    "status": {
+	      "lastAppliedRevision": "main@sha1:def456",
+	      "conditions": [{"type": "Ready", "status": "False", "reason": "DependencyNotReady"}]
+	    }
+	  }]
+	}`)
+	got, err := service.TestHookParseKustomizations(raw)
+	if err != nil {
+		t.Fatalf("parseKustomizations failed: %v", err)
+	}
+	if len(got) != 1 || got[0].SourceNamespace != "flux-system" || got[0].Ready || !got[0].Suspended {
+		t.Fatalf("unexpected Kustomization parse result: %+v", got)
+	}
+}
+
+// TestParseHelmReleases runs one orchestration or CLI step.
+// Signature: TestParseHelmReleases(t *testing.T).
+// Why: HelmRelease chart fields have separate status fallbacks from plain Flux
+// Kustomizations and need their own contract coverage.
+func TestParseHelmReleases(t *testing.T) {
+	raw := []byte(`{
+	  "items": [{
+	    "metadata": {"name": "grafana", "namespace": "monitoring"},
+	    "spec": {
+	      "chart": {"spec": {"chart": "grafana", "version": "~8.5.0"}}
+	    },
+	    "status": {
+	      "lastAttemptedRevision": "8.5.8",
+	      "history": [{"chartName": "grafana", "chartVersion": "8.5.8", "appVersion": "11.4.0", "digest": "sha256:abc"}],
+	      "conditions": [{"type": "Ready", "status": "True", "reason": "InstallSucceeded"}]
+	    }
+	  }]
+	}`)
+	got, err := service.TestHookParseHelmReleases(raw)
+	if err != nil {
+		t.Fatalf("parseHelmReleases failed: %v", err)
+	}
+	if len(got) != 1 || got[0].Chart != "grafana" || got[0].Version != "8.5.8" || got[0].Revision != "sha256:abc" || !got[0].Ready {
+		t.Fatalf("unexpected HelmRelease parse result: %+v", got)
+	}
+}
+
+// TestCollectGitOpsSnapshotPartialFailure runs one orchestration or CLI step.
+// Signature: TestCollectGitOpsSnapshotPartialFailure(t *testing.T).
+// Why: a recovering cluster may serve some Flux resource families before
+// others, and Ananke should publish a partial snapshot instead of failing closed.
+func TestCollectGitOpsSnapshotPartialFailure(t *testing.T) {
+	runner := func(_ context.Context, _ string, args ...string) ([]byte, error) {
+		joined := strings.Join(args, " ")
+		switch {
+		case strings.Contains(joined, "gitrepositories"):
+			return []byte(`{"items":[]}`), nil
+		case strings.Contains(joined, "kustomizations"):
+			return nil, errors.New("api unavailable")
+		case strings.Contains(joined, "helmreleases"):
+			return []byte(`{"items":[]}`), nil
+		default:
+			return nil, errors.New("unexpected command")
+		}
+	}
+
+	snapshot := service.TestHookCollectGitOpsSnapshotWithRunner(context.Background(), config.Config{Kubeconfig: "/tmp/kubeconfig"}, runner)
+	if snapshot.ScrapeSuccess {
+		t.Fatalf("expected partial failure")
+	}
+	if snapshot.Errors["kustomization"] == "" {
+		t.Fatalf("expected kustomization error, got %+v", snapshot.Errors)
+	}
+	if summary := service.TestHookGitOpsErrorSummary(snapshot.Errors); !strings.Contains(summary, "kustomization=api unavailable") {
+		t.Fatalf("unexpected error summary: %s", summary)
+	}
+}
+
+// TestCollectGitOpsSnapshotParseFailures runs one orchestration or CLI step.
+// Signature: TestCollectGitOpsSnapshotParseFailures(t *testing.T).
+// Why: malformed API payloads should be surfaced per resource family so the
+// exporter can distinguish data errors from transport failures.
+func TestCollectGitOpsSnapshotParseFailures(t *testing.T) {
+	runner := func(_ context.Context, _ string, _ ...string) ([]byte, error) {
+		return []byte(`not-json`), nil
+	}
+
+	snapshot := service.TestHookCollectGitOpsSnapshotWithRunner(context.Background(), config.Config{}, runner)
+	if snapshot.ScrapeSuccess {
+		t.Fatalf("expected parse failures")
+	}
+	for _, key := range []string{"gitrepository", "kustomization", "helmrelease"} {
+		if snapshot.Errors[key] == "" {
+			t.Fatalf("expected %s parse error, got %+v", key, snapshot.Errors)
+		}
+	}
+}
+
+// TestCollectGitOpsSnapshotSuccess runs one orchestration or CLI step.
+// Signature: TestCollectGitOpsSnapshotSuccess(t *testing.T).
+// Why: successful collection should preserve non-branch Git refs as the source
+// label rather than hiding them as unknown.
+func TestCollectGitOpsSnapshotSuccess(t *testing.T) {
+	runner := func(_ context.Context, _ string, args ...string) ([]byte, error) {
+		joined := strings.Join(args, " ")
+		switch {
+		case strings.Contains(joined, "gitrepositories"):
+			return []byte(`{"items":[{"metadata":{"name":"flux-system","namespace":"flux-system"},"spec":{"ref":{"tag":"v1.0.0"}}}]}`), nil
+		case strings.Contains(joined, "kustomizations"):
+			return []byte(`{"items":[]}`), nil
+		case strings.Contains(joined, "helmreleases"):
+			return []byte(`{"items":[]}`), nil
+		default:
+			return nil, errors.New("unexpected command")
+		}
+	}
+
+	snapshot := service.TestHookCollectGitOpsSnapshotWithRunner(context.Background(), config.Config{}, runner)
+	if !snapshot.ScrapeSuccess || len(snapshot.FluxSources) != 1 || snapshot.FluxSources[0].Branch != "v1.0.0" {
+		t.Fatalf("unexpected snapshot: %+v", snapshot)
+	}
+}
+
+// TestMaybeStartGitOpsSnapshot runs one orchestration or CLI step.
+// Signature: TestMaybeStartGitOpsSnapshot(t *testing.T).
+// Why: Ananke must collect GitOps status asynchronously so a slow Kubernetes API
+// cannot delay UPS polling.
+func TestMaybeStartGitOpsSnapshot(t *testing.T) {
+	runner := func(_ context.Context, _ string, _ ...string) ([]byte, error) {
+		return []byte(`{"items":[]}`), nil
+	}
+	exporter := metrics.New()
+	done := make(chan struct{}, 1)
+	lastRun := time.Time{}
+	cfg := config.Config{GitOps: config.GitOps{Enabled: true, PollSeconds: 1}}
+	withRunner := func() bool {
+		return service.TestHookCollectGitOpsSnapshotWithRunner(context.Background(), config.Config{}, runner).ScrapeSuccess
+	}
+	if !withRunner() {
+		t.Fatalf("expected injected runner sanity check to pass")
+	}
+	if running := service.TestHookMaybeStartGitOpsSnapshotWithRunner(context.Background(), cfg, exporter, log.New(io.Discard, "", 0), &lastRun, false, done, runner); !running {
+		t.Fatalf("expected scrape to start")
+	}
+	select {
+	case <-done:
+	case <-time.After(2 * time.Second):
+		t.Fatalf("timed out waiting for scrape")
+	}
+
+	req := httptest.NewRequest(http.MethodGet, "/metrics", nil)
+	rr := httptest.NewRecorder()
+	exporter.Handler("/metrics").ServeHTTP(rr, req)
+	if !strings.Contains(rr.Body.String(), "ananke_gitops_scrape_success") {
+		t.Fatalf("expected GitOps scrape metric, got:\n%s", rr.Body.String())
+	}
+	if running := service.TestHookMaybeStartGitOpsSnapshot(context.Background(), cfg, exporter, log.New(io.Discard, "", 0), &lastRun, true, done); !running {
+		t.Fatalf("running scrape should stay marked running")
+	}
+	cfg.GitOps.Enabled = false
+	if running := service.TestHookMaybeStartGitOpsSnapshot(context.Background(), cfg, exporter, nil, &lastRun, false, done); running {
+		t.Fatalf("disabled GitOps scrape should not start")
+	}
+}
+
+// TestRunGitOpsKubectlOutput runs one orchestration or CLI step.
+// Signature: TestRunGitOpsKubectlOutput(t *testing.T).
+// Why: command diagnostics should preserve stderr for operator-visible errors.
+func TestRunGitOpsKubectlOutput(t *testing.T) {
+	out, err := service.TestHookRunGitOpsKubectlOutput(context.Background(), "sh", "-c", "printf ok")
+	if err != nil || string(out) != "ok" {
+		t.Fatalf("unexpected command success result: out=%q err=%v", out, err)
+	}
+	if _, err := service.TestHookRunGitOpsKubectlOutput(context.Background(), "sh", "-c", "echo bad >&2; exit 7"); err == nil || !strings.Contains(err.Error(), "bad") {
+		t.Fatalf("expected stderr to be preserved in error, got %v", err)
+	}
+	if _, err := service.TestHookRunGitOpsKubectlOutput(context.Background(), "sh", "-c", "exit 8"); err == nil {
+		t.Fatalf("expected empty-stderr command failure")
+	}
+}
+
+// TestGitOpsParseErrors runs one orchestration or CLI step.
+// Signature: TestGitOpsParseErrors(t *testing.T).
+// Why: invalid JSON should return parse errors for every Flux resource parser.
+func TestGitOpsParseErrors(t *testing.T) {
+	if _, err := service.TestHookParseGitRepositories([]byte(`not-json`)); err == nil {
+		t.Fatalf("expected GitRepository parse error")
+	}
+	if _, err := service.TestHookParseKustomizations([]byte(`not-json`)); err == nil {
+		t.Fatalf("expected Kustomization parse error")
+	}
+	if _, err := service.TestHookParseHelmReleases([]byte(`not-json`)); err == nil {
+		t.Fatalf("expected HelmRelease parse error")
+	}
+}
+
+// TestMaybeStartGitOpsSnapshotSkipsFreshSampleAndNilExporter runs one
+// orchestration or CLI step.
+// Signature: TestMaybeStartGitOpsSnapshotSkipsFreshSampleAndNilExporter(t *testing.T).
+// Why: disabled or already-fresh scrapes should be cheap no-ops in the daemon
+// loop.
+func TestMaybeStartGitOpsSnapshotSkipsFreshSampleAndNilExporter(t *testing.T) {
+	cfg := config.Config{GitOps: config.GitOps{Enabled: true, PollSeconds: 60}}
+	done := make(chan struct{}, 1)
+	lastRun := time.Now()
+	if running := service.TestHookMaybeStartGitOpsSnapshot(context.Background(), cfg, nil, nil, &lastRun, false, done); running {
+		t.Fatalf("nil exporter should not start scrape")
+	}
+	if running := service.TestHookMaybeStartGitOpsSnapshot(context.Background(), cfg, metrics.New(), nil, &lastRun, false, done); running {
+		t.Fatalf("fresh sample should not start another scrape")
+	}
+	runner := func(_ context.Context, _ string, _ ...string) ([]byte, error) {
+		return []byte(`{"items":[]}`), nil
+	}
+	if running := service.TestHookMaybeStartGitOpsSnapshotWithRunner(context.Background(), cfg, metrics.New(), nil, &lastRun, true, done, runner); !running {
+		t.Fatalf("already-running scrape should remain marked running")
+	}
+	cfg.GitOps.Enabled = false
+	lastRun = time.Time{}
+	if running := service.TestHookMaybeStartGitOpsSnapshotWithRunner(context.Background(), cfg, metrics.New(), nil, &lastRun, false, done, runner); running {
+		t.Fatalf("disabled scrape should not start through runner hook")
+	}
+}
+
+// TestMaybeStartGitOpsSnapshotLogsPartialFailureWithoutLogger runs one
+// orchestration or CLI step.
+// Signature: TestMaybeStartGitOpsSnapshotLogsPartialFailureWithoutLogger(t *testing.T).
+// Why: scrape failure handling should not depend on a logger being present.
+func TestMaybeStartGitOpsSnapshotLogsPartialFailureWithoutLogger(t *testing.T) {
+	done := make(chan struct{}, 1)
+	lastRun := time.Time{}
+	cfg := config.Config{GitOps: config.GitOps{Enabled: true, PollSeconds: 1}}
+	runner := func(_ context.Context, _ string, _ ...string) ([]byte, error) {
+		return nil, errors.New("api down")
+	}
+	if running := service.TestHookMaybeStartGitOpsSnapshotWithRunner(context.Background(), cfg, metrics.New(), nil, &lastRun, false, done, runner); !running {
+		t.Fatalf("expected scrape to start")
+	}
+	select {
+	case <-done:
+	case <-time.After(2 * time.Second):
+		t.Fatalf("timed out waiting for failed scrape")
+	}
+}
+
+// TestGitOpsHelpers runs one orchestration or CLI step.
+// Signature: TestGitOpsHelpers(t *testing.T).
+// Why: tiny label helpers carry dashboard readability contracts and are worth
+// locking down.
+func TestGitOpsHelpers(t *testing.T) {
+	if got := service.TestHookGitOpsDefaultString("  ", "fallback"); got != "fallback" {
+		t.Fatalf("unexpected defaultString fallback: %q", got)
+	}
+	if got := service.TestHookGitOpsFirstNonEmpty(" ", "second"); got != "second" {
+		t.Fatalf("unexpected firstNonEmpty result: %q", got)
+	}
+	if got := service.TestHookGitOpsFirstNonEmpty(" ", ""); got != "" {
+		t.Fatalf("unexpected empty firstNonEmpty result: %q", got)
+	}
+	if got := service.TestHookGitOpsErrorSummary(nil); got != "none" {
+		t.Fatalf("unexpected empty summary: %q", got)
+	}
+	ready, reason := service.TestHookFluxReadyMissing()
+	if ready || reason != "ReadyMissing" {
+		t.Fatalf("unexpected empty Ready condition parse: ready=%t reason=%s", ready, reason)
+	}
+}