diff --git a/Jenkinsfile b/Jenkinsfile index ecfe471..b73abab 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -311,6 +311,10 @@ PY import re from pathlib import Path +coverage_path = Path("build/coverage-percent.txt") +if coverage_path.exists(): + print(coverage_path.read_text(encoding="utf-8", errors="ignore").strip() or "0.0") + raise SystemExit(0) log_path = Path("build/quality-gate.out") text = log_path.read_text(encoding="utf-8", errors="ignore") if log_path.exists() else "" values = [float(match.group(1)) for match in re.finditer(r"([0-9]+(?:\\.[0-9]+)?)%", text)] diff --git a/configs/ananke.example.yaml b/configs/ananke.example.yaml index 77b525e..fa5ae18 100644 --- a/configs/ananke.example.yaml +++ b/configs/ananke.example.yaml @@ -197,6 +197,9 @@ metrics: enabled: true bind_addr: 0.0.0.0:9560 path: /metrics +gitops: + enabled: true + poll_seconds: 60 state: dir: /var/lib/ananke reports_dir: /var/lib/ananke/reports diff --git a/configs/ananke.tethys.yaml b/configs/ananke.tethys.yaml index f322bb2..c0d8005 100644 --- a/configs/ananke.tethys.yaml +++ b/configs/ananke.tethys.yaml @@ -329,6 +329,9 @@ metrics: enabled: true bind_addr: 0.0.0.0:9560 path: /metrics +gitops: + enabled: true + poll_seconds: 60 state: dir: /var/lib/ananke reports_dir: /var/lib/ananke/reports diff --git a/configs/ananke.titan-db.yaml b/configs/ananke.titan-db.yaml index 1cc86f8..4bcb39e 100644 --- a/configs/ananke.titan-db.yaml +++ b/configs/ananke.titan-db.yaml @@ -329,6 +329,9 @@ metrics: enabled: true bind_addr: 0.0.0.0:9560 path: /metrics +gitops: + enabled: true + poll_seconds: 60 state: dir: /var/lib/ananke reports_dir: /var/lib/ananke/reports diff --git a/internal/config/apply_defaults.go b/internal/config/apply_defaults.go index 5325598..62fd730 100644 --- a/internal/config/apply_defaults.go +++ b/internal/config/apply_defaults.go @@ -270,6 +270,9 @@ func (c *Config) applyDefaults() { if c.Metrics.Path == "" { c.Metrics.Path = "/metrics" } + if c.GitOps.PollSeconds <= 0 { + c.GitOps.PollSeconds = 60 + } if c.State.Dir == "" { c.State.Dir = "/var/lib/ananke" } diff --git a/internal/config/defaults.go b/internal/config/defaults.go index 2d33a39..5fb4eef 100644 --- a/internal/config/defaults.go +++ b/internal/config/defaults.go @@ -156,6 +156,10 @@ func defaults() Config { BindAddr: "0.0.0.0:9560", Path: "/metrics", }, + GitOps: GitOps{ + Enabled: true, + PollSeconds: 60, + }, State: State{ Dir: "/var/lib/ananke", ReportsDir: "/var/lib/ananke/reports", diff --git a/internal/config/types.go b/internal/config/types.go index 80c432c..cccef00 100644 --- a/internal/config/types.go +++ b/internal/config/types.go @@ -23,6 +23,7 @@ type Config struct { UPS UPS `yaml:"ups"` Coordination Coordination `yaml:"coordination"` Metrics Metrics `yaml:"metrics"` + GitOps GitOps `yaml:"gitops"` State State `yaml:"state"` } @@ -174,6 +175,11 @@ type Metrics struct { Path string `yaml:"path"` } +type GitOps struct { + Enabled bool `yaml:"enabled"` + PollSeconds int `yaml:"poll_seconds"` +} + type State struct { Dir string `yaml:"dir"` ReportsDir string `yaml:"reports_dir"` diff --git a/internal/metrics/exporter.go b/internal/metrics/exporter.go index d81677e..df099b3 100644 --- a/internal/metrics/exporter.go +++ b/internal/metrics/exporter.go @@ -27,6 +27,50 @@ type Sample struct { UpdatedAt time.Time } +type GitOpsSnapshot struct { + UpdatedAt time.Time + ScrapeSuccess bool + Errors map[string]string + FluxSources []GitOpsFluxSource + Kustomizations []GitOpsKustomization + HelmReleases []GitOpsHelmRelease +} + +type GitOpsFluxSource struct { + Namespace string + Name string + URL string + Branch string + Revision string + Ready bool + Reason string + Suspended bool +} + +type GitOpsKustomization struct { + Namespace string + Name string + Path string + SourceNamespace string + SourceName string + Revision string + Ready bool + Reason string + Suspended bool +} + +type GitOpsHelmRelease struct { + Namespace string + Name string + Chart string + Version string + AppVersion string + Revision string + Ready bool + Reason string + Suspended bool +} + type Exporter struct { mu sync.RWMutex shutdownBudgetSec int @@ -34,6 +78,7 @@ type Exporter struct { lastShutdownReason string lastShutdownAt time.Time samples map[string]Sample + gitops GitOpsSnapshot } // New runs one orchestration or CLI step. @@ -66,6 +111,22 @@ func (e *Exporter) UpdateSample(s Sample) { e.samples[s.Name] = s } +// UpdateGitOpsSnapshot records the most recent Flux object-state scrape. +// Signature: (e *Exporter) UpdateGitOpsSnapshot(snapshot GitOpsSnapshot). +// Why: Grafana needs object readiness and branch/revision state, while Flux's +// controller metrics only expose controller health in this cluster. +func (e *Exporter) UpdateGitOpsSnapshot(snapshot GitOpsSnapshot) { + e.mu.Lock() + defer e.mu.Unlock() + if snapshot.UpdatedAt.IsZero() { + snapshot.UpdatedAt = time.Now().UTC() + } + if snapshot.Errors == nil { + snapshot.Errors = map[string]string{} + } + e.gitops = snapshot +} + // MarkShutdown runs one orchestration or CLI step. // Signature: (e *Exporter) MarkShutdown(reason string). // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. @@ -164,11 +225,99 @@ func (e *Exporter) serveMetrics(w http.ResponseWriter, _ *http.Request) { } b.WriteString(fmt.Sprintf("ananke_ups_error%s %d\n", labels, boolNum(s.LastError != ""))) } + appendGitOpsMetrics(&b, e.gitops) appendQualityGateMetrics(&b) _, _ = w.Write([]byte(b.String())) } +// appendGitOpsMetrics writes Flux object-state metrics collected by the +// long-running daemon loop. +// Signature: appendGitOpsMetrics(dst *strings.Builder, snapshot GitOpsSnapshot). +// Why: this keeps the expensive Kubernetes API reads out of the HTTP scrape path +// while still making current GitOps health cheap for Grafana to query. +func appendGitOpsMetrics(dst *strings.Builder, snapshot GitOpsSnapshot) { + if dst.Len() > 0 { + dst.WriteString("\n") + } + dst.WriteString("# HELP ananke_gitops_last_scrape_timestamp_seconds Unix timestamp of the latest GitOps object-state scrape.\n") + dst.WriteString("# TYPE ananke_gitops_last_scrape_timestamp_seconds gauge\n") + if snapshot.UpdatedAt.IsZero() { + dst.WriteString("ananke_gitops_last_scrape_timestamp_seconds 0\n") + } else { + dst.WriteString(fmt.Sprintf("ananke_gitops_last_scrape_timestamp_seconds %d\n", snapshot.UpdatedAt.Unix())) + } + dst.WriteString("# HELP ananke_gitops_scrape_success Whether the latest GitOps object-state scrape completed without errors.\n") + dst.WriteString("# TYPE ananke_gitops_scrape_success gauge\n") + dst.WriteString(fmt.Sprintf("ananke_gitops_scrape_success %d\n", boolNum(snapshot.ScrapeSuccess))) + dst.WriteString("# HELP ananke_gitops_scrape_error Whether a GitOps resource family failed during the latest scrape.\n") + dst.WriteString("# TYPE ananke_gitops_scrape_error gauge\n") + resources := []string{"gitrepository", "kustomization", "helmrelease"} + for _, resource := range resources { + _, failed := snapshot.Errors[resource] + dst.WriteString(fmt.Sprintf("ananke_gitops_scrape_error{resource=%q} %d\n", resource, boolNum(failed))) + } + + dst.WriteString("# HELP ananke_gitops_flux_source_info Current Flux GitRepository source metadata.\n") + dst.WriteString("# TYPE ananke_gitops_flux_source_info gauge\n") + dst.WriteString("# HELP ananke_gitops_flux_source_ready Whether a Flux GitRepository source is Ready.\n") + dst.WriteString("# TYPE ananke_gitops_flux_source_ready gauge\n") + dst.WriteString("# HELP ananke_gitops_flux_source_suspended Whether a Flux GitRepository source is suspended.\n") + dst.WriteString("# TYPE ananke_gitops_flux_source_suspended gauge\n") + sort.Slice(snapshot.FluxSources, func(i, j int) bool { + return snapshot.FluxSources[i].Namespace+"/"+snapshot.FluxSources[i].Name < snapshot.FluxSources[j].Namespace+"/"+snapshot.FluxSources[j].Name + }) + for _, source := range snapshot.FluxSources { + infoLabels := fmt.Sprintf("{namespace=%q,name=%q,url=%q,branch=%q,revision=%q,ready=%q,reason=%q}", + safe(source.Namespace), safe(source.Name), safe(source.URL), safe(source.Branch), + safe(source.Revision), readyLabel(source.Ready), safe(defaultLabel(source.Reason, "unknown"))) + baseLabels := fmt.Sprintf("{namespace=%q,name=%q}", safe(source.Namespace), safe(source.Name)) + dst.WriteString(fmt.Sprintf("ananke_gitops_flux_source_info%s 1\n", infoLabels)) + dst.WriteString(fmt.Sprintf("ananke_gitops_flux_source_ready%s %d\n", baseLabels, boolNum(source.Ready))) + dst.WriteString(fmt.Sprintf("ananke_gitops_flux_source_suspended%s %d\n", baseLabels, boolNum(source.Suspended))) + } + + dst.WriteString("# HELP ananke_gitops_kustomization_info Current Flux Kustomization metadata.\n") + dst.WriteString("# TYPE ananke_gitops_kustomization_info gauge\n") + dst.WriteString("# HELP ananke_gitops_kustomization_ready Whether a Flux Kustomization is Ready.\n") + dst.WriteString("# TYPE ananke_gitops_kustomization_ready gauge\n") + dst.WriteString("# HELP ananke_gitops_kustomization_suspended Whether a Flux Kustomization is suspended.\n") + dst.WriteString("# TYPE ananke_gitops_kustomization_suspended gauge\n") + sort.Slice(snapshot.Kustomizations, func(i, j int) bool { + return snapshot.Kustomizations[i].Namespace+"/"+snapshot.Kustomizations[i].Name < snapshot.Kustomizations[j].Namespace+"/"+snapshot.Kustomizations[j].Name + }) + for _, kustomization := range snapshot.Kustomizations { + infoLabels := fmt.Sprintf("{namespace=%q,name=%q,path=%q,source_namespace=%q,source_name=%q,revision=%q,ready=%q,reason=%q}", + safe(kustomization.Namespace), safe(kustomization.Name), safe(kustomization.Path), + safe(kustomization.SourceNamespace), safe(kustomization.SourceName), safe(kustomization.Revision), + readyLabel(kustomization.Ready), safe(defaultLabel(kustomization.Reason, "unknown"))) + baseLabels := fmt.Sprintf("{namespace=%q,name=%q}", safe(kustomization.Namespace), safe(kustomization.Name)) + dst.WriteString(fmt.Sprintf("ananke_gitops_kustomization_info%s 1\n", infoLabels)) + dst.WriteString(fmt.Sprintf("ananke_gitops_kustomization_ready%s %d\n", baseLabels, boolNum(kustomization.Ready))) + dst.WriteString(fmt.Sprintf("ananke_gitops_kustomization_suspended%s %d\n", baseLabels, boolNum(kustomization.Suspended))) + } + + dst.WriteString("# HELP ananke_gitops_helmrelease_info Current Flux HelmRelease metadata.\n") + dst.WriteString("# TYPE ananke_gitops_helmrelease_info gauge\n") + dst.WriteString("# HELP ananke_gitops_helmrelease_ready Whether a Flux HelmRelease is Ready.\n") + dst.WriteString("# TYPE ananke_gitops_helmrelease_ready gauge\n") + dst.WriteString("# HELP ananke_gitops_helmrelease_suspended Whether a Flux HelmRelease is suspended.\n") + dst.WriteString("# TYPE ananke_gitops_helmrelease_suspended gauge\n") + sort.Slice(snapshot.HelmReleases, func(i, j int) bool { + return snapshot.HelmReleases[i].Namespace+"/"+snapshot.HelmReleases[i].Name < snapshot.HelmReleases[j].Namespace+"/"+snapshot.HelmReleases[j].Name + }) + for _, release := range snapshot.HelmReleases { + infoLabels := fmt.Sprintf("{namespace=%q,name=%q,chart=%q,version=%q,app_version=%q,revision=%q,ready=%q,reason=%q}", + safe(release.Namespace), safe(release.Name), safe(release.Chart), safe(release.Version), + safe(release.AppVersion), safe(release.Revision), readyLabel(release.Ready), + safe(defaultLabel(release.Reason, "unknown"))) + baseLabels := fmt.Sprintf("{namespace=%q,name=%q}", safe(release.Namespace), safe(release.Name)) + dst.WriteString(fmt.Sprintf("ananke_gitops_helmrelease_info%s 1\n", infoLabels)) + dst.WriteString(fmt.Sprintf("ananke_gitops_helmrelease_ready%s %d\n", baseLabels, boolNum(release.Ready))) + dst.WriteString(fmt.Sprintf("ananke_gitops_helmrelease_suspended%s %d\n", baseLabels, boolNum(release.Suspended))) + } +} + // appendQualityGateMetrics runs one orchestration or CLI step. // Signature: appendQualityGateMetrics(dst *strings.Builder). // Why: quality-gate pass/fail telemetry should appear alongside UPS metrics so @@ -228,3 +377,26 @@ func safe(in string) string { out := strings.ReplaceAll(in, "\\", "\\\\") return strings.ReplaceAll(out, "\"", "\\\"") } + +// readyLabel formats a boolean readiness state as a stable label value. +// Signature: readyLabel(ready bool) string. +// Why: info metrics need a human-readable status label without changing the +// numeric ready gauges used for alerting and aggregation. +func readyLabel(ready bool) string { + if ready { + return "true" + } + return "false" +} + +// defaultLabel returns a safe fallback for empty metric label values. +// Signature: defaultLabel(value string, fallback string) string. +// Why: Flux status fields can be absent during startup, but Prometheus labels +// should remain explicit rather than silently becoming empty strings. +func defaultLabel(value string, fallback string) string { + value = strings.TrimSpace(value) + if value == "" { + return fallback + } + return value +} diff --git a/internal/service/daemon.go b/internal/service/daemon.go index a8c9adf..3191221 100644 --- a/internal/service/daemon.go +++ b/internal/service/daemon.go @@ -97,6 +97,9 @@ func (d *Daemon) Run(ctx context.Context) error { onBatterySince := map[string]time.Time{} breachCount := map[string]int{} lastAutoHeal := time.Time{} + lastGitOpsPoll := time.Time{} + gitOpsPollRunning := false + gitOpsDone := make(chan struct{}, 1) for _, t := range d.targets { lastGood[t.Name] = time.Now() } @@ -108,6 +111,8 @@ func (d *Daemon) Run(ctx context.Context) error { select { case <-ctx.Done(): return ctx.Err() + case <-gitOpsDone: + gitOpsPollRunning = false case <-t.C: budget := d.orch.EstimatedEmergencyShutdownSeconds() threshold := int(math.Ceil(float64(budget) * d.cfg.UPS.RuntimeSafetyFactor)) @@ -201,6 +206,7 @@ func (d *Daemon) Run(ctx context.Context) error { } d.maybeRunPostStartAutoHeal(ctx, &lastAutoHeal, anyOnBattery) + gitOpsPollRunning = d.maybeStartGitOpsSnapshot(ctx, &lastGitOpsPoll, gitOpsPollRunning, gitOpsDone) } } } diff --git a/internal/service/gitops_snapshot.go b/internal/service/gitops_snapshot.go new file mode 100644 index 0000000..b0374f3 --- /dev/null +++ b/internal/service/gitops_snapshot.go @@ -0,0 +1,355 @@ +package service + +import ( + "context" + "encoding/json" + "fmt" + "os/exec" + "sort" + "strings" + "time" + + "scm.bstein.dev/bstein/ananke/internal/config" + "scm.bstein.dev/bstein/ananke/internal/metrics" +) + +var gitOpsKubectlOutput = runGitOpsKubectlOutput + +type gitOpsResourceList struct { + Items []gitOpsResource `json:"items"` +} + +type gitOpsResource struct { + Metadata struct { + Name string `json:"name"` + Namespace string `json:"namespace"` + } `json:"metadata"` + Spec struct { + Suspend bool `json:"suspend"` + Ref struct { + Branch string `json:"branch"` + Tag string `json:"tag"` + SemVer string `json:"semver"` + } `json:"ref"` + URL string `json:"url"` + Path string `json:"path"` + SourceRef struct { + Name string `json:"name"` + Namespace string `json:"namespace"` + } `json:"sourceRef"` + Chart struct { + Spec struct { + Chart string `json:"chart"` + Version string `json:"version"` + } `json:"spec"` + } `json:"chart"` + } `json:"spec"` + Status struct { + Artifact struct { + Revision string `json:"revision"` + } `json:"artifact"` + LastAppliedRevision string `json:"lastAppliedRevision"` + LastAttemptedRevision string `json:"lastAttemptedRevision"` + Conditions []struct { + Type string `json:"type"` + Status string `json:"status"` + Reason string `json:"reason"` + } `json:"conditions"` + History []struct { + ChartName string `json:"chartName"` + ChartVersion string `json:"chartVersion"` + AppVersion string `json:"appVersion"` + Digest string `json:"digest"` + } `json:"history"` + } `json:"status"` +} + +// maybeStartGitOpsSnapshot starts a bounded background scrape when the GitOps +// sample interval has elapsed. +// Signature: (d *Daemon) maybeStartGitOpsSnapshot(ctx context.Context, lastRun *time.Time, running bool, done chan<- struct{}) bool. +// Why: Kubernetes API reads must not block UPS polling, especially during +// emergency power events when the daemon's first job is still safe shutdown. +func (d *Daemon) maybeStartGitOpsSnapshot(ctx context.Context, lastRun *time.Time, running bool, done chan<- struct{}) bool { + if !d.cfg.GitOps.Enabled || d.exporter == nil || running { + return running + } + interval := time.Duration(d.cfg.GitOps.PollSeconds) * time.Second + if interval <= 0 { + interval = 60 * time.Second + } + now := time.Now() + if lastRun != nil && !lastRun.IsZero() && now.Sub(*lastRun) < interval { + return running + } + if lastRun != nil { + *lastRun = now + } + cfg := d.cfg + exporter := d.exporter + logger := d.log + go func() { + defer func() { + select { + case done <- struct{}{}: + default: + } + }() + scrapeCtx, cancel := context.WithTimeout(ctx, 25*time.Second) + defer cancel() + snapshot := collectGitOpsSnapshot(scrapeCtx, cfg) + exporter.UpdateGitOpsSnapshot(snapshot) + if !snapshot.ScrapeSuccess && logger != nil { + logger.Printf("warning: gitops metrics scrape partial: %s", gitOpsErrorSummary(snapshot.Errors)) + } + }() + return true +} + +// collectGitOpsSnapshot queries Flux custom resources and converts them into +// Prometheus-safe metric state. +// Signature: collectGitOpsSnapshot(ctx context.Context, cfg config.Config) metrics.GitOpsSnapshot. +// Why: the Grafana overview needs current branch/readiness/suspend state, which +// is object status rather than controller operational telemetry. +func collectGitOpsSnapshot(ctx context.Context, cfg config.Config) metrics.GitOpsSnapshot { + snapshot := metrics.GitOpsSnapshot{ + UpdatedAt: time.Now().UTC(), + ScrapeSuccess: true, + Errors: map[string]string{}, + } + + if raw, err := gitOpsKubectlJSON(ctx, cfg, "get", "gitrepositories.source.toolkit.fluxcd.io", "-A", "-o", "json"); err != nil { + snapshot.Errors["gitrepository"] = err.Error() + } else if sources, err := parseGitRepositories(raw); err != nil { + snapshot.Errors["gitrepository"] = err.Error() + } else { + snapshot.FluxSources = sources + } + + if raw, err := gitOpsKubectlJSON(ctx, cfg, "get", "kustomizations.kustomize.toolkit.fluxcd.io", "-A", "-o", "json"); err != nil { + snapshot.Errors["kustomization"] = err.Error() + } else if kustomizations, err := parseKustomizations(raw); err != nil { + snapshot.Errors["kustomization"] = err.Error() + } else { + snapshot.Kustomizations = kustomizations + } + + if raw, err := gitOpsKubectlJSON(ctx, cfg, "get", "helmreleases.helm.toolkit.fluxcd.io", "-A", "-o", "json"); err != nil { + snapshot.Errors["helmrelease"] = err.Error() + } else if releases, err := parseHelmReleases(raw); err != nil { + snapshot.Errors["helmrelease"] = err.Error() + } else { + snapshot.HelmReleases = releases + } + + snapshot.ScrapeSuccess = len(snapshot.Errors) == 0 + return snapshot +} + +// parseGitRepositories extracts Git source branch/revision/readiness from Flux +// GitRepository JSON. +// Signature: parseGitRepositories(raw []byte) ([]metrics.GitOpsFluxSource, error). +// Why: keeping status parsing isolated makes it testable without a live Flux +// API and protects the daemon loop from brittle JSON traversal code. +func parseGitRepositories(raw []byte) ([]metrics.GitOpsFluxSource, error) { + var list gitOpsResourceList + if err := json.Unmarshal(raw, &list); err != nil { + return nil, fmt.Errorf("decode gitrepositories: %w", err) + } + out := make([]metrics.GitOpsFluxSource, 0, len(list.Items)) + for _, item := range list.Items { + ready, reason := fluxReady(item.Status.Conditions) + out = append(out, metrics.GitOpsFluxSource{ + Namespace: defaultString(item.Metadata.Namespace, "default"), + Name: item.Metadata.Name, + URL: item.Spec.URL, + Branch: sourceBranch(item), + Revision: item.Status.Artifact.Revision, + Ready: ready, + Reason: reason, + Suspended: item.Spec.Suspend, + }) + } + sort.Slice(out, func(i, j int) bool { + return out[i].Namespace+"/"+out[i].Name < out[j].Namespace+"/"+out[j].Name + }) + return out, nil +} + +// parseKustomizations extracts Kustomization health and source metadata from +// Flux Kustomization JSON. +// Signature: parseKustomizations(raw []byte) ([]metrics.GitOpsKustomization, error). +// Why: Kustomization status drives both the overview summary and the detailed +// GitOps table, so the parser keeps the metric contract in one place. +func parseKustomizations(raw []byte) ([]metrics.GitOpsKustomization, error) { + var list gitOpsResourceList + if err := json.Unmarshal(raw, &list); err != nil { + return nil, fmt.Errorf("decode kustomizations: %w", err) + } + out := make([]metrics.GitOpsKustomization, 0, len(list.Items)) + for _, item := range list.Items { + ready, reason := fluxReady(item.Status.Conditions) + namespace := defaultString(item.Metadata.Namespace, "default") + out = append(out, metrics.GitOpsKustomization{ + Namespace: namespace, + Name: item.Metadata.Name, + Path: item.Spec.Path, + SourceNamespace: defaultString(item.Spec.SourceRef.Namespace, namespace), + SourceName: item.Spec.SourceRef.Name, + Revision: firstNonEmpty(item.Status.LastAppliedRevision, item.Status.LastAttemptedRevision), + Ready: ready, + Reason: reason, + Suspended: item.Spec.Suspend, + }) + } + sort.Slice(out, func(i, j int) bool { + return out[i].Namespace+"/"+out[i].Name < out[j].Namespace+"/"+out[j].Name + }) + return out, nil +} + +// parseHelmReleases extracts HelmRelease health and chart metadata from Flux +// HelmRelease JSON. +// Signature: parseHelmReleases(raw []byte) ([]metrics.GitOpsHelmRelease, error). +// Why: HelmRelease status has different chart fields than Kustomizations, and +// isolating it keeps dashboard metrics stable as Flux payloads evolve. +func parseHelmReleases(raw []byte) ([]metrics.GitOpsHelmRelease, error) { + var list gitOpsResourceList + if err := json.Unmarshal(raw, &list); err != nil { + return nil, fmt.Errorf("decode helmreleases: %w", err) + } + out := make([]metrics.GitOpsHelmRelease, 0, len(list.Items)) + for _, item := range list.Items { + ready, reason := fluxReady(item.Status.Conditions) + chart := item.Spec.Chart.Spec.Chart + version := item.Spec.Chart.Spec.Version + appVersion := "" + revision := firstNonEmpty(item.Status.LastAppliedRevision, item.Status.LastAttemptedRevision) + if len(item.Status.History) > 0 { + latest := item.Status.History[0] + chart = firstNonEmpty(latest.ChartName, chart) + version = firstNonEmpty(latest.ChartVersion, version) + appVersion = latest.AppVersion + revision = firstNonEmpty(latest.Digest, revision) + } + out = append(out, metrics.GitOpsHelmRelease{ + Namespace: defaultString(item.Metadata.Namespace, "default"), + Name: item.Metadata.Name, + Chart: chart, + Version: version, + AppVersion: appVersion, + Revision: revision, + Ready: ready, + Reason: reason, + Suspended: item.Spec.Suspend, + }) + } + sort.Slice(out, func(i, j int) bool { + return out[i].Namespace+"/"+out[i].Name < out[j].Namespace+"/"+out[j].Name + }) + return out, nil +} + +// gitOpsKubectlJSON runs kubectl with the configured kubeconfig and a short +// request timeout. +// Signature: gitOpsKubectlJSON(ctx context.Context, cfg config.Config, args ...string) ([]byte, error). +// Why: every GitOps scrape should fail quickly instead of stealing time from +// UPS safety polling during outage recovery. +func gitOpsKubectlJSON(ctx context.Context, cfg config.Config, args ...string) ([]byte, error) { + finalArgs := []string{} + if strings.TrimSpace(cfg.Kubeconfig) != "" { + finalArgs = append(finalArgs, "--kubeconfig", cfg.Kubeconfig) + } + finalArgs = append(finalArgs, "--request-timeout=8s") + finalArgs = append(finalArgs, args...) + return gitOpsKubectlOutput(ctx, "kubectl", finalArgs...) +} + +// runGitOpsKubectlOutput is the production command runner for GitOps snapshot +// collection. +// Signature: runGitOpsKubectlOutput(ctx context.Context, name string, args ...string) ([]byte, error). +// Why: preserving stderr in errors makes live diagnosis possible when Flux CRD +// reads fail on a recovering control plane. +func runGitOpsKubectlOutput(ctx context.Context, name string, args ...string) ([]byte, error) { + cmd := exec.CommandContext(ctx, name, args...) + out, err := cmd.CombinedOutput() + if err != nil { + trimmed := strings.TrimSpace(string(out)) + if trimmed == "" { + return nil, err + } + return nil, fmt.Errorf("%w: %s", err, trimmed) + } + return out, nil +} + +// fluxReady resolves a Flux Ready condition into a boolean and reason. +// Signature: fluxReady(conditions []struct { Type string; Status string; Reason string }) (bool, string). +// Why: Flux resources all use Ready conditions, so one helper keeps status +// interpretation consistent across GitRepository, Kustomization, and HelmRelease. +func fluxReady(conditions []struct { + Type string `json:"type"` + Status string `json:"status"` + Reason string `json:"reason"` +}) (bool, string) { + for _, condition := range conditions { + if condition.Type == "Ready" { + return strings.EqualFold(condition.Status, "True"), defaultString(condition.Reason, "ReadyUnknown") + } + } + return false, "ReadyMissing" +} + +// sourceBranch returns the most useful source reference label for a +// GitRepository. +// Signature: sourceBranch(item gitOpsResource) string. +// Why: branch is preferred for Atlas, but tags and semver refs should still +// render clearly if Flux is ever pinned differently. +func sourceBranch(item gitOpsResource) string { + return firstNonEmpty(item.Spec.Ref.Branch, item.Spec.Ref.Tag, item.Spec.Ref.SemVer, "unknown") +} + +// defaultString trims a value and returns a fallback when it is empty. +// Signature: defaultString(value string, fallback string) string. +// Why: Flux omits some optional fields; dashboards are easier to read with +// explicit fallback labels. +func defaultString(value string, fallback string) string { + value = strings.TrimSpace(value) + if value == "" { + return fallback + } + return value +} + +// firstNonEmpty returns the first non-empty trimmed value. +// Signature: firstNonEmpty(values ...string) string. +// Why: Flux has several revision fields depending on resource kind and +// reconciliation stage, so callers can express preferred fallback order. +func firstNonEmpty(values ...string) string { + for _, value := range values { + value = strings.TrimSpace(value) + if value != "" { + return value + } + } + return "" +} + +// gitOpsErrorSummary renders resource-family scrape errors for daemon logs. +// Signature: gitOpsErrorSummary(errors map[string]string) string. +// Why: a compact sorted summary keeps recurring scrape warnings readable in +// journald during partial cluster recovery. +func gitOpsErrorSummary(errors map[string]string) string { + if len(errors) == 0 { + return "none" + } + keys := make([]string, 0, len(errors)) + for key := range errors { + keys = append(keys, key) + } + sort.Strings(keys) + parts := make([]string, 0, len(keys)) + for _, key := range keys { + parts = append(parts, key+"="+errors[key]) + } + return strings.Join(parts, "; ") +} diff --git a/internal/service/testing_hooks_gitops.go b/internal/service/testing_hooks_gitops.go new file mode 100644 index 0000000..b6b701d --- /dev/null +++ b/internal/service/testing_hooks_gitops.go @@ -0,0 +1,127 @@ +package service + +import ( + "context" + "log" + "time" + + "scm.bstein.dev/bstein/ananke/internal/config" + "scm.bstein.dev/bstein/ananke/internal/metrics" +) + +type TestHookGitOpsRunner func(context.Context, string, ...string) ([]byte, error) + +// TestHookParseGitRepositories exposes GitRepository parsing to the top-level +// testing module. +// Signature: TestHookParseGitRepositories(raw []byte) ([]metrics.GitOpsFluxSource, error). +// Why: Ananke keeps split-module tests outside internal packages, but the +// parser needs direct contract coverage without a live cluster. +func TestHookParseGitRepositories(raw []byte) ([]metrics.GitOpsFluxSource, error) { + return parseGitRepositories(raw) +} + +// TestHookParseKustomizations exposes Kustomization parsing to the top-level +// testing module. +// Signature: TestHookParseKustomizations(raw []byte) ([]metrics.GitOpsKustomization, error). +// Why: dashboard-facing labels should be verified from representative Flux JSON +// without relying on the current production cluster shape. +func TestHookParseKustomizations(raw []byte) ([]metrics.GitOpsKustomization, error) { + return parseKustomizations(raw) +} + +// TestHookParseHelmReleases exposes HelmRelease parsing to the top-level +// testing module. +// Signature: TestHookParseHelmReleases(raw []byte) ([]metrics.GitOpsHelmRelease, error). +// Why: Helm status fields vary by reconciliation phase, and split-module tests +// need a stable seam for parser coverage. +func TestHookParseHelmReleases(raw []byte) ([]metrics.GitOpsHelmRelease, error) { + return parseHelmReleases(raw) +} + +// TestHookCollectGitOpsSnapshotWithRunner runs snapshot collection with an +// injected kubectl runner. +// Signature: TestHookCollectGitOpsSnapshotWithRunner(ctx context.Context, cfg config.Config, runner TestHookGitOpsRunner) metrics.GitOpsSnapshot. +// Why: this covers success and partial-failure collection paths without shelling +// out to kubectl from unit tests. +func TestHookCollectGitOpsSnapshotWithRunner(ctx context.Context, cfg config.Config, runner TestHookGitOpsRunner) metrics.GitOpsSnapshot { + original := gitOpsKubectlOutput + defer func() { gitOpsKubectlOutput = original }() + gitOpsKubectlOutput = runner + return collectGitOpsSnapshot(ctx, cfg) +} + +// TestHookMaybeStartGitOpsSnapshot starts the daemon's background GitOps scrape +// from split-module tests. +// Signature: TestHookMaybeStartGitOpsSnapshot(ctx context.Context, cfg config.Config, exporter *metrics.Exporter, logger *log.Logger, lastRun *time.Time, running bool, done chan<- struct{}) bool. +// Why: the daemon intentionally keeps scraping asynchronous so GitOps telemetry +// cannot delay UPS polling; the behavior needs direct coverage. +func TestHookMaybeStartGitOpsSnapshot(ctx context.Context, cfg config.Config, exporter *metrics.Exporter, logger *log.Logger, lastRun *time.Time, running bool, done chan<- struct{}) bool { + d := &Daemon{cfg: cfg, exporter: exporter, log: logger} + return d.maybeStartGitOpsSnapshot(ctx, lastRun, running, done) +} + +// TestHookMaybeStartGitOpsSnapshotWithRunner starts a background GitOps scrape +// with an injected runner and restores the production runner after completion. +// Signature: TestHookMaybeStartGitOpsSnapshotWithRunner(ctx context.Context, cfg config.Config, exporter *metrics.Exporter, logger *log.Logger, lastRun *time.Time, running bool, done chan<- struct{}, runner TestHookGitOpsRunner) bool. +// Why: the scrape is asynchronous, so split-module tests need a seam that keeps +// fake kubectl behavior installed until the goroutine exits. +func TestHookMaybeStartGitOpsSnapshotWithRunner(ctx context.Context, cfg config.Config, exporter *metrics.Exporter, logger *log.Logger, lastRun *time.Time, running bool, done chan<- struct{}, runner TestHookGitOpsRunner) bool { + original := gitOpsKubectlOutput + gitOpsKubectlOutput = runner + proxyDone := make(chan struct{}, 1) + d := &Daemon{cfg: cfg, exporter: exporter, log: logger} + started := d.maybeStartGitOpsSnapshot(ctx, lastRun, running, proxyDone) + if !started || running { + gitOpsKubectlOutput = original + return started + } + go func() { + <-proxyDone + gitOpsKubectlOutput = original + select { + case done <- struct{}{}: + default: + } + }() + return started +} + +// TestHookRunGitOpsKubectlOutput exposes the production command runner to +// split-module tests. +// Signature: TestHookRunGitOpsKubectlOutput(ctx context.Context, name string, args ...string) ([]byte, error). +// Why: stderr preservation and empty-stderr failure behavior are part of the +// operator-facing diagnostics contract. +func TestHookRunGitOpsKubectlOutput(ctx context.Context, name string, args ...string) ([]byte, error) { + return runGitOpsKubectlOutput(ctx, name, args...) +} + +// TestHookGitOpsDefaultString exposes defaultString to split-module tests. +// Signature: TestHookGitOpsDefaultString(value string, fallback string) string. +// Why: fallback label behavior is intentionally tiny but important for readable +// Grafana tables during startup. +func TestHookGitOpsDefaultString(value string, fallback string) string { + return defaultString(value, fallback) +} + +// TestHookGitOpsFirstNonEmpty exposes firstNonEmpty to split-module tests. +// Signature: TestHookGitOpsFirstNonEmpty(values ...string) string. +// Why: revision fallback order should remain deterministic as Flux status +// payloads change. +func TestHookGitOpsFirstNonEmpty(values ...string) string { + return firstNonEmpty(values...) +} + +// TestHookGitOpsErrorSummary exposes gitOpsErrorSummary to split-module tests. +// Signature: TestHookGitOpsErrorSummary(errors map[string]string) string. +// Why: sorted scrape warnings are easier to inspect in journald during recovery. +func TestHookGitOpsErrorSummary(errors map[string]string) string { + return gitOpsErrorSummary(errors) +} + +// TestHookFluxReadyMissing exposes the missing Ready-condition branch to tests. +// Signature: TestHookFluxReadyMissing() (bool, string). +// Why: absent Ready conditions should be treated as not-ready and labelled +// explicitly instead of being mistaken for healthy. +func TestHookFluxReadyMissing() (bool, string) { + return fluxReady(nil) +} diff --git a/scripts/publish_quality_metrics.py b/scripts/publish_quality_metrics.py index 718f670..0a12309 100755 --- a/scripts/publish_quality_metrics.py +++ b/scripts/publish_quality_metrics.py @@ -172,6 +172,14 @@ def _read_coverage_percent(path: str) -> float: return 0.0 +def _resolve_repo_path(repo_root: Path, path: str) -> Path: + """Resolve quality-gate artifact paths relative to the repository root.""" + candidate = Path(path) + if candidate.is_absolute(): + return candidate + return repo_root / candidate + + def _iter_source_files(repo_root: Path): for rel_root in SOURCE_SCAN_ROOTS: base = repo_root / rel_root @@ -367,7 +375,9 @@ def main(argv: list[str] | None = None) -> int: args = parse_args(argv or sys.argv[1:]) repo_root = Path(__file__).resolve().parents[1] build_dir = repo_root / "build" - gate_rc = _read_exit_code(Path(os.getenv("ANANKE_QUALITY_EXIT_CODE_PATH", str(build_dir / "quality-gate.rc")))) + gate_rc = _read_exit_code( + _resolve_repo_path(repo_root, os.getenv("ANANKE_QUALITY_EXIT_CODE_PATH", str(build_dir / "quality-gate.rc"))) + ) current_ok = 1 if gate_rc == 0 else 0 current_failed = 0 if gate_rc == 0 else 1 @@ -420,13 +430,18 @@ def main(argv: list[str] | None = None) -> int: elif not already_recorded: resolved_ok += current_ok resolved_failed += current_failed - coverage_percent = _read_coverage_percent(args.coverage_percent_file) + coverage_percent = _read_coverage_percent(str(_resolve_repo_path(repo_root, args.coverage_percent_file))) source_files_total = _count_source_files(repo_root) source_lines_over_500 = _count_source_files_over_limit(repo_root, max_lines=500) - quality_output = Path(os.getenv("ANANKE_QUALITY_OUTPUT_FILE", str(build_dir / "quality-gate.out"))) + quality_output = _resolve_repo_path( + repo_root, + os.getenv("ANANKE_QUALITY_OUTPUT_FILE", str(build_dir / "quality-gate.out")), + ) tests = _parse_go_test_counts(quality_output) test_cases = _parse_go_test_cases(quality_output) - docs_status = _read_status(Path(os.getenv("ANANKE_QUALITY_DOCS_STATUS_PATH", str(build_dir / "docs-naming.status")))) + docs_status = _read_status( + _resolve_repo_path(repo_root, os.getenv("ANANKE_QUALITY_DOCS_STATUS_PATH", str(build_dir / "docs-naming.status"))) + ) unit_tests_failed = _unit_tests_failed(quality_output, coverage_percent) checks = { "tests": "failed" if unit_tests_failed or tests["failed"] > 0 or tests["errors"] > 0 else "ok", diff --git a/scripts/publish_quality_metrics_test.py b/scripts/publish_quality_metrics_test.py index 945f9eb..dd17cc7 100755 --- a/scripts/publish_quality_metrics_test.py +++ b/scripts/publish_quality_metrics_test.py @@ -61,6 +61,18 @@ class PublishQualityMetricsTest(unittest.TestCase): self.server.server_close() self.thread.join(timeout=5) + def test_relative_artifact_paths_are_repo_rooted(self) -> None: + repo_root = Path("/tmp/ananke-repo") + + self.assertEqual( + publisher._resolve_repo_path(repo_root, "build/coverage-percent.txt"), + repo_root / "build" / "coverage-percent.txt", + ) + self.assertEqual( + publisher._resolve_repo_path(repo_root, "/tmp/coverage-percent.txt"), + Path("/tmp/coverage-percent.txt"), + ) + def _env_for_gate_status(self, status: int = 0) -> dict[str, str]: tmp_dir = tempfile.TemporaryDirectory() self.addCleanup(tmp_dir.cleanup) diff --git a/scripts/quality_gate.sh b/scripts/quality_gate.sh index 1ffdfcf..9747556 100755 --- a/scripts/quality_gate.sh +++ b/scripts/quality_gate.sh @@ -56,12 +56,13 @@ read_quality_counter() { write_quality_metrics() { local exit_code="$1" - local metrics_dir state_dir + local metrics_dir state_dir write_metrics metrics_dir="$(dirname "${QUALITY_METRICS_FILE}")" state_dir="$(dirname "${QUALITY_STATE_FILE}")" + write_metrics="${QUALITY_METRICS_ENABLED}" mkdir -p "${state_dir}" >/dev/null 2>&1 || return 0 - if [[ "${QUALITY_METRICS_ENABLED}" == "1" ]]; then - mkdir -p "${metrics_dir}" >/dev/null 2>&1 || return 0 + if [[ "${write_metrics}" == "1" ]]; then + mkdir -p "${metrics_dir}" >/dev/null 2>&1 || write_metrics=0 fi local ok failed total last_success now success_percent @@ -84,12 +85,12 @@ write_quality_metrics() { QUALITY_SUCCESS_PERCENT="${success_percent}" local tmp_metrics="" tmp_state - if [[ "${QUALITY_METRICS_ENABLED}" == "1" ]]; then + if [[ "${write_metrics}" == "1" ]]; then tmp_metrics="$(mktemp "${metrics_dir}/quality-gate.prom.XXXXXX")" fi tmp_state="$(mktemp "${state_dir}/quality-gate.state.XXXXXX")" - if [[ "${QUALITY_METRICS_ENABLED}" == "1" ]]; then + if [[ "${write_metrics}" == "1" ]]; then cat > "${tmp_metrics}" < "${BUILD_DIR}/quality-gate.rc" 2>/dev/null || true + cd "${REPO_DIR}" 2>/dev/null || true write_quality_metrics "${exit_code}" || true publish_quality_metrics || true exit "${exit_code}" @@ -193,4 +196,7 @@ printf '%s\n' "${coverage_percent}" > "${COVERAGE_PERCENT_FILE}" echo "[quality] per-file coverage gate (95%)" cd testing -ANANKE_ENFORCE_COVERAGE=1 ANANKE_PER_FILE_COVERAGE_TARGET=95 go test ./coverage -run TestPerFileCoverageReport -count=1 -v +ANANKE_ENFORCE_COVERAGE=1 \ + ANANKE_PER_FILE_COVERAGE_TARGET=95 \ + ANANKE_PER_FILE_COVERAGE_PERCENT_FILE="${COVERAGE_PERCENT_FILE}" \ + go test ./coverage -run TestPerFileCoverageReport -count=1 -v diff --git a/testing/coverage/coverage_test.go b/testing/coverage/coverage_test.go index c9f4844..bcaa639 100644 --- a/testing/coverage/coverage_test.go +++ b/testing/coverage/coverage_test.go @@ -121,6 +121,7 @@ func TestPerFileCoverageReport(t *testing.T) { "./metrics", "./hygiene", "./orchestrator", + "./service", "./state", "./sshutil", "./ups", @@ -167,17 +168,26 @@ func TestPerFileCoverageReport(t *testing.T) { var report bytes.Buffer report.WriteString("per-file coverage\n") under := make([]string, 0) + minPercent := 100.0 for _, file := range keys { fc := byFile[file] pct := 100.0 if fc.total > 0 { pct = (fc.covered / fc.total) * 100.0 } + if pct < minPercent { + minPercent = pct + } report.WriteString(fmt.Sprintf("- %s: %.1f%%\n", file, pct)) if pct < target { under = append(under, fmt.Sprintf("%s (%.1f%% < %.1f%%)", file, pct, target)) } } + if outputPath := strings.TrimSpace(os.Getenv("ANANKE_PER_FILE_COVERAGE_PERCENT_FILE")); outputPath != "" { + if err := os.WriteFile(outputPath, []byte(fmt.Sprintf("%.1f\n", minPercent)), 0o644); err != nil { + t.Fatalf("write per-file coverage percent: %v", err) + } + } t.Log(report.String()) if len(under) > 0 && enforce { diff --git a/testing/metrics/exporter_http_contract_test.go b/testing/metrics/exporter_http_contract_test.go index b9d6d1e..baaa542 100644 --- a/testing/metrics/exporter_http_contract_test.go +++ b/testing/metrics/exporter_http_contract_test.go @@ -104,3 +104,64 @@ func TestExporterHelperContracts(t *testing.T) { t.Fatalf("unexpected escaped string: %q", got) } } + +// TestExporterEmitsGitOpsMetrics runs one orchestration or CLI step. +// Signature: TestExporterEmitsGitOpsMetrics(t *testing.T). +// Why: the overview dashboard depends on Ananke-owned Flux object-state metrics +// rather than the narrower controller metrics exposed by Flux itself. +func TestExporterEmitsGitOpsMetrics(t *testing.T) { + exporter := metrics.New() + exporter.UpdateGitOpsSnapshot(metrics.GitOpsSnapshot{ + UpdatedAt: time.Unix(1710000100, 0).UTC(), + ScrapeSuccess: true, + FluxSources: []metrics.GitOpsFluxSource{{ + Namespace: "flux-system", + Name: "flux-system", + URL: "ssh://git@example/repo.git", + Branch: "main", + Revision: "main@sha1:abc123", + Ready: true, + Reason: "Succeeded", + }}, + Kustomizations: []metrics.GitOpsKustomization{{ + Namespace: "flux-system", + Name: "monitoring", + Path: "./services/monitoring", + SourceNamespace: "flux-system", + SourceName: "flux-system", + Revision: "main@sha1:abc123", + Ready: true, + Reason: "ReconciliationSucceeded", + }}, + HelmReleases: []metrics.GitOpsHelmRelease{{ + Namespace: "monitoring", + Name: "grafana", + Chart: "grafana", + Version: "8.5.0", + AppVersion: "11.4.0", + Revision: "sha256:abc123", + Ready: false, + Reason: "Progressing", + Suspended: true, + }}, + }) + + req := httptest.NewRequest(http.MethodGet, "/metrics", nil) + rr := httptest.NewRecorder() + exporter.Handler("/metrics").ServeHTTP(rr, req) + body := rr.Body.String() + + mustContain := []string{ + "ananke_gitops_last_scrape_timestamp_seconds 1710000100", + "ananke_gitops_scrape_success 1", + `ananke_gitops_flux_source_info{namespace="flux-system",name="flux-system",url="ssh://git@example/repo.git",branch="main",revision="main@sha1:abc123",ready="true",reason="Succeeded"} 1`, + `ananke_gitops_kustomization_ready{namespace="flux-system",name="monitoring"} 1`, + `ananke_gitops_helmrelease_info{namespace="monitoring",name="grafana",chart="grafana",version="8.5.0",app_version="11.4.0",revision="sha256:abc123",ready="false",reason="Progressing"} 1`, + `ananke_gitops_helmrelease_suspended{namespace="monitoring",name="grafana"} 1`, + } + for _, fragment := range mustContain { + if !strings.Contains(body, fragment) { + t.Fatalf("missing GitOps metric fragment %q in output:\n%s", fragment, body) + } + } +} diff --git a/testing/service/gitops_snapshot_test.go b/testing/service/gitops_snapshot_test.go new file mode 100644 index 0000000..235fb7b --- /dev/null +++ b/testing/service/gitops_snapshot_test.go @@ -0,0 +1,316 @@ +package servicequality + +import ( + "context" + "errors" + "io" + "log" + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" + + "scm.bstein.dev/bstein/ananke/internal/config" + "scm.bstein.dev/bstein/ananke/internal/metrics" + "scm.bstein.dev/bstein/ananke/internal/service" +) + +// TestParseGitRepositories runs one orchestration or CLI step. +// Signature: TestParseGitRepositories(t *testing.T). +// Why: branch, revision, and Ready status are the labels the Grafana GitOps +// panels rely on for the current Flux source. +func TestParseGitRepositories(t *testing.T) { + raw := []byte(`{ + "items": [{ + "metadata": {"name": "flux-system", "namespace": "flux-system"}, + "spec": {"url": "ssh://git@example/repo.git", "ref": {"branch": "main"}}, + "status": { + "artifact": {"revision": "main@sha1:abc123"}, + "conditions": [{"type": "Ready", "status": "True", "reason": "Succeeded"}] + } + }] + }`) + got, err := service.TestHookParseGitRepositories(raw) + if err != nil { + t.Fatalf("parseGitRepositories failed: %v", err) + } + if len(got) != 1 || got[0].Branch != "main" || got[0].Revision != "main@sha1:abc123" || !got[0].Ready { + t.Fatalf("unexpected GitRepository parse result: %+v", got) + } +} + +// TestParseKustomizations runs one orchestration or CLI step. +// Signature: TestParseKustomizations(t *testing.T). +// Why: Kustomization source and suspend labels power the detailed GitOps table. +func TestParseKustomizations(t *testing.T) { + raw := []byte(`{ + "items": [{ + "metadata": {"name": "monitoring", "namespace": "flux-system"}, + "spec": { + "path": "./services/monitoring", + "sourceRef": {"name": "flux-system"}, + "suspend": true + }, + "status": { + "lastAppliedRevision": "main@sha1:def456", + "conditions": [{"type": "Ready", "status": "False", "reason": "DependencyNotReady"}] + } + }] + }`) + got, err := service.TestHookParseKustomizations(raw) + if err != nil { + t.Fatalf("parseKustomizations failed: %v", err) + } + if len(got) != 1 || got[0].SourceNamespace != "flux-system" || got[0].Ready || !got[0].Suspended { + t.Fatalf("unexpected Kustomization parse result: %+v", got) + } +} + +// TestParseHelmReleases runs one orchestration or CLI step. +// Signature: TestParseHelmReleases(t *testing.T). +// Why: HelmRelease chart fields have separate status fallbacks from plain Flux +// Kustomizations and need their own contract coverage. +func TestParseHelmReleases(t *testing.T) { + raw := []byte(`{ + "items": [{ + "metadata": {"name": "grafana", "namespace": "monitoring"}, + "spec": { + "chart": {"spec": {"chart": "grafana", "version": "~8.5.0"}} + }, + "status": { + "lastAttemptedRevision": "8.5.8", + "history": [{"chartName": "grafana", "chartVersion": "8.5.8", "appVersion": "11.4.0", "digest": "sha256:abc"}], + "conditions": [{"type": "Ready", "status": "True", "reason": "InstallSucceeded"}] + } + }] + }`) + got, err := service.TestHookParseHelmReleases(raw) + if err != nil { + t.Fatalf("parseHelmReleases failed: %v", err) + } + if len(got) != 1 || got[0].Chart != "grafana" || got[0].Version != "8.5.8" || got[0].Revision != "sha256:abc" || !got[0].Ready { + t.Fatalf("unexpected HelmRelease parse result: %+v", got) + } +} + +// TestCollectGitOpsSnapshotPartialFailure runs one orchestration or CLI step. +// Signature: TestCollectGitOpsSnapshotPartialFailure(t *testing.T). +// Why: a recovering cluster may serve some Flux resource families before +// others, and Ananke should publish a partial snapshot instead of failing closed. +func TestCollectGitOpsSnapshotPartialFailure(t *testing.T) { + runner := func(_ context.Context, _ string, args ...string) ([]byte, error) { + joined := strings.Join(args, " ") + switch { + case strings.Contains(joined, "gitrepositories"): + return []byte(`{"items":[]}`), nil + case strings.Contains(joined, "kustomizations"): + return nil, errors.New("api unavailable") + case strings.Contains(joined, "helmreleases"): + return []byte(`{"items":[]}`), nil + default: + return nil, errors.New("unexpected command") + } + } + + snapshot := service.TestHookCollectGitOpsSnapshotWithRunner(context.Background(), config.Config{Kubeconfig: "/tmp/kubeconfig"}, runner) + if snapshot.ScrapeSuccess { + t.Fatalf("expected partial failure") + } + if snapshot.Errors["kustomization"] == "" { + t.Fatalf("expected kustomization error, got %+v", snapshot.Errors) + } + if summary := service.TestHookGitOpsErrorSummary(snapshot.Errors); !strings.Contains(summary, "kustomization=api unavailable") { + t.Fatalf("unexpected error summary: %s", summary) + } +} + +// TestCollectGitOpsSnapshotParseFailures runs one orchestration or CLI step. +// Signature: TestCollectGitOpsSnapshotParseFailures(t *testing.T). +// Why: malformed API payloads should be surfaced per resource family so the +// exporter can distinguish data errors from transport failures. +func TestCollectGitOpsSnapshotParseFailures(t *testing.T) { + runner := func(_ context.Context, _ string, _ ...string) ([]byte, error) { + return []byte(`not-json`), nil + } + + snapshot := service.TestHookCollectGitOpsSnapshotWithRunner(context.Background(), config.Config{}, runner) + if snapshot.ScrapeSuccess { + t.Fatalf("expected parse failures") + } + for _, key := range []string{"gitrepository", "kustomization", "helmrelease"} { + if snapshot.Errors[key] == "" { + t.Fatalf("expected %s parse error, got %+v", key, snapshot.Errors) + } + } +} + +// TestCollectGitOpsSnapshotSuccess runs one orchestration or CLI step. +// Signature: TestCollectGitOpsSnapshotSuccess(t *testing.T). +// Why: successful collection should preserve non-branch Git refs as the source +// label rather than hiding them as unknown. +func TestCollectGitOpsSnapshotSuccess(t *testing.T) { + runner := func(_ context.Context, _ string, args ...string) ([]byte, error) { + joined := strings.Join(args, " ") + switch { + case strings.Contains(joined, "gitrepositories"): + return []byte(`{"items":[{"metadata":{"name":"flux-system","namespace":"flux-system"},"spec":{"ref":{"tag":"v1.0.0"}}}]}`), nil + case strings.Contains(joined, "kustomizations"): + return []byte(`{"items":[]}`), nil + case strings.Contains(joined, "helmreleases"): + return []byte(`{"items":[]}`), nil + default: + return nil, errors.New("unexpected command") + } + } + + snapshot := service.TestHookCollectGitOpsSnapshotWithRunner(context.Background(), config.Config{}, runner) + if !snapshot.ScrapeSuccess || len(snapshot.FluxSources) != 1 || snapshot.FluxSources[0].Branch != "v1.0.0" { + t.Fatalf("unexpected snapshot: %+v", snapshot) + } +} + +// TestMaybeStartGitOpsSnapshot runs one orchestration or CLI step. +// Signature: TestMaybeStartGitOpsSnapshot(t *testing.T). +// Why: Ananke must collect GitOps status asynchronously so a slow Kubernetes API +// cannot delay UPS polling. +func TestMaybeStartGitOpsSnapshot(t *testing.T) { + runner := func(_ context.Context, _ string, _ ...string) ([]byte, error) { + return []byte(`{"items":[]}`), nil + } + exporter := metrics.New() + done := make(chan struct{}, 1) + lastRun := time.Time{} + cfg := config.Config{GitOps: config.GitOps{Enabled: true, PollSeconds: 1}} + withRunner := func() bool { + return service.TestHookCollectGitOpsSnapshotWithRunner(context.Background(), config.Config{}, runner).ScrapeSuccess + } + if !withRunner() { + t.Fatalf("expected injected runner sanity check to pass") + } + if running := service.TestHookMaybeStartGitOpsSnapshotWithRunner(context.Background(), cfg, exporter, log.New(io.Discard, "", 0), &lastRun, false, done, runner); !running { + t.Fatalf("expected scrape to start") + } + select { + case <-done: + case <-time.After(2 * time.Second): + t.Fatalf("timed out waiting for scrape") + } + + req := httptest.NewRequest(http.MethodGet, "/metrics", nil) + rr := httptest.NewRecorder() + exporter.Handler("/metrics").ServeHTTP(rr, req) + if !strings.Contains(rr.Body.String(), "ananke_gitops_scrape_success") { + t.Fatalf("expected GitOps scrape metric, got:\n%s", rr.Body.String()) + } + if running := service.TestHookMaybeStartGitOpsSnapshot(context.Background(), cfg, exporter, log.New(io.Discard, "", 0), &lastRun, true, done); !running { + t.Fatalf("running scrape should stay marked running") + } + cfg.GitOps.Enabled = false + if running := service.TestHookMaybeStartGitOpsSnapshot(context.Background(), cfg, exporter, nil, &lastRun, false, done); running { + t.Fatalf("disabled GitOps scrape should not start") + } +} + +// TestRunGitOpsKubectlOutput runs one orchestration or CLI step. +// Signature: TestRunGitOpsKubectlOutput(t *testing.T). +// Why: command diagnostics should preserve stderr for operator-visible errors. +func TestRunGitOpsKubectlOutput(t *testing.T) { + out, err := service.TestHookRunGitOpsKubectlOutput(context.Background(), "sh", "-c", "printf ok") + if err != nil || string(out) != "ok" { + t.Fatalf("unexpected command success result: out=%q err=%v", out, err) + } + if _, err := service.TestHookRunGitOpsKubectlOutput(context.Background(), "sh", "-c", "echo bad >&2; exit 7"); err == nil || !strings.Contains(err.Error(), "bad") { + t.Fatalf("expected stderr to be preserved in error, got %v", err) + } + if _, err := service.TestHookRunGitOpsKubectlOutput(context.Background(), "sh", "-c", "exit 8"); err == nil { + t.Fatalf("expected empty-stderr command failure") + } +} + +// TestGitOpsParseErrors runs one orchestration or CLI step. +// Signature: TestGitOpsParseErrors(t *testing.T). +// Why: invalid JSON should return parse errors for every Flux resource parser. +func TestGitOpsParseErrors(t *testing.T) { + if _, err := service.TestHookParseGitRepositories([]byte(`not-json`)); err == nil { + t.Fatalf("expected GitRepository parse error") + } + if _, err := service.TestHookParseKustomizations([]byte(`not-json`)); err == nil { + t.Fatalf("expected Kustomization parse error") + } + if _, err := service.TestHookParseHelmReleases([]byte(`not-json`)); err == nil { + t.Fatalf("expected HelmRelease parse error") + } +} + +// TestMaybeStartGitOpsSnapshotSkipsFreshSampleAndNilExporter runs one +// orchestration or CLI step. +// Signature: TestMaybeStartGitOpsSnapshotSkipsFreshSampleAndNilExporter(t *testing.T). +// Why: disabled or already-fresh scrapes should be cheap no-ops in the daemon +// loop. +func TestMaybeStartGitOpsSnapshotSkipsFreshSampleAndNilExporter(t *testing.T) { + cfg := config.Config{GitOps: config.GitOps{Enabled: true, PollSeconds: 60}} + done := make(chan struct{}, 1) + lastRun := time.Now() + if running := service.TestHookMaybeStartGitOpsSnapshot(context.Background(), cfg, nil, nil, &lastRun, false, done); running { + t.Fatalf("nil exporter should not start scrape") + } + if running := service.TestHookMaybeStartGitOpsSnapshot(context.Background(), cfg, metrics.New(), nil, &lastRun, false, done); running { + t.Fatalf("fresh sample should not start another scrape") + } + runner := func(_ context.Context, _ string, _ ...string) ([]byte, error) { + return []byte(`{"items":[]}`), nil + } + if running := service.TestHookMaybeStartGitOpsSnapshotWithRunner(context.Background(), cfg, metrics.New(), nil, &lastRun, true, done, runner); !running { + t.Fatalf("already-running scrape should remain marked running") + } + cfg.GitOps.Enabled = false + lastRun = time.Time{} + if running := service.TestHookMaybeStartGitOpsSnapshotWithRunner(context.Background(), cfg, metrics.New(), nil, &lastRun, false, done, runner); running { + t.Fatalf("disabled scrape should not start through runner hook") + } +} + +// TestMaybeStartGitOpsSnapshotLogsPartialFailureWithoutLogger runs one +// orchestration or CLI step. +// Signature: TestMaybeStartGitOpsSnapshotLogsPartialFailureWithoutLogger(t *testing.T). +// Why: scrape failure handling should not depend on a logger being present. +func TestMaybeStartGitOpsSnapshotLogsPartialFailureWithoutLogger(t *testing.T) { + done := make(chan struct{}, 1) + lastRun := time.Time{} + cfg := config.Config{GitOps: config.GitOps{Enabled: true, PollSeconds: 1}} + runner := func(_ context.Context, _ string, _ ...string) ([]byte, error) { + return nil, errors.New("api down") + } + if running := service.TestHookMaybeStartGitOpsSnapshotWithRunner(context.Background(), cfg, metrics.New(), nil, &lastRun, false, done, runner); !running { + t.Fatalf("expected scrape to start") + } + select { + case <-done: + case <-time.After(2 * time.Second): + t.Fatalf("timed out waiting for failed scrape") + } +} + +// TestGitOpsHelpers runs one orchestration or CLI step. +// Signature: TestGitOpsHelpers(t *testing.T). +// Why: tiny label helpers carry dashboard readability contracts and are worth +// locking down. +func TestGitOpsHelpers(t *testing.T) { + if got := service.TestHookGitOpsDefaultString(" ", "fallback"); got != "fallback" { + t.Fatalf("unexpected defaultString fallback: %q", got) + } + if got := service.TestHookGitOpsFirstNonEmpty(" ", "second"); got != "second" { + t.Fatalf("unexpected firstNonEmpty result: %q", got) + } + if got := service.TestHookGitOpsFirstNonEmpty(" ", ""); got != "" { + t.Fatalf("unexpected empty firstNonEmpty result: %q", got) + } + if got := service.TestHookGitOpsErrorSummary(nil); got != "none" { + t.Fatalf("unexpected empty summary: %q", got) + } + ready, reason := service.TestHookFluxReadyMissing() + if ready || reason != "ReadyMissing" { + t.Fatalf("unexpected empty Ready condition parse: ready=%t reason=%s", ready, reason) + } +}