From 14a9d67088682c164f5a324041561b5d84002dd0 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 8 Apr 2026 01:01:44 -0300 Subject: [PATCH] startup: auto-heal ingress-backed workloads when checks fail --- internal/cluster/orchestrator.go | 164 ++++++++++++++++++++++++++ internal/cluster/orchestrator_test.go | 48 ++++++++ 2 files changed, 212 insertions(+) diff --git a/internal/cluster/orchestrator.go b/internal/cluster/orchestrator.go index 0d8a6d6..ffb8187 100644 --- a/internal/cluster/orchestrator.go +++ b/internal/cluster/orchestrator.go @@ -2514,6 +2514,7 @@ func (o *Orchestrator) waitForIngressChecklist(ctx context.Context) error { lastLogged := time.Time{} lastRecycleAttempt := time.Time{} lastReplicaHeal := time.Time{} + lastIngressHeal := time.Time{} for { o.maybeAutoRecycleStuckPods(ctx, &lastRecycleAttempt) o.maybeAutoHealCriticalWorkloadReplicas(ctx, &lastReplicaHeal) @@ -2525,6 +2526,7 @@ func (o *Orchestrator) waitForIngressChecklist(ctx context.Context) error { o.log.Printf("ingress checklist passed (%s)", detail) return nil } + o.maybeAutoHealIngressHostBackends(ctx, &lastIngressHeal, lastFailure) if lastFailure != prevFailure || time.Since(lastLogged) >= 30*time.Second { remaining := time.Until(deadline).Round(time.Second) if remaining < 0 { @@ -2603,6 +2605,166 @@ func (o *Orchestrator) discoverIngressHosts(ctx context.Context) ([]string, erro return outHosts, nil } +func (o *Orchestrator) discoverIngressNamespacesForHost(ctx context.Context, host string) ([]string, error) { + host = strings.ToLower(strings.TrimSpace(host)) + if host == "" { + return nil, nil + } + out, err := o.kubectl(ctx, 25*time.Second, "get", "ingress", "-A", "-o", "json") + if err != nil { + return nil, fmt.Errorf("query ingresses: %w", err) + } + var list ingressList + if err := json.Unmarshal([]byte(out), &list); err != nil { + return nil, fmt.Errorf("decode ingresses: %w", err) + } + namespaces := map[string]struct{}{} + for _, item := range list.Items { + ns := strings.TrimSpace(item.Metadata.Namespace) + if ns == "" { + continue + } + for _, rule := range item.Spec.Rules { + ruleHost := strings.ToLower(strings.TrimSpace(rule.Host)) + if ruleHost == "" { + continue + } + if ruleHost == host { + namespaces[ns] = struct{}{} + break + } + } + } + outNamespaces := make([]string, 0, len(namespaces)) + for ns := range namespaces { + outNamespaces = append(outNamespaces, ns) + } + sort.Strings(outNamespaces) + return outNamespaces, nil +} + +func (o *Orchestrator) maybeAutoHealIngressHostBackends(ctx context.Context, lastAttempt *time.Time, failureDetail string) { + if o.runner.DryRun { + return + } + host := o.checklistFailureHost(failureDetail) + if host == "" { + return + } + now := time.Now() + if lastAttempt != nil && !lastAttempt.IsZero() && now.Sub(*lastAttempt) < 45*time.Second { + return + } + if lastAttempt != nil { + *lastAttempt = now + } + healed, err := o.healIngressHostBackendReplicas(ctx, host) + if err != nil { + o.log.Printf("warning: ingress host auto-heal failed for %s: %v", host, err) + return + } + if len(healed) == 0 { + return + } + sort.Strings(healed) + detail := fmt.Sprintf("restored ingress backend replicas for %s: %s", host, joinLimited(healed, 8)) + o.log.Printf("%s", detail) + o.noteStartupAutoHeal(detail) +} + +func (o *Orchestrator) checklistFailureHost(failureDetail string) string { + prefix := strings.TrimSpace(failureDetail) + if idx := strings.Index(prefix, ":"); idx > 0 { + prefix = strings.TrimSpace(prefix[:idx]) + } + if isLikelyHostname(prefix) { + return strings.ToLower(prefix) + } + for _, check := range o.cfg.Startup.ServiceChecklist { + name := strings.TrimSpace(check.Name) + if !strings.EqualFold(name, prefix) { + continue + } + host := hostFromURL(check.URL) + if host != "" { + return strings.ToLower(host) + } + } + if host := hostFromURL(prefix); host != "" { + return strings.ToLower(host) + } + return "" +} + +func hostFromURL(raw string) string { + parsed, err := neturl.Parse(strings.TrimSpace(raw)) + if err != nil || parsed == nil { + return "" + } + return strings.TrimSpace(parsed.Hostname()) +} + +func isLikelyHostname(value string) bool { + value = strings.TrimSpace(value) + if value == "" { + return false + } + if strings.Contains(value, " ") || strings.Contains(value, "/") { + return false + } + return strings.Contains(value, ".") +} + +func (o *Orchestrator) healIngressHostBackendReplicas(ctx context.Context, host string) ([]string, error) { + namespaces, err := o.discoverIngressNamespacesForHost(ctx, host) + if err != nil { + return nil, err + } + if len(namespaces) == 0 { + return nil, nil + } + targetNamespaces := makeStringSet(namespaces) + out, err := o.kubectl(ctx, 30*time.Second, "get", "deploy,statefulset", "-A", "-o", "json") + if err != nil { + return nil, fmt.Errorf("query workloads: %w", err) + } + var list workloadList + if err := json.Unmarshal([]byte(out), &list); err != nil { + return nil, fmt.Errorf("decode workloads: %w", err) + } + healed := []string{} + for _, item := range list.Items { + kind := strings.ToLower(strings.TrimSpace(item.Kind)) + ns := strings.TrimSpace(item.Metadata.Namespace) + name := strings.TrimSpace(item.Metadata.Name) + if kind == "" || ns == "" || name == "" { + continue + } + if kind != "deployment" && kind != "statefulset" { + continue + } + if _, ok := targetNamespaces[ns]; !ok { + continue + } + desired := int32(1) + if item.Spec.Replicas != nil { + desired = *item.Spec.Replicas + } + if desired >= 1 { + continue + } + workload := startupWorkload{Namespace: ns, Kind: kind, Name: name} + if err := o.ensureWorkloadReplicas(ctx, workload, 1); err != nil { + if isNotFoundErr(err) { + continue + } + return healed, fmt.Errorf("scale %s/%s/%s to 1: %w", ns, kind, name, err) + } + healed = append(healed, ns+"/"+kind+"/"+name) + } + return healed, nil +} + func (o *Orchestrator) waitForServiceChecklist(ctx context.Context) error { wait := time.Duration(o.cfg.Startup.ServiceChecklistWaitSeconds) * time.Second if wait <= 0 { @@ -2617,6 +2779,7 @@ func (o *Orchestrator) waitForServiceChecklist(ctx context.Context) error { lastLogged := time.Time{} lastRecycleAttempt := time.Time{} lastReplicaHeal := time.Time{} + lastIngressHeal := time.Time{} for { o.maybeAutoRecycleStuckPods(ctx, &lastRecycleAttempt) o.maybeAutoHealCriticalWorkloadReplicas(ctx, &lastReplicaHeal) @@ -2627,6 +2790,7 @@ func (o *Orchestrator) waitForServiceChecklist(ctx context.Context) error { o.log.Printf("external service checklist passed (%s)", detail) return nil } + o.maybeAutoHealIngressHostBackends(ctx, &lastIngressHeal, lastFailure) if lastFailure != prevFailure || time.Since(lastLogged) >= 30*time.Second { remaining := time.Until(deadline).Round(time.Second) if remaining < 0 { diff --git a/internal/cluster/orchestrator_test.go b/internal/cluster/orchestrator_test.go index f842f4a..6a88cc7 100644 --- a/internal/cluster/orchestrator_test.go +++ b/internal/cluster/orchestrator_test.go @@ -216,6 +216,54 @@ func TestServiceCheckReadyBodyContainsIgnoresWhitespace(t *testing.T) { } } +func TestChecklistFailureHostFromIngressDetail(t *testing.T) { + orch := &Orchestrator{} + got := orch.checklistFailureHost("cloud.bstein.dev: unexpected status code=500") + if got != "cloud.bstein.dev" { + t.Fatalf("expected host cloud.bstein.dev, got %q", got) + } +} + +func TestChecklistFailureHostFromServiceCheckName(t *testing.T) { + orch := &Orchestrator{ + cfg: config.Config{ + Startup: config.Startup{ + ServiceChecklist: []config.ServiceChecklistCheck{ + { + Name: "harbor-registry", + URL: "https://registry.bstein.dev/v2/", + }, + }, + }, + }, + } + got := orch.checklistFailureHost("harbor-registry: unexpected status code=404") + if got != "registry.bstein.dev" { + t.Fatalf("expected host registry.bstein.dev, got %q", got) + } +} + +func TestChecklistFailureHostUnknown(t *testing.T) { + orch := &Orchestrator{ + cfg: config.Config{ + Startup: config.Startup{ + ServiceChecklist: []config.ServiceChecklistCheck{ + { + Name: "grafana-api", + URL: "https://metrics.bstein.dev/api/health", + }, + }, + }, + }, + } + if got := orch.checklistFailureHost("grafana-api: tcp timeout"); got != "metrics.bstein.dev" { + t.Fatalf("expected metrics host from configured URL, got %q", got) + } + if got := orch.checklistFailureHost("some-unmapped-check: fail"); got != "" { + t.Fatalf("expected empty host for unknown check, got %q", got) + } +} + func TestStuckVaultInitReasonDetectsHungInit(t *testing.T) { var pod podResource pod.Status.Phase = "Pending"