package cluster import ( "context" "encoding/json" "fmt" neturl "net/url" "sort" "strings" "time" "scm.bstein.dev/bstein/ananke/internal/config" ) // ensureRequiredNodeLabels runs one orchestration or CLI step. // Signature: (o *Orchestrator) ensureRequiredNodeLabels(ctx context.Context) error. // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func (o *Orchestrator) ensureRequiredNodeLabels(ctx context.Context) error { if o.runner.DryRun || len(o.cfg.Startup.RequiredNodeLabels) == 0 { return nil } ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes) nodes := make([]string, 0, len(o.cfg.Startup.RequiredNodeLabels)) for node := range o.cfg.Startup.RequiredNodeLabels { node = strings.TrimSpace(node) if node != "" { nodes = append(nodes, node) } } sort.Strings(nodes) for _, node := range nodes { if _, skip := ignored[node]; skip { o.log.Printf("skipping required node labels for ignored unavailable node %s", node) continue } labels := o.cfg.Startup.RequiredNodeLabels[node] if len(labels) == 0 { continue } keys := make([]string, 0, len(labels)) for key := range labels { key = strings.TrimSpace(key) if key != "" { keys = append(keys, key) } } sort.Strings(keys) args := []string{"label", "node", node, "--overwrite"} pairs := make([]string, 0, len(keys)) for _, key := range keys { value := strings.TrimSpace(labels[key]) if value == "" { continue } pair := fmt.Sprintf("%s=%s", key, value) args = append(args, pair) pairs = append(pairs, pair) } if len(pairs) == 0 { continue } if _, err := o.kubectl(ctx, 25*time.Second, args...); err != nil { if isNotFoundErr(err) && !o.startupNodeStrictlyRequired(node) { o.log.Printf("warning: skipping required labels for absent non-core node %s: %v", node, err) o.noteStartupAutoHeal(fmt.Sprintf("skipped required node labels for absent non-core node %s", node)) continue } return fmt.Errorf("ensure required node labels on %s (%s): %w", node, strings.Join(pairs, ", "), err) } o.log.Printf("ensured required labels on node %s: %s", node, strings.Join(pairs, ", ")) } return nil } // waitForStartupConvergence runs one orchestration or CLI step. // Signature: (o *Orchestrator) waitForStartupConvergence(ctx context.Context) error. // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func (o *Orchestrator) waitForStartupConvergence(ctx context.Context) error { if o.runner.DryRun { return nil } if o.cfg.Startup.RequireIngressChecklist { o.noteStartupCheckState("ingress-checklist", "running", "waiting for ingress host reachability") if err := o.waitForIngressChecklist(ctx); err != nil { o.noteStartupCheck("ingress-checklist", false, err.Error()) return err } o.noteStartupCheck("ingress-checklist", true, "all ingress hosts reachable") } if o.cfg.Startup.RequireServiceChecklist { o.noteStartupCheckState("service-checklist", "running", "waiting for external service checklist") if err := o.waitForServiceChecklist(ctx); err != nil { o.noteStartupCheck("service-checklist", false, err.Error()) return err } o.noteStartupCheck("service-checklist", true, "all configured service checks passed") } if o.cfg.Startup.RequireCriticalServiceEndpoints { o.noteStartupCheckState("critical-service-endpoints", "running", "waiting for critical service endpoint backends") if err := o.waitForCriticalServiceEndpoints(ctx); err != nil { o.noteStartupCheck("critical-service-endpoints", false, err.Error()) return err } o.noteStartupCheck("critical-service-endpoints", true, "critical service endpoints have active backends") } if o.cfg.Startup.RequireFluxHealth { o.noteStartupCheckState("flux-health", "running", "waiting for flux kustomization readiness") if err := o.waitForFluxHealth(ctx); err != nil { o.noteStartupCheck("flux-health", false, err.Error()) return err } o.noteStartupCheck("flux-health", true, "all flux kustomizations ready") } if o.cfg.Startup.RequireWorkloadConvergence { o.noteStartupCheckState("workload-convergence", "running", "waiting for controller convergence") if err := o.waitForWorkloadConvergence(ctx); err != nil { o.noteStartupCheck("workload-convergence", false, err.Error()) return err } o.noteStartupCheck("workload-convergence", true, "controllers converged") } o.noteStartupCheckState("stability-window", "running", "running startup stability soak window") if err := o.waitForStabilityWindow(ctx); err != nil { o.noteStartupCheck("stability-window", false, err.Error()) return err } o.noteStartupCheck("stability-window", true, "startup soak passed") return nil } // waitForIngressChecklist runs one orchestration or CLI step. // Signature: (o *Orchestrator) waitForIngressChecklist(ctx context.Context) error. // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func (o *Orchestrator) waitForIngressChecklist(ctx context.Context) error { wait := time.Duration(o.cfg.Startup.IngressChecklistWaitSeconds) * time.Second if wait <= 0 { wait = 7 * time.Minute } poll := time.Duration(o.cfg.Startup.IngressChecklistPollSeconds) * time.Second if poll <= 0 { poll = 5 * time.Second } deadline := time.Now().Add(wait) lastFailure := "unknown" lastLogged := time.Time{} lastRecycleAttempt := time.Time{} lastReplicaHeal := time.Time{} lastIngressHeal := time.Time{} for { o.maybeAutoRecycleStuckPods(ctx, &lastRecycleAttempt) o.maybeAutoHealCriticalWorkloadReplicas(ctx, &lastReplicaHeal) prevFailure := lastFailure ready, detail := o.ingressChecklistReady(ctx) lastFailure = detail if ready { o.log.Printf("ingress checklist passed (%s)", detail) return nil } o.maybeAutoHealIngressHostBackends(ctx, &lastIngressHeal, lastFailure) if lastFailure != prevFailure || time.Since(lastLogged) >= 30*time.Second { remaining := time.Until(deadline).Round(time.Second) if remaining < 0 { remaining = 0 } o.log.Printf("waiting for ingress checklist (%s remaining): %s", remaining, lastFailure) lastLogged = time.Now() } if time.Now().After(deadline) { return fmt.Errorf("startup blocked: ingress checklist not satisfied within %s (%s)", wait, lastFailure) } select { case <-ctx.Done(): return ctx.Err() case <-time.After(poll): } } } // ingressChecklistReady runs one orchestration or CLI step. // Signature: (o *Orchestrator) ingressChecklistReady(ctx context.Context) (bool, string). // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func (o *Orchestrator) ingressChecklistReady(ctx context.Context) (bool, string) { hosts, err := o.discoverIngressHosts(ctx) if err != nil { return false, err.Error() } if len(hosts) == 0 { return true, "no ingress hosts discovered" } accepted := o.cfg.Startup.IngressChecklistAccepted if len(accepted) == 0 { accepted = []int{200, 301, 302, 307, 308, 401, 403, 404} } for _, host := range hosts { check := config.ServiceChecklistCheck{ Name: "ingress-" + host, URL: "https://" + host + "/", AcceptedStatuses: accepted, TimeoutSeconds: 12, InsecureSkipTLS: o.cfg.Startup.IngressChecklistInsecureSkip, } ok, detail := o.serviceCheckReady(ctx, check) if !ok { return false, fmt.Sprintf("%s: %s", host, detail) } } return true, fmt.Sprintf("hosts=%d", len(hosts)) } // discoverIngressHosts runs one orchestration or CLI step. // Signature: (o *Orchestrator) discoverIngressHosts(ctx context.Context) ([]string, error). // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func (o *Orchestrator) discoverIngressHosts(ctx context.Context) ([]string, error) { out, err := o.kubectl(ctx, 25*time.Second, "get", "ingress", "-A", "-o", "json") if err != nil { return nil, fmt.Errorf("query ingresses: %w", err) } var list ingressList if err := json.Unmarshal([]byte(out), &list); err != nil { return nil, fmt.Errorf("decode ingresses: %w", err) } ignored := makeStringSet(o.cfg.Startup.IngressChecklistIgnoreHosts) hosts := map[string]struct{}{} for _, item := range list.Items { for _, rule := range item.Spec.Rules { host := strings.TrimSpace(rule.Host) if host == "" || strings.Contains(host, "*") { continue } if _, skip := ignored[host]; skip { continue } hosts[host] = struct{}{} } } outHosts := make([]string, 0, len(hosts)) for host := range hosts { outHosts = append(outHosts, host) } sort.Strings(outHosts) return outHosts, nil } // discoverIngressNamespacesForHost runs one orchestration or CLI step. // Signature: (o *Orchestrator) discoverIngressNamespacesForHost(ctx context.Context, host string) ([]string, error). // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func (o *Orchestrator) discoverIngressNamespacesForHost(ctx context.Context, host string) ([]string, error) { host = strings.ToLower(strings.TrimSpace(host)) if host == "" { return nil, nil } out, err := o.kubectl(ctx, 25*time.Second, "get", "ingress", "-A", "-o", "json") if err != nil { return nil, fmt.Errorf("query ingresses: %w", err) } var list ingressList if err := json.Unmarshal([]byte(out), &list); err != nil { return nil, fmt.Errorf("decode ingresses: %w", err) } namespaces := map[string]struct{}{} for _, item := range list.Items { ns := strings.TrimSpace(item.Metadata.Namespace) if ns == "" { continue } for _, rule := range item.Spec.Rules { ruleHost := strings.ToLower(strings.TrimSpace(rule.Host)) if ruleHost == "" { continue } if ruleHost == host { namespaces[ns] = struct{}{} break } } } outNamespaces := make([]string, 0, len(namespaces)) for ns := range namespaces { outNamespaces = append(outNamespaces, ns) } sort.Strings(outNamespaces) return outNamespaces, nil } // maybeAutoHealIngressHostBackends runs one orchestration or CLI step. // Signature: (o *Orchestrator) maybeAutoHealIngressHostBackends(ctx context.Context, lastAttempt *time.Time, failureDetail string). // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func (o *Orchestrator) maybeAutoHealIngressHostBackends(ctx context.Context, lastAttempt *time.Time, failureDetail string) { if o.runner.DryRun { return } host := o.checklistFailureHost(failureDetail) if host == "" { return } now := time.Now() if lastAttempt != nil && !lastAttempt.IsZero() && now.Sub(*lastAttempt) < 45*time.Second { return } if lastAttempt != nil { *lastAttempt = now } healed, err := o.healIngressHostBackendReplicas(ctx, host) if err != nil { o.log.Printf("warning: ingress host auto-heal failed for %s: %v", host, err) return } if len(healed) == 0 { return } sort.Strings(healed) detail := fmt.Sprintf("restored ingress backend replicas for %s: %s", host, joinLimited(healed, 8)) o.log.Printf("%s", detail) o.noteStartupAutoHeal(detail) } // checklistFailureHost runs one orchestration or CLI step. // Signature: (o *Orchestrator) checklistFailureHost(failureDetail string) string. // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func (o *Orchestrator) checklistFailureHost(failureDetail string) string { prefix := strings.TrimSpace(failureDetail) if idx := strings.Index(prefix, ":"); idx > 0 { prefix = strings.TrimSpace(prefix[:idx]) } if isLikelyHostname(prefix) { return strings.ToLower(prefix) } for _, check := range o.cfg.Startup.ServiceChecklist { name := strings.TrimSpace(check.Name) if !strings.EqualFold(name, prefix) { continue } host := hostFromURL(check.URL) if host != "" { return strings.ToLower(host) } } if host := hostFromURL(prefix); host != "" { return strings.ToLower(host) } return "" } // hostFromURL runs one orchestration or CLI step. // Signature: hostFromURL(raw string) string. // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func hostFromURL(raw string) string { parsed, err := neturl.Parse(strings.TrimSpace(raw)) if err != nil || parsed == nil { return "" } return strings.TrimSpace(parsed.Hostname()) }