ananke/internal/cluster/orchestrator_ingress.go

package cluster

import (
	"context"
	"encoding/json"
	"fmt"
	neturl "net/url"
	"sort"
	"strings"
	"time"

	"scm.bstein.dev/bstein/ananke/internal/config"
)

// ensureRequiredNodeLabels runs one orchestration or CLI step.
// Signature: (o *Orchestrator) ensureRequiredNodeLabels(ctx context.Context) error.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) ensureRequiredNodeLabels(ctx context.Context) error {
	if o.runner.DryRun || len(o.cfg.Startup.RequiredNodeLabels) == 0 {
		return nil
	}
	ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
	nodes := make([]string, 0, len(o.cfg.Startup.RequiredNodeLabels))
	for node := range o.cfg.Startup.RequiredNodeLabels {
		node = strings.TrimSpace(node)
		if node != "" {
			nodes = append(nodes, node)
		}
	}
	sort.Strings(nodes)
	for _, node := range nodes {
		if _, skip := ignored[node]; skip {
			o.log.Printf("skipping required node labels for ignored unavailable node %s", node)
			continue
		}
		labels := o.cfg.Startup.RequiredNodeLabels[node]
		if len(labels) == 0 {
			continue
		}
		keys := make([]string, 0, len(labels))
		for key := range labels {
			key = strings.TrimSpace(key)
			if key != "" {
				keys = append(keys, key)
			}
		}
		sort.Strings(keys)
		args := []string{"label", "node", node, "--overwrite"}
		pairs := make([]string, 0, len(keys))
		for _, key := range keys {
			value := strings.TrimSpace(labels[key])
			if value == "" {
				continue
			}
			pair := fmt.Sprintf("%s=%s", key, value)
			args = append(args, pair)
			pairs = append(pairs, pair)
		}
		if len(pairs) == 0 {
			continue
		}
		if _, err := o.kubectl(ctx, 25*time.Second, args...); err != nil {
			if isNotFoundErr(err) && !o.startupNodeStrictlyRequired(node) {
				o.log.Printf("warning: skipping required labels for absent non-core node %s: %v", node, err)
				o.noteStartupAutoHeal(fmt.Sprintf("skipped required node labels for absent non-core node %s", node))
				continue
			}
			return fmt.Errorf("ensure required node labels on %s (%s): %w", node, strings.Join(pairs, ", "), err)
		}
		o.log.Printf("ensured required labels on node %s: %s", node, strings.Join(pairs, ", "))
	}
	return nil
}

// waitForStartupConvergence runs one orchestration or CLI step.
// Signature: (o *Orchestrator) waitForStartupConvergence(ctx context.Context) error.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) waitForStartupConvergence(ctx context.Context) error {
	if o.runner.DryRun {
		return nil
	}
	if o.cfg.Startup.RequireIngressChecklist {
		o.noteStartupCheckState("ingress-checklist", "running", "waiting for ingress host reachability")
		if err := o.waitForIngressChecklist(ctx); err != nil {
			o.noteStartupCheck("ingress-checklist", false, err.Error())
			return err
		}
		o.noteStartupCheck("ingress-checklist", true, "all ingress hosts reachable")
	}
	if o.cfg.Startup.RequireServiceChecklist {
		o.noteStartupCheckState("service-checklist", "running", "waiting for external service checklist")
		if err := o.waitForServiceChecklist(ctx); err != nil {
			o.noteStartupCheck("service-checklist", false, err.Error())
			return err
		}
		o.noteStartupCheck("service-checklist", true, "all configured service checks passed")
	}
	if o.cfg.Startup.RequireCriticalServiceEndpoints {
		o.noteStartupCheckState("critical-service-endpoints", "running", "waiting for critical service endpoint backends")
		if err := o.waitForCriticalServiceEndpoints(ctx); err != nil {
			o.noteStartupCheck("critical-service-endpoints", false, err.Error())
			return err
		}
		o.noteStartupCheck("critical-service-endpoints", true, "critical service endpoints have active backends")
	}
	if o.cfg.Startup.RequireFluxHealth {
		o.noteStartupCheckState("flux-health", "running", "waiting for flux kustomization readiness")
		if err := o.waitForFluxHealth(ctx); err != nil {
			o.noteStartupCheck("flux-health", false, err.Error())
			return err
		}
		o.noteStartupCheck("flux-health", true, "all flux kustomizations ready")
	}
	if o.cfg.Startup.RequireWorkloadConvergence {
		o.noteStartupCheckState("workload-convergence", "running", "waiting for controller convergence")
		if err := o.waitForWorkloadConvergence(ctx); err != nil {
			o.noteStartupCheck("workload-convergence", false, err.Error())
			return err
		}
		o.noteStartupCheck("workload-convergence", true, "controllers converged")
	}
	o.noteStartupCheckState("stability-window", "running", "running startup stability soak window")
	if err := o.waitForStabilityWindow(ctx); err != nil {
		o.noteStartupCheck("stability-window", false, err.Error())
		return err
	}
	o.noteStartupCheck("stability-window", true, "startup soak passed")
	return nil
}

// waitForIngressChecklist runs one orchestration or CLI step.
// Signature: (o *Orchestrator) waitForIngressChecklist(ctx context.Context) error.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) waitForIngressChecklist(ctx context.Context) error {
	wait := time.Duration(o.cfg.Startup.IngressChecklistWaitSeconds) * time.Second
	if wait <= 0 {
		wait = 7 * time.Minute
	}
	poll := time.Duration(o.cfg.Startup.IngressChecklistPollSeconds) * time.Second
	if poll <= 0 {
		poll = 5 * time.Second
	}
	deadline := time.Now().Add(wait)
	lastFailure := "unknown"
	lastLogged := time.Time{}
	lastRecycleAttempt := time.Time{}
	lastReplicaHeal := time.Time{}
	lastIngressHeal := time.Time{}
	for {
		o.maybeAutoRecycleStuckPods(ctx, &lastRecycleAttempt)
		o.maybeAutoHealCriticalWorkloadReplicas(ctx, &lastReplicaHeal)

		prevFailure := lastFailure
		ready, detail := o.ingressChecklistReady(ctx)
		lastFailure = detail
		if ready {
			o.log.Printf("ingress checklist passed (%s)", detail)
			return nil
		}
		o.maybeAutoHealIngressHostBackends(ctx, &lastIngressHeal, lastFailure)
		if lastFailure != prevFailure || time.Since(lastLogged) >= 30*time.Second {
			remaining := time.Until(deadline).Round(time.Second)
			if remaining < 0 {
				remaining = 0
			}
			o.log.Printf("waiting for ingress checklist (%s remaining): %s", remaining, lastFailure)
			lastLogged = time.Now()
		}
		if time.Now().After(deadline) {
			return fmt.Errorf("startup blocked: ingress checklist not satisfied within %s (%s)", wait, lastFailure)
		}
		select {
		case <-ctx.Done():
			return ctx.Err()
		case <-time.After(poll):
		}
	}
}

// ingressChecklistReady runs one orchestration or CLI step.
// Signature: (o *Orchestrator) ingressChecklistReady(ctx context.Context) (bool, string).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) ingressChecklistReady(ctx context.Context) (bool, string) {
	hosts, err := o.discoverIngressHosts(ctx)
	if err != nil {
		return false, err.Error()
	}
	if len(hosts) == 0 {
		return true, "no ingress hosts discovered"
	}
	accepted := o.cfg.Startup.IngressChecklistAccepted
	if len(accepted) == 0 {
		accepted = []int{200, 301, 302, 307, 308, 401, 403, 404}
	}
	for _, host := range hosts {
		check := config.ServiceChecklistCheck{
			Name:             "ingress-" + host,
			URL:              "https://" + host + "/",
			AcceptedStatuses: accepted,
			TimeoutSeconds:   12,
			InsecureSkipTLS:  o.cfg.Startup.IngressChecklistInsecureSkip,
		}
		ok, detail := o.serviceCheckReady(ctx, check)
		if !ok {
			return false, fmt.Sprintf("%s: %s", host, detail)
		}
	}
	return true, fmt.Sprintf("hosts=%d", len(hosts))
}

// discoverIngressHosts runs one orchestration or CLI step.
// Signature: (o *Orchestrator) discoverIngressHosts(ctx context.Context) ([]string, error).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) discoverIngressHosts(ctx context.Context) ([]string, error) {
	out, err := o.kubectl(ctx, 25*time.Second, "get", "ingress", "-A", "-o", "json")
	if err != nil {
		return nil, fmt.Errorf("query ingresses: %w", err)
	}
	var list ingressList
	if err := json.Unmarshal([]byte(out), &list); err != nil {
		return nil, fmt.Errorf("decode ingresses: %w", err)
	}
	ignored := makeStringSet(o.cfg.Startup.IngressChecklistIgnoreHosts)
	hosts := map[string]struct{}{}
	for _, item := range list.Items {
		for _, rule := range item.Spec.Rules {
			host := strings.TrimSpace(rule.Host)
			if host == "" || strings.Contains(host, "*") {
				continue
			}
			if _, skip := ignored[host]; skip {
				continue
			}
			hosts[host] = struct{}{}
		}
	}
	outHosts := make([]string, 0, len(hosts))
	for host := range hosts {
		outHosts = append(outHosts, host)
	}
	sort.Strings(outHosts)
	return outHosts, nil
}

// discoverIngressNamespacesForHost runs one orchestration or CLI step.
// Signature: (o *Orchestrator) discoverIngressNamespacesForHost(ctx context.Context, host string) ([]string, error).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) discoverIngressNamespacesForHost(ctx context.Context, host string) ([]string, error) {
	host = strings.ToLower(strings.TrimSpace(host))
	if host == "" {
		return nil, nil
	}
	out, err := o.kubectl(ctx, 25*time.Second, "get", "ingress", "-A", "-o", "json")
	if err != nil {
		return nil, fmt.Errorf("query ingresses: %w", err)
	}
	var list ingressList
	if err := json.Unmarshal([]byte(out), &list); err != nil {
		return nil, fmt.Errorf("decode ingresses: %w", err)
	}
	namespaces := map[string]struct{}{}
	for _, item := range list.Items {
		ns := strings.TrimSpace(item.Metadata.Namespace)
		if ns == "" {
			continue
		}
		for _, rule := range item.Spec.Rules {
			ruleHost := strings.ToLower(strings.TrimSpace(rule.Host))
			if ruleHost == "" {
				continue
			}
			if ruleHost == host {
				namespaces[ns] = struct{}{}
				break
			}
		}
	}
	outNamespaces := make([]string, 0, len(namespaces))
	for ns := range namespaces {
		outNamespaces = append(outNamespaces, ns)
	}
	sort.Strings(outNamespaces)
	return outNamespaces, nil
}

// maybeAutoHealIngressHostBackends runs one orchestration or CLI step.
// Signature: (o *Orchestrator) maybeAutoHealIngressHostBackends(ctx context.Context, lastAttempt *time.Time, failureDetail string).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) maybeAutoHealIngressHostBackends(ctx context.Context, lastAttempt *time.Time, failureDetail string) {
	if o.runner.DryRun {
		return
	}
	host := o.checklistFailureHost(failureDetail)
	if host == "" {
		return
	}
	now := time.Now()
	if lastAttempt != nil && !lastAttempt.IsZero() && now.Sub(*lastAttempt) < 45*time.Second {
		return
	}
	if lastAttempt != nil {
		*lastAttempt = now
	}
	healed, err := o.healIngressHostBackendReplicas(ctx, host)
	if err != nil {
		o.log.Printf("warning: ingress host auto-heal failed for %s: %v", host, err)
		return
	}
	if len(healed) == 0 {
		return
	}
	sort.Strings(healed)
	detail := fmt.Sprintf("restored ingress backend replicas for %s: %s", host, joinLimited(healed, 8))
	o.log.Printf("%s", detail)
	o.noteStartupAutoHeal(detail)
}

// checklistFailureHost runs one orchestration or CLI step.
// Signature: (o *Orchestrator) checklistFailureHost(failureDetail string) string.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) checklistFailureHost(failureDetail string) string {
	prefix := strings.TrimSpace(failureDetail)
	if idx := strings.Index(prefix, ":"); idx > 0 {
		prefix = strings.TrimSpace(prefix[:idx])
	}
	if isLikelyHostname(prefix) {
		return strings.ToLower(prefix)
	}
	for _, check := range o.cfg.Startup.ServiceChecklist {
		name := strings.TrimSpace(check.Name)
		if !strings.EqualFold(name, prefix) {
			continue
		}
		host := hostFromURL(check.URL)
		if host != "" {
			return strings.ToLower(host)
		}
	}
	if host := hostFromURL(prefix); host != "" {
		return strings.ToLower(host)
	}
	return ""
}

// hostFromURL runs one orchestration or CLI step.
// Signature: hostFromURL(raw string) string.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func hostFromURL(raw string) string {
	parsed, err := neturl.Parse(strings.TrimSpace(raw))
	if err != nil || parsed == nil {
		return ""
	}
	return strings.TrimSpace(parsed.Hostname())
}
build: reconcile split modules and restore clean checkout integrity 2026-04-08 23:52:29 -03:00			`package cluster`

			`import (`
			`"context"`
			`"encoding/json"`
			`"fmt"`
			`neturl "net/url"`
			`"sort"`
			`"strings"`
			`"time"`

			`"scm.bstein.dev/bstein/ananke/internal/config"`
			`)`

			`// ensureRequiredNodeLabels runs one orchestration or CLI step.`
			`// Signature: (o *Orchestrator) ensureRequiredNodeLabels(ctx context.Context) error.`
			`// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.`
			`func (o *Orchestrator) ensureRequiredNodeLabels(ctx context.Context) error {`
			`if o.runner.DryRun \|\| len(o.cfg.Startup.RequiredNodeLabels) == 0 {`
			`return nil`
			`}`
startup(ananke): scope emergency recovery to core services 2026-05-05 05:17:59 -03:00			`ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)`
build: reconcile split modules and restore clean checkout integrity 2026-04-08 23:52:29 -03:00			`nodes := make([]string, 0, len(o.cfg.Startup.RequiredNodeLabels))`
			`for node := range o.cfg.Startup.RequiredNodeLabels {`
			`node = strings.TrimSpace(node)`
			`if node != "" {`
			`nodes = append(nodes, node)`
			`}`
			`}`
			`sort.Strings(nodes)`
			`for _, node := range nodes {`
startup(ananke): scope emergency recovery to core services 2026-05-05 05:17:59 -03:00			`if _, skip := ignored[node]; skip {`
			`o.log.Printf("skipping required node labels for ignored unavailable node %s", node)`
			`continue`
			`}`
build: reconcile split modules and restore clean checkout integrity 2026-04-08 23:52:29 -03:00			`labels := o.cfg.Startup.RequiredNodeLabels[node]`
			`if len(labels) == 0 {`
			`continue`
			`}`
			`keys := make([]string, 0, len(labels))`
			`for key := range labels {`
			`key = strings.TrimSpace(key)`
			`if key != "" {`
			`keys = append(keys, key)`
			`}`
			`}`
			`sort.Strings(keys)`
			`args := []string{"label", "node", node, "--overwrite"}`
			`pairs := make([]string, 0, len(keys))`
			`for _, key := range keys {`
			`value := strings.TrimSpace(labels[key])`
			`if value == "" {`
			`continue`
			`}`
			`pair := fmt.Sprintf("%s=%s", key, value)`
			`args = append(args, pair)`
			`pairs = append(pairs, pair)`
			`}`
			`if len(pairs) == 0 {`
			`continue`
			`}`
			`if _, err := o.kubectl(ctx, 25*time.Second, args...); err != nil {`
startup(ananke): scope emergency recovery to core services 2026-05-05 05:17:59 -03:00			`if isNotFoundErr(err) && !o.startupNodeStrictlyRequired(node) {`
			`o.log.Printf("warning: skipping required labels for absent non-core node %s: %v", node, err)`
			`o.noteStartupAutoHeal(fmt.Sprintf("skipped required node labels for absent non-core node %s", node))`
			`continue`
			`}`
build: reconcile split modules and restore clean checkout integrity 2026-04-08 23:52:29 -03:00			`return fmt.Errorf("ensure required node labels on %s (%s): %w", node, strings.Join(pairs, ", "), err)`
			`}`
			`o.log.Printf("ensured required labels on node %s: %s", node, strings.Join(pairs, ", "))`
			`}`
			`return nil`
			`}`

			`// waitForStartupConvergence runs one orchestration or CLI step.`
			`// Signature: (o *Orchestrator) waitForStartupConvergence(ctx context.Context) error.`
			`// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.`
			`func (o *Orchestrator) waitForStartupConvergence(ctx context.Context) error {`
			`if o.runner.DryRun {`
			`return nil`
			`}`
			`if o.cfg.Startup.RequireIngressChecklist {`
			`o.noteStartupCheckState("ingress-checklist", "running", "waiting for ingress host reachability")`
			`if err := o.waitForIngressChecklist(ctx); err != nil {`
			`o.noteStartupCheck("ingress-checklist", false, err.Error())`
			`return err`
			`}`
			`o.noteStartupCheck("ingress-checklist", true, "all ingress hosts reachable")`
			`}`
			`if o.cfg.Startup.RequireServiceChecklist {`
			`o.noteStartupCheckState("service-checklist", "running", "waiting for external service checklist")`
			`if err := o.waitForServiceChecklist(ctx); err != nil {`
			`o.noteStartupCheck("service-checklist", false, err.Error())`
			`return err`
			`}`
			`o.noteStartupCheck("service-checklist", true, "all configured service checks passed")`
			`}`
			`if o.cfg.Startup.RequireCriticalServiceEndpoints {`
			`o.noteStartupCheckState("critical-service-endpoints", "running", "waiting for critical service endpoint backends")`
			`if err := o.waitForCriticalServiceEndpoints(ctx); err != nil {`
			`o.noteStartupCheck("critical-service-endpoints", false, err.Error())`
			`return err`
			`}`
			`o.noteStartupCheck("critical-service-endpoints", true, "critical service endpoints have active backends")`
			`}`
			`if o.cfg.Startup.RequireFluxHealth {`
			`o.noteStartupCheckState("flux-health", "running", "waiting for flux kustomization readiness")`
			`if err := o.waitForFluxHealth(ctx); err != nil {`
			`o.noteStartupCheck("flux-health", false, err.Error())`
			`return err`
			`}`
			`o.noteStartupCheck("flux-health", true, "all flux kustomizations ready")`
			`}`
			`if o.cfg.Startup.RequireWorkloadConvergence {`
			`o.noteStartupCheckState("workload-convergence", "running", "waiting for controller convergence")`
			`if err := o.waitForWorkloadConvergence(ctx); err != nil {`
			`o.noteStartupCheck("workload-convergence", false, err.Error())`
			`return err`
			`}`
			`o.noteStartupCheck("workload-convergence", true, "controllers converged")`
			`}`
			`o.noteStartupCheckState("stability-window", "running", "running startup stability soak window")`
			`if err := o.waitForStabilityWindow(ctx); err != nil {`
			`o.noteStartupCheck("stability-window", false, err.Error())`
			`return err`
			`}`
			`o.noteStartupCheck("stability-window", true, "startup soak passed")`
			`return nil`
			`}`

			`// waitForIngressChecklist runs one orchestration or CLI step.`
			`// Signature: (o *Orchestrator) waitForIngressChecklist(ctx context.Context) error.`
			`// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.`
			`func (o *Orchestrator) waitForIngressChecklist(ctx context.Context) error {`
			`wait := time.Duration(o.cfg.Startup.IngressChecklistWaitSeconds) * time.Second`
			`if wait <= 0 {`
			`wait = 7 * time.Minute`
			`}`
			`poll := time.Duration(o.cfg.Startup.IngressChecklistPollSeconds) * time.Second`
			`if poll <= 0 {`
			`poll = 5 * time.Second`
			`}`
			`deadline := time.Now().Add(wait)`
			`lastFailure := "unknown"`
			`lastLogged := time.Time{}`
			`lastRecycleAttempt := time.Time{}`
			`lastReplicaHeal := time.Time{}`
			`lastIngressHeal := time.Time{}`
			`for {`
			`o.maybeAutoRecycleStuckPods(ctx, &lastRecycleAttempt)`
			`o.maybeAutoHealCriticalWorkloadReplicas(ctx, &lastReplicaHeal)`

			`prevFailure := lastFailure`
			`ready, detail := o.ingressChecklistReady(ctx)`
			`lastFailure = detail`
			`if ready {`
			`o.log.Printf("ingress checklist passed (%s)", detail)`
			`return nil`
			`}`
			`o.maybeAutoHealIngressHostBackends(ctx, &lastIngressHeal, lastFailure)`
			`if lastFailure != prevFailure \|\| time.Since(lastLogged) >= 30*time.Second {`
			`remaining := time.Until(deadline).Round(time.Second)`
			`if remaining < 0 {`
			`remaining = 0`
			`}`
			`o.log.Printf("waiting for ingress checklist (%s remaining): %s", remaining, lastFailure)`
			`lastLogged = time.Now()`
			`}`
			`if time.Now().After(deadline) {`
			`return fmt.Errorf("startup blocked: ingress checklist not satisfied within %s (%s)", wait, lastFailure)`
			`}`
			`select {`
			`case <-ctx.Done():`
			`return ctx.Err()`
			`case <-time.After(poll):`
			`}`
			`}`
			`}`

			`// ingressChecklistReady runs one orchestration or CLI step.`
			`// Signature: (o *Orchestrator) ingressChecklistReady(ctx context.Context) (bool, string).`
			`// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.`
			`func (o *Orchestrator) ingressChecklistReady(ctx context.Context) (bool, string) {`
			`hosts, err := o.discoverIngressHosts(ctx)`
			`if err != nil {`
			`return false, err.Error()`
			`}`
			`if len(hosts) == 0 {`
			`return true, "no ingress hosts discovered"`
			`}`
			`accepted := o.cfg.Startup.IngressChecklistAccepted`
			`if len(accepted) == 0 {`
			`accepted = []int{200, 301, 302, 307, 308, 401, 403, 404}`
			`}`
			`for _, host := range hosts {`
			`check := config.ServiceChecklistCheck{`
			`Name: "ingress-" + host,`
			`URL: "https://" + host + "/",`
			`AcceptedStatuses: accepted,`
			`TimeoutSeconds: 12,`
			`InsecureSkipTLS: o.cfg.Startup.IngressChecklistInsecureSkip,`
			`}`
			`ok, detail := o.serviceCheckReady(ctx, check)`
			`if !ok {`
			`return false, fmt.Sprintf("%s: %s", host, detail)`
			`}`
			`}`
			`return true, fmt.Sprintf("hosts=%d", len(hosts))`
			`}`

			`// discoverIngressHosts runs one orchestration or CLI step.`
			`// Signature: (o *Orchestrator) discoverIngressHosts(ctx context.Context) ([]string, error).`
			`// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.`
			`func (o *Orchestrator) discoverIngressHosts(ctx context.Context) ([]string, error) {`
			`out, err := o.kubectl(ctx, 25*time.Second, "get", "ingress", "-A", "-o", "json")`
			`if err != nil {`
			`return nil, fmt.Errorf("query ingresses: %w", err)`
			`}`
			`var list ingressList`
			`if err := json.Unmarshal([]byte(out), &list); err != nil {`
			`return nil, fmt.Errorf("decode ingresses: %w", err)`
			`}`
			`ignored := makeStringSet(o.cfg.Startup.IngressChecklistIgnoreHosts)`
			`hosts := map[string]struct{}{}`
			`for _, item := range list.Items {`
			`for _, rule := range item.Spec.Rules {`
			`host := strings.TrimSpace(rule.Host)`
			`if host == "" \|\| strings.Contains(host, "*") {`
			`continue`
			`}`
			`if _, skip := ignored[host]; skip {`
			`continue`
			`}`
			`hosts[host] = struct{}{}`
			`}`
			`}`
			`outHosts := make([]string, 0, len(hosts))`
			`for host := range hosts {`
			`outHosts = append(outHosts, host)`
			`}`
			`sort.Strings(outHosts)`
			`return outHosts, nil`
			`}`

			`// discoverIngressNamespacesForHost runs one orchestration or CLI step.`
			`// Signature: (o *Orchestrator) discoverIngressNamespacesForHost(ctx context.Context, host string) ([]string, error).`
			`// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.`
			`func (o *Orchestrator) discoverIngressNamespacesForHost(ctx context.Context, host string) ([]string, error) {`
			`host = strings.ToLower(strings.TrimSpace(host))`
			`if host == "" {`
			`return nil, nil`
			`}`
			`out, err := o.kubectl(ctx, 25*time.Second, "get", "ingress", "-A", "-o", "json")`
			`if err != nil {`
			`return nil, fmt.Errorf("query ingresses: %w", err)`
			`}`
			`var list ingressList`
			`if err := json.Unmarshal([]byte(out), &list); err != nil {`
			`return nil, fmt.Errorf("decode ingresses: %w", err)`
			`}`
			`namespaces := map[string]struct{}{}`
			`for _, item := range list.Items {`
			`ns := strings.TrimSpace(item.Metadata.Namespace)`
			`if ns == "" {`
			`continue`
			`}`
			`for _, rule := range item.Spec.Rules {`
			`ruleHost := strings.ToLower(strings.TrimSpace(rule.Host))`
			`if ruleHost == "" {`
			`continue`
			`}`
			`if ruleHost == host {`
			`namespaces[ns] = struct{}{}`
			`break`
			`}`
			`}`
			`}`
			`outNamespaces := make([]string, 0, len(namespaces))`
			`for ns := range namespaces {`
			`outNamespaces = append(outNamespaces, ns)`
			`}`
			`sort.Strings(outNamespaces)`
			`return outNamespaces, nil`
			`}`

			`// maybeAutoHealIngressHostBackends runs one orchestration or CLI step.`
			`// Signature: (o Orchestrator) maybeAutoHealIngressHostBackends(ctx context.Context, lastAttempt time.Time, failureDetail string).`
			`// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.`
			`func (o Orchestrator) maybeAutoHealIngressHostBackends(ctx context.Context, lastAttempt time.Time, failureDetail string) {`
			`if o.runner.DryRun {`
			`return`
			`}`
			`host := o.checklistFailureHost(failureDetail)`
			`if host == "" {`
			`return`
			`}`
			`now := time.Now()`
			`if lastAttempt != nil && !lastAttempt.IsZero() && now.Sub(lastAttempt) < 45time.Second {`
			`return`
			`}`
			`if lastAttempt != nil {`
			`*lastAttempt = now`
			`}`
			`healed, err := o.healIngressHostBackendReplicas(ctx, host)`
			`if err != nil {`
			`o.log.Printf("warning: ingress host auto-heal failed for %s: %v", host, err)`
			`return`
			`}`
			`if len(healed) == 0 {`
			`return`
			`}`
			`sort.Strings(healed)`
			`detail := fmt.Sprintf("restored ingress backend replicas for %s: %s", host, joinLimited(healed, 8))`
			`o.log.Printf("%s", detail)`
			`o.noteStartupAutoHeal(detail)`
			`}`

			`// checklistFailureHost runs one orchestration or CLI step.`
			`// Signature: (o *Orchestrator) checklistFailureHost(failureDetail string) string.`
			`// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.`
			`func (o *Orchestrator) checklistFailureHost(failureDetail string) string {`
			`prefix := strings.TrimSpace(failureDetail)`
			`if idx := strings.Index(prefix, ":"); idx > 0 {`
			`prefix = strings.TrimSpace(prefix[:idx])`
			`}`
			`if isLikelyHostname(prefix) {`
			`return strings.ToLower(prefix)`
			`}`
			`for _, check := range o.cfg.Startup.ServiceChecklist {`
			`name := strings.TrimSpace(check.Name)`
			`if !strings.EqualFold(name, prefix) {`
			`continue`
			`}`
			`host := hostFromURL(check.URL)`
			`if host != "" {`
			`return strings.ToLower(host)`
			`}`
			`}`
			`if host := hostFromURL(prefix); host != "" {`
			`return strings.ToLower(host)`
			`}`
			`return ""`
			`}`

			`// hostFromURL runs one orchestration or CLI step.`
			`// Signature: hostFromURL(raw string) string.`
			`// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.`
			`func hostFromURL(raw string) string {`
			`parsed, err := neturl.Parse(strings.TrimSpace(raw))`
			`if err != nil \|\| parsed == nil {`
			`return ""`
			`}`
			`return strings.TrimSpace(parsed.Hostname())`
			`}`