ananke/internal/cluster/orchestrator_poststart_exec.go

package cluster

import (
	"context"
	"fmt"
	"os"
	"os/exec"
	"strconv"
	"strings"
	"time"

	"scm.bstein.dev/bstein/ananke/internal/sshutil"
)

// waitForPostStartProbes runs one orchestration or CLI step.
// Signature: (o *Orchestrator) waitForPostStartProbes(ctx context.Context) error.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) waitForPostStartProbes(ctx context.Context) error {
	if o.runner.DryRun {
		return nil
	}
	wait := time.Duration(o.cfg.Startup.PostStartProbeWaitSeconds) * time.Second
	if wait <= 0 {
		wait = 240 * time.Second
	}
	poll := time.Duration(o.cfg.Startup.PostStartProbePollSeconds) * time.Second
	if poll <= 0 {
		poll = 5 * time.Second
	}
	deadline := time.Now().Add(wait)
	lastFailure := "unknown"
	lastLogged := time.Time{}
	for {
		ok, failure := o.postStartProbesReady(ctx)
		if ok {
			o.log.Printf("post-start probes passed")
			return nil
		}
		if failure != lastFailure || time.Since(lastLogged) >= 30*time.Second {
			remaining := time.Until(deadline).Round(time.Second)
			if remaining < 0 {
				remaining = 0
			}
			o.log.Printf("waiting for post-start probes (%s remaining): %s", remaining, failure)
			lastLogged = time.Now()
		}
		lastFailure = failure
		if time.Now().After(deadline) {
			return fmt.Errorf("startup blocked: post-start probes did not pass within %s (%s)", wait, lastFailure)
		}
		select {
		case <-ctx.Done():
			return ctx.Err()
		case <-time.After(poll):
		}
	}
}

// postStartProbesReady runs one orchestration or CLI step.
// Signature: (o *Orchestrator) postStartProbesReady(ctx context.Context) (bool, string).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) postStartProbesReady(ctx context.Context) (bool, string) {
	probes := make([]string, 0, len(o.cfg.Startup.PostStartProbes))
	for _, p := range o.cfg.Startup.PostStartProbes {
		p = strings.TrimSpace(p)
		if p != "" {
			probes = append(probes, p)
		}
	}
	if len(probes) == 0 {
		return true, "no probes configured"
	}

	for _, probe := range probes {
		code, err := o.httpProbe(ctx, probe)
		if err != nil {
			return false, fmt.Sprintf("%s: %v", probe, err)
		}
		if !probeStatusAccepted(probe, code) {
			return false, fmt.Sprintf("%s: unexpected status code=%d", probe, code)
		}
	}
	return true, "all probes successful"
}

// probeStatusAccepted runs one orchestration or CLI step.
// Signature: probeStatusAccepted(_ string, code int) bool.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func probeStatusAccepted(_ string, code int) bool {
	if code >= 200 && code < 400 {
		return true
	}
	// Auth fronts often return unauthorized/forbidden while still proving the service is up.
	if code == 401 || code == 403 {
		return true
	}
	return false
}

// httpProbe runs one orchestration or CLI step.
// Signature: (o *Orchestrator) httpProbe(ctx context.Context, probeURL string) (int, error).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) httpProbe(ctx context.Context, probeURL string) (int, error) {
	out, err := o.run(
		ctx,
		20*time.Second,
		"curl",
		"--silent",
		"--show-error",
		"--location",
		"--max-time",
		"12",
		"--output",
		"/dev/null",
		"--write-out",
		"%{http_code}",
		probeURL,
	)
	if err != nil {
		return 0, err
	}
	code, convErr := strconv.Atoi(strings.TrimSpace(out))
	if convErr != nil {
		return 0, fmt.Errorf("parse http status %q: %w", strings.TrimSpace(out), convErr)
	}
	return code, nil
}

// resumeFluxAndReconcile runs one orchestration or CLI step.
// Signature: (o *Orchestrator) resumeFluxAndReconcile(ctx context.Context) error.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) resumeFluxAndReconcile(ctx context.Context) error {
	if err := o.patchFluxSuspendAll(ctx, false); err != nil {
		return err
	}

	now := time.Now().UTC().Format(time.RFC3339)
	if _, err := o.kubectl(
		ctx,
		25*time.Second,
		"-n", "flux-system",
		"annotate",
		"kustomizations.kustomize.toolkit.fluxcd.io",
		"--all",
		"reconcile.fluxcd.io/requestedAt="+now,
		"--overwrite",
	); err != nil {
		o.log.Printf("warning: annotate kustomizations for reconcile failed: %v", err)
	}
	if _, err := o.kubectl(
		ctx,
		25*time.Second,
		"annotate",
		"--all-namespaces",
		"helmreleases.helm.toolkit.fluxcd.io",
		"--all",
		"reconcile.fluxcd.io/requestedAt="+now,
		"--overwrite",
	); err != nil {
		o.log.Printf("warning: annotate helmreleases for reconcile failed: %v", err)
	}

	if o.runner.CommandExists("flux") {
		sourceCmd := []string{"reconcile", "source", "git", "flux-system", "-n", "flux-system", "--timeout=60s"}
		if _, err := o.run(ctx, 75*time.Second, "flux", sourceCmd...); err != nil {
			o.log.Printf("warning: flux command failed (%s): %v", strings.Join(sourceCmd, " "), err)
		}
	}
	return nil
}

// kubectl runs one orchestration or CLI step.
// Signature: (o *Orchestrator) kubectl(ctx context.Context, timeout time.Duration, args ...string) (string, error).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) kubectl(ctx context.Context, timeout time.Duration, args ...string) (string, error) {
	return o.run(ctx, timeout, "kubectl", args...)
}

// ssh runs one orchestration or CLI step.
// Signature: (o *Orchestrator) ssh(ctx context.Context, node string, command string) (string, error).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) ssh(ctx context.Context, node string, command string) (string, error) {
	return o.sshWithTimeout(ctx, node, command, 45*time.Second)
}

// sshWithTimeout runs one orchestration or CLI step.
// Signature: (o *Orchestrator) sshWithTimeout(ctx context.Context, node string, command string, timeout time.Duration) (string, error).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) sshWithTimeout(ctx context.Context, node string, command string, timeout time.Duration) (string, error) {
	host := node
	if mapped, ok := o.cfg.SSHNodeHosts[node]; ok && strings.TrimSpace(mapped) != "" {
		host = strings.TrimSpace(mapped)
	}
	sshUser := o.cfg.SSHUser
	if override, ok := o.cfg.SSHNodeUsers[node]; ok && strings.TrimSpace(override) != "" {
		sshUser = strings.TrimSpace(override)
	}
	target := host
	if sshUser != "" {
		target = sshUser + "@" + host
	}
	sshConfigFile := o.resolveSSHConfigFile()
	sshIdentity := o.resolveSSHIdentityFile()
	baseArgs := []string{
		"-o", "BatchMode=yes",
		"-o", "ConnectTimeout=8",
		"-o", "StrictHostKeyChecking=accept-new",
	}
	if sshConfigFile != "" {
		baseArgs = append(baseArgs, "-F", sshConfigFile)
	}
	if sshIdentity != "" {
		baseArgs = append(baseArgs, "-i", sshIdentity)
	}
	if o.cfg.SSHPort > 0 {
		baseArgs = append(baseArgs, "-p", strconv.Itoa(o.cfg.SSHPort))
	}
	attempts := make([][]string, 0, 2)
	attemptNames := make([]string, 0, 2)
	knownHostsFiles := sshutil.KnownHostsFiles(sshConfigFile, sshIdentity)
	repairHosts := []string{node, host}
	if o.cfg.SSHJumpHost != "" {
		jump := o.cfg.SSHJumpHost
		repairHosts = append(repairHosts, jump)
		if mapped, ok := o.cfg.SSHNodeHosts[jump]; ok && strings.TrimSpace(mapped) != "" {
			repairHosts = append(repairHosts, strings.TrimSpace(mapped))
		}
		if o.cfg.SSHJumpUser != "" {
			jump = o.cfg.SSHJumpUser + "@" + jump
		}
		if o.cfg.SSHPort > 0 && !strings.Contains(jump, ":") {
			jump = fmt.Sprintf("%s:%d", jump, o.cfg.SSHPort)
		}
		withJump := append([]string{}, baseArgs...)
		withJump = append(withJump, "-J", jump, target, command)
		attempts = append(attempts, withJump)
		attemptNames = append(attemptNames, "jump")
	}
	direct := append([]string{}, baseArgs...)
	direct = append(direct, target, command)
	attempts = append(attempts, direct)
	attemptNames = append(attemptNames, "direct")

	var lastOut string
	var lastErr error
	for i, args := range attempts {
		out, err := o.run(ctx, timeout, "ssh", args...)
		if err == nil {
			if i > 0 {
				o.log.Printf("warning: ssh %s path failed for %s, using %s path", attemptNames[i-1], node, attemptNames[i])
			}
			return out, nil
		}
		if sshutil.ShouldAttemptKnownHostsRepair(out, err) {
			o.log.Printf("warning: ssh failure on %s via %s path may be host-key related; repairing known_hosts and retrying once", node, attemptNames[i])
			sshutil.RepairKnownHosts(ctx, o.log, knownHostsFiles, repairHosts, o.cfg.SSHPort)
			retryOut, retryErr := o.run(ctx, timeout, "ssh", args...)
			if retryErr == nil {
				return retryOut, nil
			}
			out = retryOut
			err = retryErr
		}
		lastOut = out
		lastErr = err
		if i < len(attempts)-1 {
			o.log.Printf("warning: ssh %s path failed for %s: %v; trying %s path", attemptNames[i], node, err, attemptNames[i+1])
		}
	}
	return lastOut, lastErr
}

// run runs one orchestration or CLI step.
// Signature: (o *Orchestrator) run(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) run(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
	if o.runOverride != nil {
		return o.runOverride(ctx, timeout, name, args...)
	}
	runCtx, cancel := context.WithTimeout(ctx, timeout)
	defer cancel()
	return o.runner.Run(runCtx, name, args...)
}

// runSensitive runs one orchestration or CLI step.
// Signature: (o *Orchestrator) runSensitive(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) runSensitive(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
	if o.runSensitiveOverride != nil {
		return o.runSensitiveOverride(ctx, timeout, name, args...)
	}
	runCtx, cancel := context.WithTimeout(ctx, timeout)
	defer cancel()

	cmd := exec.CommandContext(runCtx, name, args...)
	cmd.Env = os.Environ()
	if o.runner.Kubeconfig != "" {
		cmd.Env = append(cmd.Env, "KUBECONFIG="+o.runner.Kubeconfig)
	}
	out, err := cmd.CombinedOutput()
	trimmed := strings.TrimSpace(string(out))
	if err != nil {
		if trimmed == "" {
			return "", fmt.Errorf("%s failed: %w", name, err)
		}
		return trimmed, fmt.Errorf("%s failed: %w", name, err)
	}
	return trimmed, nil
}

// lines runs one orchestration or CLI step.
// Signature: lines(in string) []string.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func lines(in string) []string {
	in = strings.TrimSpace(in)
	if in == "" {
		return nil
	}
	parts := strings.Split(in, "\n")
	out := make([]string, 0, len(parts))
	for _, p := range parts {
		v := strings.TrimSpace(p)
		if v != "" {
			out = append(out, v)
		}
	}
	return out
}

// sshManaged runs one orchestration or CLI step.
// Signature: (o *Orchestrator) sshManaged(node string) bool.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) sshManaged(node string) bool {
	if len(o.cfg.SSHManagedNodes) == 0 {
		return true
	}
	for _, allowed := range o.cfg.SSHManagedNodes {
		if strings.TrimSpace(allowed) == node {
			return true
		}
	}
	return false
}

// resolveSSHConfigFile runs one orchestration or CLI step.
// Signature: (o *Orchestrator) resolveSSHConfigFile() string.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) resolveSSHConfigFile() string {
	if strings.TrimSpace(o.cfg.SSHConfigFile) != "" {
		return strings.TrimSpace(o.cfg.SSHConfigFile)
	}
	candidates := []string{
		"/home/atlas/.ssh/config",
		"/home/tethys/.ssh/config",
	}
	for _, p := range candidates {
		if stat, err := os.Stat(p); err == nil && !stat.IsDir() {
			return p
		}
	}
	return ""
}

// resolveSSHIdentityFile runs one orchestration or CLI step.
// Signature: (o *Orchestrator) resolveSSHIdentityFile() string.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) resolveSSHIdentityFile() string {
	if strings.TrimSpace(o.cfg.SSHIdentityFile) != "" {
		return strings.TrimSpace(o.cfg.SSHIdentityFile)
	}
	candidates := []string{
		"/home/atlas/.ssh/id_ed25519",
		"/home/tethys/.ssh/id_ed25519",
	}
	for _, p := range candidates {
		if stat, err := os.Stat(p); err == nil && !stat.IsDir() {
			return p
		}
	}
	return ""
}
build: reconcile split modules and restore clean checkout integrity 2026-04-08 23:52:29 -03:00			`package cluster`

			`import (`
			`"context"`
			`"fmt"`
			`"os"`
			`"os/exec"`
			`"strconv"`
			`"strings"`
			`"time"`

			`"scm.bstein.dev/bstein/ananke/internal/sshutil"`
			`)`

			`// waitForPostStartProbes runs one orchestration or CLI step.`
			`// Signature: (o *Orchestrator) waitForPostStartProbes(ctx context.Context) error.`
			`// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.`
			`func (o *Orchestrator) waitForPostStartProbes(ctx context.Context) error {`
			`if o.runner.DryRun {`
			`return nil`
			`}`
			`wait := time.Duration(o.cfg.Startup.PostStartProbeWaitSeconds) * time.Second`
			`if wait <= 0 {`
			`wait = 240 * time.Second`
			`}`
			`poll := time.Duration(o.cfg.Startup.PostStartProbePollSeconds) * time.Second`
			`if poll <= 0 {`
			`poll = 5 * time.Second`
			`}`
			`deadline := time.Now().Add(wait)`
			`lastFailure := "unknown"`
			`lastLogged := time.Time{}`
			`for {`
			`ok, failure := o.postStartProbesReady(ctx)`
			`if ok {`
			`o.log.Printf("post-start probes passed")`
			`return nil`
			`}`
			`if failure != lastFailure \|\| time.Since(lastLogged) >= 30*time.Second {`
			`remaining := time.Until(deadline).Round(time.Second)`
			`if remaining < 0 {`
			`remaining = 0`
			`}`
			`o.log.Printf("waiting for post-start probes (%s remaining): %s", remaining, failure)`
			`lastLogged = time.Now()`
			`}`
			`lastFailure = failure`
			`if time.Now().After(deadline) {`
			`return fmt.Errorf("startup blocked: post-start probes did not pass within %s (%s)", wait, lastFailure)`
			`}`
			`select {`
			`case <-ctx.Done():`
			`return ctx.Err()`
			`case <-time.After(poll):`
			`}`
			`}`
			`}`

			`// postStartProbesReady runs one orchestration or CLI step.`
			`// Signature: (o *Orchestrator) postStartProbesReady(ctx context.Context) (bool, string).`
			`// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.`
			`func (o *Orchestrator) postStartProbesReady(ctx context.Context) (bool, string) {`
			`probes := make([]string, 0, len(o.cfg.Startup.PostStartProbes))`
			`for _, p := range o.cfg.Startup.PostStartProbes {`
			`p = strings.TrimSpace(p)`
			`if p != "" {`
			`probes = append(probes, p)`
			`}`
			`}`
			`if len(probes) == 0 {`
			`return true, "no probes configured"`
			`}`

			`for _, probe := range probes {`
			`code, err := o.httpProbe(ctx, probe)`
			`if err != nil {`
			`return false, fmt.Sprintf("%s: %v", probe, err)`
			`}`
			`if !probeStatusAccepted(probe, code) {`
			`return false, fmt.Sprintf("%s: unexpected status code=%d", probe, code)`
			`}`
			`}`
			`return true, "all probes successful"`
			`}`

			`// probeStatusAccepted runs one orchestration or CLI step.`
			`// Signature: probeStatusAccepted(_ string, code int) bool.`
			`// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.`
			`func probeStatusAccepted(_ string, code int) bool {`
			`if code >= 200 && code < 400 {`
			`return true`
			`}`
			`// Auth fronts often return unauthorized/forbidden while still proving the service is up.`
			`if code == 401 \|\| code == 403 {`
			`return true`
			`}`
			`return false`
			`}`

			`// httpProbe runs one orchestration or CLI step.`
			`// Signature: (o *Orchestrator) httpProbe(ctx context.Context, probeURL string) (int, error).`
			`// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.`
			`func (o *Orchestrator) httpProbe(ctx context.Context, probeURL string) (int, error) {`
			`out, err := o.run(`
			`ctx,`
			`20*time.Second,`
			`"curl",`
			`"--silent",`
			`"--show-error",`
			`"--location",`
			`"--max-time",`
			`"12",`
			`"--output",`
			`"/dev/null",`
			`"--write-out",`
			`"%{http_code}",`
			`probeURL,`
			`)`
			`if err != nil {`
			`return 0, err`
			`}`
			`code, convErr := strconv.Atoi(strings.TrimSpace(out))`
			`if convErr != nil {`
			`return 0, fmt.Errorf("parse http status %q: %w", strings.TrimSpace(out), convErr)`
			`}`
			`return code, nil`
			`}`

			`// resumeFluxAndReconcile runs one orchestration or CLI step.`
			`// Signature: (o *Orchestrator) resumeFluxAndReconcile(ctx context.Context) error.`
			`// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.`
			`func (o *Orchestrator) resumeFluxAndReconcile(ctx context.Context) error {`
			`if err := o.patchFluxSuspendAll(ctx, false); err != nil {`
			`return err`
			`}`

			`now := time.Now().UTC().Format(time.RFC3339)`
			`if _, err := o.kubectl(`
			`ctx,`
			`25*time.Second,`
			`"-n", "flux-system",`
			`"annotate",`
			`"kustomizations.kustomize.toolkit.fluxcd.io",`
			`"--all",`
			`"reconcile.fluxcd.io/requestedAt="+now,`
			`"--overwrite",`
			`); err != nil {`
			`o.log.Printf("warning: annotate kustomizations for reconcile failed: %v", err)`
			`}`
			`if _, err := o.kubectl(`
			`ctx,`
			`25*time.Second,`
			`"annotate",`
			`"--all-namespaces",`
			`"helmreleases.helm.toolkit.fluxcd.io",`
			`"--all",`
			`"reconcile.fluxcd.io/requestedAt="+now,`
			`"--overwrite",`
			`); err != nil {`
			`o.log.Printf("warning: annotate helmreleases for reconcile failed: %v", err)`
			`}`

			`if o.runner.CommandExists("flux") {`
			`sourceCmd := []string{"reconcile", "source", "git", "flux-system", "-n", "flux-system", "--timeout=60s"}`
			`if _, err := o.run(ctx, 75*time.Second, "flux", sourceCmd...); err != nil {`
			`o.log.Printf("warning: flux command failed (%s): %v", strings.Join(sourceCmd, " "), err)`
			`}`
			`}`
			`return nil`
			`}`

			`// kubectl runs one orchestration or CLI step.`
			`// Signature: (o *Orchestrator) kubectl(ctx context.Context, timeout time.Duration, args ...string) (string, error).`
			`// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.`
			`func (o *Orchestrator) kubectl(ctx context.Context, timeout time.Duration, args ...string) (string, error) {`
			`return o.run(ctx, timeout, "kubectl", args...)`
			`}`

			`// ssh runs one orchestration or CLI step.`
			`// Signature: (o *Orchestrator) ssh(ctx context.Context, node string, command string) (string, error).`
			`// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.`
			`func (o *Orchestrator) ssh(ctx context.Context, node string, command string) (string, error) {`
			`return o.sshWithTimeout(ctx, node, command, 45*time.Second)`
			`}`

			`// sshWithTimeout runs one orchestration or CLI step.`
			`// Signature: (o *Orchestrator) sshWithTimeout(ctx context.Context, node string, command string, timeout time.Duration) (string, error).`
			`// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.`
			`func (o *Orchestrator) sshWithTimeout(ctx context.Context, node string, command string, timeout time.Duration) (string, error) {`
			`host := node`
			`if mapped, ok := o.cfg.SSHNodeHosts[node]; ok && strings.TrimSpace(mapped) != "" {`
			`host = strings.TrimSpace(mapped)`
			`}`
			`sshUser := o.cfg.SSHUser`
			`if override, ok := o.cfg.SSHNodeUsers[node]; ok && strings.TrimSpace(override) != "" {`
			`sshUser = strings.TrimSpace(override)`
			`}`
			`target := host`
			`if sshUser != "" {`
			`target = sshUser + "@" + host`
			`}`
			`sshConfigFile := o.resolveSSHConfigFile()`
			`sshIdentity := o.resolveSSHIdentityFile()`
			`baseArgs := []string{`
			`"-o", "BatchMode=yes",`
			`"-o", "ConnectTimeout=8",`
			`"-o", "StrictHostKeyChecking=accept-new",`
			`}`
			`if sshConfigFile != "" {`
			`baseArgs = append(baseArgs, "-F", sshConfigFile)`
			`}`
			`if sshIdentity != "" {`
			`baseArgs = append(baseArgs, "-i", sshIdentity)`
			`}`
			`if o.cfg.SSHPort > 0 {`
			`baseArgs = append(baseArgs, "-p", strconv.Itoa(o.cfg.SSHPort))`
			`}`
			`attempts := make([][]string, 0, 2)`
			`attemptNames := make([]string, 0, 2)`
			`knownHostsFiles := sshutil.KnownHostsFiles(sshConfigFile, sshIdentity)`
			`repairHosts := []string{node, host}`
			`if o.cfg.SSHJumpHost != "" {`
			`jump := o.cfg.SSHJumpHost`
			`repairHosts = append(repairHosts, jump)`
			`if mapped, ok := o.cfg.SSHNodeHosts[jump]; ok && strings.TrimSpace(mapped) != "" {`
			`repairHosts = append(repairHosts, strings.TrimSpace(mapped))`
			`}`
			`if o.cfg.SSHJumpUser != "" {`
			`jump = o.cfg.SSHJumpUser + "@" + jump`
			`}`
			`if o.cfg.SSHPort > 0 && !strings.Contains(jump, ":") {`
			`jump = fmt.Sprintf("%s:%d", jump, o.cfg.SSHPort)`
			`}`
			`withJump := append([]string{}, baseArgs...)`
			`withJump = append(withJump, "-J", jump, target, command)`
			`attempts = append(attempts, withJump)`
			`attemptNames = append(attemptNames, "jump")`
			`}`
			`direct := append([]string{}, baseArgs...)`
			`direct = append(direct, target, command)`
			`attempts = append(attempts, direct)`
			`attemptNames = append(attemptNames, "direct")`

			`var lastOut string`
			`var lastErr error`
			`for i, args := range attempts {`
			`out, err := o.run(ctx, timeout, "ssh", args...)`
			`if err == nil {`
			`if i > 0 {`
			`o.log.Printf("warning: ssh %s path failed for %s, using %s path", attemptNames[i-1], node, attemptNames[i])`
			`}`
			`return out, nil`
			`}`
			`if sshutil.ShouldAttemptKnownHostsRepair(out, err) {`
			`o.log.Printf("warning: ssh failure on %s via %s path may be host-key related; repairing known_hosts and retrying once", node, attemptNames[i])`
			`sshutil.RepairKnownHosts(ctx, o.log, knownHostsFiles, repairHosts, o.cfg.SSHPort)`
			`retryOut, retryErr := o.run(ctx, timeout, "ssh", args...)`
			`if retryErr == nil {`
			`return retryOut, nil`
			`}`
			`out = retryOut`
			`err = retryErr`
			`}`
			`lastOut = out`
			`lastErr = err`
			`if i < len(attempts)-1 {`
			`o.log.Printf("warning: ssh %s path failed for %s: %v; trying %s path", attemptNames[i], node, err, attemptNames[i+1])`
			`}`
			`}`
			`return lastOut, lastErr`
			`}`

			`// run runs one orchestration or CLI step.`
			`// Signature: (o *Orchestrator) run(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error).`
			`// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.`
			`func (o *Orchestrator) run(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {`
			`if o.runOverride != nil {`
			`return o.runOverride(ctx, timeout, name, args...)`
			`}`
			`runCtx, cancel := context.WithTimeout(ctx, timeout)`
			`defer cancel()`
			`return o.runner.Run(runCtx, name, args...)`
			`}`

			`// runSensitive runs one orchestration or CLI step.`
			`// Signature: (o *Orchestrator) runSensitive(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error).`
			`// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.`
			`func (o *Orchestrator) runSensitive(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {`
			`if o.runSensitiveOverride != nil {`
			`return o.runSensitiveOverride(ctx, timeout, name, args...)`
			`}`
			`runCtx, cancel := context.WithTimeout(ctx, timeout)`
			`defer cancel()`

			`cmd := exec.CommandContext(runCtx, name, args...)`
			`cmd.Env = os.Environ()`
			`if o.runner.Kubeconfig != "" {`
			`cmd.Env = append(cmd.Env, "KUBECONFIG="+o.runner.Kubeconfig)`
			`}`
			`out, err := cmd.CombinedOutput()`
			`trimmed := strings.TrimSpace(string(out))`
			`if err != nil {`
			`if trimmed == "" {`
			`return "", fmt.Errorf("%s failed: %w", name, err)`
			`}`
			`return trimmed, fmt.Errorf("%s failed: %w", name, err)`
			`}`
			`return trimmed, nil`
			`}`

			`// lines runs one orchestration or CLI step.`
			`// Signature: lines(in string) []string.`
			`// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.`
			`func lines(in string) []string {`
			`in = strings.TrimSpace(in)`
			`if in == "" {`
			`return nil`
			`}`
			`parts := strings.Split(in, "\n")`
			`out := make([]string, 0, len(parts))`
			`for _, p := range parts {`
			`v := strings.TrimSpace(p)`
			`if v != "" {`
			`out = append(out, v)`
			`}`
			`}`
			`return out`
			`}`

			`// sshManaged runs one orchestration or CLI step.`
			`// Signature: (o *Orchestrator) sshManaged(node string) bool.`
			`// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.`
			`func (o *Orchestrator) sshManaged(node string) bool {`
			`if len(o.cfg.SSHManagedNodes) == 0 {`
			`return true`
			`}`
			`for _, allowed := range o.cfg.SSHManagedNodes {`
			`if strings.TrimSpace(allowed) == node {`
			`return true`
			`}`
			`}`
			`return false`
			`}`

			`// resolveSSHConfigFile runs one orchestration or CLI step.`
			`// Signature: (o *Orchestrator) resolveSSHConfigFile() string.`
			`// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.`
			`func (o *Orchestrator) resolveSSHConfigFile() string {`
			`if strings.TrimSpace(o.cfg.SSHConfigFile) != "" {`
			`return strings.TrimSpace(o.cfg.SSHConfigFile)`
			`}`
			`candidates := []string{`
			`"/home/atlas/.ssh/config",`
			`"/home/tethys/.ssh/config",`
			`}`
			`for _, p := range candidates {`
			`if stat, err := os.Stat(p); err == nil && !stat.IsDir() {`
			`return p`
			`}`
			`}`
			`return ""`
			`}`

			`// resolveSSHIdentityFile runs one orchestration or CLI step.`
			`// Signature: (o *Orchestrator) resolveSSHIdentityFile() string.`
			`// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.`
			`func (o *Orchestrator) resolveSSHIdentityFile() string {`
			`if strings.TrimSpace(o.cfg.SSHIdentityFile) != "" {`
			`return strings.TrimSpace(o.cfg.SSHIdentityFile)`
			`}`
			`candidates := []string{`
			`"/home/atlas/.ssh/id_ed25519",`
			`"/home/tethys/.ssh/id_ed25519",`
			`}`
			`for _, p := range candidates {`
			`if stat, err := os.Stat(p); err == nil && !stat.IsDir() {`
			`return p`
			`}`
			`}`
			`return ""`
			`}`