ananke/internal/cluster/orchestrator_coordination.go

package cluster

import (
	"context"
	"fmt"
	"path/filepath"
	"strconv"
	"strings"
	"time"

	"scm.bstein.dev/bstein/ananke/internal/state"
)

// intentAge runs one orchestration or CLI step.
// Signature: intentAge(in state.Intent) time.Duration.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func intentAge(in state.Intent) time.Duration {
	if in.UpdatedAt.IsZero() {
		return 0
	}
	return time.Since(in.UpdatedAt)
}

// intentFresh runs one orchestration or CLI step.
// Signature: intentFresh(in state.Intent, maxAge time.Duration) bool.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func intentFresh(in state.Intent, maxAge time.Duration) bool {
	if in.UpdatedAt.IsZero() {
		return true
	}
	return intentAge(in) <= maxAge
}

// startupGuardAge runs one orchestration or CLI step.
// Signature: (o *Orchestrator) startupGuardAge() time.Duration.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) startupGuardAge() time.Duration {
	seconds := o.cfg.Coordination.StartupGuardMaxAgeSec
	if seconds <= 0 {
		seconds = 900
	}
	return time.Duration(seconds) * time.Second
}

// startupShutdownCooldown runs one orchestration or CLI step.
// Signature: (o *Orchestrator) startupShutdownCooldown() time.Duration.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) startupShutdownCooldown() time.Duration {
	seconds := o.cfg.Startup.ShutdownCooldownSeconds
	if seconds <= 0 {
		seconds = 45
	}
	return time.Duration(seconds) * time.Second
}

// coordinationPeers runs one orchestration or CLI step.
// Signature: (o *Orchestrator) coordinationPeers() []string.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) coordinationPeers() []string {
	seen := map[string]struct{}{}
	out := make([]string, 0, len(o.cfg.Coordination.PeerHosts)+1)
	add := func(node string) {
		node = strings.TrimSpace(node)
		if node == "" {
			return
		}
		if _, ok := seen[node]; ok {
			return
		}
		seen[node] = struct{}{}
		out = append(out, node)
	}
	for _, node := range o.cfg.Coordination.PeerHosts {
		add(node)
	}
	if strings.TrimSpace(o.cfg.Coordination.ForwardShutdownHost) != "" {
		add(o.cfg.Coordination.ForwardShutdownHost)
	}
	return out
}

// guardPeerStartupIntents runs one orchestration or CLI step.
// Signature: (o *Orchestrator) guardPeerStartupIntents(ctx context.Context) error.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) guardPeerStartupIntents(ctx context.Context) error {
	peers := o.coordinationPeers()
	if len(peers) == 0 {
		return nil
	}
	guardAge := o.startupGuardAge()
	localRole := strings.ToLower(strings.TrimSpace(o.cfg.Coordination.Role))
	for _, peer := range peers {
		peerStatus, err := o.readRemotePeerStatus(ctx, peer)
		if err != nil {
			o.log.Printf("warning: peer startup guard skipped intent check for %s: %v", peer, err)
			continue
		}
		intent := peerStatus.Intent
		switch intent.State {
		case "", state.IntentNormal:
			continue
		case state.IntentShuttingDown:
			if intentFresh(intent, guardAge) {
				return fmt.Errorf("startup blocked: peer %s has active shutdown intent (reason=%q age=%s)", peer, intent.Reason, intentAge(intent).Round(time.Second))
			}
			o.log.Printf("warning: peer %s shutdown intent appears stale; allowing startup", peer)
		case state.IntentStartupInProgress:
			if !peerStatus.BootstrapActive {
				o.log.Printf("warning: peer %s reports startup_in_progress but bootstrap service is inactive (reason=%q age=%s); auto-clearing stale peer intent",
					peer, intent.Reason, intentAge(intent).Round(time.Second))
				o.bestEffort(fmt.Sprintf("clear stale peer startup intent on %s", peer), func() error {
					return o.clearRemotePeerIntent(ctx, peer, "auto-clear stale peer startup intent")
				})
				continue
			}
			if localRole == "coordinator" && strings.EqualFold(strings.TrimSpace(intent.Reason), "manual-startup") {
				o.log.Printf("warning: peer %s has manual startup in progress (age=%s); allowing coordinator startup to continue",
					peer, intentAge(intent).Round(time.Second))
				continue
			}
			if intentFresh(intent, guardAge) {
				return fmt.Errorf("startup blocked: peer %s reports startup_in_progress (reason=%q age=%s)", peer, intent.Reason, intentAge(intent).Round(time.Second))
			}
			o.log.Printf("warning: peer %s startup intent appears stale; auto-clearing and allowing startup", peer)
			o.bestEffort(fmt.Sprintf("clear stale peer startup intent on %s", peer), func() error {
				return o.clearRemotePeerIntent(ctx, peer, "auto-clear stale peer startup intent")
			})
		case state.IntentShutdownComplete:
			if intentFresh(intent, o.startupShutdownCooldown()) {
				return fmt.Errorf("startup blocked: peer %s completed shutdown too recently (age=%s)", peer, intentAge(intent).Round(time.Second))
			}
		default:
			o.log.Printf("warning: peer %s intent state %q is unknown; ignoring", peer, intent.State)
		}
	}
	return nil
}

// readRemotePeerStatus runs one orchestration or CLI step.
// Signature: (o *Orchestrator) readRemotePeerStatus(ctx context.Context, node string) (remotePeerStatus, error).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) readRemotePeerStatus(ctx context.Context, node string) (remotePeerStatus, error) {
	if !o.sshManaged(node) {
		return remotePeerStatus{}, fmt.Errorf("%s is not in ssh_managed_nodes", node)
	}
	out, err := o.ssh(ctx, node, "if sudo -n /usr/bin/systemctl is-active --quiet ananke-bootstrap.service; then echo __ANANKE_BOOTSTRAP_ACTIVE__; else echo __ANANKE_BOOTSTRAP_IDLE__; fi; sudo -n /usr/local/bin/ananke intent --config /etc/ananke/ananke.yaml")
	if err != nil {
		return remotePeerStatus{}, err
	}
	status := remotePeerStatus{
		BootstrapActive: strings.Contains(out, "__ANANKE_BOOTSTRAP_ACTIVE__"),
	}
	in, err := state.ParseIntentOutput(out)
	if err != nil {
		return remotePeerStatus{}, fmt.Errorf("parse remote intent output: %w", err)
	}
	status.Intent = in
	return status, nil
}

// clearRemotePeerIntent runs one orchestration or CLI step.
// Signature: (o *Orchestrator) clearRemotePeerIntent(ctx context.Context, node string, reason string) error.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) clearRemotePeerIntent(ctx context.Context, node string, reason string) error {
	cmd := fmt.Sprintf(
		"sudo -n /usr/local/bin/ananke intent --config /etc/ananke/ananke.yaml --set normal --reason %s --source startup --execute",
		shellQuote(reason),
	)
	_, err := o.ssh(ctx, node, cmd)
	return err
}

// shellQuote runs one orchestration or CLI step.
// Signature: shellQuote(v string) string.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func shellQuote(v string) string {
	return "'" + strings.ReplaceAll(v, "'", `'"'"'`) + "'"
}

// verifyEtcdSnapshot runs one orchestration or CLI step.
// Signature: (o *Orchestrator) verifyEtcdSnapshot(ctx context.Context, node string, snapshotPath string) error.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) verifyEtcdSnapshot(ctx context.Context, node string, snapshotPath string) error {
	if o.runner.DryRun {
		return nil
	}
	path := strings.TrimSpace(snapshotPath)
	if path == "" {
		return fmt.Errorf("etcd snapshot verification failed: snapshot path is empty")
	}
	quoted := shellQuote(path)
	sizeOut, err := o.ssh(ctx, node, fmt.Sprintf("sudo -n sh -lc 'test -s %s && stat -c %%s %s'", quoted, quoted))
	if err != nil {
		return fmt.Errorf("etcd snapshot verification failed for %s on %s: %w", path, node, err)
	}
	size, convErr := strconv.ParseInt(strings.TrimSpace(sizeOut), 10, 64)
	if convErr != nil {
		return fmt.Errorf("etcd snapshot verification failed for %s on %s: parse size %q: %w", path, node, strings.TrimSpace(sizeOut), convErr)
	}
	const minSnapshotBytes = int64(1 << 20) // 1 MiB sanity floor.
	if size < minSnapshotBytes {
		return fmt.Errorf("etcd snapshot verification failed for %s on %s: snapshot too small (%d bytes)", path, node, size)
	}
	lsOut, err := o.runSudoK3S(ctx, node, "etcd-snapshot", "ls")
	if err != nil {
		return fmt.Errorf("etcd snapshot verification failed for %s on %s: list snapshots: %w", path, node, err)
	}
	if !strings.Contains(lsOut, path) && !strings.Contains(lsOut, filepath.Base(path)) {
		return fmt.Errorf("etcd snapshot verification failed for %s on %s: snapshot is not present in k3s etcd-snapshot ls output", path, node)
	}
	sumOut, err := o.ssh(ctx, node, fmt.Sprintf("sudo -n sh -lc 'sha256sum %s | awk \"{print \\$1}\"'", quoted))
	if err != nil {
		return fmt.Errorf("etcd snapshot verification failed for %s on %s: sha256: %w", path, node, err)
	}
	hash := strings.TrimSpace(sumOut)
	if len(hash) != 64 {
		return fmt.Errorf("etcd snapshot verification failed for %s on %s: invalid sha256 %q", path, node, hash)
	}
	o.log.Printf("etcd snapshot verified path=%s size_bytes=%d sha256=%s", path, size, hash[:12])
	return nil
}

// runSudoK3S runs one orchestration or CLI step.
// Signature: (o *Orchestrator) runSudoK3S(ctx context.Context, node string, args ...string) (string, error).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) runSudoK3S(ctx context.Context, node string, args ...string) (string, error) {
	k3sPaths := []string{
		"/usr/local/bin/k3s",
		"/usr/bin/k3s",
		"k3s",
	}
	var lastErr error
	for _, path := range k3sPaths {
		parts := []string{"sudo", "-n", path}
		parts = append(parts, args...)
		command := strings.Join(parts, " ")
		out, err := o.ssh(ctx, node, command)
		if err == nil {
			return out, nil
		}
		lastErr = err
	}
	if lastErr == nil {
		lastErr = fmt.Errorf("no k3s executable candidates configured")
	}
	return "", lastErr
}

// controlPlaneUsesExternalDatastore runs one orchestration or CLI step.
// Signature: (o *Orchestrator) controlPlaneUsesExternalDatastore(ctx context.Context, node string) (bool, error).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) controlPlaneUsesExternalDatastore(ctx context.Context, node string) (bool, error) {
	out, err := o.ssh(ctx, node, "sudo systemctl cat k3s")
	if err != nil {
		return false, fmt.Errorf("inspect k3s service on %s for datastore mode: %w", node, err)
	}
	return strings.Contains(out, "--datastore-endpoint="), nil
}

// waitForAPI runs one orchestration or CLI step.
// Signature: (o *Orchestrator) waitForAPI(ctx context.Context, attempts int, sleep time.Duration) error.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) waitForAPI(ctx context.Context, attempts int, sleep time.Duration) error {
	if o.runner.DryRun {
		return nil
	}
	for i := 0; i < attempts; i++ {
		_, err := o.kubectl(ctx, 5*time.Second, "version", "--request-timeout=5s")
		if err == nil {
			return nil
		}
		time.Sleep(sleep)
	}
	return fmt.Errorf("kubernetes API did not become reachable within timeout")
}
build: reconcile split modules and restore clean checkout integrity 2026-04-08 23:52:29 -03:00			`package cluster`

			`import (`
			`"context"`
			`"fmt"`
			`"path/filepath"`
			`"strconv"`
			`"strings"`
			`"time"`

			`"scm.bstein.dev/bstein/ananke/internal/state"`
			`)`

			`// intentAge runs one orchestration or CLI step.`
			`// Signature: intentAge(in state.Intent) time.Duration.`
			`// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.`
			`func intentAge(in state.Intent) time.Duration {`
			`if in.UpdatedAt.IsZero() {`
			`return 0`
			`}`
			`return time.Since(in.UpdatedAt)`
			`}`

			`// intentFresh runs one orchestration or CLI step.`
			`// Signature: intentFresh(in state.Intent, maxAge time.Duration) bool.`
			`// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.`
			`func intentFresh(in state.Intent, maxAge time.Duration) bool {`
			`if in.UpdatedAt.IsZero() {`
			`return true`
			`}`
			`return intentAge(in) <= maxAge`
			`}`

			`// startupGuardAge runs one orchestration or CLI step.`
			`// Signature: (o *Orchestrator) startupGuardAge() time.Duration.`
			`// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.`
			`func (o *Orchestrator) startupGuardAge() time.Duration {`
			`seconds := o.cfg.Coordination.StartupGuardMaxAgeSec`
			`if seconds <= 0 {`
			`seconds = 900`
			`}`
			`return time.Duration(seconds) * time.Second`
			`}`

			`// startupShutdownCooldown runs one orchestration or CLI step.`
			`// Signature: (o *Orchestrator) startupShutdownCooldown() time.Duration.`
			`// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.`
			`func (o *Orchestrator) startupShutdownCooldown() time.Duration {`
			`seconds := o.cfg.Startup.ShutdownCooldownSeconds`
			`if seconds <= 0 {`
			`seconds = 45`
			`}`
			`return time.Duration(seconds) * time.Second`
			`}`

			`// coordinationPeers runs one orchestration or CLI step.`
			`// Signature: (o *Orchestrator) coordinationPeers() []string.`
			`// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.`
			`func (o *Orchestrator) coordinationPeers() []string {`
			`seen := map[string]struct{}{}`
			`out := make([]string, 0, len(o.cfg.Coordination.PeerHosts)+1)`
			`add := func(node string) {`
			`node = strings.TrimSpace(node)`
			`if node == "" {`
			`return`
			`}`
			`if _, ok := seen[node]; ok {`
			`return`
			`}`
			`seen[node] = struct{}{}`
			`out = append(out, node)`
			`}`
			`for _, node := range o.cfg.Coordination.PeerHosts {`
			`add(node)`
			`}`
			`if strings.TrimSpace(o.cfg.Coordination.ForwardShutdownHost) != "" {`
			`add(o.cfg.Coordination.ForwardShutdownHost)`
			`}`
			`return out`
			`}`

			`// guardPeerStartupIntents runs one orchestration or CLI step.`
			`// Signature: (o *Orchestrator) guardPeerStartupIntents(ctx context.Context) error.`
			`// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.`
			`func (o *Orchestrator) guardPeerStartupIntents(ctx context.Context) error {`
			`peers := o.coordinationPeers()`
			`if len(peers) == 0 {`
			`return nil`
			`}`
			`guardAge := o.startupGuardAge()`
			`localRole := strings.ToLower(strings.TrimSpace(o.cfg.Coordination.Role))`
			`for _, peer := range peers {`
			`peerStatus, err := o.readRemotePeerStatus(ctx, peer)`
			`if err != nil {`
			`o.log.Printf("warning: peer startup guard skipped intent check for %s: %v", peer, err)`
			`continue`
			`}`
			`intent := peerStatus.Intent`
			`switch intent.State {`
			`case "", state.IntentNormal:`
			`continue`
			`case state.IntentShuttingDown:`
			`if intentFresh(intent, guardAge) {`
			`return fmt.Errorf("startup blocked: peer %s has active shutdown intent (reason=%q age=%s)", peer, intent.Reason, intentAge(intent).Round(time.Second))`
			`}`
			`o.log.Printf("warning: peer %s shutdown intent appears stale; allowing startup", peer)`
			`case state.IntentStartupInProgress:`
			`if !peerStatus.BootstrapActive {`
			`o.log.Printf("warning: peer %s reports startup_in_progress but bootstrap service is inactive (reason=%q age=%s); auto-clearing stale peer intent",`
			`peer, intent.Reason, intentAge(intent).Round(time.Second))`
			`o.bestEffort(fmt.Sprintf("clear stale peer startup intent on %s", peer), func() error {`
			`return o.clearRemotePeerIntent(ctx, peer, "auto-clear stale peer startup intent")`
			`})`
			`continue`
			`}`
			`if localRole == "coordinator" && strings.EqualFold(strings.TrimSpace(intent.Reason), "manual-startup") {`
			`o.log.Printf("warning: peer %s has manual startup in progress (age=%s); allowing coordinator startup to continue",`
			`peer, intentAge(intent).Round(time.Second))`
			`continue`
			`}`
			`if intentFresh(intent, guardAge) {`
			`return fmt.Errorf("startup blocked: peer %s reports startup_in_progress (reason=%q age=%s)", peer, intent.Reason, intentAge(intent).Round(time.Second))`
			`}`
			`o.log.Printf("warning: peer %s startup intent appears stale; auto-clearing and allowing startup", peer)`
			`o.bestEffort(fmt.Sprintf("clear stale peer startup intent on %s", peer), func() error {`
			`return o.clearRemotePeerIntent(ctx, peer, "auto-clear stale peer startup intent")`
			`})`
			`case state.IntentShutdownComplete:`
			`if intentFresh(intent, o.startupShutdownCooldown()) {`
			`return fmt.Errorf("startup blocked: peer %s completed shutdown too recently (age=%s)", peer, intentAge(intent).Round(time.Second))`
			`}`
			`default:`
			`o.log.Printf("warning: peer %s intent state %q is unknown; ignoring", peer, intent.State)`
			`}`
			`}`
			`return nil`
			`}`

			`// readRemotePeerStatus runs one orchestration or CLI step.`
			`// Signature: (o *Orchestrator) readRemotePeerStatus(ctx context.Context, node string) (remotePeerStatus, error).`
			`// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.`
			`func (o *Orchestrator) readRemotePeerStatus(ctx context.Context, node string) (remotePeerStatus, error) {`
			`if !o.sshManaged(node) {`
			`return remotePeerStatus{}, fmt.Errorf("%s is not in ssh_managed_nodes", node)`
			`}`
			`out, err := o.ssh(ctx, node, "if sudo -n /usr/bin/systemctl is-active --quiet ananke-bootstrap.service; then echo __ANANKE_BOOTSTRAP_ACTIVE__; else echo __ANANKE_BOOTSTRAP_IDLE__; fi; sudo -n /usr/local/bin/ananke intent --config /etc/ananke/ananke.yaml")`
			`if err != nil {`
			`return remotePeerStatus{}, err`
			`}`
			`status := remotePeerStatus{`
			`BootstrapActive: strings.Contains(out, "__ANANKE_BOOTSTRAP_ACTIVE__"),`
			`}`
			`in, err := state.ParseIntentOutput(out)`
			`if err != nil {`
			`return remotePeerStatus{}, fmt.Errorf("parse remote intent output: %w", err)`
			`}`
			`status.Intent = in`
			`return status, nil`
			`}`

			`// clearRemotePeerIntent runs one orchestration or CLI step.`
			`// Signature: (o *Orchestrator) clearRemotePeerIntent(ctx context.Context, node string, reason string) error.`
			`// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.`
			`func (o *Orchestrator) clearRemotePeerIntent(ctx context.Context, node string, reason string) error {`
			`cmd := fmt.Sprintf(`
			`"sudo -n /usr/local/bin/ananke intent --config /etc/ananke/ananke.yaml --set normal --reason %s --source startup --execute",`
			`shellQuote(reason),`
			`)`
			`_, err := o.ssh(ctx, node, cmd)`
			`return err`
			`}`

			`// shellQuote runs one orchestration or CLI step.`
			`// Signature: shellQuote(v string) string.`
			`// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.`
			`func shellQuote(v string) string {`
			return "'" + strings.ReplaceAll(v, "'", `'"'"'`) + "'"
			`}`

			`// verifyEtcdSnapshot runs one orchestration or CLI step.`
			`// Signature: (o *Orchestrator) verifyEtcdSnapshot(ctx context.Context, node string, snapshotPath string) error.`
			`// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.`
			`func (o *Orchestrator) verifyEtcdSnapshot(ctx context.Context, node string, snapshotPath string) error {`
			`if o.runner.DryRun {`
			`return nil`
			`}`
			`path := strings.TrimSpace(snapshotPath)`
			`if path == "" {`
			`return fmt.Errorf("etcd snapshot verification failed: snapshot path is empty")`
			`}`
			`quoted := shellQuote(path)`
			`sizeOut, err := o.ssh(ctx, node, fmt.Sprintf("sudo -n sh -lc 'test -s %s && stat -c %%s %s'", quoted, quoted))`
			`if err != nil {`
			`return fmt.Errorf("etcd snapshot verification failed for %s on %s: %w", path, node, err)`
			`}`
			`size, convErr := strconv.ParseInt(strings.TrimSpace(sizeOut), 10, 64)`
			`if convErr != nil {`
			`return fmt.Errorf("etcd snapshot verification failed for %s on %s: parse size %q: %w", path, node, strings.TrimSpace(sizeOut), convErr)`
			`}`
			`const minSnapshotBytes = int64(1 << 20) // 1 MiB sanity floor.`
			`if size < minSnapshotBytes {`
			`return fmt.Errorf("etcd snapshot verification failed for %s on %s: snapshot too small (%d bytes)", path, node, size)`
			`}`
			`lsOut, err := o.runSudoK3S(ctx, node, "etcd-snapshot", "ls")`
			`if err != nil {`
			`return fmt.Errorf("etcd snapshot verification failed for %s on %s: list snapshots: %w", path, node, err)`
			`}`
			`if !strings.Contains(lsOut, path) && !strings.Contains(lsOut, filepath.Base(path)) {`
			`return fmt.Errorf("etcd snapshot verification failed for %s on %s: snapshot is not present in k3s etcd-snapshot ls output", path, node)`
			`}`
			`sumOut, err := o.ssh(ctx, node, fmt.Sprintf("sudo -n sh -lc 'sha256sum %s \| awk \"{print \\$1}\"'", quoted))`
			`if err != nil {`
			`return fmt.Errorf("etcd snapshot verification failed for %s on %s: sha256: %w", path, node, err)`
			`}`
			`hash := strings.TrimSpace(sumOut)`
			`if len(hash) != 64 {`
			`return fmt.Errorf("etcd snapshot verification failed for %s on %s: invalid sha256 %q", path, node, hash)`
			`}`
			`o.log.Printf("etcd snapshot verified path=%s size_bytes=%d sha256=%s", path, size, hash[:12])`
			`return nil`
			`}`

			`// runSudoK3S runs one orchestration or CLI step.`
			`// Signature: (o *Orchestrator) runSudoK3S(ctx context.Context, node string, args ...string) (string, error).`
			`// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.`
			`func (o *Orchestrator) runSudoK3S(ctx context.Context, node string, args ...string) (string, error) {`
			`k3sPaths := []string{`
			`"/usr/local/bin/k3s",`
			`"/usr/bin/k3s",`
			`"k3s",`
			`}`
			`var lastErr error`
			`for _, path := range k3sPaths {`
			`parts := []string{"sudo", "-n", path}`
			`parts = append(parts, args...)`
			`command := strings.Join(parts, " ")`
			`out, err := o.ssh(ctx, node, command)`
			`if err == nil {`
			`return out, nil`
			`}`
			`lastErr = err`
			`}`
			`if lastErr == nil {`
			`lastErr = fmt.Errorf("no k3s executable candidates configured")`
			`}`
			`return "", lastErr`
			`}`

			`// controlPlaneUsesExternalDatastore runs one orchestration or CLI step.`
			`// Signature: (o *Orchestrator) controlPlaneUsesExternalDatastore(ctx context.Context, node string) (bool, error).`
			`// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.`
			`func (o *Orchestrator) controlPlaneUsesExternalDatastore(ctx context.Context, node string) (bool, error) {`
			`out, err := o.ssh(ctx, node, "sudo systemctl cat k3s")`
			`if err != nil {`
			`return false, fmt.Errorf("inspect k3s service on %s for datastore mode: %w", node, err)`
			`}`
			`return strings.Contains(out, "--datastore-endpoint="), nil`
			`}`

			`// waitForAPI runs one orchestration or CLI step.`
			`// Signature: (o *Orchestrator) waitForAPI(ctx context.Context, attempts int, sleep time.Duration) error.`
			`// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.`
			`func (o *Orchestrator) waitForAPI(ctx context.Context, attempts int, sleep time.Duration) error {`
			`if o.runner.DryRun {`
			`return nil`
			`}`
			`for i := 0; i < attempts; i++ {`
			`_, err := o.kubectl(ctx, 5*time.Second, "version", "--request-timeout=5s")`
			`if err == nil {`
			`return nil`
			`}`
			`time.Sleep(sleep)`
			`}`
			`return fmt.Errorf("kubernetes API did not become reachable within timeout")`
			`}`