ananke/internal/cluster/orchestrator_coordination.go

package cluster

import (
	"context"
	"fmt"
	"path/filepath"
	"strconv"
	"strings"
	"time"

	"scm.bstein.dev/bstein/ananke/internal/state"
)

// intentAge runs one orchestration or CLI step.
// Signature: intentAge(in state.Intent) time.Duration.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func intentAge(in state.Intent) time.Duration {
	if in.UpdatedAt.IsZero() {
		return 0
	}
	return time.Since(in.UpdatedAt)
}

// intentFresh runs one orchestration or CLI step.
// Signature: intentFresh(in state.Intent, maxAge time.Duration) bool.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func intentFresh(in state.Intent, maxAge time.Duration) bool {
	if in.UpdatedAt.IsZero() {
		return true
	}
	return intentAge(in) <= maxAge
}

// startupGuardAge runs one orchestration or CLI step.
// Signature: (o *Orchestrator) startupGuardAge() time.Duration.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) startupGuardAge() time.Duration {
	seconds := o.cfg.Coordination.StartupGuardMaxAgeSec
	if seconds <= 0 {
		seconds = 900
	}
	return time.Duration(seconds) * time.Second
}

// startupShutdownCooldown runs one orchestration or CLI step.
// Signature: (o *Orchestrator) startupShutdownCooldown() time.Duration.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) startupShutdownCooldown() time.Duration {
	seconds := o.cfg.Startup.ShutdownCooldownSeconds
	if seconds <= 0 {
		seconds = 45
	}
	return time.Duration(seconds) * time.Second
}

// coordinationPeers runs one orchestration or CLI step.
// Signature: (o *Orchestrator) coordinationPeers() []string.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) coordinationPeers() []string {
	seen := map[string]struct{}{}
	out := make([]string, 0, len(o.cfg.Coordination.PeerHosts)+1)
	add := func(node string) {
		node = strings.TrimSpace(node)
		if node == "" {
			return
		}
		if _, ok := seen[node]; ok {
			return
		}
		seen[node] = struct{}{}
		out = append(out, node)
	}
	for _, node := range o.cfg.Coordination.PeerHosts {
		add(node)
	}
	if strings.TrimSpace(o.cfg.Coordination.ForwardShutdownHost) != "" {
		add(o.cfg.Coordination.ForwardShutdownHost)
	}
	return out
}

// guardPeerStartupIntents runs one orchestration or CLI step.
// Signature: (o *Orchestrator) guardPeerStartupIntents(ctx context.Context) error.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) guardPeerStartupIntents(ctx context.Context) error {
	peers := o.coordinationPeers()
	if len(peers) == 0 {
		return nil
	}
	guardAge := o.startupGuardAge()
	localRole := strings.ToLower(strings.TrimSpace(o.cfg.Coordination.Role))
	for _, peer := range peers {
		peerStatus, err := o.readRemotePeerStatus(ctx, peer)
		if err != nil {
			o.log.Printf("warning: peer startup guard skipped intent check for %s: %v", peer, err)
			continue
		}
		intent := peerStatus.Intent
		switch intent.State {
		case "", state.IntentNormal:
			continue
		case state.IntentShuttingDown:
			if intentFresh(intent, guardAge) {
				return fmt.Errorf("startup blocked: peer %s has active shutdown intent (reason=%q age=%s)", peer, intent.Reason, intentAge(intent).Round(time.Second))
			}
			o.log.Printf("warning: peer %s shutdown intent appears stale; allowing startup", peer)
		case state.IntentStartupInProgress:
			if !peerStatus.BootstrapActive {
				o.log.Printf("warning: peer %s reports startup_in_progress but bootstrap service is inactive (reason=%q age=%s); auto-clearing stale peer intent",
					peer, intent.Reason, intentAge(intent).Round(time.Second))
				o.bestEffort(fmt.Sprintf("clear stale peer startup intent on %s", peer), func() error {
					return o.clearRemotePeerIntent(ctx, peer, "auto-clear stale peer startup intent")
				})
				continue
			}
			if localRole == "coordinator" && strings.EqualFold(strings.TrimSpace(intent.Reason), "manual-startup") {
				o.log.Printf("warning: peer %s has manual startup in progress (age=%s); allowing coordinator startup to continue",
					peer, intentAge(intent).Round(time.Second))
				continue
			}
			if intentFresh(intent, guardAge) {
				return fmt.Errorf("startup blocked: peer %s reports startup_in_progress (reason=%q age=%s)", peer, intent.Reason, intentAge(intent).Round(time.Second))
			}
			o.log.Printf("warning: peer %s startup intent appears stale; auto-clearing and allowing startup", peer)
			o.bestEffort(fmt.Sprintf("clear stale peer startup intent on %s", peer), func() error {
				return o.clearRemotePeerIntent(ctx, peer, "auto-clear stale peer startup intent")
			})
		case state.IntentShutdownComplete:
			if intentFresh(intent, o.startupShutdownCooldown()) {
				return fmt.Errorf("startup blocked: peer %s completed shutdown too recently (age=%s)", peer, intentAge(intent).Round(time.Second))
			}
		default:
			o.log.Printf("warning: peer %s intent state %q is unknown; ignoring", peer, intent.State)
		}
	}
	return nil
}

// readRemotePeerStatus runs one orchestration or CLI step.
// Signature: (o *Orchestrator) readRemotePeerStatus(ctx context.Context, node string) (remotePeerStatus, error).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) readRemotePeerStatus(ctx context.Context, node string) (remotePeerStatus, error) {
	if !o.sshManaged(node) {
		return remotePeerStatus{}, fmt.Errorf("%s is not in ssh_managed_nodes", node)
	}
	out, err := o.ssh(ctx, node, "if sudo -n /usr/bin/systemctl is-active --quiet ananke-bootstrap.service; then echo __ANANKE_BOOTSTRAP_ACTIVE__; else echo __ANANKE_BOOTSTRAP_IDLE__; fi; sudo -n /usr/local/bin/ananke intent --config /etc/ananke/ananke.yaml")
	if err != nil {
		return remotePeerStatus{}, err
	}
	status := remotePeerStatus{
		BootstrapActive: strings.Contains(out, "__ANANKE_BOOTSTRAP_ACTIVE__"),
	}
	in, err := state.ParseIntentOutput(out)
	if err != nil {
		return remotePeerStatus{}, fmt.Errorf("parse remote intent output: %w", err)
	}
	status.Intent = in
	return status, nil
}

// clearRemotePeerIntent runs one orchestration or CLI step.
// Signature: (o *Orchestrator) clearRemotePeerIntent(ctx context.Context, node string, reason string) error.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) clearRemotePeerIntent(ctx context.Context, node string, reason string) error {
	cmd := fmt.Sprintf(
		"sudo -n /usr/local/bin/ananke intent --config /etc/ananke/ananke.yaml --set normal --reason %s --source startup --execute",
		shellQuote(reason),
	)
	_, err := o.ssh(ctx, node, cmd)
	return err
}

// shellQuote runs one orchestration or CLI step.
// Signature: shellQuote(v string) string.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func shellQuote(v string) string {
	return "'" + strings.ReplaceAll(v, "'", `'"'"'`) + "'"
}

// verifyEtcdSnapshot runs one orchestration or CLI step.
// Signature: (o *Orchestrator) verifyEtcdSnapshot(ctx context.Context, node string, snapshotPath string) error.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) verifyEtcdSnapshot(ctx context.Context, node string, snapshotPath string) error {
	if o.runner.DryRun {
		return nil
	}
	path := strings.TrimSpace(snapshotPath)
	if path == "" {
		return fmt.Errorf("etcd snapshot verification failed: snapshot path is empty")
	}
	quoted := shellQuote(path)
	sizeOut, err := o.ssh(ctx, node, fmt.Sprintf("sudo -n sh -lc 'test -s %s && stat -c %%s %s'", quoted, quoted))
	if err != nil {
		return fmt.Errorf("etcd snapshot verification failed for %s on %s: %w", path, node, err)
	}
	size, convErr := strconv.ParseInt(strings.TrimSpace(sizeOut), 10, 64)
	if convErr != nil {
		return fmt.Errorf("etcd snapshot verification failed for %s on %s: parse size %q: %w", path, node, strings.TrimSpace(sizeOut), convErr)
	}
	const minSnapshotBytes = int64(1 << 20) // 1 MiB sanity floor.
	if size < minSnapshotBytes {
		return fmt.Errorf("etcd snapshot verification failed for %s on %s: snapshot too small (%d bytes)", path, node, size)
	}
	lsOut, err := o.runSudoK3S(ctx, node, "etcd-snapshot", "ls")
	if err != nil {
		return fmt.Errorf("etcd snapshot verification failed for %s on %s: list snapshots: %w", path, node, err)
	}
	if !strings.Contains(lsOut, path) && !strings.Contains(lsOut, filepath.Base(path)) {
		return fmt.Errorf("etcd snapshot verification failed for %s on %s: snapshot is not present in k3s etcd-snapshot ls output", path, node)
	}
	sumOut, err := o.ssh(ctx, node, fmt.Sprintf("sudo -n sh -lc 'sha256sum %s | awk \"{print \\$1}\"'", quoted))
	if err != nil {
		return fmt.Errorf("etcd snapshot verification failed for %s on %s: sha256: %w", path, node, err)
	}
	hash := strings.TrimSpace(sumOut)
	if len(hash) != 64 {
		return fmt.Errorf("etcd snapshot verification failed for %s on %s: invalid sha256 %q", path, node, hash)
	}
	o.log.Printf("etcd snapshot verified path=%s size_bytes=%d sha256=%s", path, size, hash[:12])
	return nil
}

// runSudoK3S runs one orchestration or CLI step.
// Signature: (o *Orchestrator) runSudoK3S(ctx context.Context, node string, args ...string) (string, error).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) runSudoK3S(ctx context.Context, node string, args ...string) (string, error) {
	k3sPaths := []string{
		"/usr/local/bin/k3s",
		"/usr/bin/k3s",
		"k3s",
	}
	var lastErr error
	for _, path := range k3sPaths {
		parts := []string{"sudo", "-n", path}
		parts = append(parts, args...)
		command := strings.Join(parts, " ")
		out, err := o.ssh(ctx, node, command)
		if err == nil {
			return out, nil
		}
		lastErr = err
	}
	if lastErr == nil {
		lastErr = fmt.Errorf("no k3s executable candidates configured")
	}
	return "", lastErr
}

// controlPlaneUsesExternalDatastore runs one orchestration or CLI step.
// Signature: (o *Orchestrator) controlPlaneUsesExternalDatastore(ctx context.Context, node string) (bool, error).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) controlPlaneUsesExternalDatastore(ctx context.Context, node string) (bool, error) {
	out, err := o.ssh(ctx, node, "sudo systemctl cat k3s")
	if err != nil {
		return false, fmt.Errorf("inspect k3s service on %s for datastore mode: %w", node, err)
	}
	return strings.Contains(out, "--datastore-endpoint="), nil
}

// waitForAPI runs one orchestration or CLI step.
// Signature: (o *Orchestrator) waitForAPI(ctx context.Context, attempts int, sleep time.Duration) error.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) waitForAPI(ctx context.Context, attempts int, sleep time.Duration) error {
	if o.runner.DryRun {
		return nil
	}
	for i := 0; i < attempts; i++ {
		_, err := o.kubectl(ctx, 5*time.Second, "version", "--request-timeout=5s")
		if err == nil {
			return nil
		}
		time.Sleep(sleep)
	}
	return fmt.Errorf("kubernetes API did not become reachable within timeout")
}