diff --git a/internal/cluster/orchestrator.go b/internal/cluster/orchestrator.go index a64a0eb..db56089 100644 --- a/internal/cluster/orchestrator.go +++ b/internal/cluster/orchestrator.go @@ -322,8 +322,7 @@ func (o *Orchestrator) EtcdRestore(ctx context.Context, opts EtcdRestoreOptions) }) } - restoreCmd := fmt.Sprintf("sudo k3s server --cluster-reset --cluster-reset-restore-path %q", snapshotPath) - if _, err := o.ssh(ctx, controlPlane, restoreCmd); err != nil { + if _, err := o.runSudoK3S(ctx, controlPlane, "server", "--cluster-reset", "--cluster-reset-restore-path", snapshotPath); err != nil { return fmt.Errorf("etcd restore command failed on %s: %w", controlPlane, err) } o.log.Printf("etcd restore command completed on %s", controlPlane) @@ -881,7 +880,7 @@ func (o *Orchestrator) takeEtcdSnapshot(ctx context.Context, node string) error return fmt.Errorf("cannot run etcd snapshot on %s: node not in ssh_managed_nodes", node) } name := "pre-shutdown-" + time.Now().UTC().Format("20060102-150405") - _, err := o.ssh(ctx, node, "sudo k3s etcd-snapshot save --name "+name) + _, err := o.runSudoK3S(ctx, node, "etcd-snapshot", "save", "--name", name) return err } @@ -889,18 +888,61 @@ func (o *Orchestrator) latestEtcdSnapshotPath(ctx context.Context, node string) if !o.sshManaged(node) { return "", fmt.Errorf("cannot resolve etcd snapshot on %s: node not in ssh_managed_nodes", node) } - cmd := `sudo sh -lc 'ls -1t /var/lib/rancher/k3s/server/db/snapshots/* 2>/dev/null | head -n 1'` - out, err := o.ssh(ctx, node, cmd) + out, err := o.runSudoK3S(ctx, node, "etcd-snapshot", "ls") if err != nil { return "", fmt.Errorf("resolve latest etcd snapshot on %s: %w", node, err) } - snapshot := strings.TrimSpace(out) + snapshot := parseSnapshotPathFromEtcdSnapshotList(out) if snapshot == "" { return "", fmt.Errorf("no etcd snapshots found on %s under /var/lib/rancher/k3s/server/db/snapshots", node) } return snapshot, nil } +func parseSnapshotPathFromEtcdSnapshotList(out string) string { + for _, line := range lines(out) { + trimmed := strings.TrimSpace(line) + if trimmed == "" { + continue + } + lower := strings.ToLower(trimmed) + if strings.HasPrefix(lower, "name") && strings.Contains(lower, "location") { + continue + } + for _, field := range strings.Fields(trimmed) { + candidate := strings.Trim(strings.TrimSpace(field), "\",") + candidate = strings.TrimPrefix(candidate, "file://") + if strings.HasPrefix(candidate, "/var/lib/rancher/k3s/server/db/snapshots/") { + return candidate + } + } + } + return "" +} + +func (o *Orchestrator) runSudoK3S(ctx context.Context, node string, args ...string) (string, error) { + k3sPaths := []string{ + "/usr/local/bin/k3s", + "/usr/bin/k3s", + "k3s", + } + var lastErr error + for _, path := range k3sPaths { + parts := []string{"sudo", "-n", path} + parts = append(parts, args...) + command := strings.Join(parts, " ") + out, err := o.ssh(ctx, node, command) + if err == nil { + return out, nil + } + lastErr = err + } + if lastErr == nil { + lastErr = fmt.Errorf("no k3s executable candidates configured") + } + return "", lastErr +} + func (o *Orchestrator) controlPlaneUsesExternalDatastore(ctx context.Context, node string) (bool, error) { out, err := o.ssh(ctx, node, "sudo systemctl cat k3s") if err != nil { @@ -1156,7 +1198,6 @@ func (o *Orchestrator) reconcileNodeAccess(ctx context.Context, nodes []string) sem := make(chan struct{}, parallelism) var wg sync.WaitGroup errCh := make(chan error, len(nodes)) - cmd := `sudo sh -lc 'id atlas >/dev/null 2>&1 || useradd -m -s /bin/bash atlas || true; install -d -m 0755 /etc/sudoers.d; printf "%s\n" "atlas ALL=(ALL) NOPASSWD: /usr/bin/systemctl, /usr/sbin/poweroff, /sbin/poweroff, /usr/local/bin/hecate" > /etc/sudoers.d/90-hecate-atlas; chmod 0440 /etc/sudoers.d/90-hecate-atlas; if command -v visudo >/dev/null 2>&1; then visudo -cf /etc/sudoers.d/90-hecate-atlas >/dev/null; fi'` for _, node := range nodes { node := strings.TrimSpace(node) if node == "" || !o.sshManaged(node) { @@ -1167,8 +1208,8 @@ func (o *Orchestrator) reconcileNodeAccess(ctx context.Context, nodes []string) defer wg.Done() sem <- struct{}{} defer func() { <-sem }() - if _, err := o.ssh(ctx, node, cmd); err != nil { - errCh <- fmt.Errorf("%s: %w", node, err) + if _, err := o.ssh(ctx, node, "sudo -n /usr/bin/systemctl --version"); err != nil { + errCh <- fmt.Errorf("%s: missing sudo access to /usr/bin/systemctl (--version): %w", node, err) } }() } @@ -1184,7 +1225,7 @@ func (o *Orchestrator) reconcileNodeAccess(ctx context.Context, nodes []string) break } } - return fmt.Errorf("access reconcile had %d errors (first: %s)", len(errCh), strings.Join(samples, " | ")) + return fmt.Errorf("access validation had %d errors (first: %s)", len(errCh), strings.Join(samples, " | ")) } func (o *Orchestrator) fluxSourceReady(ctx context.Context) (bool, error) { diff --git a/scripts/install.sh b/scripts/install.sh index da9f8aa..572f6e4 100755 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -164,6 +164,40 @@ ensure_hecate_kubeconfig() { echo "[install] warning: kubeconfig at ${kubeconfig_path} is still not validated; local startup fallback may fail" } +ensure_hecate_ssh_identity() { + local key_path key_dir key_user key_comment + key_path="$(migration_yaml_lookup "ssh_identity_file")" + if [[ -z "${key_path}" ]]; then + key_path="/home/atlas/.ssh/id_ed25519" + fi + key_dir="$(dirname "${key_path}")" + key_comment="hecate-$(hostname)-forward" + + key_user="root" + if [[ "${key_path}" == /home/*/* ]]; then + key_user="${key_path#/home/}" + key_user="${key_user%%/*}" + fi + + if ! id "${key_user}" >/dev/null 2>&1; then + echo "[install] warning: ssh identity owner ${key_user} does not exist; skipping key bootstrap for ${key_path}" + return 0 + fi + + install -d -m 0700 -o "${key_user}" -g "${key_user}" "${key_dir}" + if [[ ! -s "${key_path}" ]]; then + echo "[install] generating missing SSH identity at ${key_path}" + if [[ "${key_user}" == "root" ]]; then + ssh-keygen -q -t ed25519 -N '' -C "${key_comment}" -f "${key_path}" + else + runuser -u "${key_user}" -- ssh-keygen -q -t ed25519 -N '' -C "${key_comment}" -f "${key_path}" + fi + fi + chown "${key_user}:${key_user}" "${key_path}" "${key_path}.pub" 2>/dev/null || true + chmod 0600 "${key_path}" || true + chmod 0644 "${key_path}.pub" || true +} + migrate_hecate_config() { if [[ ! -f "${CONF_DIR}/hecate.yaml" ]]; then return 0 @@ -562,6 +596,7 @@ else echo "[install] keeping existing config at ${CONF_DIR}/hecate.yaml" fi migrate_hecate_config +ensure_hecate_ssh_identity ensure_hecate_kubeconfig echo "[install] installing systemd units"