hecate: harden startup ssh access checks and k3s command paths

This commit is contained in:
Brad Stein 2026-04-05 10:03:15 -03:00
parent ae5220ff9d
commit d2526edf0e
2 changed files with 86 additions and 10 deletions

View File

@ -322,8 +322,7 @@ func (o *Orchestrator) EtcdRestore(ctx context.Context, opts EtcdRestoreOptions)
})
}
restoreCmd := fmt.Sprintf("sudo k3s server --cluster-reset --cluster-reset-restore-path %q", snapshotPath)
if _, err := o.ssh(ctx, controlPlane, restoreCmd); err != nil {
if _, err := o.runSudoK3S(ctx, controlPlane, "server", "--cluster-reset", "--cluster-reset-restore-path", snapshotPath); err != nil {
return fmt.Errorf("etcd restore command failed on %s: %w", controlPlane, err)
}
o.log.Printf("etcd restore command completed on %s", controlPlane)
@ -881,7 +880,7 @@ func (o *Orchestrator) takeEtcdSnapshot(ctx context.Context, node string) error
return fmt.Errorf("cannot run etcd snapshot on %s: node not in ssh_managed_nodes", node)
}
name := "pre-shutdown-" + time.Now().UTC().Format("20060102-150405")
_, err := o.ssh(ctx, node, "sudo k3s etcd-snapshot save --name "+name)
_, err := o.runSudoK3S(ctx, node, "etcd-snapshot", "save", "--name", name)
return err
}
@ -889,18 +888,61 @@ func (o *Orchestrator) latestEtcdSnapshotPath(ctx context.Context, node string)
if !o.sshManaged(node) {
return "", fmt.Errorf("cannot resolve etcd snapshot on %s: node not in ssh_managed_nodes", node)
}
cmd := `sudo sh -lc 'ls -1t /var/lib/rancher/k3s/server/db/snapshots/* 2>/dev/null | head -n 1'`
out, err := o.ssh(ctx, node, cmd)
out, err := o.runSudoK3S(ctx, node, "etcd-snapshot", "ls")
if err != nil {
return "", fmt.Errorf("resolve latest etcd snapshot on %s: %w", node, err)
}
snapshot := strings.TrimSpace(out)
snapshot := parseSnapshotPathFromEtcdSnapshotList(out)
if snapshot == "" {
return "", fmt.Errorf("no etcd snapshots found on %s under /var/lib/rancher/k3s/server/db/snapshots", node)
}
return snapshot, nil
}
func parseSnapshotPathFromEtcdSnapshotList(out string) string {
for _, line := range lines(out) {
trimmed := strings.TrimSpace(line)
if trimmed == "" {
continue
}
lower := strings.ToLower(trimmed)
if strings.HasPrefix(lower, "name") && strings.Contains(lower, "location") {
continue
}
for _, field := range strings.Fields(trimmed) {
candidate := strings.Trim(strings.TrimSpace(field), "\",")
candidate = strings.TrimPrefix(candidate, "file://")
if strings.HasPrefix(candidate, "/var/lib/rancher/k3s/server/db/snapshots/") {
return candidate
}
}
}
return ""
}
func (o *Orchestrator) runSudoK3S(ctx context.Context, node string, args ...string) (string, error) {
k3sPaths := []string{
"/usr/local/bin/k3s",
"/usr/bin/k3s",
"k3s",
}
var lastErr error
for _, path := range k3sPaths {
parts := []string{"sudo", "-n", path}
parts = append(parts, args...)
command := strings.Join(parts, " ")
out, err := o.ssh(ctx, node, command)
if err == nil {
return out, nil
}
lastErr = err
}
if lastErr == nil {
lastErr = fmt.Errorf("no k3s executable candidates configured")
}
return "", lastErr
}
func (o *Orchestrator) controlPlaneUsesExternalDatastore(ctx context.Context, node string) (bool, error) {
out, err := o.ssh(ctx, node, "sudo systemctl cat k3s")
if err != nil {
@ -1156,7 +1198,6 @@ func (o *Orchestrator) reconcileNodeAccess(ctx context.Context, nodes []string)
sem := make(chan struct{}, parallelism)
var wg sync.WaitGroup
errCh := make(chan error, len(nodes))
cmd := `sudo sh -lc 'id atlas >/dev/null 2>&1 || useradd -m -s /bin/bash atlas || true; install -d -m 0755 /etc/sudoers.d; printf "%s\n" "atlas ALL=(ALL) NOPASSWD: /usr/bin/systemctl, /usr/sbin/poweroff, /sbin/poweroff, /usr/local/bin/hecate" > /etc/sudoers.d/90-hecate-atlas; chmod 0440 /etc/sudoers.d/90-hecate-atlas; if command -v visudo >/dev/null 2>&1; then visudo -cf /etc/sudoers.d/90-hecate-atlas >/dev/null; fi'`
for _, node := range nodes {
node := strings.TrimSpace(node)
if node == "" || !o.sshManaged(node) {
@ -1167,8 +1208,8 @@ func (o *Orchestrator) reconcileNodeAccess(ctx context.Context, nodes []string)
defer wg.Done()
sem <- struct{}{}
defer func() { <-sem }()
if _, err := o.ssh(ctx, node, cmd); err != nil {
errCh <- fmt.Errorf("%s: %w", node, err)
if _, err := o.ssh(ctx, node, "sudo -n /usr/bin/systemctl --version"); err != nil {
errCh <- fmt.Errorf("%s: missing sudo access to /usr/bin/systemctl (--version): %w", node, err)
}
}()
}
@ -1184,7 +1225,7 @@ func (o *Orchestrator) reconcileNodeAccess(ctx context.Context, nodes []string)
break
}
}
return fmt.Errorf("access reconcile had %d errors (first: %s)", len(errCh), strings.Join(samples, " | "))
return fmt.Errorf("access validation had %d errors (first: %s)", len(errCh), strings.Join(samples, " | "))
}
func (o *Orchestrator) fluxSourceReady(ctx context.Context) (bool, error) {

View File

@ -164,6 +164,40 @@ ensure_hecate_kubeconfig() {
echo "[install] warning: kubeconfig at ${kubeconfig_path} is still not validated; local startup fallback may fail"
}
ensure_hecate_ssh_identity() {
local key_path key_dir key_user key_comment
key_path="$(migration_yaml_lookup "ssh_identity_file")"
if [[ -z "${key_path}" ]]; then
key_path="/home/atlas/.ssh/id_ed25519"
fi
key_dir="$(dirname "${key_path}")"
key_comment="hecate-$(hostname)-forward"
key_user="root"
if [[ "${key_path}" == /home/*/* ]]; then
key_user="${key_path#/home/}"
key_user="${key_user%%/*}"
fi
if ! id "${key_user}" >/dev/null 2>&1; then
echo "[install] warning: ssh identity owner ${key_user} does not exist; skipping key bootstrap for ${key_path}"
return 0
fi
install -d -m 0700 -o "${key_user}" -g "${key_user}" "${key_dir}"
if [[ ! -s "${key_path}" ]]; then
echo "[install] generating missing SSH identity at ${key_path}"
if [[ "${key_user}" == "root" ]]; then
ssh-keygen -q -t ed25519 -N '' -C "${key_comment}" -f "${key_path}"
else
runuser -u "${key_user}" -- ssh-keygen -q -t ed25519 -N '' -C "${key_comment}" -f "${key_path}"
fi
fi
chown "${key_user}:${key_user}" "${key_path}" "${key_path}.pub" 2>/dev/null || true
chmod 0600 "${key_path}" || true
chmod 0644 "${key_path}.pub" || true
}
migrate_hecate_config() {
if [[ ! -f "${CONF_DIR}/hecate.yaml" ]]; then
return 0
@ -562,6 +596,7 @@ else
echo "[install] keeping existing config at ${CONF_DIR}/hecate.yaml"
fi
migrate_hecate_config
ensure_hecate_ssh_identity
ensure_hecate_kubeconfig
echo "[install] installing systemd units"