hecate: harden startup ssh access checks and k3s command paths
This commit is contained in:
parent
ae5220ff9d
commit
d2526edf0e
@ -322,8 +322,7 @@ func (o *Orchestrator) EtcdRestore(ctx context.Context, opts EtcdRestoreOptions)
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
restoreCmd := fmt.Sprintf("sudo k3s server --cluster-reset --cluster-reset-restore-path %q", snapshotPath)
|
if _, err := o.runSudoK3S(ctx, controlPlane, "server", "--cluster-reset", "--cluster-reset-restore-path", snapshotPath); err != nil {
|
||||||
if _, err := o.ssh(ctx, controlPlane, restoreCmd); err != nil {
|
|
||||||
return fmt.Errorf("etcd restore command failed on %s: %w", controlPlane, err)
|
return fmt.Errorf("etcd restore command failed on %s: %w", controlPlane, err)
|
||||||
}
|
}
|
||||||
o.log.Printf("etcd restore command completed on %s", controlPlane)
|
o.log.Printf("etcd restore command completed on %s", controlPlane)
|
||||||
@ -881,7 +880,7 @@ func (o *Orchestrator) takeEtcdSnapshot(ctx context.Context, node string) error
|
|||||||
return fmt.Errorf("cannot run etcd snapshot on %s: node not in ssh_managed_nodes", node)
|
return fmt.Errorf("cannot run etcd snapshot on %s: node not in ssh_managed_nodes", node)
|
||||||
}
|
}
|
||||||
name := "pre-shutdown-" + time.Now().UTC().Format("20060102-150405")
|
name := "pre-shutdown-" + time.Now().UTC().Format("20060102-150405")
|
||||||
_, err := o.ssh(ctx, node, "sudo k3s etcd-snapshot save --name "+name)
|
_, err := o.runSudoK3S(ctx, node, "etcd-snapshot", "save", "--name", name)
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -889,18 +888,61 @@ func (o *Orchestrator) latestEtcdSnapshotPath(ctx context.Context, node string)
|
|||||||
if !o.sshManaged(node) {
|
if !o.sshManaged(node) {
|
||||||
return "", fmt.Errorf("cannot resolve etcd snapshot on %s: node not in ssh_managed_nodes", node)
|
return "", fmt.Errorf("cannot resolve etcd snapshot on %s: node not in ssh_managed_nodes", node)
|
||||||
}
|
}
|
||||||
cmd := `sudo sh -lc 'ls -1t /var/lib/rancher/k3s/server/db/snapshots/* 2>/dev/null | head -n 1'`
|
out, err := o.runSudoK3S(ctx, node, "etcd-snapshot", "ls")
|
||||||
out, err := o.ssh(ctx, node, cmd)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", fmt.Errorf("resolve latest etcd snapshot on %s: %w", node, err)
|
return "", fmt.Errorf("resolve latest etcd snapshot on %s: %w", node, err)
|
||||||
}
|
}
|
||||||
snapshot := strings.TrimSpace(out)
|
snapshot := parseSnapshotPathFromEtcdSnapshotList(out)
|
||||||
if snapshot == "" {
|
if snapshot == "" {
|
||||||
return "", fmt.Errorf("no etcd snapshots found on %s under /var/lib/rancher/k3s/server/db/snapshots", node)
|
return "", fmt.Errorf("no etcd snapshots found on %s under /var/lib/rancher/k3s/server/db/snapshots", node)
|
||||||
}
|
}
|
||||||
return snapshot, nil
|
return snapshot, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func parseSnapshotPathFromEtcdSnapshotList(out string) string {
|
||||||
|
for _, line := range lines(out) {
|
||||||
|
trimmed := strings.TrimSpace(line)
|
||||||
|
if trimmed == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
lower := strings.ToLower(trimmed)
|
||||||
|
if strings.HasPrefix(lower, "name") && strings.Contains(lower, "location") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
for _, field := range strings.Fields(trimmed) {
|
||||||
|
candidate := strings.Trim(strings.TrimSpace(field), "\",")
|
||||||
|
candidate = strings.TrimPrefix(candidate, "file://")
|
||||||
|
if strings.HasPrefix(candidate, "/var/lib/rancher/k3s/server/db/snapshots/") {
|
||||||
|
return candidate
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
func (o *Orchestrator) runSudoK3S(ctx context.Context, node string, args ...string) (string, error) {
|
||||||
|
k3sPaths := []string{
|
||||||
|
"/usr/local/bin/k3s",
|
||||||
|
"/usr/bin/k3s",
|
||||||
|
"k3s",
|
||||||
|
}
|
||||||
|
var lastErr error
|
||||||
|
for _, path := range k3sPaths {
|
||||||
|
parts := []string{"sudo", "-n", path}
|
||||||
|
parts = append(parts, args...)
|
||||||
|
command := strings.Join(parts, " ")
|
||||||
|
out, err := o.ssh(ctx, node, command)
|
||||||
|
if err == nil {
|
||||||
|
return out, nil
|
||||||
|
}
|
||||||
|
lastErr = err
|
||||||
|
}
|
||||||
|
if lastErr == nil {
|
||||||
|
lastErr = fmt.Errorf("no k3s executable candidates configured")
|
||||||
|
}
|
||||||
|
return "", lastErr
|
||||||
|
}
|
||||||
|
|
||||||
func (o *Orchestrator) controlPlaneUsesExternalDatastore(ctx context.Context, node string) (bool, error) {
|
func (o *Orchestrator) controlPlaneUsesExternalDatastore(ctx context.Context, node string) (bool, error) {
|
||||||
out, err := o.ssh(ctx, node, "sudo systemctl cat k3s")
|
out, err := o.ssh(ctx, node, "sudo systemctl cat k3s")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -1156,7 +1198,6 @@ func (o *Orchestrator) reconcileNodeAccess(ctx context.Context, nodes []string)
|
|||||||
sem := make(chan struct{}, parallelism)
|
sem := make(chan struct{}, parallelism)
|
||||||
var wg sync.WaitGroup
|
var wg sync.WaitGroup
|
||||||
errCh := make(chan error, len(nodes))
|
errCh := make(chan error, len(nodes))
|
||||||
cmd := `sudo sh -lc 'id atlas >/dev/null 2>&1 || useradd -m -s /bin/bash atlas || true; install -d -m 0755 /etc/sudoers.d; printf "%s\n" "atlas ALL=(ALL) NOPASSWD: /usr/bin/systemctl, /usr/sbin/poweroff, /sbin/poweroff, /usr/local/bin/hecate" > /etc/sudoers.d/90-hecate-atlas; chmod 0440 /etc/sudoers.d/90-hecate-atlas; if command -v visudo >/dev/null 2>&1; then visudo -cf /etc/sudoers.d/90-hecate-atlas >/dev/null; fi'`
|
|
||||||
for _, node := range nodes {
|
for _, node := range nodes {
|
||||||
node := strings.TrimSpace(node)
|
node := strings.TrimSpace(node)
|
||||||
if node == "" || !o.sshManaged(node) {
|
if node == "" || !o.sshManaged(node) {
|
||||||
@ -1167,8 +1208,8 @@ func (o *Orchestrator) reconcileNodeAccess(ctx context.Context, nodes []string)
|
|||||||
defer wg.Done()
|
defer wg.Done()
|
||||||
sem <- struct{}{}
|
sem <- struct{}{}
|
||||||
defer func() { <-sem }()
|
defer func() { <-sem }()
|
||||||
if _, err := o.ssh(ctx, node, cmd); err != nil {
|
if _, err := o.ssh(ctx, node, "sudo -n /usr/bin/systemctl --version"); err != nil {
|
||||||
errCh <- fmt.Errorf("%s: %w", node, err)
|
errCh <- fmt.Errorf("%s: missing sudo access to /usr/bin/systemctl (--version): %w", node, err)
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
}
|
}
|
||||||
@ -1184,7 +1225,7 @@ func (o *Orchestrator) reconcileNodeAccess(ctx context.Context, nodes []string)
|
|||||||
break
|
break
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return fmt.Errorf("access reconcile had %d errors (first: %s)", len(errCh), strings.Join(samples, " | "))
|
return fmt.Errorf("access validation had %d errors (first: %s)", len(errCh), strings.Join(samples, " | "))
|
||||||
}
|
}
|
||||||
|
|
||||||
func (o *Orchestrator) fluxSourceReady(ctx context.Context) (bool, error) {
|
func (o *Orchestrator) fluxSourceReady(ctx context.Context) (bool, error) {
|
||||||
|
|||||||
@ -164,6 +164,40 @@ ensure_hecate_kubeconfig() {
|
|||||||
echo "[install] warning: kubeconfig at ${kubeconfig_path} is still not validated; local startup fallback may fail"
|
echo "[install] warning: kubeconfig at ${kubeconfig_path} is still not validated; local startup fallback may fail"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ensure_hecate_ssh_identity() {
|
||||||
|
local key_path key_dir key_user key_comment
|
||||||
|
key_path="$(migration_yaml_lookup "ssh_identity_file")"
|
||||||
|
if [[ -z "${key_path}" ]]; then
|
||||||
|
key_path="/home/atlas/.ssh/id_ed25519"
|
||||||
|
fi
|
||||||
|
key_dir="$(dirname "${key_path}")"
|
||||||
|
key_comment="hecate-$(hostname)-forward"
|
||||||
|
|
||||||
|
key_user="root"
|
||||||
|
if [[ "${key_path}" == /home/*/* ]]; then
|
||||||
|
key_user="${key_path#/home/}"
|
||||||
|
key_user="${key_user%%/*}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ! id "${key_user}" >/dev/null 2>&1; then
|
||||||
|
echo "[install] warning: ssh identity owner ${key_user} does not exist; skipping key bootstrap for ${key_path}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
install -d -m 0700 -o "${key_user}" -g "${key_user}" "${key_dir}"
|
||||||
|
if [[ ! -s "${key_path}" ]]; then
|
||||||
|
echo "[install] generating missing SSH identity at ${key_path}"
|
||||||
|
if [[ "${key_user}" == "root" ]]; then
|
||||||
|
ssh-keygen -q -t ed25519 -N '' -C "${key_comment}" -f "${key_path}"
|
||||||
|
else
|
||||||
|
runuser -u "${key_user}" -- ssh-keygen -q -t ed25519 -N '' -C "${key_comment}" -f "${key_path}"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
chown "${key_user}:${key_user}" "${key_path}" "${key_path}.pub" 2>/dev/null || true
|
||||||
|
chmod 0600 "${key_path}" || true
|
||||||
|
chmod 0644 "${key_path}.pub" || true
|
||||||
|
}
|
||||||
|
|
||||||
migrate_hecate_config() {
|
migrate_hecate_config() {
|
||||||
if [[ ! -f "${CONF_DIR}/hecate.yaml" ]]; then
|
if [[ ! -f "${CONF_DIR}/hecate.yaml" ]]; then
|
||||||
return 0
|
return 0
|
||||||
@ -562,6 +596,7 @@ else
|
|||||||
echo "[install] keeping existing config at ${CONF_DIR}/hecate.yaml"
|
echo "[install] keeping existing config at ${CONF_DIR}/hecate.yaml"
|
||||||
fi
|
fi
|
||||||
migrate_hecate_config
|
migrate_hecate_config
|
||||||
|
ensure_hecate_ssh_identity
|
||||||
ensure_hecate_kubeconfig
|
ensure_hecate_kubeconfig
|
||||||
|
|
||||||
echo "[install] installing systemd units"
|
echo "[install] installing systemd units"
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user