From 7ce729d81080a320d96b5ee14212b379563ff419 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Sat, 4 Apr 2026 12:56:58 -0300 Subject: [PATCH] hecate(ssh): add config/key fallback and scoped node orchestration --- README.md | 2 +- configs/hecate.example.yaml | 1 + configs/hecate.tethys.yaml | 3 ++- configs/hecate.titan-db.yaml | 3 ++- internal/cluster/orchestrator.go | 41 ++++++++++++++++++++++++++++++-- internal/config/config.go | 1 + internal/service/daemon.go | 40 +++++++++++++++++++++++++++++-- 7 files changed, 84 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 8e69350..cf5ae61 100644 --- a/README.md +++ b/README.md @@ -68,7 +68,7 @@ sudo systemctl start hecate-bootstrap.service Optional SSH jump/bastion: - Set `ssh_jump_host` (and optional `ssh_jump_user`) to route node SSH through a jump host like `titan-jh`; Hecate now falls back to direct SSH automatically if jump routing is unavailable. -- Set `ssh_port`, `ssh_identity_file`, and `ssh_node_hosts` so root-run systemd actions can actually reach node SSH daemons during cold-start recovery. +- Set `ssh_port`, `ssh_config_file`, `ssh_identity_file`, and `ssh_node_hosts` so root-run systemd actions can actually reach node SSH daemons during cold-start recovery. - Use `ssh_node_users` for per-node username overrides (for example `titan-24: tethys`). - Use `ssh_managed_nodes` to limit host-level SSH start/stop actions to nodes Hecate can actually authenticate to. diff --git a/configs/hecate.example.yaml b/configs/hecate.example.yaml index 391c61c..df9d904 100644 --- a/configs/hecate.example.yaml +++ b/configs/hecate.example.yaml @@ -2,6 +2,7 @@ kubeconfig: /etc/hecate/kubeconfig ssh_user: atlas ssh_port: 2277 +ssh_config_file: "" ssh_identity_file: /home/atlas/.ssh/id_ed25519 ssh_node_hosts: {} ssh_node_users: {} diff --git a/configs/hecate.tethys.yaml b/configs/hecate.tethys.yaml index c4ad44e..c1c8a74 100644 --- a/configs/hecate.tethys.yaml +++ b/configs/hecate.tethys.yaml @@ -2,9 +2,10 @@ kubeconfig: /etc/hecate/kubeconfig ssh_user: atlas ssh_port: 2277 +ssh_config_file: /home/tethys/.ssh/config ssh_identity_file: /home/tethys/.ssh/id_ed25519 ssh_node_hosts: - titan-db: 192.168.22.7 + titan-db: 192.168.22.10 titan-0a: 192.168.22.11 titan-0b: 192.168.22.12 titan-0c: 192.168.22.13 diff --git a/configs/hecate.titan-db.yaml b/configs/hecate.titan-db.yaml index c9050f6..5f472c4 100644 --- a/configs/hecate.titan-db.yaml +++ b/configs/hecate.titan-db.yaml @@ -2,9 +2,10 @@ kubeconfig: /etc/hecate/kubeconfig ssh_user: atlas ssh_port: 2277 +ssh_config_file: /home/atlas/.ssh/config ssh_identity_file: /home/atlas/.ssh/id_ed25519 ssh_node_hosts: - titan-db: 192.168.22.7 + titan-db: 192.168.22.10 titan-0a: 192.168.22.11 titan-0b: 192.168.22.12 titan-0c: 192.168.22.13 diff --git a/internal/cluster/orchestrator.go b/internal/cluster/orchestrator.go index 74effd1..4e9a9fb 100644 --- a/internal/cluster/orchestrator.go +++ b/internal/cluster/orchestrator.go @@ -572,13 +572,18 @@ func (o *Orchestrator) ssh(ctx context.Context, node string, command string) (st if sshUser != "" { target = sshUser + "@" + host } + sshConfigFile := o.resolveSSHConfigFile() + sshIdentity := o.resolveSSHIdentityFile() baseArgs := []string{ "-o", "BatchMode=yes", "-o", "ConnectTimeout=8", "-o", "StrictHostKeyChecking=accept-new", } - if o.cfg.SSHIdentityFile != "" { - baseArgs = append(baseArgs, "-i", o.cfg.SSHIdentityFile) + if sshConfigFile != "" { + baseArgs = append(baseArgs, "-F", sshConfigFile) + } + if sshIdentity != "" { + baseArgs = append(baseArgs, "-i", sshIdentity) } if o.cfg.SSHPort > 0 { baseArgs = append(baseArgs, "-p", strconv.Itoa(o.cfg.SSHPort)) @@ -676,6 +681,38 @@ func (o *Orchestrator) sshManaged(node string) bool { return false } +func (o *Orchestrator) resolveSSHConfigFile() string { + if strings.TrimSpace(o.cfg.SSHConfigFile) != "" { + return strings.TrimSpace(o.cfg.SSHConfigFile) + } + candidates := []string{ + "/home/atlas/.ssh/config", + "/home/tethys/.ssh/config", + } + for _, p := range candidates { + if stat, err := os.Stat(p); err == nil && !stat.IsDir() { + return p + } + } + return "" +} + +func (o *Orchestrator) resolveSSHIdentityFile() string { + if strings.TrimSpace(o.cfg.SSHIdentityFile) != "" { + return strings.TrimSpace(o.cfg.SSHIdentityFile) + } + candidates := []string{ + "/home/atlas/.ssh/id_ed25519", + "/home/tethys/.ssh/id_ed25519", + } + for _, p := range candidates { + if stat, err := os.Stat(p); err == nil && !stat.IsDir() { + return p + } + } + return "" +} + func (o *Orchestrator) bestEffort(name string, fn func() error) { if err := fn(); err != nil { o.log.Printf("warning: %s: %v", name, err) diff --git a/internal/config/config.go b/internal/config/config.go index 09d0d65..f1fac5a 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -11,6 +11,7 @@ type Config struct { Kubeconfig string `yaml:"kubeconfig"` SSHUser string `yaml:"ssh_user"` SSHPort int `yaml:"ssh_port"` + SSHConfigFile string `yaml:"ssh_config_file"` SSHIdentityFile string `yaml:"ssh_identity_file"` SSHNodeHosts map[string]string `yaml:"ssh_node_hosts"` SSHNodeUsers map[string]string `yaml:"ssh_node_users"` diff --git a/internal/service/daemon.go b/internal/service/daemon.go index 5f836fa..1f98a41 100644 --- a/internal/service/daemon.go +++ b/internal/service/daemon.go @@ -6,6 +6,7 @@ import ( "log" "math" "net/http" + "os" "os/exec" "strconv" "strings" @@ -219,8 +220,11 @@ func (d *Daemon) forwardShutdown(ctx context.Context, reason string) error { "-o", "ConnectTimeout=8", "-o", "StrictHostKeyChecking=accept-new", } - if d.cfg.SSHIdentityFile != "" { - args = append(args, "-i", d.cfg.SSHIdentityFile) + if cfgFile := d.resolveSSHConfigFile(); cfgFile != "" { + args = append(args, "-F", cfgFile) + } + if idFile := d.resolveSSHIdentityFile(); idFile != "" { + args = append(args, "-i", idFile) } if d.cfg.SSHPort > 0 { args = append(args, "-p", strconv.Itoa(d.cfg.SSHPort)) @@ -248,6 +252,38 @@ func (d *Daemon) forwardShutdown(ctx context.Context, reason string) error { return nil } +func (d *Daemon) resolveSSHConfigFile() string { + if strings.TrimSpace(d.cfg.SSHConfigFile) != "" { + return strings.TrimSpace(d.cfg.SSHConfigFile) + } + candidates := []string{ + "/home/atlas/.ssh/config", + "/home/tethys/.ssh/config", + } + for _, p := range candidates { + if stat, err := os.Stat(p); err == nil && !stat.IsDir() { + return p + } + } + return "" +} + +func (d *Daemon) resolveSSHIdentityFile() string { + if strings.TrimSpace(d.cfg.SSHIdentityFile) != "" { + return strings.TrimSpace(d.cfg.SSHIdentityFile) + } + candidates := []string{ + "/home/atlas/.ssh/id_ed25519", + "/home/tethys/.ssh/id_ed25519", + } + for _, p := range candidates { + if stat, err := os.Stat(p); err == nil && !stat.IsDir() { + return p + } + } + return "" +} + func (d *Daemon) targetList() string { names := make([]string, 0, len(d.targets)) for _, t := range d.targets {