From b5f27a79e07a478aa65f42bfbdd342ca999fbd0b Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Sat, 4 Apr 2026 22:40:39 -0300 Subject: [PATCH] hecate: retry ssh with known_hosts repair on silent 255 --- cmd/hecate/main.go | 2 +- internal/cluster/orchestrator.go | 4 ++-- internal/service/daemon.go | 2 +- internal/sshutil/sshutil.go | 15 +++++++++++++++ internal/sshutil/sshutil_test.go | 6 ++++++ 5 files changed, 25 insertions(+), 4 deletions(-) diff --git a/cmd/hecate/main.go b/cmd/hecate/main.go index 4ca586a..4455413 100644 --- a/cmd/hecate/main.go +++ b/cmd/hecate/main.go @@ -553,7 +553,7 @@ func runSSHWithRecovery(ctx context.Context, logger *log.Logger, cfg config.Conf if err == nil { return out, nil } - if !sshutil.IsHostKeyError(out, err) { + if !sshutil.ShouldAttemptKnownHostsRepair(out, err) { return out, err } diff --git a/internal/cluster/orchestrator.go b/internal/cluster/orchestrator.go index 76cb422..3de87f8 100644 --- a/internal/cluster/orchestrator.go +++ b/internal/cluster/orchestrator.go @@ -1296,8 +1296,8 @@ func (o *Orchestrator) ssh(ctx context.Context, node string, command string) (st } return out, nil } - if sshutil.IsHostKeyError(out, err) { - o.log.Printf("warning: ssh host-key mismatch detected for %s via %s path; repairing known_hosts and retrying once", node, attemptNames[i]) + if sshutil.ShouldAttemptKnownHostsRepair(out, err) { + o.log.Printf("warning: ssh failure on %s via %s path may be host-key related; repairing known_hosts and retrying once", node, attemptNames[i]) sshutil.RepairKnownHosts(ctx, o.log, knownHostsFiles, repairHosts, o.cfg.SSHPort) retryOut, retryErr := o.run(ctx, 45*time.Second, "ssh", args...) if retryErr == nil { diff --git a/internal/service/daemon.go b/internal/service/daemon.go index 9e7ce8d..0d88034 100644 --- a/internal/service/daemon.go +++ b/internal/service/daemon.go @@ -256,7 +256,7 @@ func (d *Daemon) forwardShutdown(ctx context.Context, reason string) error { } out, err := try() - if err != nil && sshutil.IsHostKeyError(out, err) { + if err != nil && sshutil.ShouldAttemptKnownHostsRepair(out, err) { repairHosts := []string{d.cfg.Coordination.ForwardShutdownHost, host} if d.cfg.SSHJumpHost != "" { repairHosts = append(repairHosts, d.cfg.SSHJumpHost) diff --git a/internal/sshutil/sshutil.go b/internal/sshutil/sshutil.go index 9347a72..d97eb19 100644 --- a/internal/sshutil/sshutil.go +++ b/internal/sshutil/sshutil.go @@ -34,6 +34,21 @@ func IsHostKeyError(output string, err error) bool { return false } +func ShouldAttemptKnownHostsRepair(output string, err error) bool { + if IsHostKeyError(output, err) { + return true + } + if err == nil { + return false + } + // Some SSH invocations (especially under strict non-interactive configs) + // return exit 255 without forwarding the host-key mismatch text. + if strings.Contains(strings.ToLower(err.Error()), "exit status 255") && strings.TrimSpace(output) == "" { + return true + } + return false +} + func KnownHostsFiles(sshConfigFile, sshIdentityFile string) []string { seen := map[string]struct{}{} add := func(path string) { diff --git a/internal/sshutil/sshutil_test.go b/internal/sshutil/sshutil_test.go index 68e9078..cb852c1 100644 --- a/internal/sshutil/sshutil_test.go +++ b/internal/sshutil/sshutil_test.go @@ -20,6 +20,12 @@ func TestIsHostKeyErrorIgnoresGenericFailures(t *testing.T) { } } +func TestShouldAttemptKnownHostsRepairOnSilent255(t *testing.T) { + if !ShouldAttemptKnownHostsRepair("", errors.New("ssh ...: exit status 255")) { + t.Fatalf("expected silent exit status 255 to trigger known_hosts repair") + } +} + func TestKnownHostsFilesIncludesDerivedPaths(t *testing.T) { configFile := "/home/atlas/.ssh/config" identityFile := "/home/tethys/.ssh/id_ed25519"