From 437a6b62cd88f8bbaa519da3cac1589879db01b9 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Sun, 5 Apr 2026 11:30:54 -0300 Subject: [PATCH] startup: add off-site break-glass unseal fallback --- README.md | 5 ++++ configs/hecate.example.yaml | 2 ++ configs/hecate.tethys.yaml | 2 ++ configs/hecate.titan-db.yaml | 2 ++ internal/cluster/orchestrator.go | 28 ++++++++++++++++++- internal/config/config.go | 48 ++++++++++++++++++-------------- internal/config/config_test.go | 3 ++ scripts/install.sh | 7 +++++ 8 files changed, 75 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index 5a274ce..df1cc20 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,7 @@ Key startup guards: - Startup can block until external probes pass (`startup.require_post_start_probes` + `startup.post_start_probes`). - Startup refreshes and can use a cached bootstrap manifest set under `/var/lib/hecate/bootstrap-cache` when local fallback paths fail. - Vault unseal now falls back to a local cached key file (`startup.vault_unseal_key_file`) if `vault-init` cannot be read yet. +- Optional off-site break-glass retrieval can be configured with `startup.vault_unseal_breakglass_command` (for example, an SSH `cat` command to a remote key escrow host). ## Manual install on titan-db @@ -93,6 +94,10 @@ Recommended: See `configs/hecate.example.yaml`. +Break-glass unseal fallback knobs: +- `startup.vault_unseal_breakglass_command`: optional shell command that prints the unseal key to stdout. +- `startup.vault_unseal_breakglass_timeout_seconds`: timeout for the command (default `15`). + UPS auto-shutdown trigger uses: - runtime threshold = `runtime_safety_factor * estimated_shutdown_budget` - default safety factor `1.25` diff --git a/configs/hecate.example.yaml b/configs/hecate.example.yaml index b7c7275..7a8cd5b 100644 --- a/configs/hecate.example.yaml +++ b/configs/hecate.example.yaml @@ -70,6 +70,8 @@ startup: - https://scm.bstein.dev/user/login - https://metrics.bstein.dev/login vault_unseal_key_file: /var/lib/hecate/vault-unseal.key + vault_unseal_breakglass_command: "" + vault_unseal_breakglass_timeout_seconds: 15 shutdown: default_budget_seconds: 1380 history_min_samples: 3 diff --git a/configs/hecate.tethys.yaml b/configs/hecate.tethys.yaml index 28ec32a..fda4ad2 100644 --- a/configs/hecate.tethys.yaml +++ b/configs/hecate.tethys.yaml @@ -136,6 +136,8 @@ startup: - https://scm.bstein.dev/user/login - https://metrics.bstein.dev/login vault_unseal_key_file: /var/lib/hecate/vault-unseal.key + vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/tethys/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.hecate-breakglass/vault-unseal.key'" + vault_unseal_breakglass_timeout_seconds: 15 shutdown: default_budget_seconds: 1380 history_min_samples: 3 diff --git a/configs/hecate.titan-db.yaml b/configs/hecate.titan-db.yaml index 88c205c..ff61fca 100644 --- a/configs/hecate.titan-db.yaml +++ b/configs/hecate.titan-db.yaml @@ -136,6 +136,8 @@ startup: - https://scm.bstein.dev/user/login - https://metrics.bstein.dev/login vault_unseal_key_file: /var/lib/hecate/vault-unseal.key + vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/atlas/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.hecate-breakglass/vault-unseal.key'" + vault_unseal_breakglass_timeout_seconds: 15 shutdown: default_budget_seconds: 1380 history_min_samples: 3 diff --git a/internal/cluster/orchestrator.go b/internal/cluster/orchestrator.go index db56089..a2bbd91 100644 --- a/internal/cluster/orchestrator.go +++ b/internal/cluster/orchestrator.go @@ -2086,7 +2086,33 @@ func (o *Orchestrator) vaultUnsealKey(ctx context.Context) (string, error) { o.log.Printf("warning: using cached vault unseal key from %s", o.cfg.Startup.VaultUnsealKeyFile) return fallbackKey, nil } - return "", fmt.Errorf("%v; fallback %v", err, fileErr) + breakglassKey, breakglassErr := o.readVaultUnsealKeyBreakglass(ctx) + if breakglassErr == nil { + o.log.Printf("warning: using break-glass vault unseal key command fallback") + o.bestEffort("cache vault unseal key locally", func() error { return o.writeVaultUnsealKeyFile(breakglassKey) }) + return breakglassKey, nil + } + return "", fmt.Errorf("%v; fallback %v; break-glass %v", err, fileErr, breakglassErr) +} + +func (o *Orchestrator) readVaultUnsealKeyBreakglass(ctx context.Context) (string, error) { + cmd := strings.TrimSpace(o.cfg.Startup.VaultUnsealBreakglassCommand) + if cmd == "" { + return "", fmt.Errorf("break-glass command not configured") + } + timeout := time.Duration(o.cfg.Startup.VaultUnsealBreakglassTimeout) * time.Second + if timeout <= 0 { + timeout = 15 * time.Second + } + out, err := o.runSensitive(ctx, timeout, "sh", "-lc", cmd) + if err != nil { + return "", fmt.Errorf("run break-glass command: %w", err) + } + key := strings.TrimSpace(out) + if key == "" { + return "", fmt.Errorf("break-glass command returned empty output") + } + return key, nil } func (o *Orchestrator) writeVaultUnsealKeyFile(key string) error { diff --git a/internal/config/config.go b/internal/config/config.go index 634c566..f99cc8a 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -34,26 +34,28 @@ type Config struct { } type Startup struct { - APIWaitSeconds int `yaml:"api_wait_seconds"` - APIPollSeconds int `yaml:"api_poll_seconds"` - RequireTimeSync bool `yaml:"require_time_sync"` - TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"` - TimeSyncPollSeconds int `yaml:"time_sync_poll_seconds"` - TimeSyncMode string `yaml:"time_sync_mode"` - TimeSyncQuorum int `yaml:"time_sync_quorum"` - ReconcileAccessOnBoot bool `yaml:"reconcile_access_on_boot"` - AutoEtcdRestoreOnAPIFailure bool `yaml:"auto_etcd_restore_on_api_failure"` - EtcdRestoreControlPlane string `yaml:"etcd_restore_control_plane"` - RequireStorageReady bool `yaml:"require_storage_ready"` - StorageReadyWaitSeconds int `yaml:"storage_ready_wait_seconds"` - StorageReadyPollSeconds int `yaml:"storage_ready_poll_seconds"` - StorageMinReadyNodes int `yaml:"storage_min_ready_nodes"` - StorageCriticalPVCs []string `yaml:"storage_critical_pvcs"` - RequirePostStartProbes bool `yaml:"require_post_start_probes"` - PostStartProbeWaitSeconds int `yaml:"post_start_probe_wait_seconds"` - PostStartProbePollSeconds int `yaml:"post_start_probe_poll_seconds"` - PostStartProbes []string `yaml:"post_start_probes"` - VaultUnsealKeyFile string `yaml:"vault_unseal_key_file"` + APIWaitSeconds int `yaml:"api_wait_seconds"` + APIPollSeconds int `yaml:"api_poll_seconds"` + RequireTimeSync bool `yaml:"require_time_sync"` + TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"` + TimeSyncPollSeconds int `yaml:"time_sync_poll_seconds"` + TimeSyncMode string `yaml:"time_sync_mode"` + TimeSyncQuorum int `yaml:"time_sync_quorum"` + ReconcileAccessOnBoot bool `yaml:"reconcile_access_on_boot"` + AutoEtcdRestoreOnAPIFailure bool `yaml:"auto_etcd_restore_on_api_failure"` + EtcdRestoreControlPlane string `yaml:"etcd_restore_control_plane"` + RequireStorageReady bool `yaml:"require_storage_ready"` + StorageReadyWaitSeconds int `yaml:"storage_ready_wait_seconds"` + StorageReadyPollSeconds int `yaml:"storage_ready_poll_seconds"` + StorageMinReadyNodes int `yaml:"storage_min_ready_nodes"` + StorageCriticalPVCs []string `yaml:"storage_critical_pvcs"` + RequirePostStartProbes bool `yaml:"require_post_start_probes"` + PostStartProbeWaitSeconds int `yaml:"post_start_probe_wait_seconds"` + PostStartProbePollSeconds int `yaml:"post_start_probe_poll_seconds"` + PostStartProbes []string `yaml:"post_start_probes"` + VaultUnsealKeyFile string `yaml:"vault_unseal_key_file"` + VaultUnsealBreakglassCommand string `yaml:"vault_unseal_breakglass_command"` + VaultUnsealBreakglassTimeout int `yaml:"vault_unseal_breakglass_timeout_seconds"` } type Shutdown struct { @@ -323,7 +325,8 @@ func defaults() Config { "https://scm.bstein.dev/user/login", "https://metrics.bstein.dev/login", }, - VaultUnsealKeyFile: "/var/lib/hecate/vault-unseal.key", + VaultUnsealKeyFile: "/var/lib/hecate/vault-unseal.key", + VaultUnsealBreakglassTimeout: 15, }, Shutdown: Shutdown{ DefaultBudgetSeconds: 1380, @@ -435,6 +438,9 @@ func (c *Config) applyDefaults() { if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" { c.Startup.VaultUnsealKeyFile = "/var/lib/hecate/vault-unseal.key" } + if c.Startup.VaultUnsealBreakglassTimeout <= 0 { + c.Startup.VaultUnsealBreakglassTimeout = 15 + } if c.SSHPort <= 0 { c.SSHPort = 2277 } diff --git a/internal/config/config_test.go b/internal/config/config_test.go index c6339ca..383e034 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -98,6 +98,9 @@ state: if cfg.Startup.VaultUnsealKeyFile == "" { t.Fatalf("expected startup vault unseal key file default to be set") } + if cfg.Startup.VaultUnsealBreakglassTimeout <= 0 { + t.Fatalf("expected startup break-glass timeout default > 0, got %d", cfg.Startup.VaultUnsealBreakglassTimeout) + } } func TestValidateRejectsInvalidTimeSyncMode(t *testing.T) { diff --git a/scripts/install.sh b/scripts/install.sh index 572f6e4..111b3f5 100755 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -268,6 +268,13 @@ migrate_hecate_config() { changed=1 fi fi + if ! grep -Eq '^ vault_unseal_breakglass_timeout_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/hecate.yaml"; then + if grep -Eq '^ vault_unseal_key_file:[[:space:]]*/var/lib/hecate/vault-unseal.key' "${CONF_DIR}/hecate.yaml"; then + sed -Ei '/^ vault_unseal_key_file:[[:space:]]*\/var\/lib\/hecate\/vault-unseal.key$/a\ vault_unseal_breakglass_command: ""\n vault_unseal_breakglass_timeout_seconds: 15' "${CONF_DIR}/hecate.yaml" + echo "[install] added startup break-glass fallback defaults" + changed=1 + fi + fi local role role="$(read_hecate_role)"