startup: add off-site break-glass unseal fallback

This commit is contained in:
Brad Stein 2026-04-05 11:30:54 -03:00
parent d2526edf0e
commit 437a6b62cd
8 changed files with 75 additions and 22 deletions

View File

@ -31,6 +31,7 @@ Key startup guards:
- Startup can block until external probes pass (`startup.require_post_start_probes` + `startup.post_start_probes`). - Startup can block until external probes pass (`startup.require_post_start_probes` + `startup.post_start_probes`).
- Startup refreshes and can use a cached bootstrap manifest set under `/var/lib/hecate/bootstrap-cache` when local fallback paths fail. - Startup refreshes and can use a cached bootstrap manifest set under `/var/lib/hecate/bootstrap-cache` when local fallback paths fail.
- Vault unseal now falls back to a local cached key file (`startup.vault_unseal_key_file`) if `vault-init` cannot be read yet. - Vault unseal now falls back to a local cached key file (`startup.vault_unseal_key_file`) if `vault-init` cannot be read yet.
- Optional off-site break-glass retrieval can be configured with `startup.vault_unseal_breakglass_command` (for example, an SSH `cat` command to a remote key escrow host).
## Manual install on titan-db ## Manual install on titan-db
@ -93,6 +94,10 @@ Recommended:
See `configs/hecate.example.yaml`. See `configs/hecate.example.yaml`.
Break-glass unseal fallback knobs:
- `startup.vault_unseal_breakglass_command`: optional shell command that prints the unseal key to stdout.
- `startup.vault_unseal_breakglass_timeout_seconds`: timeout for the command (default `15`).
UPS auto-shutdown trigger uses: UPS auto-shutdown trigger uses:
- runtime threshold = `runtime_safety_factor * estimated_shutdown_budget` - runtime threshold = `runtime_safety_factor * estimated_shutdown_budget`
- default safety factor `1.25` - default safety factor `1.25`

View File

@ -70,6 +70,8 @@ startup:
- https://scm.bstein.dev/user/login - https://scm.bstein.dev/user/login
- https://metrics.bstein.dev/login - https://metrics.bstein.dev/login
vault_unseal_key_file: /var/lib/hecate/vault-unseal.key vault_unseal_key_file: /var/lib/hecate/vault-unseal.key
vault_unseal_breakglass_command: ""
vault_unseal_breakglass_timeout_seconds: 15
shutdown: shutdown:
default_budget_seconds: 1380 default_budget_seconds: 1380
history_min_samples: 3 history_min_samples: 3

View File

@ -136,6 +136,8 @@ startup:
- https://scm.bstein.dev/user/login - https://scm.bstein.dev/user/login
- https://metrics.bstein.dev/login - https://metrics.bstein.dev/login
vault_unseal_key_file: /var/lib/hecate/vault-unseal.key vault_unseal_key_file: /var/lib/hecate/vault-unseal.key
vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/tethys/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.hecate-breakglass/vault-unseal.key'"
vault_unseal_breakglass_timeout_seconds: 15
shutdown: shutdown:
default_budget_seconds: 1380 default_budget_seconds: 1380
history_min_samples: 3 history_min_samples: 3

View File

@ -136,6 +136,8 @@ startup:
- https://scm.bstein.dev/user/login - https://scm.bstein.dev/user/login
- https://metrics.bstein.dev/login - https://metrics.bstein.dev/login
vault_unseal_key_file: /var/lib/hecate/vault-unseal.key vault_unseal_key_file: /var/lib/hecate/vault-unseal.key
vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/atlas/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.hecate-breakglass/vault-unseal.key'"
vault_unseal_breakglass_timeout_seconds: 15
shutdown: shutdown:
default_budget_seconds: 1380 default_budget_seconds: 1380
history_min_samples: 3 history_min_samples: 3

View File

@ -2086,7 +2086,33 @@ func (o *Orchestrator) vaultUnsealKey(ctx context.Context) (string, error) {
o.log.Printf("warning: using cached vault unseal key from %s", o.cfg.Startup.VaultUnsealKeyFile) o.log.Printf("warning: using cached vault unseal key from %s", o.cfg.Startup.VaultUnsealKeyFile)
return fallbackKey, nil return fallbackKey, nil
} }
return "", fmt.Errorf("%v; fallback %v", err, fileErr) breakglassKey, breakglassErr := o.readVaultUnsealKeyBreakglass(ctx)
if breakglassErr == nil {
o.log.Printf("warning: using break-glass vault unseal key command fallback")
o.bestEffort("cache vault unseal key locally", func() error { return o.writeVaultUnsealKeyFile(breakglassKey) })
return breakglassKey, nil
}
return "", fmt.Errorf("%v; fallback %v; break-glass %v", err, fileErr, breakglassErr)
}
func (o *Orchestrator) readVaultUnsealKeyBreakglass(ctx context.Context) (string, error) {
cmd := strings.TrimSpace(o.cfg.Startup.VaultUnsealBreakglassCommand)
if cmd == "" {
return "", fmt.Errorf("break-glass command not configured")
}
timeout := time.Duration(o.cfg.Startup.VaultUnsealBreakglassTimeout) * time.Second
if timeout <= 0 {
timeout = 15 * time.Second
}
out, err := o.runSensitive(ctx, timeout, "sh", "-lc", cmd)
if err != nil {
return "", fmt.Errorf("run break-glass command: %w", err)
}
key := strings.TrimSpace(out)
if key == "" {
return "", fmt.Errorf("break-glass command returned empty output")
}
return key, nil
} }
func (o *Orchestrator) writeVaultUnsealKeyFile(key string) error { func (o *Orchestrator) writeVaultUnsealKeyFile(key string) error {

View File

@ -34,26 +34,28 @@ type Config struct {
} }
type Startup struct { type Startup struct {
APIWaitSeconds int `yaml:"api_wait_seconds"` APIWaitSeconds int `yaml:"api_wait_seconds"`
APIPollSeconds int `yaml:"api_poll_seconds"` APIPollSeconds int `yaml:"api_poll_seconds"`
RequireTimeSync bool `yaml:"require_time_sync"` RequireTimeSync bool `yaml:"require_time_sync"`
TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"` TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"`
TimeSyncPollSeconds int `yaml:"time_sync_poll_seconds"` TimeSyncPollSeconds int `yaml:"time_sync_poll_seconds"`
TimeSyncMode string `yaml:"time_sync_mode"` TimeSyncMode string `yaml:"time_sync_mode"`
TimeSyncQuorum int `yaml:"time_sync_quorum"` TimeSyncQuorum int `yaml:"time_sync_quorum"`
ReconcileAccessOnBoot bool `yaml:"reconcile_access_on_boot"` ReconcileAccessOnBoot bool `yaml:"reconcile_access_on_boot"`
AutoEtcdRestoreOnAPIFailure bool `yaml:"auto_etcd_restore_on_api_failure"` AutoEtcdRestoreOnAPIFailure bool `yaml:"auto_etcd_restore_on_api_failure"`
EtcdRestoreControlPlane string `yaml:"etcd_restore_control_plane"` EtcdRestoreControlPlane string `yaml:"etcd_restore_control_plane"`
RequireStorageReady bool `yaml:"require_storage_ready"` RequireStorageReady bool `yaml:"require_storage_ready"`
StorageReadyWaitSeconds int `yaml:"storage_ready_wait_seconds"` StorageReadyWaitSeconds int `yaml:"storage_ready_wait_seconds"`
StorageReadyPollSeconds int `yaml:"storage_ready_poll_seconds"` StorageReadyPollSeconds int `yaml:"storage_ready_poll_seconds"`
StorageMinReadyNodes int `yaml:"storage_min_ready_nodes"` StorageMinReadyNodes int `yaml:"storage_min_ready_nodes"`
StorageCriticalPVCs []string `yaml:"storage_critical_pvcs"` StorageCriticalPVCs []string `yaml:"storage_critical_pvcs"`
RequirePostStartProbes bool `yaml:"require_post_start_probes"` RequirePostStartProbes bool `yaml:"require_post_start_probes"`
PostStartProbeWaitSeconds int `yaml:"post_start_probe_wait_seconds"` PostStartProbeWaitSeconds int `yaml:"post_start_probe_wait_seconds"`
PostStartProbePollSeconds int `yaml:"post_start_probe_poll_seconds"` PostStartProbePollSeconds int `yaml:"post_start_probe_poll_seconds"`
PostStartProbes []string `yaml:"post_start_probes"` PostStartProbes []string `yaml:"post_start_probes"`
VaultUnsealKeyFile string `yaml:"vault_unseal_key_file"` VaultUnsealKeyFile string `yaml:"vault_unseal_key_file"`
VaultUnsealBreakglassCommand string `yaml:"vault_unseal_breakglass_command"`
VaultUnsealBreakglassTimeout int `yaml:"vault_unseal_breakglass_timeout_seconds"`
} }
type Shutdown struct { type Shutdown struct {
@ -323,7 +325,8 @@ func defaults() Config {
"https://scm.bstein.dev/user/login", "https://scm.bstein.dev/user/login",
"https://metrics.bstein.dev/login", "https://metrics.bstein.dev/login",
}, },
VaultUnsealKeyFile: "/var/lib/hecate/vault-unseal.key", VaultUnsealKeyFile: "/var/lib/hecate/vault-unseal.key",
VaultUnsealBreakglassTimeout: 15,
}, },
Shutdown: Shutdown{ Shutdown: Shutdown{
DefaultBudgetSeconds: 1380, DefaultBudgetSeconds: 1380,
@ -435,6 +438,9 @@ func (c *Config) applyDefaults() {
if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" { if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" {
c.Startup.VaultUnsealKeyFile = "/var/lib/hecate/vault-unseal.key" c.Startup.VaultUnsealKeyFile = "/var/lib/hecate/vault-unseal.key"
} }
if c.Startup.VaultUnsealBreakglassTimeout <= 0 {
c.Startup.VaultUnsealBreakglassTimeout = 15
}
if c.SSHPort <= 0 { if c.SSHPort <= 0 {
c.SSHPort = 2277 c.SSHPort = 2277
} }

View File

@ -98,6 +98,9 @@ state:
if cfg.Startup.VaultUnsealKeyFile == "" { if cfg.Startup.VaultUnsealKeyFile == "" {
t.Fatalf("expected startup vault unseal key file default to be set") t.Fatalf("expected startup vault unseal key file default to be set")
} }
if cfg.Startup.VaultUnsealBreakglassTimeout <= 0 {
t.Fatalf("expected startup break-glass timeout default > 0, got %d", cfg.Startup.VaultUnsealBreakglassTimeout)
}
} }
func TestValidateRejectsInvalidTimeSyncMode(t *testing.T) { func TestValidateRejectsInvalidTimeSyncMode(t *testing.T) {

View File

@ -268,6 +268,13 @@ migrate_hecate_config() {
changed=1 changed=1
fi fi
fi fi
if ! grep -Eq '^ vault_unseal_breakglass_timeout_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/hecate.yaml"; then
if grep -Eq '^ vault_unseal_key_file:[[:space:]]*/var/lib/hecate/vault-unseal.key' "${CONF_DIR}/hecate.yaml"; then
sed -Ei '/^ vault_unseal_key_file:[[:space:]]*\/var\/lib\/hecate\/vault-unseal.key$/a\ vault_unseal_breakglass_command: ""\n vault_unseal_breakglass_timeout_seconds: 15' "${CONF_DIR}/hecate.yaml"
echo "[install] added startup break-glass fallback defaults"
changed=1
fi
fi
local role local role
role="$(read_hecate_role)" role="$(read_hecate_role)"