startup: add off-site break-glass unseal fallback

This commit is contained in:
Brad Stein 2026-04-05 11:30:54 -03:00
parent d2526edf0e
commit 437a6b62cd
8 changed files with 75 additions and 22 deletions

View File

@ -31,6 +31,7 @@ Key startup guards:
- Startup can block until external probes pass (`startup.require_post_start_probes` + `startup.post_start_probes`).
- Startup refreshes and can use a cached bootstrap manifest set under `/var/lib/hecate/bootstrap-cache` when local fallback paths fail.
- Vault unseal now falls back to a local cached key file (`startup.vault_unseal_key_file`) if `vault-init` cannot be read yet.
- Optional off-site break-glass retrieval can be configured with `startup.vault_unseal_breakglass_command` (for example, an SSH `cat` command to a remote key escrow host).
## Manual install on titan-db
@ -93,6 +94,10 @@ Recommended:
See `configs/hecate.example.yaml`.
Break-glass unseal fallback knobs:
- `startup.vault_unseal_breakglass_command`: optional shell command that prints the unseal key to stdout.
- `startup.vault_unseal_breakglass_timeout_seconds`: timeout for the command (default `15`).
UPS auto-shutdown trigger uses:
- runtime threshold = `runtime_safety_factor * estimated_shutdown_budget`
- default safety factor `1.25`

View File

@ -70,6 +70,8 @@ startup:
- https://scm.bstein.dev/user/login
- https://metrics.bstein.dev/login
vault_unseal_key_file: /var/lib/hecate/vault-unseal.key
vault_unseal_breakglass_command: ""
vault_unseal_breakglass_timeout_seconds: 15
shutdown:
default_budget_seconds: 1380
history_min_samples: 3

View File

@ -136,6 +136,8 @@ startup:
- https://scm.bstein.dev/user/login
- https://metrics.bstein.dev/login
vault_unseal_key_file: /var/lib/hecate/vault-unseal.key
vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/tethys/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.hecate-breakglass/vault-unseal.key'"
vault_unseal_breakglass_timeout_seconds: 15
shutdown:
default_budget_seconds: 1380
history_min_samples: 3

View File

@ -136,6 +136,8 @@ startup:
- https://scm.bstein.dev/user/login
- https://metrics.bstein.dev/login
vault_unseal_key_file: /var/lib/hecate/vault-unseal.key
vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/atlas/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.hecate-breakglass/vault-unseal.key'"
vault_unseal_breakglass_timeout_seconds: 15
shutdown:
default_budget_seconds: 1380
history_min_samples: 3

View File

@ -2086,7 +2086,33 @@ func (o *Orchestrator) vaultUnsealKey(ctx context.Context) (string, error) {
o.log.Printf("warning: using cached vault unseal key from %s", o.cfg.Startup.VaultUnsealKeyFile)
return fallbackKey, nil
}
return "", fmt.Errorf("%v; fallback %v", err, fileErr)
breakglassKey, breakglassErr := o.readVaultUnsealKeyBreakglass(ctx)
if breakglassErr == nil {
o.log.Printf("warning: using break-glass vault unseal key command fallback")
o.bestEffort("cache vault unseal key locally", func() error { return o.writeVaultUnsealKeyFile(breakglassKey) })
return breakglassKey, nil
}
return "", fmt.Errorf("%v; fallback %v; break-glass %v", err, fileErr, breakglassErr)
}
func (o *Orchestrator) readVaultUnsealKeyBreakglass(ctx context.Context) (string, error) {
cmd := strings.TrimSpace(o.cfg.Startup.VaultUnsealBreakglassCommand)
if cmd == "" {
return "", fmt.Errorf("break-glass command not configured")
}
timeout := time.Duration(o.cfg.Startup.VaultUnsealBreakglassTimeout) * time.Second
if timeout <= 0 {
timeout = 15 * time.Second
}
out, err := o.runSensitive(ctx, timeout, "sh", "-lc", cmd)
if err != nil {
return "", fmt.Errorf("run break-glass command: %w", err)
}
key := strings.TrimSpace(out)
if key == "" {
return "", fmt.Errorf("break-glass command returned empty output")
}
return key, nil
}
func (o *Orchestrator) writeVaultUnsealKeyFile(key string) error {

View File

@ -54,6 +54,8 @@ type Startup struct {
PostStartProbePollSeconds int `yaml:"post_start_probe_poll_seconds"`
PostStartProbes []string `yaml:"post_start_probes"`
VaultUnsealKeyFile string `yaml:"vault_unseal_key_file"`
VaultUnsealBreakglassCommand string `yaml:"vault_unseal_breakglass_command"`
VaultUnsealBreakglassTimeout int `yaml:"vault_unseal_breakglass_timeout_seconds"`
}
type Shutdown struct {
@ -324,6 +326,7 @@ func defaults() Config {
"https://metrics.bstein.dev/login",
},
VaultUnsealKeyFile: "/var/lib/hecate/vault-unseal.key",
VaultUnsealBreakglassTimeout: 15,
},
Shutdown: Shutdown{
DefaultBudgetSeconds: 1380,
@ -435,6 +438,9 @@ func (c *Config) applyDefaults() {
if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" {
c.Startup.VaultUnsealKeyFile = "/var/lib/hecate/vault-unseal.key"
}
if c.Startup.VaultUnsealBreakglassTimeout <= 0 {
c.Startup.VaultUnsealBreakglassTimeout = 15
}
if c.SSHPort <= 0 {
c.SSHPort = 2277
}

View File

@ -98,6 +98,9 @@ state:
if cfg.Startup.VaultUnsealKeyFile == "" {
t.Fatalf("expected startup vault unseal key file default to be set")
}
if cfg.Startup.VaultUnsealBreakglassTimeout <= 0 {
t.Fatalf("expected startup break-glass timeout default > 0, got %d", cfg.Startup.VaultUnsealBreakglassTimeout)
}
}
func TestValidateRejectsInvalidTimeSyncMode(t *testing.T) {

View File

@ -268,6 +268,13 @@ migrate_hecate_config() {
changed=1
fi
fi
if ! grep -Eq '^ vault_unseal_breakglass_timeout_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/hecate.yaml"; then
if grep -Eq '^ vault_unseal_key_file:[[:space:]]*/var/lib/hecate/vault-unseal.key' "${CONF_DIR}/hecate.yaml"; then
sed -Ei '/^ vault_unseal_key_file:[[:space:]]*\/var\/lib\/hecate\/vault-unseal.key$/a\ vault_unseal_breakglass_command: ""\n vault_unseal_breakglass_timeout_seconds: 15' "${CONF_DIR}/hecate.yaml"
echo "[install] added startup break-glass fallback defaults"
changed=1
fi
fi
local role
role="$(read_hecate_role)"