hecate(startup): auto-unseal vault during layered recovery
This commit is contained in:
parent
6b88fb305f
commit
58a7947223
@ -2,9 +2,12 @@ package cluster
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/base64"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strconv"
|
||||
@ -118,7 +121,7 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
|
||||
}
|
||||
|
||||
if !opts.SkipLocalBootstrap && needsLocalBootstrap {
|
||||
if ready, err := o.waitForFluxSourceReady(ctx, 2*time.Minute); err != nil {
|
||||
if ready, err := o.waitForFluxSourceReady(ctx, 5*time.Minute); err != nil {
|
||||
o.log.Printf("warning: flux source readiness wait failed before local bootstrap: %v", err)
|
||||
} else if ready {
|
||||
o.log.Printf("flux source became ready after targeted recovery; skipping local bootstrap")
|
||||
@ -483,6 +486,26 @@ func (o *Orchestrator) run(ctx context.Context, timeout time.Duration, name stri
|
||||
return o.runner.Run(runCtx, name, args...)
|
||||
}
|
||||
|
||||
func (o *Orchestrator) runSensitive(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||||
runCtx, cancel := context.WithTimeout(ctx, timeout)
|
||||
defer cancel()
|
||||
|
||||
cmd := exec.CommandContext(runCtx, name, args...)
|
||||
cmd.Env = os.Environ()
|
||||
if o.runner.Kubeconfig != "" {
|
||||
cmd.Env = append(cmd.Env, "KUBECONFIG="+o.runner.Kubeconfig)
|
||||
}
|
||||
out, err := cmd.CombinedOutput()
|
||||
trimmed := strings.TrimSpace(string(out))
|
||||
if err != nil {
|
||||
if trimmed == "" {
|
||||
return "", fmt.Errorf("%s failed: %w", name, err)
|
||||
}
|
||||
return trimmed, fmt.Errorf("%s failed: %w", name, err)
|
||||
}
|
||||
return trimmed, nil
|
||||
}
|
||||
|
||||
func lines(in string) []string {
|
||||
in = strings.TrimSpace(in)
|
||||
if in == "" {
|
||||
@ -558,6 +581,10 @@ func (o *Orchestrator) ensureWorkloadReplicas(ctx context.Context, w startupWork
|
||||
}
|
||||
|
||||
func (o *Orchestrator) waitWorkloadReady(ctx context.Context, w startupWorkload) error {
|
||||
if w.Namespace == "vault" && w.Kind == "statefulset" && w.Name == "vault" {
|
||||
return o.waitVaultReady(ctx, w)
|
||||
}
|
||||
|
||||
timeout := "240s"
|
||||
if w.Kind == "statefulset" {
|
||||
timeout = "360s"
|
||||
@ -578,6 +605,158 @@ func (o *Orchestrator) waitWorkloadReady(ctx context.Context, w startupWorkload)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (o *Orchestrator) waitVaultReady(ctx context.Context, w startupWorkload) error {
|
||||
if o.runner.DryRun {
|
||||
return nil
|
||||
}
|
||||
|
||||
deadline := time.Now().Add(7 * time.Minute)
|
||||
var lastErr error
|
||||
for time.Now().Before(deadline) {
|
||||
ready, err := o.workloadReady(ctx, w)
|
||||
if err == nil && ready {
|
||||
return nil
|
||||
}
|
||||
if err != nil {
|
||||
lastErr = err
|
||||
}
|
||||
|
||||
if err := o.ensureVaultUnsealed(ctx); err != nil {
|
||||
lastErr = err
|
||||
o.log.Printf("warning: vault readiness still blocked: %v", err)
|
||||
}
|
||||
|
||||
ready, err = o.workloadReady(ctx, w)
|
||||
if err == nil && ready {
|
||||
return nil
|
||||
}
|
||||
if err != nil {
|
||||
lastErr = err
|
||||
}
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-time.After(5 * time.Second):
|
||||
}
|
||||
}
|
||||
|
||||
if lastErr != nil {
|
||||
return fmt.Errorf("wait ready %s/%s/%s: %w", w.Namespace, w.Kind, w.Name, lastErr)
|
||||
}
|
||||
return fmt.Errorf("wait ready %s/%s/%s: timeout", w.Namespace, w.Kind, w.Name)
|
||||
}
|
||||
|
||||
func (o *Orchestrator) ensureVaultUnsealed(ctx context.Context) error {
|
||||
if o.runner.DryRun {
|
||||
return nil
|
||||
}
|
||||
|
||||
phase, err := o.kubectl(ctx, 15*time.Second, "-n", "vault", "get", "pod", "vault-0", "-o", "jsonpath={.status.phase}")
|
||||
if err != nil {
|
||||
return fmt.Errorf("vault pod phase check failed: %w", err)
|
||||
}
|
||||
if strings.TrimSpace(phase) != "Running" {
|
||||
return fmt.Errorf("vault-0 pod phase is %q", strings.TrimSpace(phase))
|
||||
}
|
||||
|
||||
sealed, err := o.vaultSealed(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if !sealed {
|
||||
return nil
|
||||
}
|
||||
|
||||
unsealKey, err := o.vaultUnsealKey(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for attempt := 1; attempt <= 5; attempt++ {
|
||||
o.log.Printf("vault is sealed; attempting auto-unseal (%d/5)", attempt)
|
||||
if _, err := o.runSensitive(
|
||||
ctx,
|
||||
30*time.Second,
|
||||
"kubectl",
|
||||
"-n", "vault",
|
||||
"exec", "vault-0", "--",
|
||||
"vault", "operator", "unseal", unsealKey,
|
||||
); err != nil {
|
||||
return fmt.Errorf("vault unseal attempt %d failed: %w", attempt, err)
|
||||
}
|
||||
|
||||
sealed, err = o.vaultSealed(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if !sealed {
|
||||
o.log.Printf("vault auto-unseal succeeded")
|
||||
return nil
|
||||
}
|
||||
time.Sleep(2 * time.Second)
|
||||
}
|
||||
|
||||
return fmt.Errorf("vault remained sealed after 5 auto-unseal attempts")
|
||||
}
|
||||
|
||||
func (o *Orchestrator) vaultSealed(ctx context.Context) (bool, error) {
|
||||
out, err := o.kubectl(
|
||||
ctx,
|
||||
25*time.Second,
|
||||
"-n", "vault",
|
||||
"exec", "vault-0", "--",
|
||||
"sh", "-lc",
|
||||
"VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json 2>/dev/null || true",
|
||||
)
|
||||
if err != nil {
|
||||
return false, fmt.Errorf("vault status check failed: %w", err)
|
||||
}
|
||||
sealed, err := parseVaultSealed(out)
|
||||
if err != nil {
|
||||
return false, fmt.Errorf("parse vault status: %w", err)
|
||||
}
|
||||
return sealed, nil
|
||||
}
|
||||
|
||||
func parseVaultSealed(raw string) (bool, error) {
|
||||
trimmed := strings.TrimSpace(raw)
|
||||
if trimmed == "" {
|
||||
return false, fmt.Errorf("empty vault status output")
|
||||
}
|
||||
|
||||
type vaultStatus struct {
|
||||
Sealed bool `json:"sealed"`
|
||||
}
|
||||
var st vaultStatus
|
||||
if err := json.Unmarshal([]byte(trimmed), &st); err != nil {
|
||||
return false, err
|
||||
}
|
||||
return st.Sealed, nil
|
||||
}
|
||||
|
||||
func (o *Orchestrator) vaultUnsealKey(ctx context.Context) (string, error) {
|
||||
out, err := o.kubectl(
|
||||
ctx,
|
||||
15*time.Second,
|
||||
"-n", "vault",
|
||||
"get", "secret", "vault-init",
|
||||
"-o", "jsonpath={.data.unseal_key_b64}",
|
||||
)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("read vault-init secret: %w", err)
|
||||
}
|
||||
decoded, err := base64.StdEncoding.DecodeString(strings.TrimSpace(out))
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("decode vault-init unseal_key_b64: %w", err)
|
||||
}
|
||||
key := strings.TrimSpace(string(decoded))
|
||||
if key == "" {
|
||||
return "", fmt.Errorf("vault-init unseal key is empty")
|
||||
}
|
||||
return key, nil
|
||||
}
|
||||
|
||||
func (o *Orchestrator) workloadReady(ctx context.Context, w startupWorkload) (bool, error) {
|
||||
out, err := o.kubectl(
|
||||
ctx,
|
||||
|
||||
27
internal/cluster/orchestrator_test.go
Normal file
27
internal/cluster/orchestrator_test.go
Normal file
@ -0,0 +1,27 @@
|
||||
package cluster
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestParseVaultSealed(t *testing.T) {
|
||||
sealed, err := parseVaultSealed(`{"initialized":true,"sealed":true}`)
|
||||
if err != nil {
|
||||
t.Fatalf("parse sealed=true: %v", err)
|
||||
}
|
||||
if !sealed {
|
||||
t.Fatalf("expected sealed=true")
|
||||
}
|
||||
|
||||
sealed, err = parseVaultSealed(`{"initialized":true,"sealed":false}`)
|
||||
if err != nil {
|
||||
t.Fatalf("parse sealed=false: %v", err)
|
||||
}
|
||||
if sealed {
|
||||
t.Fatalf("expected sealed=false")
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseVaultSealedRejectsEmpty(t *testing.T) {
|
||||
if _, err := parseVaultSealed(" "); err == nil {
|
||||
t.Fatalf("expected parse error for empty status payload")
|
||||
}
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user