diff --git a/internal/cluster/orchestrator.go b/internal/cluster/orchestrator.go index 825c2e5..6787912 100644 --- a/internal/cluster/orchestrator.go +++ b/internal/cluster/orchestrator.go @@ -2,9 +2,12 @@ package cluster import ( "context" + "encoding/base64" + "encoding/json" "fmt" "log" "os" + "os/exec" "path/filepath" "sort" "strconv" @@ -118,7 +121,7 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er } if !opts.SkipLocalBootstrap && needsLocalBootstrap { - if ready, err := o.waitForFluxSourceReady(ctx, 2*time.Minute); err != nil { + if ready, err := o.waitForFluxSourceReady(ctx, 5*time.Minute); err != nil { o.log.Printf("warning: flux source readiness wait failed before local bootstrap: %v", err) } else if ready { o.log.Printf("flux source became ready after targeted recovery; skipping local bootstrap") @@ -483,6 +486,26 @@ func (o *Orchestrator) run(ctx context.Context, timeout time.Duration, name stri return o.runner.Run(runCtx, name, args...) } +func (o *Orchestrator) runSensitive(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { + runCtx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + + cmd := exec.CommandContext(runCtx, name, args...) + cmd.Env = os.Environ() + if o.runner.Kubeconfig != "" { + cmd.Env = append(cmd.Env, "KUBECONFIG="+o.runner.Kubeconfig) + } + out, err := cmd.CombinedOutput() + trimmed := strings.TrimSpace(string(out)) + if err != nil { + if trimmed == "" { + return "", fmt.Errorf("%s failed: %w", name, err) + } + return trimmed, fmt.Errorf("%s failed: %w", name, err) + } + return trimmed, nil +} + func lines(in string) []string { in = strings.TrimSpace(in) if in == "" { @@ -558,6 +581,10 @@ func (o *Orchestrator) ensureWorkloadReplicas(ctx context.Context, w startupWork } func (o *Orchestrator) waitWorkloadReady(ctx context.Context, w startupWorkload) error { + if w.Namespace == "vault" && w.Kind == "statefulset" && w.Name == "vault" { + return o.waitVaultReady(ctx, w) + } + timeout := "240s" if w.Kind == "statefulset" { timeout = "360s" @@ -578,6 +605,158 @@ func (o *Orchestrator) waitWorkloadReady(ctx context.Context, w startupWorkload) return nil } +func (o *Orchestrator) waitVaultReady(ctx context.Context, w startupWorkload) error { + if o.runner.DryRun { + return nil + } + + deadline := time.Now().Add(7 * time.Minute) + var lastErr error + for time.Now().Before(deadline) { + ready, err := o.workloadReady(ctx, w) + if err == nil && ready { + return nil + } + if err != nil { + lastErr = err + } + + if err := o.ensureVaultUnsealed(ctx); err != nil { + lastErr = err + o.log.Printf("warning: vault readiness still blocked: %v", err) + } + + ready, err = o.workloadReady(ctx, w) + if err == nil && ready { + return nil + } + if err != nil { + lastErr = err + } + + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(5 * time.Second): + } + } + + if lastErr != nil { + return fmt.Errorf("wait ready %s/%s/%s: %w", w.Namespace, w.Kind, w.Name, lastErr) + } + return fmt.Errorf("wait ready %s/%s/%s: timeout", w.Namespace, w.Kind, w.Name) +} + +func (o *Orchestrator) ensureVaultUnsealed(ctx context.Context) error { + if o.runner.DryRun { + return nil + } + + phase, err := o.kubectl(ctx, 15*time.Second, "-n", "vault", "get", "pod", "vault-0", "-o", "jsonpath={.status.phase}") + if err != nil { + return fmt.Errorf("vault pod phase check failed: %w", err) + } + if strings.TrimSpace(phase) != "Running" { + return fmt.Errorf("vault-0 pod phase is %q", strings.TrimSpace(phase)) + } + + sealed, err := o.vaultSealed(ctx) + if err != nil { + return err + } + if !sealed { + return nil + } + + unsealKey, err := o.vaultUnsealKey(ctx) + if err != nil { + return err + } + + for attempt := 1; attempt <= 5; attempt++ { + o.log.Printf("vault is sealed; attempting auto-unseal (%d/5)", attempt) + if _, err := o.runSensitive( + ctx, + 30*time.Second, + "kubectl", + "-n", "vault", + "exec", "vault-0", "--", + "vault", "operator", "unseal", unsealKey, + ); err != nil { + return fmt.Errorf("vault unseal attempt %d failed: %w", attempt, err) + } + + sealed, err = o.vaultSealed(ctx) + if err != nil { + return err + } + if !sealed { + o.log.Printf("vault auto-unseal succeeded") + return nil + } + time.Sleep(2 * time.Second) + } + + return fmt.Errorf("vault remained sealed after 5 auto-unseal attempts") +} + +func (o *Orchestrator) vaultSealed(ctx context.Context) (bool, error) { + out, err := o.kubectl( + ctx, + 25*time.Second, + "-n", "vault", + "exec", "vault-0", "--", + "sh", "-lc", + "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json 2>/dev/null || true", + ) + if err != nil { + return false, fmt.Errorf("vault status check failed: %w", err) + } + sealed, err := parseVaultSealed(out) + if err != nil { + return false, fmt.Errorf("parse vault status: %w", err) + } + return sealed, nil +} + +func parseVaultSealed(raw string) (bool, error) { + trimmed := strings.TrimSpace(raw) + if trimmed == "" { + return false, fmt.Errorf("empty vault status output") + } + + type vaultStatus struct { + Sealed bool `json:"sealed"` + } + var st vaultStatus + if err := json.Unmarshal([]byte(trimmed), &st); err != nil { + return false, err + } + return st.Sealed, nil +} + +func (o *Orchestrator) vaultUnsealKey(ctx context.Context) (string, error) { + out, err := o.kubectl( + ctx, + 15*time.Second, + "-n", "vault", + "get", "secret", "vault-init", + "-o", "jsonpath={.data.unseal_key_b64}", + ) + if err != nil { + return "", fmt.Errorf("read vault-init secret: %w", err) + } + decoded, err := base64.StdEncoding.DecodeString(strings.TrimSpace(out)) + if err != nil { + return "", fmt.Errorf("decode vault-init unseal_key_b64: %w", err) + } + key := strings.TrimSpace(string(decoded)) + if key == "" { + return "", fmt.Errorf("vault-init unseal key is empty") + } + return key, nil +} + func (o *Orchestrator) workloadReady(ctx context.Context, w startupWorkload) (bool, error) { out, err := o.kubectl( ctx, diff --git a/internal/cluster/orchestrator_test.go b/internal/cluster/orchestrator_test.go new file mode 100644 index 0000000..1dc1d35 --- /dev/null +++ b/internal/cluster/orchestrator_test.go @@ -0,0 +1,27 @@ +package cluster + +import "testing" + +func TestParseVaultSealed(t *testing.T) { + sealed, err := parseVaultSealed(`{"initialized":true,"sealed":true}`) + if err != nil { + t.Fatalf("parse sealed=true: %v", err) + } + if !sealed { + t.Fatalf("expected sealed=true") + } + + sealed, err = parseVaultSealed(`{"initialized":true,"sealed":false}`) + if err != nil { + t.Fatalf("parse sealed=false: %v", err) + } + if sealed { + t.Fatalf("expected sealed=false") + } +} + +func TestParseVaultSealedRejectsEmpty(t *testing.T) { + if _, err := parseVaultSealed(" "); err == nil { + t.Fatalf("expected parse error for empty status payload") + } +}