diff --git a/configs/hecate.example.yaml b/configs/hecate.example.yaml index 2a5dba9..b7c7275 100644 --- a/configs/hecate.example.yaml +++ b/configs/hecate.example.yaml @@ -48,9 +48,28 @@ startup: require_time_sync: true time_sync_wait_seconds: 240 time_sync_poll_seconds: 5 + time_sync_mode: quorum + time_sync_quorum: 2 reconcile_access_on_boot: true auto_etcd_restore_on_api_failure: true etcd_restore_control_plane: titan-0a + require_storage_ready: true + storage_ready_wait_seconds: 420 + storage_ready_poll_seconds: 5 + storage_min_ready_nodes: 2 + storage_critical_pvcs: + - vault/data-vault-0 + - postgres/postgres-data-postgres-0 + - gitea/gitea-data + - sso/keycloak-data + require_post_start_probes: true + post_start_probe_wait_seconds: 240 + post_start_probe_poll_seconds: 5 + post_start_probes: + - https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration + - https://scm.bstein.dev/user/login + - https://metrics.bstein.dev/login + vault_unseal_key_file: /var/lib/hecate/vault-unseal.key shutdown: default_budget_seconds: 1380 history_min_samples: 3 diff --git a/configs/hecate.tethys.yaml b/configs/hecate.tethys.yaml index 766c822..28ec32a 100644 --- a/configs/hecate.tethys.yaml +++ b/configs/hecate.tethys.yaml @@ -114,9 +114,28 @@ startup: require_time_sync: true time_sync_wait_seconds: 240 time_sync_poll_seconds: 5 + time_sync_mode: quorum + time_sync_quorum: 2 reconcile_access_on_boot: true auto_etcd_restore_on_api_failure: true etcd_restore_control_plane: titan-0a + require_storage_ready: true + storage_ready_wait_seconds: 420 + storage_ready_poll_seconds: 5 + storage_min_ready_nodes: 2 + storage_critical_pvcs: + - vault/data-vault-0 + - postgres/postgres-data-postgres-0 + - gitea/gitea-data + - sso/keycloak-data + require_post_start_probes: true + post_start_probe_wait_seconds: 240 + post_start_probe_poll_seconds: 5 + post_start_probes: + - https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration + - https://scm.bstein.dev/user/login + - https://metrics.bstein.dev/login + vault_unseal_key_file: /var/lib/hecate/vault-unseal.key shutdown: default_budget_seconds: 1380 history_min_samples: 3 diff --git a/configs/hecate.titan-db.yaml b/configs/hecate.titan-db.yaml index 28dda02..88c205c 100644 --- a/configs/hecate.titan-db.yaml +++ b/configs/hecate.titan-db.yaml @@ -114,9 +114,28 @@ startup: require_time_sync: true time_sync_wait_seconds: 240 time_sync_poll_seconds: 5 + time_sync_mode: quorum + time_sync_quorum: 2 reconcile_access_on_boot: true auto_etcd_restore_on_api_failure: true etcd_restore_control_plane: titan-0a + require_storage_ready: true + storage_ready_wait_seconds: 420 + storage_ready_poll_seconds: 5 + storage_min_ready_nodes: 2 + storage_critical_pvcs: + - vault/data-vault-0 + - postgres/postgres-data-postgres-0 + - gitea/gitea-data + - sso/keycloak-data + require_post_start_probes: true + post_start_probe_wait_seconds: 240 + post_start_probe_poll_seconds: 5 + post_start_probes: + - https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration + - https://scm.bstein.dev/user/login + - https://metrics.bstein.dev/login + vault_unseal_key_file: /var/lib/hecate/vault-unseal.key shutdown: default_budget_seconds: 1380 history_min_samples: 3 diff --git a/internal/cluster/orchestrator.go b/internal/cluster/orchestrator.go index 0c3d134..a64a0eb 100644 --- a/internal/cluster/orchestrator.go +++ b/internal/cluster/orchestrator.go @@ -141,6 +141,8 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er if err := o.preflightExternalDatastore(ctx); err != nil { return err } + o.bestEffort("sync local titan-iac checkout", func() error { return o.syncLocalIACRepo(ctx) }) + o.bestEffort("refresh bootstrap cache from local repo", func() error { return o.refreshBootstrapCache(ctx) }) if o.cfg.Startup.ReconcileAccessOnBoot { o.bestEffort("reconcile control-plane access", func() error { return o.reconcileNodeAccess(ctx, o.cfg.ControlPlanes) }) } @@ -217,6 +219,11 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er if len(missing) > 0 { o.log.Printf("startup critical workloads not ready; applying targeted recovery first: %s", strings.Join(missing, ", ")) } + if o.cfg.Startup.RequireStorageReady { + if err := o.waitForStorageReady(ctx); err != nil { + return err + } + } if err := o.ensureCriticalStartupWorkloads(ctx); err != nil { return err } @@ -252,6 +259,11 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er if err := o.resumeFluxAndReconcile(ctx); err != nil { return err } + if o.cfg.Startup.RequirePostStartProbes { + if err := o.waitForPostStartProbes(ctx); err != nil { + return err + } + } o.log.Printf("startup flow complete") return nil } @@ -923,11 +935,40 @@ func (o *Orchestrator) waitForTimeSync(ctx context.Context, nodes []string) erro if poll <= 0 { poll = 5 * time.Second } + + mode := strings.ToLower(strings.TrimSpace(o.cfg.Startup.TimeSyncMode)) + if mode == "" { + mode = "strict" + } + managedControlPlanes := 0 + for _, node := range nodes { + node = strings.TrimSpace(node) + if node == "" { + continue + } + if o.sshManaged(node) { + managedControlPlanes++ + } + } + requiredQuorum := o.cfg.Startup.TimeSyncQuorum + if requiredQuorum <= 0 { + requiredQuorum = managedControlPlanes + if requiredQuorum <= 0 { + requiredQuorum = 1 + } + } + if requiredQuorum > managedControlPlanes && managedControlPlanes > 0 { + requiredQuorum = managedControlPlanes + } + deadline := time.Now().Add(wait) for { unsynced := []string{} + syncedControlPlanes := 0 + checkedControlPlanes := 0 localOut, localErr := o.run(ctx, 10*time.Second, "sh", "-lc", "timedatectl show -p NTPSynchronized --value 2>/dev/null || echo unknown") - if localErr != nil || !isTimeSynced(localOut) { + localSynced := localErr == nil && isTimeSynced(localOut) + if !localSynced { if localErr != nil { unsynced = append(unsynced, fmt.Sprintf("local(%v)", localErr)) } else { @@ -942,6 +983,7 @@ func (o *Orchestrator) waitForTimeSync(ctx context.Context, nodes []string) erro if !o.sshManaged(node) { continue } + checkedControlPlanes++ out, err := o.ssh(ctx, node, "timedatectl show -p NTPSynchronized --value 2>/dev/null || echo unknown") if err != nil || !isTimeSynced(out) { if err != nil { @@ -949,12 +991,38 @@ func (o *Orchestrator) waitForTimeSync(ctx context.Context, nodes []string) erro } else { unsynced = append(unsynced, fmt.Sprintf("%s(%s)", node, strings.TrimSpace(out))) } + } else { + syncedControlPlanes++ } } - if len(unsynced) == 0 { + + ready := false + switch mode { + case "quorum": + if localSynced && syncedControlPlanes >= requiredQuorum { + ready = true + } + default: + if localSynced && len(unsynced) == 0 { + ready = true + } + } + + if ready { return nil } if time.Now().After(deadline) { + if mode == "quorum" { + return fmt.Errorf( + "startup blocked: time sync quorum not ready within %s (mode=quorum local_synced=%t synced_control_planes=%d required=%d checked=%d details=%s)", + wait, + localSynced, + syncedControlPlanes, + requiredQuorum, + checkedControlPlanes, + strings.Join(unsynced, ", "), + ) + } return fmt.Errorf("startup blocked: time sync not ready within %s (%s)", wait, strings.Join(unsynced, ", ")) } select { @@ -1183,23 +1251,30 @@ func (o *Orchestrator) ensureFluxBranch(ctx context.Context, branch string) erro func (o *Orchestrator) bootstrapLocal(ctx context.Context) error { failures := 0 + successes := 0 for _, rel := range o.cfg.LocalBootstrapPaths { full := filepath.Join(o.cfg.IACRepoPath, rel) - o.log.Printf("local bootstrap apply -k %s", full) + o.log.Printf("local bootstrap apply rel=%s path=%s", rel, full) if o.runner.DryRun { + successes++ continue } if _, err := o.kubectl(ctx, 2*time.Minute, "apply", "-k", full); err != nil { o.log.Printf("warning: local bootstrap apply failed at %s: %v", full, err) o.log.Printf("local bootstrap fallback render/apply with LoadRestrictionsNone for %s", full) if fallbackErr := o.applyKustomizeFallback(ctx, full); fallbackErr != nil { - failures++ o.log.Printf("warning: local bootstrap fallback failed at %s: %v", full, fallbackErr) - continue + o.log.Printf("local bootstrap cache apply for rel=%s", rel) + if cacheErr := o.applyBootstrapCache(ctx, rel); cacheErr != nil { + failures++ + o.log.Printf("warning: local bootstrap cache apply failed for rel=%s: %v", rel, cacheErr) + continue + } } } + successes++ } - if failures == len(o.cfg.LocalBootstrapPaths) { + if failures > 0 && successes == 0 { return fmt.Errorf("local bootstrap apply failed for every configured path (%d total)", failures) } return nil @@ -1213,6 +1288,99 @@ func (o *Orchestrator) applyKustomizeFallback(ctx context.Context, full string) return nil } +func (o *Orchestrator) syncLocalIACRepo(ctx context.Context) error { + repo := strings.TrimSpace(o.cfg.IACRepoPath) + if repo == "" { + return fmt.Errorf("iac repo path is empty") + } + gitDir := filepath.Join(repo, ".git") + if stat, err := os.Stat(gitDir); err != nil || stat.IsDir() == false { + return fmt.Errorf("iac repo %s is not a git checkout", repo) + } + statusOut, statusErr := o.runSensitive(ctx, 10*time.Second, "git", "-C", repo, "status", "--porcelain") + if statusErr != nil { + return fmt.Errorf("inspect iac repo working tree: %w", statusErr) + } + if strings.TrimSpace(statusOut) != "" { + o.log.Printf("warning: skipping local titan-iac sync because working tree is dirty") + return nil + } + branch := strings.TrimSpace(o.cfg.ExpectedFluxBranch) + if branch == "" { + branch = "main" + } + if _, err := o.runSensitive(ctx, 45*time.Second, "git", "-C", repo, "fetch", "origin", "--prune"); err != nil { + return fmt.Errorf("git fetch origin: %w", err) + } + if _, err := o.runSensitive(ctx, 20*time.Second, "git", "-C", repo, "checkout", branch); err != nil { + return fmt.Errorf("git checkout %s: %w", branch, err) + } + if _, err := o.runSensitive(ctx, 20*time.Second, "git", "-C", repo, "reset", "--hard", "origin/"+branch); err != nil { + return fmt.Errorf("git reset --hard origin/%s: %w", branch, err) + } + return nil +} + +func (o *Orchestrator) refreshBootstrapCache(ctx context.Context) error { + if len(o.cfg.LocalBootstrapPaths) == 0 { + return nil + } + if err := os.MkdirAll(o.bootstrapCacheDir(), 0o755); err != nil { + return fmt.Errorf("ensure bootstrap cache dir: %w", err) + } + rendered := 0 + for _, rel := range o.cfg.LocalBootstrapPaths { + rel = strings.TrimSpace(rel) + if rel == "" { + continue + } + full := filepath.Join(o.cfg.IACRepoPath, rel) + if stat, err := os.Stat(full); err != nil || !stat.IsDir() { + o.log.Printf("warning: skip bootstrap cache render for rel=%s (path missing)", rel) + continue + } + cmd := fmt.Sprintf("kubectl kustomize --load-restrictor=LoadRestrictionsNone %q", full) + manifest, err := o.runSensitive(ctx, 2*time.Minute, "sh", "-lc", cmd) + if err != nil { + o.log.Printf("warning: bootstrap cache render failed for rel=%s: %v", rel, err) + continue + } + cachePath := o.bootstrapCachePath(rel) + if err := os.WriteFile(cachePath, []byte(manifest+"\n"), 0o644); err != nil { + o.log.Printf("warning: bootstrap cache write failed for rel=%s path=%s: %v", rel, cachePath, err) + continue + } + rendered++ + } + if rendered == 0 { + return fmt.Errorf("no bootstrap cache manifests rendered") + } + o.log.Printf("bootstrap cache refreshed (%d paths)", rendered) + return nil +} + +func (o *Orchestrator) applyBootstrapCache(ctx context.Context, rel string) error { + cachePath := o.bootstrapCachePath(rel) + if _, err := os.Stat(cachePath); err != nil { + return fmt.Errorf("bootstrap cache missing at %s: %w", cachePath, err) + } + if _, err := o.kubectl(ctx, 2*time.Minute, "apply", "-f", cachePath); err != nil { + return err + } + return nil +} + +func (o *Orchestrator) bootstrapCacheDir() string { + return filepath.Join(o.cfg.State.Dir, "bootstrap-cache") +} + +func (o *Orchestrator) bootstrapCachePath(rel string) string { + safe := strings.TrimSpace(rel) + safe = strings.ReplaceAll(safe, "/", "__") + safe = strings.ReplaceAll(safe, string(os.PathSeparator), "__") + return filepath.Join(o.bootstrapCacheDir(), safe+".yaml") +} + func (o *Orchestrator) waitForFluxSourceReady(ctx context.Context, window time.Duration) (bool, error) { if o.runner.DryRun { return true, nil @@ -1237,6 +1405,184 @@ func (o *Orchestrator) waitForFluxSourceReady(ctx context.Context, window time.D } } +func (o *Orchestrator) waitForStorageReady(ctx context.Context) error { + if o.runner.DryRun { + return nil + } + wait := time.Duration(o.cfg.Startup.StorageReadyWaitSeconds) * time.Second + if wait <= 0 { + wait = 420 * time.Second + } + poll := time.Duration(o.cfg.Startup.StorageReadyPollSeconds) * time.Second + if poll <= 0 { + poll = 5 * time.Second + } + deadline := time.Now().Add(wait) + lastReason := "unknown" + for { + ok, reason, err := o.storageReady(ctx) + if err != nil { + lastReason = err.Error() + } else { + lastReason = reason + } + if ok { + o.log.Printf("storage readiness check passed (%s)", reason) + return nil + } + if time.Now().After(deadline) { + return fmt.Errorf("startup blocked: storage readiness not satisfied within %s (%s)", wait, lastReason) + } + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(poll): + } + } +} + +func (o *Orchestrator) storageReady(ctx context.Context) (bool, string, error) { + minReady := o.cfg.Startup.StorageMinReadyNodes + if minReady <= 0 { + minReady = 2 + } + longhornOut, err := o.kubectl( + ctx, + 15*time.Second, + "-n", + "longhorn-system", + "get", + "nodes.longhorn.io", + "-o", + `jsonpath={range .items[*]}{.metadata.name}{":"}{.status.conditions[?(@.type=="Ready")].status}{":"}{.status.conditions[?(@.type=="Schedulable")].status}{"\n"}{end}`, + ) + if err != nil { + return false, "", fmt.Errorf("query longhorn nodes: %w", err) + } + readyNodes := 0 + for _, line := range lines(longhornOut) { + parts := strings.Split(line, ":") + if len(parts) < 3 { + continue + } + ready := strings.EqualFold(strings.TrimSpace(parts[1]), "true") + sched := strings.EqualFold(strings.TrimSpace(parts[2]), "true") + if ready && sched { + readyNodes++ + } + } + if readyNodes < minReady { + return false, fmt.Sprintf("longhorn ready+sched nodes %d/%d", readyNodes, minReady), nil + } + + for _, item := range o.cfg.Startup.StorageCriticalPVCs { + item = strings.TrimSpace(item) + if item == "" { + continue + } + parts := strings.SplitN(item, "/", 2) + if len(parts) != 2 { + return false, "", fmt.Errorf("invalid storage_critical_pvcs entry %q", item) + } + ns := strings.TrimSpace(parts[0]) + name := strings.TrimSpace(parts[1]) + out, pvcErr := o.kubectl(ctx, 15*time.Second, "-n", ns, "get", "pvc", name, "-o", "jsonpath={.status.phase}") + if pvcErr != nil { + if isNotFoundErr(pvcErr) { + return false, fmt.Sprintf("pvc %s/%s not found", ns, name), nil + } + return false, "", fmt.Errorf("query pvc %s/%s: %w", ns, name, pvcErr) + } + if !strings.EqualFold(strings.TrimSpace(out), "Bound") { + return false, fmt.Sprintf("pvc %s/%s phase=%s", ns, name, strings.TrimSpace(out)), nil + } + } + + return true, fmt.Sprintf("longhorn ready+sched nodes=%d critical pvcs bound=%d", readyNodes, len(o.cfg.Startup.StorageCriticalPVCs)), nil +} + +func (o *Orchestrator) waitForPostStartProbes(ctx context.Context) error { + if o.runner.DryRun { + return nil + } + wait := time.Duration(o.cfg.Startup.PostStartProbeWaitSeconds) * time.Second + if wait <= 0 { + wait = 240 * time.Second + } + poll := time.Duration(o.cfg.Startup.PostStartProbePollSeconds) * time.Second + if poll <= 0 { + poll = 5 * time.Second + } + deadline := time.Now().Add(wait) + lastFailure := "unknown" + for { + ok, failure := o.postStartProbesReady(ctx) + if ok { + o.log.Printf("post-start probes passed") + return nil + } + lastFailure = failure + if time.Now().After(deadline) { + return fmt.Errorf("startup blocked: post-start probes did not pass within %s (%s)", wait, lastFailure) + } + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(poll): + } + } +} + +func (o *Orchestrator) postStartProbesReady(ctx context.Context) (bool, string) { + probes := make([]string, 0, len(o.cfg.Startup.PostStartProbes)) + for _, p := range o.cfg.Startup.PostStartProbes { + p = strings.TrimSpace(p) + if p != "" { + probes = append(probes, p) + } + } + if len(probes) == 0 { + return true, "no probes configured" + } + + for _, probe := range probes { + code, err := o.httpProbe(ctx, probe) + if err != nil { + return false, fmt.Sprintf("%s: %v", probe, err) + } + if code < 200 || code >= 400 { + return false, fmt.Sprintf("%s: unexpected status code=%d", probe, code) + } + } + return true, "all probes successful" +} + +func (o *Orchestrator) httpProbe(ctx context.Context, probeURL string) (int, error) { + out, err := o.run( + ctx, + 20*time.Second, + "curl", + "--silent", + "--show-error", + "--location", + "--max-time", + "12", + "--output", + "/dev/null", + "--write-out", + "%{http_code}", + probeURL, + ) + if err != nil { + return 0, err + } + code, convErr := strconv.Atoi(strings.TrimSpace(out)) + if convErr != nil { + return 0, fmt.Errorf("parse http status %q: %w", strings.TrimSpace(out), convErr) + } + return code, nil +} + func (o *Orchestrator) resumeFluxAndReconcile(ctx context.Context) error { if err := o.patchFluxSuspendAll(ctx, false); err != nil { return err @@ -1678,16 +2024,56 @@ func (o *Orchestrator) vaultUnsealKey(ctx context.Context) (string, error) { "get", "secret", "vault-init", "-o", "jsonpath={.data.unseal_key_b64}", ) - if err != nil { - return "", fmt.Errorf("read vault-init secret: %w", err) + if err == nil { + decoded, decodeErr := base64.StdEncoding.DecodeString(strings.TrimSpace(out)) + if decodeErr == nil { + key := strings.TrimSpace(string(decoded)) + if key != "" { + o.bestEffort("cache vault unseal key locally", func() error { return o.writeVaultUnsealKeyFile(key) }) + return key, nil + } + err = fmt.Errorf("vault-init unseal key is empty") + } else { + err = fmt.Errorf("decode vault-init unseal_key_b64: %w", decodeErr) + } + } else { + err = fmt.Errorf("read vault-init secret: %w", err) } - decoded, err := base64.StdEncoding.DecodeString(strings.TrimSpace(out)) - if err != nil { - return "", fmt.Errorf("decode vault-init unseal_key_b64: %w", err) + + fallbackKey, fileErr := o.readVaultUnsealKeyFile() + if fileErr == nil { + o.log.Printf("warning: using cached vault unseal key from %s", o.cfg.Startup.VaultUnsealKeyFile) + return fallbackKey, nil } - key := strings.TrimSpace(string(decoded)) + return "", fmt.Errorf("%v; fallback %v", err, fileErr) +} + +func (o *Orchestrator) writeVaultUnsealKeyFile(key string) error { + path := strings.TrimSpace(o.cfg.Startup.VaultUnsealKeyFile) + if path == "" { + return fmt.Errorf("vault unseal key file path is empty") + } + if err := os.MkdirAll(filepath.Dir(path), 0o700); err != nil { + return fmt.Errorf("ensure vault unseal key dir: %w", err) + } + if err := os.WriteFile(path, []byte(strings.TrimSpace(key)+"\n"), 0o600); err != nil { + return fmt.Errorf("write vault unseal key file: %w", err) + } + return nil +} + +func (o *Orchestrator) readVaultUnsealKeyFile() (string, error) { + path := strings.TrimSpace(o.cfg.Startup.VaultUnsealKeyFile) + if path == "" { + return "", fmt.Errorf("vault unseal key file path is empty") + } + b, err := os.ReadFile(path) + if err != nil { + return "", fmt.Errorf("read vault unseal key file %s: %w", path, err) + } + key := strings.TrimSpace(string(b)) if key == "" { - return "", fmt.Errorf("vault-init unseal key is empty") + return "", fmt.Errorf("vault unseal key file %s is empty", path) } return key, nil } diff --git a/internal/config/config.go b/internal/config/config.go index 56aec4d..634c566 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -3,6 +3,7 @@ package config import ( "fmt" "os" + "strings" "gopkg.in/yaml.v3" ) @@ -33,14 +34,26 @@ type Config struct { } type Startup struct { - APIWaitSeconds int `yaml:"api_wait_seconds"` - APIPollSeconds int `yaml:"api_poll_seconds"` - RequireTimeSync bool `yaml:"require_time_sync"` - TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"` - TimeSyncPollSeconds int `yaml:"time_sync_poll_seconds"` - ReconcileAccessOnBoot bool `yaml:"reconcile_access_on_boot"` - AutoEtcdRestoreOnAPIFailure bool `yaml:"auto_etcd_restore_on_api_failure"` - EtcdRestoreControlPlane string `yaml:"etcd_restore_control_plane"` + APIWaitSeconds int `yaml:"api_wait_seconds"` + APIPollSeconds int `yaml:"api_poll_seconds"` + RequireTimeSync bool `yaml:"require_time_sync"` + TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"` + TimeSyncPollSeconds int `yaml:"time_sync_poll_seconds"` + TimeSyncMode string `yaml:"time_sync_mode"` + TimeSyncQuorum int `yaml:"time_sync_quorum"` + ReconcileAccessOnBoot bool `yaml:"reconcile_access_on_boot"` + AutoEtcdRestoreOnAPIFailure bool `yaml:"auto_etcd_restore_on_api_failure"` + EtcdRestoreControlPlane string `yaml:"etcd_restore_control_plane"` + RequireStorageReady bool `yaml:"require_storage_ready"` + StorageReadyWaitSeconds int `yaml:"storage_ready_wait_seconds"` + StorageReadyPollSeconds int `yaml:"storage_ready_poll_seconds"` + StorageMinReadyNodes int `yaml:"storage_min_ready_nodes"` + StorageCriticalPVCs []string `yaml:"storage_critical_pvcs"` + RequirePostStartProbes bool `yaml:"require_post_start_probes"` + PostStartProbeWaitSeconds int `yaml:"post_start_probe_wait_seconds"` + PostStartProbePollSeconds int `yaml:"post_start_probe_poll_seconds"` + PostStartProbes []string `yaml:"post_start_probes"` + VaultUnsealKeyFile string `yaml:"vault_unseal_key_file"` } type Shutdown struct { @@ -162,6 +175,12 @@ func (c Config) Validate() error { if c.Startup.TimeSyncPollSeconds <= 0 { return fmt.Errorf("config.startup.time_sync_poll_seconds must be > 0") } + if c.Startup.TimeSyncMode != "strict" && c.Startup.TimeSyncMode != "quorum" { + return fmt.Errorf("config.startup.time_sync_mode must be strict or quorum") + } + if c.Startup.TimeSyncMode == "quorum" && c.Startup.TimeSyncQuorum <= 0 { + return fmt.Errorf("config.startup.time_sync_quorum must be > 0 when time_sync_mode=quorum") + } if c.Startup.EtcdRestoreControlPlane != "" { found := false for _, cp := range c.ControlPlanes { @@ -174,6 +193,37 @@ func (c Config) Validate() error { return fmt.Errorf("config.startup.etcd_restore_control_plane must be one of config.control_planes when set") } } + if c.Startup.StorageReadyWaitSeconds <= 0 { + return fmt.Errorf("config.startup.storage_ready_wait_seconds must be > 0") + } + if c.Startup.StorageReadyPollSeconds <= 0 { + return fmt.Errorf("config.startup.storage_ready_poll_seconds must be > 0") + } + if c.Startup.StorageMinReadyNodes <= 0 { + return fmt.Errorf("config.startup.storage_min_ready_nodes must be > 0") + } + for _, pvc := range c.Startup.StorageCriticalPVCs { + if strings.Count(strings.TrimSpace(pvc), "/") != 1 { + return fmt.Errorf("config.startup.storage_critical_pvcs entries must be namespace/name, got %q", pvc) + } + } + if c.Startup.PostStartProbeWaitSeconds <= 0 { + return fmt.Errorf("config.startup.post_start_probe_wait_seconds must be > 0") + } + if c.Startup.PostStartProbePollSeconds <= 0 { + return fmt.Errorf("config.startup.post_start_probe_poll_seconds must be > 0") + } + if c.Startup.RequirePostStartProbes && len(c.Startup.PostStartProbes) == 0 { + return fmt.Errorf("config.startup.post_start_probes must not be empty when require_post_start_probes is true") + } + for _, probe := range c.Startup.PostStartProbes { + if strings.TrimSpace(probe) == "" { + return fmt.Errorf("config.startup.post_start_probes entries must not be empty") + } + } + if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" { + return fmt.Errorf("config.startup.vault_unseal_key_file must not be empty") + } if c.SSHPort <= 0 || c.SSHPort > 65535 { return fmt.Errorf("config.ssh_port must be in range 1-65535") } @@ -250,9 +300,30 @@ func defaults() Config { RequireTimeSync: true, TimeSyncWaitSeconds: 240, TimeSyncPollSeconds: 5, + TimeSyncMode: "quorum", + TimeSyncQuorum: 2, ReconcileAccessOnBoot: true, AutoEtcdRestoreOnAPIFailure: true, EtcdRestoreControlPlane: "titan-0a", + RequireStorageReady: true, + StorageReadyWaitSeconds: 420, + StorageReadyPollSeconds: 5, + StorageMinReadyNodes: 2, + StorageCriticalPVCs: []string{ + "vault/data-vault-0", + "postgres/postgres-data-postgres-0", + "gitea/gitea-data", + "sso/keycloak-data", + }, + RequirePostStartProbes: true, + PostStartProbeWaitSeconds: 240, + PostStartProbePollSeconds: 5, + PostStartProbes: []string{ + "https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration", + "https://scm.bstein.dev/user/login", + "https://metrics.bstein.dev/login", + }, + VaultUnsealKeyFile: "/var/lib/hecate/vault-unseal.key", }, Shutdown: Shutdown{ DefaultBudgetSeconds: 1380, @@ -319,9 +390,51 @@ func (c *Config) applyDefaults() { if c.Startup.TimeSyncPollSeconds <= 0 { c.Startup.TimeSyncPollSeconds = 5 } + if c.Startup.TimeSyncMode == "" { + c.Startup.TimeSyncMode = "quorum" + } + if c.Startup.TimeSyncQuorum <= 0 { + c.Startup.TimeSyncQuorum = 2 + } + if c.Startup.TimeSyncQuorum > len(c.ControlPlanes) && len(c.ControlPlanes) > 0 { + c.Startup.TimeSyncQuorum = len(c.ControlPlanes) + } if c.Startup.EtcdRestoreControlPlane == "" && len(c.ControlPlanes) > 0 { c.Startup.EtcdRestoreControlPlane = c.ControlPlanes[0] } + if c.Startup.StorageReadyWaitSeconds <= 0 { + c.Startup.StorageReadyWaitSeconds = 420 + } + if c.Startup.StorageReadyPollSeconds <= 0 { + c.Startup.StorageReadyPollSeconds = 5 + } + if c.Startup.StorageMinReadyNodes <= 0 { + c.Startup.StorageMinReadyNodes = 2 + } + if len(c.Startup.StorageCriticalPVCs) == 0 { + c.Startup.StorageCriticalPVCs = []string{ + "vault/data-vault-0", + "postgres/postgres-data-postgres-0", + "gitea/gitea-data", + "sso/keycloak-data", + } + } + if c.Startup.PostStartProbeWaitSeconds <= 0 { + c.Startup.PostStartProbeWaitSeconds = 240 + } + if c.Startup.PostStartProbePollSeconds <= 0 { + c.Startup.PostStartProbePollSeconds = 5 + } + if len(c.Startup.PostStartProbes) == 0 { + c.Startup.PostStartProbes = []string{ + "https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration", + "https://scm.bstein.dev/user/login", + "https://metrics.bstein.dev/login", + } + } + if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" { + c.Startup.VaultUnsealKeyFile = "/var/lib/hecate/vault-unseal.key" + } if c.SSHPort <= 0 { c.SSHPort = 2277 } diff --git a/internal/config/config_test.go b/internal/config/config_test.go index d08dfb7..c6339ca 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -92,4 +92,35 @@ state: if cfg.Startup.EtcdRestoreControlPlane == "" { t.Fatalf("expected startup etcd restore control plane default to be set") } + if cfg.Startup.TimeSyncMode == "" { + t.Fatalf("expected startup time sync mode default to be set") + } + if cfg.Startup.VaultUnsealKeyFile == "" { + t.Fatalf("expected startup vault unseal key file default to be set") + } +} + +func TestValidateRejectsInvalidTimeSyncMode(t *testing.T) { + cfg := defaults() + cfg.Startup.TimeSyncMode = "invalid" + if err := cfg.Validate(); err == nil { + t.Fatalf("expected validation error for invalid time_sync_mode") + } +} + +func TestValidateRejectsBadStoragePVCFormat(t *testing.T) { + cfg := defaults() + cfg.Startup.StorageCriticalPVCs = []string{"vault-data-vault-0"} + if err := cfg.Validate(); err == nil { + t.Fatalf("expected validation error for invalid storage_critical_pvcs entry") + } +} + +func TestValidateRejectsMissingPostStartProbesWhenRequired(t *testing.T) { + cfg := defaults() + cfg.Startup.RequirePostStartProbes = true + cfg.Startup.PostStartProbes = nil + if err := cfg.Validate(); err == nil { + t.Fatalf("expected validation error when post start probes are required but empty") + } } diff --git a/scripts/install.sh b/scripts/install.sh index 4e5d992..6cbe732 100755 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -209,6 +209,31 @@ migrate_hecate_config() { echo "[install] added startup time sync + access reconciliation defaults" changed=1 fi + if grep -Eq '^ time_sync_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/hecate.yaml" \ + && ! grep -Eq '^ time_sync_mode:[[:space:]]*(strict|quorum)' "${CONF_DIR}/hecate.yaml"; then + sed -Ei '/^ time_sync_poll_seconds:[[:space:]]*[0-9]+/a\ time_sync_mode: quorum\n time_sync_quorum: 2' "${CONF_DIR}/hecate.yaml" + echo "[install] added startup time sync quorum defaults" + changed=1 + fi + if grep -Eq '^ etcd_restore_control_plane:[[:space:]]*[A-Za-z0-9._-]+' "${CONF_DIR}/hecate.yaml" \ + && ! grep -Eq '^ require_storage_ready:[[:space:]]*(true|false)' "${CONF_DIR}/hecate.yaml"; then + sed -Ei '/^ etcd_restore_control_plane:[[:space:]]*[A-Za-z0-9._-]+/a\ require_storage_ready: true\n storage_ready_wait_seconds: 420\n storage_ready_poll_seconds: 5\n storage_min_ready_nodes: 2\n storage_critical_pvcs:\n - vault/data-vault-0\n - postgres/postgres-data-postgres-0\n - gitea/gitea-data\n - sso/keycloak-data' "${CONF_DIR}/hecate.yaml" + echo "[install] added startup storage readiness defaults" + changed=1 + fi + if grep -Eq '^ storage_critical_pvcs:[[:space:]]*$' "${CONF_DIR}/hecate.yaml" \ + && ! grep -Eq '^ require_post_start_probes:[[:space:]]*(true|false)' "${CONF_DIR}/hecate.yaml"; then + sed -Ei '/^ - sso\/keycloak-data$/a\ require_post_start_probes: true\n post_start_probe_wait_seconds: 240\n post_start_probe_poll_seconds: 5\n post_start_probes:\n - https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration\n - https://scm.bstein.dev/user/login\n - https://metrics.bstein.dev/login\n vault_unseal_key_file: /var/lib/hecate/vault-unseal.key' "${CONF_DIR}/hecate.yaml" + echo "[install] added startup post-start probe + vault key fallback defaults" + changed=1 + fi + if ! grep -Eq '^ vault_unseal_key_file:[[:space:]]*/var/lib/hecate/vault-unseal.key' "${CONF_DIR}/hecate.yaml"; then + if grep -Eq '^startup:[[:space:]]*$' "${CONF_DIR}/hecate.yaml" && grep -Eq '^ post_start_probes:[[:space:]]*$' "${CONF_DIR}/hecate.yaml"; then + sed -Ei '/^ - https:\/\/metrics\.bstein\.dev\/login$/a\ vault_unseal_key_file: /var/lib/hecate/vault-unseal.key' "${CONF_DIR}/hecate.yaml" + echo "[install] added startup.vault_unseal_key_file default" + changed=1 + fi + fi local role role="$(read_hecate_role)" @@ -371,14 +396,14 @@ migrate_hecate_config() { changed=1 fi - if ! grep -Eq '^ - services/gitea$' "${CONF_DIR}/hecate.yaml"; then - perl -0pi -e 's#local_bootstrap_paths:\n(?: - [^\n]*\n)*#local_bootstrap_paths:\n - infrastructure/core\n - clusters/atlas/flux-system\n - infrastructure/sources/helm\n - infrastructure/metallb\n - infrastructure/traefik\n - infrastructure/vault-csi\n - infrastructure/vault-injector\n - services/vault\n - infrastructure/postgres\n - services/gitea\n#s' "${CONF_DIR}/hecate.yaml" + if ! grep -Eq '^ - services/keycloak$' "${CONF_DIR}/hecate.yaml" || ! grep -Eq '^ - infrastructure/cert-manager$' "${CONF_DIR}/hecate.yaml" || ! grep -Eq '^ - services/oauth2-proxy$' "${CONF_DIR}/hecate.yaml"; then + perl -0pi -e 's#local_bootstrap_paths:\n(?: - [^\n]*\n)*#local_bootstrap_paths:\n - infrastructure/core\n - clusters/atlas/flux-system\n - infrastructure/sources/helm\n - infrastructure/metallb\n - infrastructure/traefik\n - infrastructure/cert-manager\n - infrastructure/vault-csi\n - infrastructure/vault-injector\n - services/vault\n - infrastructure/postgres\n - services/gitea\n - services/keycloak\n - services/oauth2-proxy\n#s' "${CONF_DIR}/hecate.yaml" echo "[install] refreshed peer local_bootstrap_paths for full fallback bootstrap parity" changed=1 fi if perl -0777 -ne 'exit(!(/local_bootstrap_paths:\n - infrastructure\/core\n/s))' "${CONF_DIR}/hecate.yaml"; then - perl -0pi -e 's#local_bootstrap_paths:\n - infrastructure/core\n#local_bootstrap_paths:\n - infrastructure/core\n - clusters/atlas/flux-system\n - infrastructure/sources/helm\n - infrastructure/metallb\n - infrastructure/traefik\n - infrastructure/vault-csi\n - infrastructure/vault-injector\n - services/vault\n - infrastructure/postgres\n - services/gitea\n#s' "${CONF_DIR}/hecate.yaml" + perl -0pi -e 's#local_bootstrap_paths:\n - infrastructure/core\n#local_bootstrap_paths:\n - infrastructure/core\n - clusters/atlas/flux-system\n - infrastructure/sources/helm\n - infrastructure/metallb\n - infrastructure/traefik\n - infrastructure/cert-manager\n - infrastructure/vault-csi\n - infrastructure/vault-injector\n - services/vault\n - infrastructure/postgres\n - services/gitea\n - services/keycloak\n - services/oauth2-proxy\n#s' "${CONF_DIR}/hecate.yaml" echo "[install] expanded peer local_bootstrap_paths for full fallback bootstrap parity" changed=1 fi