hecate: harden startup with storage gates and fallback cache

This commit is contained in:
Brad Stein 2026-04-05 01:55:56 -03:00
parent a05973bf2b
commit 72d33bc2ce
7 changed files with 636 additions and 24 deletions

View File

@ -48,9 +48,28 @@ startup:
require_time_sync: true
time_sync_wait_seconds: 240
time_sync_poll_seconds: 5
time_sync_mode: quorum
time_sync_quorum: 2
reconcile_access_on_boot: true
auto_etcd_restore_on_api_failure: true
etcd_restore_control_plane: titan-0a
require_storage_ready: true
storage_ready_wait_seconds: 420
storage_ready_poll_seconds: 5
storage_min_ready_nodes: 2
storage_critical_pvcs:
- vault/data-vault-0
- postgres/postgres-data-postgres-0
- gitea/gitea-data
- sso/keycloak-data
require_post_start_probes: true
post_start_probe_wait_seconds: 240
post_start_probe_poll_seconds: 5
post_start_probes:
- https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration
- https://scm.bstein.dev/user/login
- https://metrics.bstein.dev/login
vault_unseal_key_file: /var/lib/hecate/vault-unseal.key
shutdown:
default_budget_seconds: 1380
history_min_samples: 3

View File

@ -114,9 +114,28 @@ startup:
require_time_sync: true
time_sync_wait_seconds: 240
time_sync_poll_seconds: 5
time_sync_mode: quorum
time_sync_quorum: 2
reconcile_access_on_boot: true
auto_etcd_restore_on_api_failure: true
etcd_restore_control_plane: titan-0a
require_storage_ready: true
storage_ready_wait_seconds: 420
storage_ready_poll_seconds: 5
storage_min_ready_nodes: 2
storage_critical_pvcs:
- vault/data-vault-0
- postgres/postgres-data-postgres-0
- gitea/gitea-data
- sso/keycloak-data
require_post_start_probes: true
post_start_probe_wait_seconds: 240
post_start_probe_poll_seconds: 5
post_start_probes:
- https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration
- https://scm.bstein.dev/user/login
- https://metrics.bstein.dev/login
vault_unseal_key_file: /var/lib/hecate/vault-unseal.key
shutdown:
default_budget_seconds: 1380
history_min_samples: 3

View File

@ -114,9 +114,28 @@ startup:
require_time_sync: true
time_sync_wait_seconds: 240
time_sync_poll_seconds: 5
time_sync_mode: quorum
time_sync_quorum: 2
reconcile_access_on_boot: true
auto_etcd_restore_on_api_failure: true
etcd_restore_control_plane: titan-0a
require_storage_ready: true
storage_ready_wait_seconds: 420
storage_ready_poll_seconds: 5
storage_min_ready_nodes: 2
storage_critical_pvcs:
- vault/data-vault-0
- postgres/postgres-data-postgres-0
- gitea/gitea-data
- sso/keycloak-data
require_post_start_probes: true
post_start_probe_wait_seconds: 240
post_start_probe_poll_seconds: 5
post_start_probes:
- https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration
- https://scm.bstein.dev/user/login
- https://metrics.bstein.dev/login
vault_unseal_key_file: /var/lib/hecate/vault-unseal.key
shutdown:
default_budget_seconds: 1380
history_min_samples: 3

View File

@ -141,6 +141,8 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
if err := o.preflightExternalDatastore(ctx); err != nil {
return err
}
o.bestEffort("sync local titan-iac checkout", func() error { return o.syncLocalIACRepo(ctx) })
o.bestEffort("refresh bootstrap cache from local repo", func() error { return o.refreshBootstrapCache(ctx) })
if o.cfg.Startup.ReconcileAccessOnBoot {
o.bestEffort("reconcile control-plane access", func() error { return o.reconcileNodeAccess(ctx, o.cfg.ControlPlanes) })
}
@ -217,6 +219,11 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
if len(missing) > 0 {
o.log.Printf("startup critical workloads not ready; applying targeted recovery first: %s", strings.Join(missing, ", "))
}
if o.cfg.Startup.RequireStorageReady {
if err := o.waitForStorageReady(ctx); err != nil {
return err
}
}
if err := o.ensureCriticalStartupWorkloads(ctx); err != nil {
return err
}
@ -252,6 +259,11 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
if err := o.resumeFluxAndReconcile(ctx); err != nil {
return err
}
if o.cfg.Startup.RequirePostStartProbes {
if err := o.waitForPostStartProbes(ctx); err != nil {
return err
}
}
o.log.Printf("startup flow complete")
return nil
}
@ -923,11 +935,40 @@ func (o *Orchestrator) waitForTimeSync(ctx context.Context, nodes []string) erro
if poll <= 0 {
poll = 5 * time.Second
}
mode := strings.ToLower(strings.TrimSpace(o.cfg.Startup.TimeSyncMode))
if mode == "" {
mode = "strict"
}
managedControlPlanes := 0
for _, node := range nodes {
node = strings.TrimSpace(node)
if node == "" {
continue
}
if o.sshManaged(node) {
managedControlPlanes++
}
}
requiredQuorum := o.cfg.Startup.TimeSyncQuorum
if requiredQuorum <= 0 {
requiredQuorum = managedControlPlanes
if requiredQuorum <= 0 {
requiredQuorum = 1
}
}
if requiredQuorum > managedControlPlanes && managedControlPlanes > 0 {
requiredQuorum = managedControlPlanes
}
deadline := time.Now().Add(wait)
for {
unsynced := []string{}
syncedControlPlanes := 0
checkedControlPlanes := 0
localOut, localErr := o.run(ctx, 10*time.Second, "sh", "-lc", "timedatectl show -p NTPSynchronized --value 2>/dev/null || echo unknown")
if localErr != nil || !isTimeSynced(localOut) {
localSynced := localErr == nil && isTimeSynced(localOut)
if !localSynced {
if localErr != nil {
unsynced = append(unsynced, fmt.Sprintf("local(%v)", localErr))
} else {
@ -942,6 +983,7 @@ func (o *Orchestrator) waitForTimeSync(ctx context.Context, nodes []string) erro
if !o.sshManaged(node) {
continue
}
checkedControlPlanes++
out, err := o.ssh(ctx, node, "timedatectl show -p NTPSynchronized --value 2>/dev/null || echo unknown")
if err != nil || !isTimeSynced(out) {
if err != nil {
@ -949,12 +991,38 @@ func (o *Orchestrator) waitForTimeSync(ctx context.Context, nodes []string) erro
} else {
unsynced = append(unsynced, fmt.Sprintf("%s(%s)", node, strings.TrimSpace(out)))
}
} else {
syncedControlPlanes++
}
}
if len(unsynced) == 0 {
ready := false
switch mode {
case "quorum":
if localSynced && syncedControlPlanes >= requiredQuorum {
ready = true
}
default:
if localSynced && len(unsynced) == 0 {
ready = true
}
}
if ready {
return nil
}
if time.Now().After(deadline) {
if mode == "quorum" {
return fmt.Errorf(
"startup blocked: time sync quorum not ready within %s (mode=quorum local_synced=%t synced_control_planes=%d required=%d checked=%d details=%s)",
wait,
localSynced,
syncedControlPlanes,
requiredQuorum,
checkedControlPlanes,
strings.Join(unsynced, ", "),
)
}
return fmt.Errorf("startup blocked: time sync not ready within %s (%s)", wait, strings.Join(unsynced, ", "))
}
select {
@ -1183,23 +1251,30 @@ func (o *Orchestrator) ensureFluxBranch(ctx context.Context, branch string) erro
func (o *Orchestrator) bootstrapLocal(ctx context.Context) error {
failures := 0
successes := 0
for _, rel := range o.cfg.LocalBootstrapPaths {
full := filepath.Join(o.cfg.IACRepoPath, rel)
o.log.Printf("local bootstrap apply -k %s", full)
o.log.Printf("local bootstrap apply rel=%s path=%s", rel, full)
if o.runner.DryRun {
successes++
continue
}
if _, err := o.kubectl(ctx, 2*time.Minute, "apply", "-k", full); err != nil {
o.log.Printf("warning: local bootstrap apply failed at %s: %v", full, err)
o.log.Printf("local bootstrap fallback render/apply with LoadRestrictionsNone for %s", full)
if fallbackErr := o.applyKustomizeFallback(ctx, full); fallbackErr != nil {
failures++
o.log.Printf("warning: local bootstrap fallback failed at %s: %v", full, fallbackErr)
continue
o.log.Printf("local bootstrap cache apply for rel=%s", rel)
if cacheErr := o.applyBootstrapCache(ctx, rel); cacheErr != nil {
failures++
o.log.Printf("warning: local bootstrap cache apply failed for rel=%s: %v", rel, cacheErr)
continue
}
}
}
successes++
}
if failures == len(o.cfg.LocalBootstrapPaths) {
if failures > 0 && successes == 0 {
return fmt.Errorf("local bootstrap apply failed for every configured path (%d total)", failures)
}
return nil
@ -1213,6 +1288,99 @@ func (o *Orchestrator) applyKustomizeFallback(ctx context.Context, full string)
return nil
}
func (o *Orchestrator) syncLocalIACRepo(ctx context.Context) error {
repo := strings.TrimSpace(o.cfg.IACRepoPath)
if repo == "" {
return fmt.Errorf("iac repo path is empty")
}
gitDir := filepath.Join(repo, ".git")
if stat, err := os.Stat(gitDir); err != nil || stat.IsDir() == false {
return fmt.Errorf("iac repo %s is not a git checkout", repo)
}
statusOut, statusErr := o.runSensitive(ctx, 10*time.Second, "git", "-C", repo, "status", "--porcelain")
if statusErr != nil {
return fmt.Errorf("inspect iac repo working tree: %w", statusErr)
}
if strings.TrimSpace(statusOut) != "" {
o.log.Printf("warning: skipping local titan-iac sync because working tree is dirty")
return nil
}
branch := strings.TrimSpace(o.cfg.ExpectedFluxBranch)
if branch == "" {
branch = "main"
}
if _, err := o.runSensitive(ctx, 45*time.Second, "git", "-C", repo, "fetch", "origin", "--prune"); err != nil {
return fmt.Errorf("git fetch origin: %w", err)
}
if _, err := o.runSensitive(ctx, 20*time.Second, "git", "-C", repo, "checkout", branch); err != nil {
return fmt.Errorf("git checkout %s: %w", branch, err)
}
if _, err := o.runSensitive(ctx, 20*time.Second, "git", "-C", repo, "reset", "--hard", "origin/"+branch); err != nil {
return fmt.Errorf("git reset --hard origin/%s: %w", branch, err)
}
return nil
}
func (o *Orchestrator) refreshBootstrapCache(ctx context.Context) error {
if len(o.cfg.LocalBootstrapPaths) == 0 {
return nil
}
if err := os.MkdirAll(o.bootstrapCacheDir(), 0o755); err != nil {
return fmt.Errorf("ensure bootstrap cache dir: %w", err)
}
rendered := 0
for _, rel := range o.cfg.LocalBootstrapPaths {
rel = strings.TrimSpace(rel)
if rel == "" {
continue
}
full := filepath.Join(o.cfg.IACRepoPath, rel)
if stat, err := os.Stat(full); err != nil || !stat.IsDir() {
o.log.Printf("warning: skip bootstrap cache render for rel=%s (path missing)", rel)
continue
}
cmd := fmt.Sprintf("kubectl kustomize --load-restrictor=LoadRestrictionsNone %q", full)
manifest, err := o.runSensitive(ctx, 2*time.Minute, "sh", "-lc", cmd)
if err != nil {
o.log.Printf("warning: bootstrap cache render failed for rel=%s: %v", rel, err)
continue
}
cachePath := o.bootstrapCachePath(rel)
if err := os.WriteFile(cachePath, []byte(manifest+"\n"), 0o644); err != nil {
o.log.Printf("warning: bootstrap cache write failed for rel=%s path=%s: %v", rel, cachePath, err)
continue
}
rendered++
}
if rendered == 0 {
return fmt.Errorf("no bootstrap cache manifests rendered")
}
o.log.Printf("bootstrap cache refreshed (%d paths)", rendered)
return nil
}
func (o *Orchestrator) applyBootstrapCache(ctx context.Context, rel string) error {
cachePath := o.bootstrapCachePath(rel)
if _, err := os.Stat(cachePath); err != nil {
return fmt.Errorf("bootstrap cache missing at %s: %w", cachePath, err)
}
if _, err := o.kubectl(ctx, 2*time.Minute, "apply", "-f", cachePath); err != nil {
return err
}
return nil
}
func (o *Orchestrator) bootstrapCacheDir() string {
return filepath.Join(o.cfg.State.Dir, "bootstrap-cache")
}
func (o *Orchestrator) bootstrapCachePath(rel string) string {
safe := strings.TrimSpace(rel)
safe = strings.ReplaceAll(safe, "/", "__")
safe = strings.ReplaceAll(safe, string(os.PathSeparator), "__")
return filepath.Join(o.bootstrapCacheDir(), safe+".yaml")
}
func (o *Orchestrator) waitForFluxSourceReady(ctx context.Context, window time.Duration) (bool, error) {
if o.runner.DryRun {
return true, nil
@ -1237,6 +1405,184 @@ func (o *Orchestrator) waitForFluxSourceReady(ctx context.Context, window time.D
}
}
func (o *Orchestrator) waitForStorageReady(ctx context.Context) error {
if o.runner.DryRun {
return nil
}
wait := time.Duration(o.cfg.Startup.StorageReadyWaitSeconds) * time.Second
if wait <= 0 {
wait = 420 * time.Second
}
poll := time.Duration(o.cfg.Startup.StorageReadyPollSeconds) * time.Second
if poll <= 0 {
poll = 5 * time.Second
}
deadline := time.Now().Add(wait)
lastReason := "unknown"
for {
ok, reason, err := o.storageReady(ctx)
if err != nil {
lastReason = err.Error()
} else {
lastReason = reason
}
if ok {
o.log.Printf("storage readiness check passed (%s)", reason)
return nil
}
if time.Now().After(deadline) {
return fmt.Errorf("startup blocked: storage readiness not satisfied within %s (%s)", wait, lastReason)
}
select {
case <-ctx.Done():
return ctx.Err()
case <-time.After(poll):
}
}
}
func (o *Orchestrator) storageReady(ctx context.Context) (bool, string, error) {
minReady := o.cfg.Startup.StorageMinReadyNodes
if minReady <= 0 {
minReady = 2
}
longhornOut, err := o.kubectl(
ctx,
15*time.Second,
"-n",
"longhorn-system",
"get",
"nodes.longhorn.io",
"-o",
`jsonpath={range .items[*]}{.metadata.name}{":"}{.status.conditions[?(@.type=="Ready")].status}{":"}{.status.conditions[?(@.type=="Schedulable")].status}{"\n"}{end}`,
)
if err != nil {
return false, "", fmt.Errorf("query longhorn nodes: %w", err)
}
readyNodes := 0
for _, line := range lines(longhornOut) {
parts := strings.Split(line, ":")
if len(parts) < 3 {
continue
}
ready := strings.EqualFold(strings.TrimSpace(parts[1]), "true")
sched := strings.EqualFold(strings.TrimSpace(parts[2]), "true")
if ready && sched {
readyNodes++
}
}
if readyNodes < minReady {
return false, fmt.Sprintf("longhorn ready+sched nodes %d/%d", readyNodes, minReady), nil
}
for _, item := range o.cfg.Startup.StorageCriticalPVCs {
item = strings.TrimSpace(item)
if item == "" {
continue
}
parts := strings.SplitN(item, "/", 2)
if len(parts) != 2 {
return false, "", fmt.Errorf("invalid storage_critical_pvcs entry %q", item)
}
ns := strings.TrimSpace(parts[0])
name := strings.TrimSpace(parts[1])
out, pvcErr := o.kubectl(ctx, 15*time.Second, "-n", ns, "get", "pvc", name, "-o", "jsonpath={.status.phase}")
if pvcErr != nil {
if isNotFoundErr(pvcErr) {
return false, fmt.Sprintf("pvc %s/%s not found", ns, name), nil
}
return false, "", fmt.Errorf("query pvc %s/%s: %w", ns, name, pvcErr)
}
if !strings.EqualFold(strings.TrimSpace(out), "Bound") {
return false, fmt.Sprintf("pvc %s/%s phase=%s", ns, name, strings.TrimSpace(out)), nil
}
}
return true, fmt.Sprintf("longhorn ready+sched nodes=%d critical pvcs bound=%d", readyNodes, len(o.cfg.Startup.StorageCriticalPVCs)), nil
}
func (o *Orchestrator) waitForPostStartProbes(ctx context.Context) error {
if o.runner.DryRun {
return nil
}
wait := time.Duration(o.cfg.Startup.PostStartProbeWaitSeconds) * time.Second
if wait <= 0 {
wait = 240 * time.Second
}
poll := time.Duration(o.cfg.Startup.PostStartProbePollSeconds) * time.Second
if poll <= 0 {
poll = 5 * time.Second
}
deadline := time.Now().Add(wait)
lastFailure := "unknown"
for {
ok, failure := o.postStartProbesReady(ctx)
if ok {
o.log.Printf("post-start probes passed")
return nil
}
lastFailure = failure
if time.Now().After(deadline) {
return fmt.Errorf("startup blocked: post-start probes did not pass within %s (%s)", wait, lastFailure)
}
select {
case <-ctx.Done():
return ctx.Err()
case <-time.After(poll):
}
}
}
func (o *Orchestrator) postStartProbesReady(ctx context.Context) (bool, string) {
probes := make([]string, 0, len(o.cfg.Startup.PostStartProbes))
for _, p := range o.cfg.Startup.PostStartProbes {
p = strings.TrimSpace(p)
if p != "" {
probes = append(probes, p)
}
}
if len(probes) == 0 {
return true, "no probes configured"
}
for _, probe := range probes {
code, err := o.httpProbe(ctx, probe)
if err != nil {
return false, fmt.Sprintf("%s: %v", probe, err)
}
if code < 200 || code >= 400 {
return false, fmt.Sprintf("%s: unexpected status code=%d", probe, code)
}
}
return true, "all probes successful"
}
func (o *Orchestrator) httpProbe(ctx context.Context, probeURL string) (int, error) {
out, err := o.run(
ctx,
20*time.Second,
"curl",
"--silent",
"--show-error",
"--location",
"--max-time",
"12",
"--output",
"/dev/null",
"--write-out",
"%{http_code}",
probeURL,
)
if err != nil {
return 0, err
}
code, convErr := strconv.Atoi(strings.TrimSpace(out))
if convErr != nil {
return 0, fmt.Errorf("parse http status %q: %w", strings.TrimSpace(out), convErr)
}
return code, nil
}
func (o *Orchestrator) resumeFluxAndReconcile(ctx context.Context) error {
if err := o.patchFluxSuspendAll(ctx, false); err != nil {
return err
@ -1678,16 +2024,56 @@ func (o *Orchestrator) vaultUnsealKey(ctx context.Context) (string, error) {
"get", "secret", "vault-init",
"-o", "jsonpath={.data.unseal_key_b64}",
)
if err != nil {
return "", fmt.Errorf("read vault-init secret: %w", err)
if err == nil {
decoded, decodeErr := base64.StdEncoding.DecodeString(strings.TrimSpace(out))
if decodeErr == nil {
key := strings.TrimSpace(string(decoded))
if key != "" {
o.bestEffort("cache vault unseal key locally", func() error { return o.writeVaultUnsealKeyFile(key) })
return key, nil
}
err = fmt.Errorf("vault-init unseal key is empty")
} else {
err = fmt.Errorf("decode vault-init unseal_key_b64: %w", decodeErr)
}
} else {
err = fmt.Errorf("read vault-init secret: %w", err)
}
decoded, err := base64.StdEncoding.DecodeString(strings.TrimSpace(out))
if err != nil {
return "", fmt.Errorf("decode vault-init unseal_key_b64: %w", err)
fallbackKey, fileErr := o.readVaultUnsealKeyFile()
if fileErr == nil {
o.log.Printf("warning: using cached vault unseal key from %s", o.cfg.Startup.VaultUnsealKeyFile)
return fallbackKey, nil
}
key := strings.TrimSpace(string(decoded))
return "", fmt.Errorf("%v; fallback %v", err, fileErr)
}
func (o *Orchestrator) writeVaultUnsealKeyFile(key string) error {
path := strings.TrimSpace(o.cfg.Startup.VaultUnsealKeyFile)
if path == "" {
return fmt.Errorf("vault unseal key file path is empty")
}
if err := os.MkdirAll(filepath.Dir(path), 0o700); err != nil {
return fmt.Errorf("ensure vault unseal key dir: %w", err)
}
if err := os.WriteFile(path, []byte(strings.TrimSpace(key)+"\n"), 0o600); err != nil {
return fmt.Errorf("write vault unseal key file: %w", err)
}
return nil
}
func (o *Orchestrator) readVaultUnsealKeyFile() (string, error) {
path := strings.TrimSpace(o.cfg.Startup.VaultUnsealKeyFile)
if path == "" {
return "", fmt.Errorf("vault unseal key file path is empty")
}
b, err := os.ReadFile(path)
if err != nil {
return "", fmt.Errorf("read vault unseal key file %s: %w", path, err)
}
key := strings.TrimSpace(string(b))
if key == "" {
return "", fmt.Errorf("vault-init unseal key is empty")
return "", fmt.Errorf("vault unseal key file %s is empty", path)
}
return key, nil
}

View File

@ -3,6 +3,7 @@ package config
import (
"fmt"
"os"
"strings"
"gopkg.in/yaml.v3"
)
@ -33,14 +34,26 @@ type Config struct {
}
type Startup struct {
APIWaitSeconds int `yaml:"api_wait_seconds"`
APIPollSeconds int `yaml:"api_poll_seconds"`
RequireTimeSync bool `yaml:"require_time_sync"`
TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"`
TimeSyncPollSeconds int `yaml:"time_sync_poll_seconds"`
ReconcileAccessOnBoot bool `yaml:"reconcile_access_on_boot"`
AutoEtcdRestoreOnAPIFailure bool `yaml:"auto_etcd_restore_on_api_failure"`
EtcdRestoreControlPlane string `yaml:"etcd_restore_control_plane"`
APIWaitSeconds int `yaml:"api_wait_seconds"`
APIPollSeconds int `yaml:"api_poll_seconds"`
RequireTimeSync bool `yaml:"require_time_sync"`
TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"`
TimeSyncPollSeconds int `yaml:"time_sync_poll_seconds"`
TimeSyncMode string `yaml:"time_sync_mode"`
TimeSyncQuorum int `yaml:"time_sync_quorum"`
ReconcileAccessOnBoot bool `yaml:"reconcile_access_on_boot"`
AutoEtcdRestoreOnAPIFailure bool `yaml:"auto_etcd_restore_on_api_failure"`
EtcdRestoreControlPlane string `yaml:"etcd_restore_control_plane"`
RequireStorageReady bool `yaml:"require_storage_ready"`
StorageReadyWaitSeconds int `yaml:"storage_ready_wait_seconds"`
StorageReadyPollSeconds int `yaml:"storage_ready_poll_seconds"`
StorageMinReadyNodes int `yaml:"storage_min_ready_nodes"`
StorageCriticalPVCs []string `yaml:"storage_critical_pvcs"`
RequirePostStartProbes bool `yaml:"require_post_start_probes"`
PostStartProbeWaitSeconds int `yaml:"post_start_probe_wait_seconds"`
PostStartProbePollSeconds int `yaml:"post_start_probe_poll_seconds"`
PostStartProbes []string `yaml:"post_start_probes"`
VaultUnsealKeyFile string `yaml:"vault_unseal_key_file"`
}
type Shutdown struct {
@ -162,6 +175,12 @@ func (c Config) Validate() error {
if c.Startup.TimeSyncPollSeconds <= 0 {
return fmt.Errorf("config.startup.time_sync_poll_seconds must be > 0")
}
if c.Startup.TimeSyncMode != "strict" && c.Startup.TimeSyncMode != "quorum" {
return fmt.Errorf("config.startup.time_sync_mode must be strict or quorum")
}
if c.Startup.TimeSyncMode == "quorum" && c.Startup.TimeSyncQuorum <= 0 {
return fmt.Errorf("config.startup.time_sync_quorum must be > 0 when time_sync_mode=quorum")
}
if c.Startup.EtcdRestoreControlPlane != "" {
found := false
for _, cp := range c.ControlPlanes {
@ -174,6 +193,37 @@ func (c Config) Validate() error {
return fmt.Errorf("config.startup.etcd_restore_control_plane must be one of config.control_planes when set")
}
}
if c.Startup.StorageReadyWaitSeconds <= 0 {
return fmt.Errorf("config.startup.storage_ready_wait_seconds must be > 0")
}
if c.Startup.StorageReadyPollSeconds <= 0 {
return fmt.Errorf("config.startup.storage_ready_poll_seconds must be > 0")
}
if c.Startup.StorageMinReadyNodes <= 0 {
return fmt.Errorf("config.startup.storage_min_ready_nodes must be > 0")
}
for _, pvc := range c.Startup.StorageCriticalPVCs {
if strings.Count(strings.TrimSpace(pvc), "/") != 1 {
return fmt.Errorf("config.startup.storage_critical_pvcs entries must be namespace/name, got %q", pvc)
}
}
if c.Startup.PostStartProbeWaitSeconds <= 0 {
return fmt.Errorf("config.startup.post_start_probe_wait_seconds must be > 0")
}
if c.Startup.PostStartProbePollSeconds <= 0 {
return fmt.Errorf("config.startup.post_start_probe_poll_seconds must be > 0")
}
if c.Startup.RequirePostStartProbes && len(c.Startup.PostStartProbes) == 0 {
return fmt.Errorf("config.startup.post_start_probes must not be empty when require_post_start_probes is true")
}
for _, probe := range c.Startup.PostStartProbes {
if strings.TrimSpace(probe) == "" {
return fmt.Errorf("config.startup.post_start_probes entries must not be empty")
}
}
if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" {
return fmt.Errorf("config.startup.vault_unseal_key_file must not be empty")
}
if c.SSHPort <= 0 || c.SSHPort > 65535 {
return fmt.Errorf("config.ssh_port must be in range 1-65535")
}
@ -250,9 +300,30 @@ func defaults() Config {
RequireTimeSync: true,
TimeSyncWaitSeconds: 240,
TimeSyncPollSeconds: 5,
TimeSyncMode: "quorum",
TimeSyncQuorum: 2,
ReconcileAccessOnBoot: true,
AutoEtcdRestoreOnAPIFailure: true,
EtcdRestoreControlPlane: "titan-0a",
RequireStorageReady: true,
StorageReadyWaitSeconds: 420,
StorageReadyPollSeconds: 5,
StorageMinReadyNodes: 2,
StorageCriticalPVCs: []string{
"vault/data-vault-0",
"postgres/postgres-data-postgres-0",
"gitea/gitea-data",
"sso/keycloak-data",
},
RequirePostStartProbes: true,
PostStartProbeWaitSeconds: 240,
PostStartProbePollSeconds: 5,
PostStartProbes: []string{
"https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration",
"https://scm.bstein.dev/user/login",
"https://metrics.bstein.dev/login",
},
VaultUnsealKeyFile: "/var/lib/hecate/vault-unseal.key",
},
Shutdown: Shutdown{
DefaultBudgetSeconds: 1380,
@ -319,9 +390,51 @@ func (c *Config) applyDefaults() {
if c.Startup.TimeSyncPollSeconds <= 0 {
c.Startup.TimeSyncPollSeconds = 5
}
if c.Startup.TimeSyncMode == "" {
c.Startup.TimeSyncMode = "quorum"
}
if c.Startup.TimeSyncQuorum <= 0 {
c.Startup.TimeSyncQuorum = 2
}
if c.Startup.TimeSyncQuorum > len(c.ControlPlanes) && len(c.ControlPlanes) > 0 {
c.Startup.TimeSyncQuorum = len(c.ControlPlanes)
}
if c.Startup.EtcdRestoreControlPlane == "" && len(c.ControlPlanes) > 0 {
c.Startup.EtcdRestoreControlPlane = c.ControlPlanes[0]
}
if c.Startup.StorageReadyWaitSeconds <= 0 {
c.Startup.StorageReadyWaitSeconds = 420
}
if c.Startup.StorageReadyPollSeconds <= 0 {
c.Startup.StorageReadyPollSeconds = 5
}
if c.Startup.StorageMinReadyNodes <= 0 {
c.Startup.StorageMinReadyNodes = 2
}
if len(c.Startup.StorageCriticalPVCs) == 0 {
c.Startup.StorageCriticalPVCs = []string{
"vault/data-vault-0",
"postgres/postgres-data-postgres-0",
"gitea/gitea-data",
"sso/keycloak-data",
}
}
if c.Startup.PostStartProbeWaitSeconds <= 0 {
c.Startup.PostStartProbeWaitSeconds = 240
}
if c.Startup.PostStartProbePollSeconds <= 0 {
c.Startup.PostStartProbePollSeconds = 5
}
if len(c.Startup.PostStartProbes) == 0 {
c.Startup.PostStartProbes = []string{
"https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration",
"https://scm.bstein.dev/user/login",
"https://metrics.bstein.dev/login",
}
}
if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" {
c.Startup.VaultUnsealKeyFile = "/var/lib/hecate/vault-unseal.key"
}
if c.SSHPort <= 0 {
c.SSHPort = 2277
}

View File

@ -92,4 +92,35 @@ state:
if cfg.Startup.EtcdRestoreControlPlane == "" {
t.Fatalf("expected startup etcd restore control plane default to be set")
}
if cfg.Startup.TimeSyncMode == "" {
t.Fatalf("expected startup time sync mode default to be set")
}
if cfg.Startup.VaultUnsealKeyFile == "" {
t.Fatalf("expected startup vault unseal key file default to be set")
}
}
func TestValidateRejectsInvalidTimeSyncMode(t *testing.T) {
cfg := defaults()
cfg.Startup.TimeSyncMode = "invalid"
if err := cfg.Validate(); err == nil {
t.Fatalf("expected validation error for invalid time_sync_mode")
}
}
func TestValidateRejectsBadStoragePVCFormat(t *testing.T) {
cfg := defaults()
cfg.Startup.StorageCriticalPVCs = []string{"vault-data-vault-0"}
if err := cfg.Validate(); err == nil {
t.Fatalf("expected validation error for invalid storage_critical_pvcs entry")
}
}
func TestValidateRejectsMissingPostStartProbesWhenRequired(t *testing.T) {
cfg := defaults()
cfg.Startup.RequirePostStartProbes = true
cfg.Startup.PostStartProbes = nil
if err := cfg.Validate(); err == nil {
t.Fatalf("expected validation error when post start probes are required but empty")
}
}

View File

@ -209,6 +209,31 @@ migrate_hecate_config() {
echo "[install] added startup time sync + access reconciliation defaults"
changed=1
fi
if grep -Eq '^ time_sync_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/hecate.yaml" \
&& ! grep -Eq '^ time_sync_mode:[[:space:]]*(strict|quorum)' "${CONF_DIR}/hecate.yaml"; then
sed -Ei '/^ time_sync_poll_seconds:[[:space:]]*[0-9]+/a\ time_sync_mode: quorum\n time_sync_quorum: 2' "${CONF_DIR}/hecate.yaml"
echo "[install] added startup time sync quorum defaults"
changed=1
fi
if grep -Eq '^ etcd_restore_control_plane:[[:space:]]*[A-Za-z0-9._-]+' "${CONF_DIR}/hecate.yaml" \
&& ! grep -Eq '^ require_storage_ready:[[:space:]]*(true|false)' "${CONF_DIR}/hecate.yaml"; then
sed -Ei '/^ etcd_restore_control_plane:[[:space:]]*[A-Za-z0-9._-]+/a\ require_storage_ready: true\n storage_ready_wait_seconds: 420\n storage_ready_poll_seconds: 5\n storage_min_ready_nodes: 2\n storage_critical_pvcs:\n - vault/data-vault-0\n - postgres/postgres-data-postgres-0\n - gitea/gitea-data\n - sso/keycloak-data' "${CONF_DIR}/hecate.yaml"
echo "[install] added startup storage readiness defaults"
changed=1
fi
if grep -Eq '^ storage_critical_pvcs:[[:space:]]*$' "${CONF_DIR}/hecate.yaml" \
&& ! grep -Eq '^ require_post_start_probes:[[:space:]]*(true|false)' "${CONF_DIR}/hecate.yaml"; then
sed -Ei '/^ - sso\/keycloak-data$/a\ require_post_start_probes: true\n post_start_probe_wait_seconds: 240\n post_start_probe_poll_seconds: 5\n post_start_probes:\n - https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration\n - https://scm.bstein.dev/user/login\n - https://metrics.bstein.dev/login\n vault_unseal_key_file: /var/lib/hecate/vault-unseal.key' "${CONF_DIR}/hecate.yaml"
echo "[install] added startup post-start probe + vault key fallback defaults"
changed=1
fi
if ! grep -Eq '^ vault_unseal_key_file:[[:space:]]*/var/lib/hecate/vault-unseal.key' "${CONF_DIR}/hecate.yaml"; then
if grep -Eq '^startup:[[:space:]]*$' "${CONF_DIR}/hecate.yaml" && grep -Eq '^ post_start_probes:[[:space:]]*$' "${CONF_DIR}/hecate.yaml"; then
sed -Ei '/^ - https:\/\/metrics\.bstein\.dev\/login$/a\ vault_unseal_key_file: /var/lib/hecate/vault-unseal.key' "${CONF_DIR}/hecate.yaml"
echo "[install] added startup.vault_unseal_key_file default"
changed=1
fi
fi
local role
role="$(read_hecate_role)"
@ -371,14 +396,14 @@ migrate_hecate_config() {
changed=1
fi
if ! grep -Eq '^ - services/gitea$' "${CONF_DIR}/hecate.yaml"; then
perl -0pi -e 's#local_bootstrap_paths:\n(?: - [^\n]*\n)*#local_bootstrap_paths:\n - infrastructure/core\n - clusters/atlas/flux-system\n - infrastructure/sources/helm\n - infrastructure/metallb\n - infrastructure/traefik\n - infrastructure/vault-csi\n - infrastructure/vault-injector\n - services/vault\n - infrastructure/postgres\n - services/gitea\n#s' "${CONF_DIR}/hecate.yaml"
if ! grep -Eq '^ - services/keycloak$' "${CONF_DIR}/hecate.yaml" || ! grep -Eq '^ - infrastructure/cert-manager$' "${CONF_DIR}/hecate.yaml" || ! grep -Eq '^ - services/oauth2-proxy$' "${CONF_DIR}/hecate.yaml"; then
perl -0pi -e 's#local_bootstrap_paths:\n(?: - [^\n]*\n)*#local_bootstrap_paths:\n - infrastructure/core\n - clusters/atlas/flux-system\n - infrastructure/sources/helm\n - infrastructure/metallb\n - infrastructure/traefik\n - infrastructure/cert-manager\n - infrastructure/vault-csi\n - infrastructure/vault-injector\n - services/vault\n - infrastructure/postgres\n - services/gitea\n - services/keycloak\n - services/oauth2-proxy\n#s' "${CONF_DIR}/hecate.yaml"
echo "[install] refreshed peer local_bootstrap_paths for full fallback bootstrap parity"
changed=1
fi
if perl -0777 -ne 'exit(!(/local_bootstrap_paths:\n - infrastructure\/core\n/s))' "${CONF_DIR}/hecate.yaml"; then
perl -0pi -e 's#local_bootstrap_paths:\n - infrastructure/core\n#local_bootstrap_paths:\n - infrastructure/core\n - clusters/atlas/flux-system\n - infrastructure/sources/helm\n - infrastructure/metallb\n - infrastructure/traefik\n - infrastructure/vault-csi\n - infrastructure/vault-injector\n - services/vault\n - infrastructure/postgres\n - services/gitea\n#s' "${CONF_DIR}/hecate.yaml"
perl -0pi -e 's#local_bootstrap_paths:\n - infrastructure/core\n#local_bootstrap_paths:\n - infrastructure/core\n - clusters/atlas/flux-system\n - infrastructure/sources/helm\n - infrastructure/metallb\n - infrastructure/traefik\n - infrastructure/cert-manager\n - infrastructure/vault-csi\n - infrastructure/vault-injector\n - services/vault\n - infrastructure/postgres\n - services/gitea\n - services/keycloak\n - services/oauth2-proxy\n#s' "${CONF_DIR}/hecate.yaml"
echo "[install] expanded peer local_bootstrap_paths for full fallback bootstrap parity"
changed=1
fi