hecate: harden startup with storage gates and fallback cache
This commit is contained in:
parent
a05973bf2b
commit
72d33bc2ce
@ -48,9 +48,28 @@ startup:
|
|||||||
require_time_sync: true
|
require_time_sync: true
|
||||||
time_sync_wait_seconds: 240
|
time_sync_wait_seconds: 240
|
||||||
time_sync_poll_seconds: 5
|
time_sync_poll_seconds: 5
|
||||||
|
time_sync_mode: quorum
|
||||||
|
time_sync_quorum: 2
|
||||||
reconcile_access_on_boot: true
|
reconcile_access_on_boot: true
|
||||||
auto_etcd_restore_on_api_failure: true
|
auto_etcd_restore_on_api_failure: true
|
||||||
etcd_restore_control_plane: titan-0a
|
etcd_restore_control_plane: titan-0a
|
||||||
|
require_storage_ready: true
|
||||||
|
storage_ready_wait_seconds: 420
|
||||||
|
storage_ready_poll_seconds: 5
|
||||||
|
storage_min_ready_nodes: 2
|
||||||
|
storage_critical_pvcs:
|
||||||
|
- vault/data-vault-0
|
||||||
|
- postgres/postgres-data-postgres-0
|
||||||
|
- gitea/gitea-data
|
||||||
|
- sso/keycloak-data
|
||||||
|
require_post_start_probes: true
|
||||||
|
post_start_probe_wait_seconds: 240
|
||||||
|
post_start_probe_poll_seconds: 5
|
||||||
|
post_start_probes:
|
||||||
|
- https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration
|
||||||
|
- https://scm.bstein.dev/user/login
|
||||||
|
- https://metrics.bstein.dev/login
|
||||||
|
vault_unseal_key_file: /var/lib/hecate/vault-unseal.key
|
||||||
shutdown:
|
shutdown:
|
||||||
default_budget_seconds: 1380
|
default_budget_seconds: 1380
|
||||||
history_min_samples: 3
|
history_min_samples: 3
|
||||||
|
|||||||
@ -114,9 +114,28 @@ startup:
|
|||||||
require_time_sync: true
|
require_time_sync: true
|
||||||
time_sync_wait_seconds: 240
|
time_sync_wait_seconds: 240
|
||||||
time_sync_poll_seconds: 5
|
time_sync_poll_seconds: 5
|
||||||
|
time_sync_mode: quorum
|
||||||
|
time_sync_quorum: 2
|
||||||
reconcile_access_on_boot: true
|
reconcile_access_on_boot: true
|
||||||
auto_etcd_restore_on_api_failure: true
|
auto_etcd_restore_on_api_failure: true
|
||||||
etcd_restore_control_plane: titan-0a
|
etcd_restore_control_plane: titan-0a
|
||||||
|
require_storage_ready: true
|
||||||
|
storage_ready_wait_seconds: 420
|
||||||
|
storage_ready_poll_seconds: 5
|
||||||
|
storage_min_ready_nodes: 2
|
||||||
|
storage_critical_pvcs:
|
||||||
|
- vault/data-vault-0
|
||||||
|
- postgres/postgres-data-postgres-0
|
||||||
|
- gitea/gitea-data
|
||||||
|
- sso/keycloak-data
|
||||||
|
require_post_start_probes: true
|
||||||
|
post_start_probe_wait_seconds: 240
|
||||||
|
post_start_probe_poll_seconds: 5
|
||||||
|
post_start_probes:
|
||||||
|
- https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration
|
||||||
|
- https://scm.bstein.dev/user/login
|
||||||
|
- https://metrics.bstein.dev/login
|
||||||
|
vault_unseal_key_file: /var/lib/hecate/vault-unseal.key
|
||||||
shutdown:
|
shutdown:
|
||||||
default_budget_seconds: 1380
|
default_budget_seconds: 1380
|
||||||
history_min_samples: 3
|
history_min_samples: 3
|
||||||
|
|||||||
@ -114,9 +114,28 @@ startup:
|
|||||||
require_time_sync: true
|
require_time_sync: true
|
||||||
time_sync_wait_seconds: 240
|
time_sync_wait_seconds: 240
|
||||||
time_sync_poll_seconds: 5
|
time_sync_poll_seconds: 5
|
||||||
|
time_sync_mode: quorum
|
||||||
|
time_sync_quorum: 2
|
||||||
reconcile_access_on_boot: true
|
reconcile_access_on_boot: true
|
||||||
auto_etcd_restore_on_api_failure: true
|
auto_etcd_restore_on_api_failure: true
|
||||||
etcd_restore_control_plane: titan-0a
|
etcd_restore_control_plane: titan-0a
|
||||||
|
require_storage_ready: true
|
||||||
|
storage_ready_wait_seconds: 420
|
||||||
|
storage_ready_poll_seconds: 5
|
||||||
|
storage_min_ready_nodes: 2
|
||||||
|
storage_critical_pvcs:
|
||||||
|
- vault/data-vault-0
|
||||||
|
- postgres/postgres-data-postgres-0
|
||||||
|
- gitea/gitea-data
|
||||||
|
- sso/keycloak-data
|
||||||
|
require_post_start_probes: true
|
||||||
|
post_start_probe_wait_seconds: 240
|
||||||
|
post_start_probe_poll_seconds: 5
|
||||||
|
post_start_probes:
|
||||||
|
- https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration
|
||||||
|
- https://scm.bstein.dev/user/login
|
||||||
|
- https://metrics.bstein.dev/login
|
||||||
|
vault_unseal_key_file: /var/lib/hecate/vault-unseal.key
|
||||||
shutdown:
|
shutdown:
|
||||||
default_budget_seconds: 1380
|
default_budget_seconds: 1380
|
||||||
history_min_samples: 3
|
history_min_samples: 3
|
||||||
|
|||||||
@ -141,6 +141,8 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
|
|||||||
if err := o.preflightExternalDatastore(ctx); err != nil {
|
if err := o.preflightExternalDatastore(ctx); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
o.bestEffort("sync local titan-iac checkout", func() error { return o.syncLocalIACRepo(ctx) })
|
||||||
|
o.bestEffort("refresh bootstrap cache from local repo", func() error { return o.refreshBootstrapCache(ctx) })
|
||||||
if o.cfg.Startup.ReconcileAccessOnBoot {
|
if o.cfg.Startup.ReconcileAccessOnBoot {
|
||||||
o.bestEffort("reconcile control-plane access", func() error { return o.reconcileNodeAccess(ctx, o.cfg.ControlPlanes) })
|
o.bestEffort("reconcile control-plane access", func() error { return o.reconcileNodeAccess(ctx, o.cfg.ControlPlanes) })
|
||||||
}
|
}
|
||||||
@ -217,6 +219,11 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
|
|||||||
if len(missing) > 0 {
|
if len(missing) > 0 {
|
||||||
o.log.Printf("startup critical workloads not ready; applying targeted recovery first: %s", strings.Join(missing, ", "))
|
o.log.Printf("startup critical workloads not ready; applying targeted recovery first: %s", strings.Join(missing, ", "))
|
||||||
}
|
}
|
||||||
|
if o.cfg.Startup.RequireStorageReady {
|
||||||
|
if err := o.waitForStorageReady(ctx); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
if err := o.ensureCriticalStartupWorkloads(ctx); err != nil {
|
if err := o.ensureCriticalStartupWorkloads(ctx); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
@ -252,6 +259,11 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
|
|||||||
if err := o.resumeFluxAndReconcile(ctx); err != nil {
|
if err := o.resumeFluxAndReconcile(ctx); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
if o.cfg.Startup.RequirePostStartProbes {
|
||||||
|
if err := o.waitForPostStartProbes(ctx); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
o.log.Printf("startup flow complete")
|
o.log.Printf("startup flow complete")
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@ -923,11 +935,40 @@ func (o *Orchestrator) waitForTimeSync(ctx context.Context, nodes []string) erro
|
|||||||
if poll <= 0 {
|
if poll <= 0 {
|
||||||
poll = 5 * time.Second
|
poll = 5 * time.Second
|
||||||
}
|
}
|
||||||
|
|
||||||
|
mode := strings.ToLower(strings.TrimSpace(o.cfg.Startup.TimeSyncMode))
|
||||||
|
if mode == "" {
|
||||||
|
mode = "strict"
|
||||||
|
}
|
||||||
|
managedControlPlanes := 0
|
||||||
|
for _, node := range nodes {
|
||||||
|
node = strings.TrimSpace(node)
|
||||||
|
if node == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if o.sshManaged(node) {
|
||||||
|
managedControlPlanes++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
requiredQuorum := o.cfg.Startup.TimeSyncQuorum
|
||||||
|
if requiredQuorum <= 0 {
|
||||||
|
requiredQuorum = managedControlPlanes
|
||||||
|
if requiredQuorum <= 0 {
|
||||||
|
requiredQuorum = 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if requiredQuorum > managedControlPlanes && managedControlPlanes > 0 {
|
||||||
|
requiredQuorum = managedControlPlanes
|
||||||
|
}
|
||||||
|
|
||||||
deadline := time.Now().Add(wait)
|
deadline := time.Now().Add(wait)
|
||||||
for {
|
for {
|
||||||
unsynced := []string{}
|
unsynced := []string{}
|
||||||
|
syncedControlPlanes := 0
|
||||||
|
checkedControlPlanes := 0
|
||||||
localOut, localErr := o.run(ctx, 10*time.Second, "sh", "-lc", "timedatectl show -p NTPSynchronized --value 2>/dev/null || echo unknown")
|
localOut, localErr := o.run(ctx, 10*time.Second, "sh", "-lc", "timedatectl show -p NTPSynchronized --value 2>/dev/null || echo unknown")
|
||||||
if localErr != nil || !isTimeSynced(localOut) {
|
localSynced := localErr == nil && isTimeSynced(localOut)
|
||||||
|
if !localSynced {
|
||||||
if localErr != nil {
|
if localErr != nil {
|
||||||
unsynced = append(unsynced, fmt.Sprintf("local(%v)", localErr))
|
unsynced = append(unsynced, fmt.Sprintf("local(%v)", localErr))
|
||||||
} else {
|
} else {
|
||||||
@ -942,6 +983,7 @@ func (o *Orchestrator) waitForTimeSync(ctx context.Context, nodes []string) erro
|
|||||||
if !o.sshManaged(node) {
|
if !o.sshManaged(node) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
checkedControlPlanes++
|
||||||
out, err := o.ssh(ctx, node, "timedatectl show -p NTPSynchronized --value 2>/dev/null || echo unknown")
|
out, err := o.ssh(ctx, node, "timedatectl show -p NTPSynchronized --value 2>/dev/null || echo unknown")
|
||||||
if err != nil || !isTimeSynced(out) {
|
if err != nil || !isTimeSynced(out) {
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -949,12 +991,38 @@ func (o *Orchestrator) waitForTimeSync(ctx context.Context, nodes []string) erro
|
|||||||
} else {
|
} else {
|
||||||
unsynced = append(unsynced, fmt.Sprintf("%s(%s)", node, strings.TrimSpace(out)))
|
unsynced = append(unsynced, fmt.Sprintf("%s(%s)", node, strings.TrimSpace(out)))
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
syncedControlPlanes++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if len(unsynced) == 0 {
|
|
||||||
|
ready := false
|
||||||
|
switch mode {
|
||||||
|
case "quorum":
|
||||||
|
if localSynced && syncedControlPlanes >= requiredQuorum {
|
||||||
|
ready = true
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
if localSynced && len(unsynced) == 0 {
|
||||||
|
ready = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ready {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
if time.Now().After(deadline) {
|
if time.Now().After(deadline) {
|
||||||
|
if mode == "quorum" {
|
||||||
|
return fmt.Errorf(
|
||||||
|
"startup blocked: time sync quorum not ready within %s (mode=quorum local_synced=%t synced_control_planes=%d required=%d checked=%d details=%s)",
|
||||||
|
wait,
|
||||||
|
localSynced,
|
||||||
|
syncedControlPlanes,
|
||||||
|
requiredQuorum,
|
||||||
|
checkedControlPlanes,
|
||||||
|
strings.Join(unsynced, ", "),
|
||||||
|
)
|
||||||
|
}
|
||||||
return fmt.Errorf("startup blocked: time sync not ready within %s (%s)", wait, strings.Join(unsynced, ", "))
|
return fmt.Errorf("startup blocked: time sync not ready within %s (%s)", wait, strings.Join(unsynced, ", "))
|
||||||
}
|
}
|
||||||
select {
|
select {
|
||||||
@ -1183,23 +1251,30 @@ func (o *Orchestrator) ensureFluxBranch(ctx context.Context, branch string) erro
|
|||||||
|
|
||||||
func (o *Orchestrator) bootstrapLocal(ctx context.Context) error {
|
func (o *Orchestrator) bootstrapLocal(ctx context.Context) error {
|
||||||
failures := 0
|
failures := 0
|
||||||
|
successes := 0
|
||||||
for _, rel := range o.cfg.LocalBootstrapPaths {
|
for _, rel := range o.cfg.LocalBootstrapPaths {
|
||||||
full := filepath.Join(o.cfg.IACRepoPath, rel)
|
full := filepath.Join(o.cfg.IACRepoPath, rel)
|
||||||
o.log.Printf("local bootstrap apply -k %s", full)
|
o.log.Printf("local bootstrap apply rel=%s path=%s", rel, full)
|
||||||
if o.runner.DryRun {
|
if o.runner.DryRun {
|
||||||
|
successes++
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if _, err := o.kubectl(ctx, 2*time.Minute, "apply", "-k", full); err != nil {
|
if _, err := o.kubectl(ctx, 2*time.Minute, "apply", "-k", full); err != nil {
|
||||||
o.log.Printf("warning: local bootstrap apply failed at %s: %v", full, err)
|
o.log.Printf("warning: local bootstrap apply failed at %s: %v", full, err)
|
||||||
o.log.Printf("local bootstrap fallback render/apply with LoadRestrictionsNone for %s", full)
|
o.log.Printf("local bootstrap fallback render/apply with LoadRestrictionsNone for %s", full)
|
||||||
if fallbackErr := o.applyKustomizeFallback(ctx, full); fallbackErr != nil {
|
if fallbackErr := o.applyKustomizeFallback(ctx, full); fallbackErr != nil {
|
||||||
failures++
|
|
||||||
o.log.Printf("warning: local bootstrap fallback failed at %s: %v", full, fallbackErr)
|
o.log.Printf("warning: local bootstrap fallback failed at %s: %v", full, fallbackErr)
|
||||||
continue
|
o.log.Printf("local bootstrap cache apply for rel=%s", rel)
|
||||||
|
if cacheErr := o.applyBootstrapCache(ctx, rel); cacheErr != nil {
|
||||||
|
failures++
|
||||||
|
o.log.Printf("warning: local bootstrap cache apply failed for rel=%s: %v", rel, cacheErr)
|
||||||
|
continue
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
successes++
|
||||||
}
|
}
|
||||||
if failures == len(o.cfg.LocalBootstrapPaths) {
|
if failures > 0 && successes == 0 {
|
||||||
return fmt.Errorf("local bootstrap apply failed for every configured path (%d total)", failures)
|
return fmt.Errorf("local bootstrap apply failed for every configured path (%d total)", failures)
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
@ -1213,6 +1288,99 @@ func (o *Orchestrator) applyKustomizeFallback(ctx context.Context, full string)
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (o *Orchestrator) syncLocalIACRepo(ctx context.Context) error {
|
||||||
|
repo := strings.TrimSpace(o.cfg.IACRepoPath)
|
||||||
|
if repo == "" {
|
||||||
|
return fmt.Errorf("iac repo path is empty")
|
||||||
|
}
|
||||||
|
gitDir := filepath.Join(repo, ".git")
|
||||||
|
if stat, err := os.Stat(gitDir); err != nil || stat.IsDir() == false {
|
||||||
|
return fmt.Errorf("iac repo %s is not a git checkout", repo)
|
||||||
|
}
|
||||||
|
statusOut, statusErr := o.runSensitive(ctx, 10*time.Second, "git", "-C", repo, "status", "--porcelain")
|
||||||
|
if statusErr != nil {
|
||||||
|
return fmt.Errorf("inspect iac repo working tree: %w", statusErr)
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(statusOut) != "" {
|
||||||
|
o.log.Printf("warning: skipping local titan-iac sync because working tree is dirty")
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
branch := strings.TrimSpace(o.cfg.ExpectedFluxBranch)
|
||||||
|
if branch == "" {
|
||||||
|
branch = "main"
|
||||||
|
}
|
||||||
|
if _, err := o.runSensitive(ctx, 45*time.Second, "git", "-C", repo, "fetch", "origin", "--prune"); err != nil {
|
||||||
|
return fmt.Errorf("git fetch origin: %w", err)
|
||||||
|
}
|
||||||
|
if _, err := o.runSensitive(ctx, 20*time.Second, "git", "-C", repo, "checkout", branch); err != nil {
|
||||||
|
return fmt.Errorf("git checkout %s: %w", branch, err)
|
||||||
|
}
|
||||||
|
if _, err := o.runSensitive(ctx, 20*time.Second, "git", "-C", repo, "reset", "--hard", "origin/"+branch); err != nil {
|
||||||
|
return fmt.Errorf("git reset --hard origin/%s: %w", branch, err)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (o *Orchestrator) refreshBootstrapCache(ctx context.Context) error {
|
||||||
|
if len(o.cfg.LocalBootstrapPaths) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if err := os.MkdirAll(o.bootstrapCacheDir(), 0o755); err != nil {
|
||||||
|
return fmt.Errorf("ensure bootstrap cache dir: %w", err)
|
||||||
|
}
|
||||||
|
rendered := 0
|
||||||
|
for _, rel := range o.cfg.LocalBootstrapPaths {
|
||||||
|
rel = strings.TrimSpace(rel)
|
||||||
|
if rel == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
full := filepath.Join(o.cfg.IACRepoPath, rel)
|
||||||
|
if stat, err := os.Stat(full); err != nil || !stat.IsDir() {
|
||||||
|
o.log.Printf("warning: skip bootstrap cache render for rel=%s (path missing)", rel)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
cmd := fmt.Sprintf("kubectl kustomize --load-restrictor=LoadRestrictionsNone %q", full)
|
||||||
|
manifest, err := o.runSensitive(ctx, 2*time.Minute, "sh", "-lc", cmd)
|
||||||
|
if err != nil {
|
||||||
|
o.log.Printf("warning: bootstrap cache render failed for rel=%s: %v", rel, err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
cachePath := o.bootstrapCachePath(rel)
|
||||||
|
if err := os.WriteFile(cachePath, []byte(manifest+"\n"), 0o644); err != nil {
|
||||||
|
o.log.Printf("warning: bootstrap cache write failed for rel=%s path=%s: %v", rel, cachePath, err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
rendered++
|
||||||
|
}
|
||||||
|
if rendered == 0 {
|
||||||
|
return fmt.Errorf("no bootstrap cache manifests rendered")
|
||||||
|
}
|
||||||
|
o.log.Printf("bootstrap cache refreshed (%d paths)", rendered)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (o *Orchestrator) applyBootstrapCache(ctx context.Context, rel string) error {
|
||||||
|
cachePath := o.bootstrapCachePath(rel)
|
||||||
|
if _, err := os.Stat(cachePath); err != nil {
|
||||||
|
return fmt.Errorf("bootstrap cache missing at %s: %w", cachePath, err)
|
||||||
|
}
|
||||||
|
if _, err := o.kubectl(ctx, 2*time.Minute, "apply", "-f", cachePath); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (o *Orchestrator) bootstrapCacheDir() string {
|
||||||
|
return filepath.Join(o.cfg.State.Dir, "bootstrap-cache")
|
||||||
|
}
|
||||||
|
|
||||||
|
func (o *Orchestrator) bootstrapCachePath(rel string) string {
|
||||||
|
safe := strings.TrimSpace(rel)
|
||||||
|
safe = strings.ReplaceAll(safe, "/", "__")
|
||||||
|
safe = strings.ReplaceAll(safe, string(os.PathSeparator), "__")
|
||||||
|
return filepath.Join(o.bootstrapCacheDir(), safe+".yaml")
|
||||||
|
}
|
||||||
|
|
||||||
func (o *Orchestrator) waitForFluxSourceReady(ctx context.Context, window time.Duration) (bool, error) {
|
func (o *Orchestrator) waitForFluxSourceReady(ctx context.Context, window time.Duration) (bool, error) {
|
||||||
if o.runner.DryRun {
|
if o.runner.DryRun {
|
||||||
return true, nil
|
return true, nil
|
||||||
@ -1237,6 +1405,184 @@ func (o *Orchestrator) waitForFluxSourceReady(ctx context.Context, window time.D
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (o *Orchestrator) waitForStorageReady(ctx context.Context) error {
|
||||||
|
if o.runner.DryRun {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
wait := time.Duration(o.cfg.Startup.StorageReadyWaitSeconds) * time.Second
|
||||||
|
if wait <= 0 {
|
||||||
|
wait = 420 * time.Second
|
||||||
|
}
|
||||||
|
poll := time.Duration(o.cfg.Startup.StorageReadyPollSeconds) * time.Second
|
||||||
|
if poll <= 0 {
|
||||||
|
poll = 5 * time.Second
|
||||||
|
}
|
||||||
|
deadline := time.Now().Add(wait)
|
||||||
|
lastReason := "unknown"
|
||||||
|
for {
|
||||||
|
ok, reason, err := o.storageReady(ctx)
|
||||||
|
if err != nil {
|
||||||
|
lastReason = err.Error()
|
||||||
|
} else {
|
||||||
|
lastReason = reason
|
||||||
|
}
|
||||||
|
if ok {
|
||||||
|
o.log.Printf("storage readiness check passed (%s)", reason)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if time.Now().After(deadline) {
|
||||||
|
return fmt.Errorf("startup blocked: storage readiness not satisfied within %s (%s)", wait, lastReason)
|
||||||
|
}
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return ctx.Err()
|
||||||
|
case <-time.After(poll):
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (o *Orchestrator) storageReady(ctx context.Context) (bool, string, error) {
|
||||||
|
minReady := o.cfg.Startup.StorageMinReadyNodes
|
||||||
|
if minReady <= 0 {
|
||||||
|
minReady = 2
|
||||||
|
}
|
||||||
|
longhornOut, err := o.kubectl(
|
||||||
|
ctx,
|
||||||
|
15*time.Second,
|
||||||
|
"-n",
|
||||||
|
"longhorn-system",
|
||||||
|
"get",
|
||||||
|
"nodes.longhorn.io",
|
||||||
|
"-o",
|
||||||
|
`jsonpath={range .items[*]}{.metadata.name}{":"}{.status.conditions[?(@.type=="Ready")].status}{":"}{.status.conditions[?(@.type=="Schedulable")].status}{"\n"}{end}`,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return false, "", fmt.Errorf("query longhorn nodes: %w", err)
|
||||||
|
}
|
||||||
|
readyNodes := 0
|
||||||
|
for _, line := range lines(longhornOut) {
|
||||||
|
parts := strings.Split(line, ":")
|
||||||
|
if len(parts) < 3 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
ready := strings.EqualFold(strings.TrimSpace(parts[1]), "true")
|
||||||
|
sched := strings.EqualFold(strings.TrimSpace(parts[2]), "true")
|
||||||
|
if ready && sched {
|
||||||
|
readyNodes++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if readyNodes < minReady {
|
||||||
|
return false, fmt.Sprintf("longhorn ready+sched nodes %d/%d", readyNodes, minReady), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, item := range o.cfg.Startup.StorageCriticalPVCs {
|
||||||
|
item = strings.TrimSpace(item)
|
||||||
|
if item == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
parts := strings.SplitN(item, "/", 2)
|
||||||
|
if len(parts) != 2 {
|
||||||
|
return false, "", fmt.Errorf("invalid storage_critical_pvcs entry %q", item)
|
||||||
|
}
|
||||||
|
ns := strings.TrimSpace(parts[0])
|
||||||
|
name := strings.TrimSpace(parts[1])
|
||||||
|
out, pvcErr := o.kubectl(ctx, 15*time.Second, "-n", ns, "get", "pvc", name, "-o", "jsonpath={.status.phase}")
|
||||||
|
if pvcErr != nil {
|
||||||
|
if isNotFoundErr(pvcErr) {
|
||||||
|
return false, fmt.Sprintf("pvc %s/%s not found", ns, name), nil
|
||||||
|
}
|
||||||
|
return false, "", fmt.Errorf("query pvc %s/%s: %w", ns, name, pvcErr)
|
||||||
|
}
|
||||||
|
if !strings.EqualFold(strings.TrimSpace(out), "Bound") {
|
||||||
|
return false, fmt.Sprintf("pvc %s/%s phase=%s", ns, name, strings.TrimSpace(out)), nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true, fmt.Sprintf("longhorn ready+sched nodes=%d critical pvcs bound=%d", readyNodes, len(o.cfg.Startup.StorageCriticalPVCs)), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (o *Orchestrator) waitForPostStartProbes(ctx context.Context) error {
|
||||||
|
if o.runner.DryRun {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
wait := time.Duration(o.cfg.Startup.PostStartProbeWaitSeconds) * time.Second
|
||||||
|
if wait <= 0 {
|
||||||
|
wait = 240 * time.Second
|
||||||
|
}
|
||||||
|
poll := time.Duration(o.cfg.Startup.PostStartProbePollSeconds) * time.Second
|
||||||
|
if poll <= 0 {
|
||||||
|
poll = 5 * time.Second
|
||||||
|
}
|
||||||
|
deadline := time.Now().Add(wait)
|
||||||
|
lastFailure := "unknown"
|
||||||
|
for {
|
||||||
|
ok, failure := o.postStartProbesReady(ctx)
|
||||||
|
if ok {
|
||||||
|
o.log.Printf("post-start probes passed")
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
lastFailure = failure
|
||||||
|
if time.Now().After(deadline) {
|
||||||
|
return fmt.Errorf("startup blocked: post-start probes did not pass within %s (%s)", wait, lastFailure)
|
||||||
|
}
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return ctx.Err()
|
||||||
|
case <-time.After(poll):
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (o *Orchestrator) postStartProbesReady(ctx context.Context) (bool, string) {
|
||||||
|
probes := make([]string, 0, len(o.cfg.Startup.PostStartProbes))
|
||||||
|
for _, p := range o.cfg.Startup.PostStartProbes {
|
||||||
|
p = strings.TrimSpace(p)
|
||||||
|
if p != "" {
|
||||||
|
probes = append(probes, p)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(probes) == 0 {
|
||||||
|
return true, "no probes configured"
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, probe := range probes {
|
||||||
|
code, err := o.httpProbe(ctx, probe)
|
||||||
|
if err != nil {
|
||||||
|
return false, fmt.Sprintf("%s: %v", probe, err)
|
||||||
|
}
|
||||||
|
if code < 200 || code >= 400 {
|
||||||
|
return false, fmt.Sprintf("%s: unexpected status code=%d", probe, code)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true, "all probes successful"
|
||||||
|
}
|
||||||
|
|
||||||
|
func (o *Orchestrator) httpProbe(ctx context.Context, probeURL string) (int, error) {
|
||||||
|
out, err := o.run(
|
||||||
|
ctx,
|
||||||
|
20*time.Second,
|
||||||
|
"curl",
|
||||||
|
"--silent",
|
||||||
|
"--show-error",
|
||||||
|
"--location",
|
||||||
|
"--max-time",
|
||||||
|
"12",
|
||||||
|
"--output",
|
||||||
|
"/dev/null",
|
||||||
|
"--write-out",
|
||||||
|
"%{http_code}",
|
||||||
|
probeURL,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
code, convErr := strconv.Atoi(strings.TrimSpace(out))
|
||||||
|
if convErr != nil {
|
||||||
|
return 0, fmt.Errorf("parse http status %q: %w", strings.TrimSpace(out), convErr)
|
||||||
|
}
|
||||||
|
return code, nil
|
||||||
|
}
|
||||||
|
|
||||||
func (o *Orchestrator) resumeFluxAndReconcile(ctx context.Context) error {
|
func (o *Orchestrator) resumeFluxAndReconcile(ctx context.Context) error {
|
||||||
if err := o.patchFluxSuspendAll(ctx, false); err != nil {
|
if err := o.patchFluxSuspendAll(ctx, false); err != nil {
|
||||||
return err
|
return err
|
||||||
@ -1678,16 +2024,56 @@ func (o *Orchestrator) vaultUnsealKey(ctx context.Context) (string, error) {
|
|||||||
"get", "secret", "vault-init",
|
"get", "secret", "vault-init",
|
||||||
"-o", "jsonpath={.data.unseal_key_b64}",
|
"-o", "jsonpath={.data.unseal_key_b64}",
|
||||||
)
|
)
|
||||||
if err != nil {
|
if err == nil {
|
||||||
return "", fmt.Errorf("read vault-init secret: %w", err)
|
decoded, decodeErr := base64.StdEncoding.DecodeString(strings.TrimSpace(out))
|
||||||
|
if decodeErr == nil {
|
||||||
|
key := strings.TrimSpace(string(decoded))
|
||||||
|
if key != "" {
|
||||||
|
o.bestEffort("cache vault unseal key locally", func() error { return o.writeVaultUnsealKeyFile(key) })
|
||||||
|
return key, nil
|
||||||
|
}
|
||||||
|
err = fmt.Errorf("vault-init unseal key is empty")
|
||||||
|
} else {
|
||||||
|
err = fmt.Errorf("decode vault-init unseal_key_b64: %w", decodeErr)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
err = fmt.Errorf("read vault-init secret: %w", err)
|
||||||
}
|
}
|
||||||
decoded, err := base64.StdEncoding.DecodeString(strings.TrimSpace(out))
|
|
||||||
if err != nil {
|
fallbackKey, fileErr := o.readVaultUnsealKeyFile()
|
||||||
return "", fmt.Errorf("decode vault-init unseal_key_b64: %w", err)
|
if fileErr == nil {
|
||||||
|
o.log.Printf("warning: using cached vault unseal key from %s", o.cfg.Startup.VaultUnsealKeyFile)
|
||||||
|
return fallbackKey, nil
|
||||||
}
|
}
|
||||||
key := strings.TrimSpace(string(decoded))
|
return "", fmt.Errorf("%v; fallback %v", err, fileErr)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (o *Orchestrator) writeVaultUnsealKeyFile(key string) error {
|
||||||
|
path := strings.TrimSpace(o.cfg.Startup.VaultUnsealKeyFile)
|
||||||
|
if path == "" {
|
||||||
|
return fmt.Errorf("vault unseal key file path is empty")
|
||||||
|
}
|
||||||
|
if err := os.MkdirAll(filepath.Dir(path), 0o700); err != nil {
|
||||||
|
return fmt.Errorf("ensure vault unseal key dir: %w", err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(path, []byte(strings.TrimSpace(key)+"\n"), 0o600); err != nil {
|
||||||
|
return fmt.Errorf("write vault unseal key file: %w", err)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (o *Orchestrator) readVaultUnsealKeyFile() (string, error) {
|
||||||
|
path := strings.TrimSpace(o.cfg.Startup.VaultUnsealKeyFile)
|
||||||
|
if path == "" {
|
||||||
|
return "", fmt.Errorf("vault unseal key file path is empty")
|
||||||
|
}
|
||||||
|
b, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("read vault unseal key file %s: %w", path, err)
|
||||||
|
}
|
||||||
|
key := strings.TrimSpace(string(b))
|
||||||
if key == "" {
|
if key == "" {
|
||||||
return "", fmt.Errorf("vault-init unseal key is empty")
|
return "", fmt.Errorf("vault unseal key file %s is empty", path)
|
||||||
}
|
}
|
||||||
return key, nil
|
return key, nil
|
||||||
}
|
}
|
||||||
|
|||||||
@ -3,6 +3,7 @@ package config
|
|||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
|
"strings"
|
||||||
|
|
||||||
"gopkg.in/yaml.v3"
|
"gopkg.in/yaml.v3"
|
||||||
)
|
)
|
||||||
@ -33,14 +34,26 @@ type Config struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type Startup struct {
|
type Startup struct {
|
||||||
APIWaitSeconds int `yaml:"api_wait_seconds"`
|
APIWaitSeconds int `yaml:"api_wait_seconds"`
|
||||||
APIPollSeconds int `yaml:"api_poll_seconds"`
|
APIPollSeconds int `yaml:"api_poll_seconds"`
|
||||||
RequireTimeSync bool `yaml:"require_time_sync"`
|
RequireTimeSync bool `yaml:"require_time_sync"`
|
||||||
TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"`
|
TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"`
|
||||||
TimeSyncPollSeconds int `yaml:"time_sync_poll_seconds"`
|
TimeSyncPollSeconds int `yaml:"time_sync_poll_seconds"`
|
||||||
ReconcileAccessOnBoot bool `yaml:"reconcile_access_on_boot"`
|
TimeSyncMode string `yaml:"time_sync_mode"`
|
||||||
AutoEtcdRestoreOnAPIFailure bool `yaml:"auto_etcd_restore_on_api_failure"`
|
TimeSyncQuorum int `yaml:"time_sync_quorum"`
|
||||||
EtcdRestoreControlPlane string `yaml:"etcd_restore_control_plane"`
|
ReconcileAccessOnBoot bool `yaml:"reconcile_access_on_boot"`
|
||||||
|
AutoEtcdRestoreOnAPIFailure bool `yaml:"auto_etcd_restore_on_api_failure"`
|
||||||
|
EtcdRestoreControlPlane string `yaml:"etcd_restore_control_plane"`
|
||||||
|
RequireStorageReady bool `yaml:"require_storage_ready"`
|
||||||
|
StorageReadyWaitSeconds int `yaml:"storage_ready_wait_seconds"`
|
||||||
|
StorageReadyPollSeconds int `yaml:"storage_ready_poll_seconds"`
|
||||||
|
StorageMinReadyNodes int `yaml:"storage_min_ready_nodes"`
|
||||||
|
StorageCriticalPVCs []string `yaml:"storage_critical_pvcs"`
|
||||||
|
RequirePostStartProbes bool `yaml:"require_post_start_probes"`
|
||||||
|
PostStartProbeWaitSeconds int `yaml:"post_start_probe_wait_seconds"`
|
||||||
|
PostStartProbePollSeconds int `yaml:"post_start_probe_poll_seconds"`
|
||||||
|
PostStartProbes []string `yaml:"post_start_probes"`
|
||||||
|
VaultUnsealKeyFile string `yaml:"vault_unseal_key_file"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type Shutdown struct {
|
type Shutdown struct {
|
||||||
@ -162,6 +175,12 @@ func (c Config) Validate() error {
|
|||||||
if c.Startup.TimeSyncPollSeconds <= 0 {
|
if c.Startup.TimeSyncPollSeconds <= 0 {
|
||||||
return fmt.Errorf("config.startup.time_sync_poll_seconds must be > 0")
|
return fmt.Errorf("config.startup.time_sync_poll_seconds must be > 0")
|
||||||
}
|
}
|
||||||
|
if c.Startup.TimeSyncMode != "strict" && c.Startup.TimeSyncMode != "quorum" {
|
||||||
|
return fmt.Errorf("config.startup.time_sync_mode must be strict or quorum")
|
||||||
|
}
|
||||||
|
if c.Startup.TimeSyncMode == "quorum" && c.Startup.TimeSyncQuorum <= 0 {
|
||||||
|
return fmt.Errorf("config.startup.time_sync_quorum must be > 0 when time_sync_mode=quorum")
|
||||||
|
}
|
||||||
if c.Startup.EtcdRestoreControlPlane != "" {
|
if c.Startup.EtcdRestoreControlPlane != "" {
|
||||||
found := false
|
found := false
|
||||||
for _, cp := range c.ControlPlanes {
|
for _, cp := range c.ControlPlanes {
|
||||||
@ -174,6 +193,37 @@ func (c Config) Validate() error {
|
|||||||
return fmt.Errorf("config.startup.etcd_restore_control_plane must be one of config.control_planes when set")
|
return fmt.Errorf("config.startup.etcd_restore_control_plane must be one of config.control_planes when set")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if c.Startup.StorageReadyWaitSeconds <= 0 {
|
||||||
|
return fmt.Errorf("config.startup.storage_ready_wait_seconds must be > 0")
|
||||||
|
}
|
||||||
|
if c.Startup.StorageReadyPollSeconds <= 0 {
|
||||||
|
return fmt.Errorf("config.startup.storage_ready_poll_seconds must be > 0")
|
||||||
|
}
|
||||||
|
if c.Startup.StorageMinReadyNodes <= 0 {
|
||||||
|
return fmt.Errorf("config.startup.storage_min_ready_nodes must be > 0")
|
||||||
|
}
|
||||||
|
for _, pvc := range c.Startup.StorageCriticalPVCs {
|
||||||
|
if strings.Count(strings.TrimSpace(pvc), "/") != 1 {
|
||||||
|
return fmt.Errorf("config.startup.storage_critical_pvcs entries must be namespace/name, got %q", pvc)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if c.Startup.PostStartProbeWaitSeconds <= 0 {
|
||||||
|
return fmt.Errorf("config.startup.post_start_probe_wait_seconds must be > 0")
|
||||||
|
}
|
||||||
|
if c.Startup.PostStartProbePollSeconds <= 0 {
|
||||||
|
return fmt.Errorf("config.startup.post_start_probe_poll_seconds must be > 0")
|
||||||
|
}
|
||||||
|
if c.Startup.RequirePostStartProbes && len(c.Startup.PostStartProbes) == 0 {
|
||||||
|
return fmt.Errorf("config.startup.post_start_probes must not be empty when require_post_start_probes is true")
|
||||||
|
}
|
||||||
|
for _, probe := range c.Startup.PostStartProbes {
|
||||||
|
if strings.TrimSpace(probe) == "" {
|
||||||
|
return fmt.Errorf("config.startup.post_start_probes entries must not be empty")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" {
|
||||||
|
return fmt.Errorf("config.startup.vault_unseal_key_file must not be empty")
|
||||||
|
}
|
||||||
if c.SSHPort <= 0 || c.SSHPort > 65535 {
|
if c.SSHPort <= 0 || c.SSHPort > 65535 {
|
||||||
return fmt.Errorf("config.ssh_port must be in range 1-65535")
|
return fmt.Errorf("config.ssh_port must be in range 1-65535")
|
||||||
}
|
}
|
||||||
@ -250,9 +300,30 @@ func defaults() Config {
|
|||||||
RequireTimeSync: true,
|
RequireTimeSync: true,
|
||||||
TimeSyncWaitSeconds: 240,
|
TimeSyncWaitSeconds: 240,
|
||||||
TimeSyncPollSeconds: 5,
|
TimeSyncPollSeconds: 5,
|
||||||
|
TimeSyncMode: "quorum",
|
||||||
|
TimeSyncQuorum: 2,
|
||||||
ReconcileAccessOnBoot: true,
|
ReconcileAccessOnBoot: true,
|
||||||
AutoEtcdRestoreOnAPIFailure: true,
|
AutoEtcdRestoreOnAPIFailure: true,
|
||||||
EtcdRestoreControlPlane: "titan-0a",
|
EtcdRestoreControlPlane: "titan-0a",
|
||||||
|
RequireStorageReady: true,
|
||||||
|
StorageReadyWaitSeconds: 420,
|
||||||
|
StorageReadyPollSeconds: 5,
|
||||||
|
StorageMinReadyNodes: 2,
|
||||||
|
StorageCriticalPVCs: []string{
|
||||||
|
"vault/data-vault-0",
|
||||||
|
"postgres/postgres-data-postgres-0",
|
||||||
|
"gitea/gitea-data",
|
||||||
|
"sso/keycloak-data",
|
||||||
|
},
|
||||||
|
RequirePostStartProbes: true,
|
||||||
|
PostStartProbeWaitSeconds: 240,
|
||||||
|
PostStartProbePollSeconds: 5,
|
||||||
|
PostStartProbes: []string{
|
||||||
|
"https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration",
|
||||||
|
"https://scm.bstein.dev/user/login",
|
||||||
|
"https://metrics.bstein.dev/login",
|
||||||
|
},
|
||||||
|
VaultUnsealKeyFile: "/var/lib/hecate/vault-unseal.key",
|
||||||
},
|
},
|
||||||
Shutdown: Shutdown{
|
Shutdown: Shutdown{
|
||||||
DefaultBudgetSeconds: 1380,
|
DefaultBudgetSeconds: 1380,
|
||||||
@ -319,9 +390,51 @@ func (c *Config) applyDefaults() {
|
|||||||
if c.Startup.TimeSyncPollSeconds <= 0 {
|
if c.Startup.TimeSyncPollSeconds <= 0 {
|
||||||
c.Startup.TimeSyncPollSeconds = 5
|
c.Startup.TimeSyncPollSeconds = 5
|
||||||
}
|
}
|
||||||
|
if c.Startup.TimeSyncMode == "" {
|
||||||
|
c.Startup.TimeSyncMode = "quorum"
|
||||||
|
}
|
||||||
|
if c.Startup.TimeSyncQuorum <= 0 {
|
||||||
|
c.Startup.TimeSyncQuorum = 2
|
||||||
|
}
|
||||||
|
if c.Startup.TimeSyncQuorum > len(c.ControlPlanes) && len(c.ControlPlanes) > 0 {
|
||||||
|
c.Startup.TimeSyncQuorum = len(c.ControlPlanes)
|
||||||
|
}
|
||||||
if c.Startup.EtcdRestoreControlPlane == "" && len(c.ControlPlanes) > 0 {
|
if c.Startup.EtcdRestoreControlPlane == "" && len(c.ControlPlanes) > 0 {
|
||||||
c.Startup.EtcdRestoreControlPlane = c.ControlPlanes[0]
|
c.Startup.EtcdRestoreControlPlane = c.ControlPlanes[0]
|
||||||
}
|
}
|
||||||
|
if c.Startup.StorageReadyWaitSeconds <= 0 {
|
||||||
|
c.Startup.StorageReadyWaitSeconds = 420
|
||||||
|
}
|
||||||
|
if c.Startup.StorageReadyPollSeconds <= 0 {
|
||||||
|
c.Startup.StorageReadyPollSeconds = 5
|
||||||
|
}
|
||||||
|
if c.Startup.StorageMinReadyNodes <= 0 {
|
||||||
|
c.Startup.StorageMinReadyNodes = 2
|
||||||
|
}
|
||||||
|
if len(c.Startup.StorageCriticalPVCs) == 0 {
|
||||||
|
c.Startup.StorageCriticalPVCs = []string{
|
||||||
|
"vault/data-vault-0",
|
||||||
|
"postgres/postgres-data-postgres-0",
|
||||||
|
"gitea/gitea-data",
|
||||||
|
"sso/keycloak-data",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if c.Startup.PostStartProbeWaitSeconds <= 0 {
|
||||||
|
c.Startup.PostStartProbeWaitSeconds = 240
|
||||||
|
}
|
||||||
|
if c.Startup.PostStartProbePollSeconds <= 0 {
|
||||||
|
c.Startup.PostStartProbePollSeconds = 5
|
||||||
|
}
|
||||||
|
if len(c.Startup.PostStartProbes) == 0 {
|
||||||
|
c.Startup.PostStartProbes = []string{
|
||||||
|
"https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration",
|
||||||
|
"https://scm.bstein.dev/user/login",
|
||||||
|
"https://metrics.bstein.dev/login",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" {
|
||||||
|
c.Startup.VaultUnsealKeyFile = "/var/lib/hecate/vault-unseal.key"
|
||||||
|
}
|
||||||
if c.SSHPort <= 0 {
|
if c.SSHPort <= 0 {
|
||||||
c.SSHPort = 2277
|
c.SSHPort = 2277
|
||||||
}
|
}
|
||||||
|
|||||||
@ -92,4 +92,35 @@ state:
|
|||||||
if cfg.Startup.EtcdRestoreControlPlane == "" {
|
if cfg.Startup.EtcdRestoreControlPlane == "" {
|
||||||
t.Fatalf("expected startup etcd restore control plane default to be set")
|
t.Fatalf("expected startup etcd restore control plane default to be set")
|
||||||
}
|
}
|
||||||
|
if cfg.Startup.TimeSyncMode == "" {
|
||||||
|
t.Fatalf("expected startup time sync mode default to be set")
|
||||||
|
}
|
||||||
|
if cfg.Startup.VaultUnsealKeyFile == "" {
|
||||||
|
t.Fatalf("expected startup vault unseal key file default to be set")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestValidateRejectsInvalidTimeSyncMode(t *testing.T) {
|
||||||
|
cfg := defaults()
|
||||||
|
cfg.Startup.TimeSyncMode = "invalid"
|
||||||
|
if err := cfg.Validate(); err == nil {
|
||||||
|
t.Fatalf("expected validation error for invalid time_sync_mode")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestValidateRejectsBadStoragePVCFormat(t *testing.T) {
|
||||||
|
cfg := defaults()
|
||||||
|
cfg.Startup.StorageCriticalPVCs = []string{"vault-data-vault-0"}
|
||||||
|
if err := cfg.Validate(); err == nil {
|
||||||
|
t.Fatalf("expected validation error for invalid storage_critical_pvcs entry")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestValidateRejectsMissingPostStartProbesWhenRequired(t *testing.T) {
|
||||||
|
cfg := defaults()
|
||||||
|
cfg.Startup.RequirePostStartProbes = true
|
||||||
|
cfg.Startup.PostStartProbes = nil
|
||||||
|
if err := cfg.Validate(); err == nil {
|
||||||
|
t.Fatalf("expected validation error when post start probes are required but empty")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -209,6 +209,31 @@ migrate_hecate_config() {
|
|||||||
echo "[install] added startup time sync + access reconciliation defaults"
|
echo "[install] added startup time sync + access reconciliation defaults"
|
||||||
changed=1
|
changed=1
|
||||||
fi
|
fi
|
||||||
|
if grep -Eq '^ time_sync_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/hecate.yaml" \
|
||||||
|
&& ! grep -Eq '^ time_sync_mode:[[:space:]]*(strict|quorum)' "${CONF_DIR}/hecate.yaml"; then
|
||||||
|
sed -Ei '/^ time_sync_poll_seconds:[[:space:]]*[0-9]+/a\ time_sync_mode: quorum\n time_sync_quorum: 2' "${CONF_DIR}/hecate.yaml"
|
||||||
|
echo "[install] added startup time sync quorum defaults"
|
||||||
|
changed=1
|
||||||
|
fi
|
||||||
|
if grep -Eq '^ etcd_restore_control_plane:[[:space:]]*[A-Za-z0-9._-]+' "${CONF_DIR}/hecate.yaml" \
|
||||||
|
&& ! grep -Eq '^ require_storage_ready:[[:space:]]*(true|false)' "${CONF_DIR}/hecate.yaml"; then
|
||||||
|
sed -Ei '/^ etcd_restore_control_plane:[[:space:]]*[A-Za-z0-9._-]+/a\ require_storage_ready: true\n storage_ready_wait_seconds: 420\n storage_ready_poll_seconds: 5\n storage_min_ready_nodes: 2\n storage_critical_pvcs:\n - vault/data-vault-0\n - postgres/postgres-data-postgres-0\n - gitea/gitea-data\n - sso/keycloak-data' "${CONF_DIR}/hecate.yaml"
|
||||||
|
echo "[install] added startup storage readiness defaults"
|
||||||
|
changed=1
|
||||||
|
fi
|
||||||
|
if grep -Eq '^ storage_critical_pvcs:[[:space:]]*$' "${CONF_DIR}/hecate.yaml" \
|
||||||
|
&& ! grep -Eq '^ require_post_start_probes:[[:space:]]*(true|false)' "${CONF_DIR}/hecate.yaml"; then
|
||||||
|
sed -Ei '/^ - sso\/keycloak-data$/a\ require_post_start_probes: true\n post_start_probe_wait_seconds: 240\n post_start_probe_poll_seconds: 5\n post_start_probes:\n - https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration\n - https://scm.bstein.dev/user/login\n - https://metrics.bstein.dev/login\n vault_unseal_key_file: /var/lib/hecate/vault-unseal.key' "${CONF_DIR}/hecate.yaml"
|
||||||
|
echo "[install] added startup post-start probe + vault key fallback defaults"
|
||||||
|
changed=1
|
||||||
|
fi
|
||||||
|
if ! grep -Eq '^ vault_unseal_key_file:[[:space:]]*/var/lib/hecate/vault-unseal.key' "${CONF_DIR}/hecate.yaml"; then
|
||||||
|
if grep -Eq '^startup:[[:space:]]*$' "${CONF_DIR}/hecate.yaml" && grep -Eq '^ post_start_probes:[[:space:]]*$' "${CONF_DIR}/hecate.yaml"; then
|
||||||
|
sed -Ei '/^ - https:\/\/metrics\.bstein\.dev\/login$/a\ vault_unseal_key_file: /var/lib/hecate/vault-unseal.key' "${CONF_DIR}/hecate.yaml"
|
||||||
|
echo "[install] added startup.vault_unseal_key_file default"
|
||||||
|
changed=1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
local role
|
local role
|
||||||
role="$(read_hecate_role)"
|
role="$(read_hecate_role)"
|
||||||
@ -371,14 +396,14 @@ migrate_hecate_config() {
|
|||||||
changed=1
|
changed=1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if ! grep -Eq '^ - services/gitea$' "${CONF_DIR}/hecate.yaml"; then
|
if ! grep -Eq '^ - services/keycloak$' "${CONF_DIR}/hecate.yaml" || ! grep -Eq '^ - infrastructure/cert-manager$' "${CONF_DIR}/hecate.yaml" || ! grep -Eq '^ - services/oauth2-proxy$' "${CONF_DIR}/hecate.yaml"; then
|
||||||
perl -0pi -e 's#local_bootstrap_paths:\n(?: - [^\n]*\n)*#local_bootstrap_paths:\n - infrastructure/core\n - clusters/atlas/flux-system\n - infrastructure/sources/helm\n - infrastructure/metallb\n - infrastructure/traefik\n - infrastructure/vault-csi\n - infrastructure/vault-injector\n - services/vault\n - infrastructure/postgres\n - services/gitea\n#s' "${CONF_DIR}/hecate.yaml"
|
perl -0pi -e 's#local_bootstrap_paths:\n(?: - [^\n]*\n)*#local_bootstrap_paths:\n - infrastructure/core\n - clusters/atlas/flux-system\n - infrastructure/sources/helm\n - infrastructure/metallb\n - infrastructure/traefik\n - infrastructure/cert-manager\n - infrastructure/vault-csi\n - infrastructure/vault-injector\n - services/vault\n - infrastructure/postgres\n - services/gitea\n - services/keycloak\n - services/oauth2-proxy\n#s' "${CONF_DIR}/hecate.yaml"
|
||||||
echo "[install] refreshed peer local_bootstrap_paths for full fallback bootstrap parity"
|
echo "[install] refreshed peer local_bootstrap_paths for full fallback bootstrap parity"
|
||||||
changed=1
|
changed=1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if perl -0777 -ne 'exit(!(/local_bootstrap_paths:\n - infrastructure\/core\n/s))' "${CONF_DIR}/hecate.yaml"; then
|
if perl -0777 -ne 'exit(!(/local_bootstrap_paths:\n - infrastructure\/core\n/s))' "${CONF_DIR}/hecate.yaml"; then
|
||||||
perl -0pi -e 's#local_bootstrap_paths:\n - infrastructure/core\n#local_bootstrap_paths:\n - infrastructure/core\n - clusters/atlas/flux-system\n - infrastructure/sources/helm\n - infrastructure/metallb\n - infrastructure/traefik\n - infrastructure/vault-csi\n - infrastructure/vault-injector\n - services/vault\n - infrastructure/postgres\n - services/gitea\n#s' "${CONF_DIR}/hecate.yaml"
|
perl -0pi -e 's#local_bootstrap_paths:\n - infrastructure/core\n#local_bootstrap_paths:\n - infrastructure/core\n - clusters/atlas/flux-system\n - infrastructure/sources/helm\n - infrastructure/metallb\n - infrastructure/traefik\n - infrastructure/cert-manager\n - infrastructure/vault-csi\n - infrastructure/vault-injector\n - services/vault\n - infrastructure/postgres\n - services/gitea\n - services/keycloak\n - services/oauth2-proxy\n#s' "${CONF_DIR}/hecate.yaml"
|
||||||
echo "[install] expanded peer local_bootstrap_paths for full fallback bootstrap parity"
|
echo "[install] expanded peer local_bootstrap_paths for full fallback bootstrap parity"
|
||||||
changed=1
|
changed=1
|
||||||
fi
|
fi
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user