hecate: harden startup with storage gates and fallback cache
This commit is contained in:
parent
a05973bf2b
commit
72d33bc2ce
@ -48,9 +48,28 @@ startup:
|
||||
require_time_sync: true
|
||||
time_sync_wait_seconds: 240
|
||||
time_sync_poll_seconds: 5
|
||||
time_sync_mode: quorum
|
||||
time_sync_quorum: 2
|
||||
reconcile_access_on_boot: true
|
||||
auto_etcd_restore_on_api_failure: true
|
||||
etcd_restore_control_plane: titan-0a
|
||||
require_storage_ready: true
|
||||
storage_ready_wait_seconds: 420
|
||||
storage_ready_poll_seconds: 5
|
||||
storage_min_ready_nodes: 2
|
||||
storage_critical_pvcs:
|
||||
- vault/data-vault-0
|
||||
- postgres/postgres-data-postgres-0
|
||||
- gitea/gitea-data
|
||||
- sso/keycloak-data
|
||||
require_post_start_probes: true
|
||||
post_start_probe_wait_seconds: 240
|
||||
post_start_probe_poll_seconds: 5
|
||||
post_start_probes:
|
||||
- https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration
|
||||
- https://scm.bstein.dev/user/login
|
||||
- https://metrics.bstein.dev/login
|
||||
vault_unseal_key_file: /var/lib/hecate/vault-unseal.key
|
||||
shutdown:
|
||||
default_budget_seconds: 1380
|
||||
history_min_samples: 3
|
||||
|
||||
@ -114,9 +114,28 @@ startup:
|
||||
require_time_sync: true
|
||||
time_sync_wait_seconds: 240
|
||||
time_sync_poll_seconds: 5
|
||||
time_sync_mode: quorum
|
||||
time_sync_quorum: 2
|
||||
reconcile_access_on_boot: true
|
||||
auto_etcd_restore_on_api_failure: true
|
||||
etcd_restore_control_plane: titan-0a
|
||||
require_storage_ready: true
|
||||
storage_ready_wait_seconds: 420
|
||||
storage_ready_poll_seconds: 5
|
||||
storage_min_ready_nodes: 2
|
||||
storage_critical_pvcs:
|
||||
- vault/data-vault-0
|
||||
- postgres/postgres-data-postgres-0
|
||||
- gitea/gitea-data
|
||||
- sso/keycloak-data
|
||||
require_post_start_probes: true
|
||||
post_start_probe_wait_seconds: 240
|
||||
post_start_probe_poll_seconds: 5
|
||||
post_start_probes:
|
||||
- https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration
|
||||
- https://scm.bstein.dev/user/login
|
||||
- https://metrics.bstein.dev/login
|
||||
vault_unseal_key_file: /var/lib/hecate/vault-unseal.key
|
||||
shutdown:
|
||||
default_budget_seconds: 1380
|
||||
history_min_samples: 3
|
||||
|
||||
@ -114,9 +114,28 @@ startup:
|
||||
require_time_sync: true
|
||||
time_sync_wait_seconds: 240
|
||||
time_sync_poll_seconds: 5
|
||||
time_sync_mode: quorum
|
||||
time_sync_quorum: 2
|
||||
reconcile_access_on_boot: true
|
||||
auto_etcd_restore_on_api_failure: true
|
||||
etcd_restore_control_plane: titan-0a
|
||||
require_storage_ready: true
|
||||
storage_ready_wait_seconds: 420
|
||||
storage_ready_poll_seconds: 5
|
||||
storage_min_ready_nodes: 2
|
||||
storage_critical_pvcs:
|
||||
- vault/data-vault-0
|
||||
- postgres/postgres-data-postgres-0
|
||||
- gitea/gitea-data
|
||||
- sso/keycloak-data
|
||||
require_post_start_probes: true
|
||||
post_start_probe_wait_seconds: 240
|
||||
post_start_probe_poll_seconds: 5
|
||||
post_start_probes:
|
||||
- https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration
|
||||
- https://scm.bstein.dev/user/login
|
||||
- https://metrics.bstein.dev/login
|
||||
vault_unseal_key_file: /var/lib/hecate/vault-unseal.key
|
||||
shutdown:
|
||||
default_budget_seconds: 1380
|
||||
history_min_samples: 3
|
||||
|
||||
@ -141,6 +141,8 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
|
||||
if err := o.preflightExternalDatastore(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
o.bestEffort("sync local titan-iac checkout", func() error { return o.syncLocalIACRepo(ctx) })
|
||||
o.bestEffort("refresh bootstrap cache from local repo", func() error { return o.refreshBootstrapCache(ctx) })
|
||||
if o.cfg.Startup.ReconcileAccessOnBoot {
|
||||
o.bestEffort("reconcile control-plane access", func() error { return o.reconcileNodeAccess(ctx, o.cfg.ControlPlanes) })
|
||||
}
|
||||
@ -217,6 +219,11 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
|
||||
if len(missing) > 0 {
|
||||
o.log.Printf("startup critical workloads not ready; applying targeted recovery first: %s", strings.Join(missing, ", "))
|
||||
}
|
||||
if o.cfg.Startup.RequireStorageReady {
|
||||
if err := o.waitForStorageReady(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if err := o.ensureCriticalStartupWorkloads(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
@ -252,6 +259,11 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
|
||||
if err := o.resumeFluxAndReconcile(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
if o.cfg.Startup.RequirePostStartProbes {
|
||||
if err := o.waitForPostStartProbes(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
o.log.Printf("startup flow complete")
|
||||
return nil
|
||||
}
|
||||
@ -923,11 +935,40 @@ func (o *Orchestrator) waitForTimeSync(ctx context.Context, nodes []string) erro
|
||||
if poll <= 0 {
|
||||
poll = 5 * time.Second
|
||||
}
|
||||
|
||||
mode := strings.ToLower(strings.TrimSpace(o.cfg.Startup.TimeSyncMode))
|
||||
if mode == "" {
|
||||
mode = "strict"
|
||||
}
|
||||
managedControlPlanes := 0
|
||||
for _, node := range nodes {
|
||||
node = strings.TrimSpace(node)
|
||||
if node == "" {
|
||||
continue
|
||||
}
|
||||
if o.sshManaged(node) {
|
||||
managedControlPlanes++
|
||||
}
|
||||
}
|
||||
requiredQuorum := o.cfg.Startup.TimeSyncQuorum
|
||||
if requiredQuorum <= 0 {
|
||||
requiredQuorum = managedControlPlanes
|
||||
if requiredQuorum <= 0 {
|
||||
requiredQuorum = 1
|
||||
}
|
||||
}
|
||||
if requiredQuorum > managedControlPlanes && managedControlPlanes > 0 {
|
||||
requiredQuorum = managedControlPlanes
|
||||
}
|
||||
|
||||
deadline := time.Now().Add(wait)
|
||||
for {
|
||||
unsynced := []string{}
|
||||
syncedControlPlanes := 0
|
||||
checkedControlPlanes := 0
|
||||
localOut, localErr := o.run(ctx, 10*time.Second, "sh", "-lc", "timedatectl show -p NTPSynchronized --value 2>/dev/null || echo unknown")
|
||||
if localErr != nil || !isTimeSynced(localOut) {
|
||||
localSynced := localErr == nil && isTimeSynced(localOut)
|
||||
if !localSynced {
|
||||
if localErr != nil {
|
||||
unsynced = append(unsynced, fmt.Sprintf("local(%v)", localErr))
|
||||
} else {
|
||||
@ -942,6 +983,7 @@ func (o *Orchestrator) waitForTimeSync(ctx context.Context, nodes []string) erro
|
||||
if !o.sshManaged(node) {
|
||||
continue
|
||||
}
|
||||
checkedControlPlanes++
|
||||
out, err := o.ssh(ctx, node, "timedatectl show -p NTPSynchronized --value 2>/dev/null || echo unknown")
|
||||
if err != nil || !isTimeSynced(out) {
|
||||
if err != nil {
|
||||
@ -949,12 +991,38 @@ func (o *Orchestrator) waitForTimeSync(ctx context.Context, nodes []string) erro
|
||||
} else {
|
||||
unsynced = append(unsynced, fmt.Sprintf("%s(%s)", node, strings.TrimSpace(out)))
|
||||
}
|
||||
} else {
|
||||
syncedControlPlanes++
|
||||
}
|
||||
}
|
||||
if len(unsynced) == 0 {
|
||||
|
||||
ready := false
|
||||
switch mode {
|
||||
case "quorum":
|
||||
if localSynced && syncedControlPlanes >= requiredQuorum {
|
||||
ready = true
|
||||
}
|
||||
default:
|
||||
if localSynced && len(unsynced) == 0 {
|
||||
ready = true
|
||||
}
|
||||
}
|
||||
|
||||
if ready {
|
||||
return nil
|
||||
}
|
||||
if time.Now().After(deadline) {
|
||||
if mode == "quorum" {
|
||||
return fmt.Errorf(
|
||||
"startup blocked: time sync quorum not ready within %s (mode=quorum local_synced=%t synced_control_planes=%d required=%d checked=%d details=%s)",
|
||||
wait,
|
||||
localSynced,
|
||||
syncedControlPlanes,
|
||||
requiredQuorum,
|
||||
checkedControlPlanes,
|
||||
strings.Join(unsynced, ", "),
|
||||
)
|
||||
}
|
||||
return fmt.Errorf("startup blocked: time sync not ready within %s (%s)", wait, strings.Join(unsynced, ", "))
|
||||
}
|
||||
select {
|
||||
@ -1183,23 +1251,30 @@ func (o *Orchestrator) ensureFluxBranch(ctx context.Context, branch string) erro
|
||||
|
||||
func (o *Orchestrator) bootstrapLocal(ctx context.Context) error {
|
||||
failures := 0
|
||||
successes := 0
|
||||
for _, rel := range o.cfg.LocalBootstrapPaths {
|
||||
full := filepath.Join(o.cfg.IACRepoPath, rel)
|
||||
o.log.Printf("local bootstrap apply -k %s", full)
|
||||
o.log.Printf("local bootstrap apply rel=%s path=%s", rel, full)
|
||||
if o.runner.DryRun {
|
||||
successes++
|
||||
continue
|
||||
}
|
||||
if _, err := o.kubectl(ctx, 2*time.Minute, "apply", "-k", full); err != nil {
|
||||
o.log.Printf("warning: local bootstrap apply failed at %s: %v", full, err)
|
||||
o.log.Printf("local bootstrap fallback render/apply with LoadRestrictionsNone for %s", full)
|
||||
if fallbackErr := o.applyKustomizeFallback(ctx, full); fallbackErr != nil {
|
||||
failures++
|
||||
o.log.Printf("warning: local bootstrap fallback failed at %s: %v", full, fallbackErr)
|
||||
continue
|
||||
o.log.Printf("local bootstrap cache apply for rel=%s", rel)
|
||||
if cacheErr := o.applyBootstrapCache(ctx, rel); cacheErr != nil {
|
||||
failures++
|
||||
o.log.Printf("warning: local bootstrap cache apply failed for rel=%s: %v", rel, cacheErr)
|
||||
continue
|
||||
}
|
||||
}
|
||||
}
|
||||
successes++
|
||||
}
|
||||
if failures == len(o.cfg.LocalBootstrapPaths) {
|
||||
if failures > 0 && successes == 0 {
|
||||
return fmt.Errorf("local bootstrap apply failed for every configured path (%d total)", failures)
|
||||
}
|
||||
return nil
|
||||
@ -1213,6 +1288,99 @@ func (o *Orchestrator) applyKustomizeFallback(ctx context.Context, full string)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (o *Orchestrator) syncLocalIACRepo(ctx context.Context) error {
|
||||
repo := strings.TrimSpace(o.cfg.IACRepoPath)
|
||||
if repo == "" {
|
||||
return fmt.Errorf("iac repo path is empty")
|
||||
}
|
||||
gitDir := filepath.Join(repo, ".git")
|
||||
if stat, err := os.Stat(gitDir); err != nil || stat.IsDir() == false {
|
||||
return fmt.Errorf("iac repo %s is not a git checkout", repo)
|
||||
}
|
||||
statusOut, statusErr := o.runSensitive(ctx, 10*time.Second, "git", "-C", repo, "status", "--porcelain")
|
||||
if statusErr != nil {
|
||||
return fmt.Errorf("inspect iac repo working tree: %w", statusErr)
|
||||
}
|
||||
if strings.TrimSpace(statusOut) != "" {
|
||||
o.log.Printf("warning: skipping local titan-iac sync because working tree is dirty")
|
||||
return nil
|
||||
}
|
||||
branch := strings.TrimSpace(o.cfg.ExpectedFluxBranch)
|
||||
if branch == "" {
|
||||
branch = "main"
|
||||
}
|
||||
if _, err := o.runSensitive(ctx, 45*time.Second, "git", "-C", repo, "fetch", "origin", "--prune"); err != nil {
|
||||
return fmt.Errorf("git fetch origin: %w", err)
|
||||
}
|
||||
if _, err := o.runSensitive(ctx, 20*time.Second, "git", "-C", repo, "checkout", branch); err != nil {
|
||||
return fmt.Errorf("git checkout %s: %w", branch, err)
|
||||
}
|
||||
if _, err := o.runSensitive(ctx, 20*time.Second, "git", "-C", repo, "reset", "--hard", "origin/"+branch); err != nil {
|
||||
return fmt.Errorf("git reset --hard origin/%s: %w", branch, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (o *Orchestrator) refreshBootstrapCache(ctx context.Context) error {
|
||||
if len(o.cfg.LocalBootstrapPaths) == 0 {
|
||||
return nil
|
||||
}
|
||||
if err := os.MkdirAll(o.bootstrapCacheDir(), 0o755); err != nil {
|
||||
return fmt.Errorf("ensure bootstrap cache dir: %w", err)
|
||||
}
|
||||
rendered := 0
|
||||
for _, rel := range o.cfg.LocalBootstrapPaths {
|
||||
rel = strings.TrimSpace(rel)
|
||||
if rel == "" {
|
||||
continue
|
||||
}
|
||||
full := filepath.Join(o.cfg.IACRepoPath, rel)
|
||||
if stat, err := os.Stat(full); err != nil || !stat.IsDir() {
|
||||
o.log.Printf("warning: skip bootstrap cache render for rel=%s (path missing)", rel)
|
||||
continue
|
||||
}
|
||||
cmd := fmt.Sprintf("kubectl kustomize --load-restrictor=LoadRestrictionsNone %q", full)
|
||||
manifest, err := o.runSensitive(ctx, 2*time.Minute, "sh", "-lc", cmd)
|
||||
if err != nil {
|
||||
o.log.Printf("warning: bootstrap cache render failed for rel=%s: %v", rel, err)
|
||||
continue
|
||||
}
|
||||
cachePath := o.bootstrapCachePath(rel)
|
||||
if err := os.WriteFile(cachePath, []byte(manifest+"\n"), 0o644); err != nil {
|
||||
o.log.Printf("warning: bootstrap cache write failed for rel=%s path=%s: %v", rel, cachePath, err)
|
||||
continue
|
||||
}
|
||||
rendered++
|
||||
}
|
||||
if rendered == 0 {
|
||||
return fmt.Errorf("no bootstrap cache manifests rendered")
|
||||
}
|
||||
o.log.Printf("bootstrap cache refreshed (%d paths)", rendered)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (o *Orchestrator) applyBootstrapCache(ctx context.Context, rel string) error {
|
||||
cachePath := o.bootstrapCachePath(rel)
|
||||
if _, err := os.Stat(cachePath); err != nil {
|
||||
return fmt.Errorf("bootstrap cache missing at %s: %w", cachePath, err)
|
||||
}
|
||||
if _, err := o.kubectl(ctx, 2*time.Minute, "apply", "-f", cachePath); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (o *Orchestrator) bootstrapCacheDir() string {
|
||||
return filepath.Join(o.cfg.State.Dir, "bootstrap-cache")
|
||||
}
|
||||
|
||||
func (o *Orchestrator) bootstrapCachePath(rel string) string {
|
||||
safe := strings.TrimSpace(rel)
|
||||
safe = strings.ReplaceAll(safe, "/", "__")
|
||||
safe = strings.ReplaceAll(safe, string(os.PathSeparator), "__")
|
||||
return filepath.Join(o.bootstrapCacheDir(), safe+".yaml")
|
||||
}
|
||||
|
||||
func (o *Orchestrator) waitForFluxSourceReady(ctx context.Context, window time.Duration) (bool, error) {
|
||||
if o.runner.DryRun {
|
||||
return true, nil
|
||||
@ -1237,6 +1405,184 @@ func (o *Orchestrator) waitForFluxSourceReady(ctx context.Context, window time.D
|
||||
}
|
||||
}
|
||||
|
||||
func (o *Orchestrator) waitForStorageReady(ctx context.Context) error {
|
||||
if o.runner.DryRun {
|
||||
return nil
|
||||
}
|
||||
wait := time.Duration(o.cfg.Startup.StorageReadyWaitSeconds) * time.Second
|
||||
if wait <= 0 {
|
||||
wait = 420 * time.Second
|
||||
}
|
||||
poll := time.Duration(o.cfg.Startup.StorageReadyPollSeconds) * time.Second
|
||||
if poll <= 0 {
|
||||
poll = 5 * time.Second
|
||||
}
|
||||
deadline := time.Now().Add(wait)
|
||||
lastReason := "unknown"
|
||||
for {
|
||||
ok, reason, err := o.storageReady(ctx)
|
||||
if err != nil {
|
||||
lastReason = err.Error()
|
||||
} else {
|
||||
lastReason = reason
|
||||
}
|
||||
if ok {
|
||||
o.log.Printf("storage readiness check passed (%s)", reason)
|
||||
return nil
|
||||
}
|
||||
if time.Now().After(deadline) {
|
||||
return fmt.Errorf("startup blocked: storage readiness not satisfied within %s (%s)", wait, lastReason)
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-time.After(poll):
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (o *Orchestrator) storageReady(ctx context.Context) (bool, string, error) {
|
||||
minReady := o.cfg.Startup.StorageMinReadyNodes
|
||||
if minReady <= 0 {
|
||||
minReady = 2
|
||||
}
|
||||
longhornOut, err := o.kubectl(
|
||||
ctx,
|
||||
15*time.Second,
|
||||
"-n",
|
||||
"longhorn-system",
|
||||
"get",
|
||||
"nodes.longhorn.io",
|
||||
"-o",
|
||||
`jsonpath={range .items[*]}{.metadata.name}{":"}{.status.conditions[?(@.type=="Ready")].status}{":"}{.status.conditions[?(@.type=="Schedulable")].status}{"\n"}{end}`,
|
||||
)
|
||||
if err != nil {
|
||||
return false, "", fmt.Errorf("query longhorn nodes: %w", err)
|
||||
}
|
||||
readyNodes := 0
|
||||
for _, line := range lines(longhornOut) {
|
||||
parts := strings.Split(line, ":")
|
||||
if len(parts) < 3 {
|
||||
continue
|
||||
}
|
||||
ready := strings.EqualFold(strings.TrimSpace(parts[1]), "true")
|
||||
sched := strings.EqualFold(strings.TrimSpace(parts[2]), "true")
|
||||
if ready && sched {
|
||||
readyNodes++
|
||||
}
|
||||
}
|
||||
if readyNodes < minReady {
|
||||
return false, fmt.Sprintf("longhorn ready+sched nodes %d/%d", readyNodes, minReady), nil
|
||||
}
|
||||
|
||||
for _, item := range o.cfg.Startup.StorageCriticalPVCs {
|
||||
item = strings.TrimSpace(item)
|
||||
if item == "" {
|
||||
continue
|
||||
}
|
||||
parts := strings.SplitN(item, "/", 2)
|
||||
if len(parts) != 2 {
|
||||
return false, "", fmt.Errorf("invalid storage_critical_pvcs entry %q", item)
|
||||
}
|
||||
ns := strings.TrimSpace(parts[0])
|
||||
name := strings.TrimSpace(parts[1])
|
||||
out, pvcErr := o.kubectl(ctx, 15*time.Second, "-n", ns, "get", "pvc", name, "-o", "jsonpath={.status.phase}")
|
||||
if pvcErr != nil {
|
||||
if isNotFoundErr(pvcErr) {
|
||||
return false, fmt.Sprintf("pvc %s/%s not found", ns, name), nil
|
||||
}
|
||||
return false, "", fmt.Errorf("query pvc %s/%s: %w", ns, name, pvcErr)
|
||||
}
|
||||
if !strings.EqualFold(strings.TrimSpace(out), "Bound") {
|
||||
return false, fmt.Sprintf("pvc %s/%s phase=%s", ns, name, strings.TrimSpace(out)), nil
|
||||
}
|
||||
}
|
||||
|
||||
return true, fmt.Sprintf("longhorn ready+sched nodes=%d critical pvcs bound=%d", readyNodes, len(o.cfg.Startup.StorageCriticalPVCs)), nil
|
||||
}
|
||||
|
||||
func (o *Orchestrator) waitForPostStartProbes(ctx context.Context) error {
|
||||
if o.runner.DryRun {
|
||||
return nil
|
||||
}
|
||||
wait := time.Duration(o.cfg.Startup.PostStartProbeWaitSeconds) * time.Second
|
||||
if wait <= 0 {
|
||||
wait = 240 * time.Second
|
||||
}
|
||||
poll := time.Duration(o.cfg.Startup.PostStartProbePollSeconds) * time.Second
|
||||
if poll <= 0 {
|
||||
poll = 5 * time.Second
|
||||
}
|
||||
deadline := time.Now().Add(wait)
|
||||
lastFailure := "unknown"
|
||||
for {
|
||||
ok, failure := o.postStartProbesReady(ctx)
|
||||
if ok {
|
||||
o.log.Printf("post-start probes passed")
|
||||
return nil
|
||||
}
|
||||
lastFailure = failure
|
||||
if time.Now().After(deadline) {
|
||||
return fmt.Errorf("startup blocked: post-start probes did not pass within %s (%s)", wait, lastFailure)
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-time.After(poll):
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (o *Orchestrator) postStartProbesReady(ctx context.Context) (bool, string) {
|
||||
probes := make([]string, 0, len(o.cfg.Startup.PostStartProbes))
|
||||
for _, p := range o.cfg.Startup.PostStartProbes {
|
||||
p = strings.TrimSpace(p)
|
||||
if p != "" {
|
||||
probes = append(probes, p)
|
||||
}
|
||||
}
|
||||
if len(probes) == 0 {
|
||||
return true, "no probes configured"
|
||||
}
|
||||
|
||||
for _, probe := range probes {
|
||||
code, err := o.httpProbe(ctx, probe)
|
||||
if err != nil {
|
||||
return false, fmt.Sprintf("%s: %v", probe, err)
|
||||
}
|
||||
if code < 200 || code >= 400 {
|
||||
return false, fmt.Sprintf("%s: unexpected status code=%d", probe, code)
|
||||
}
|
||||
}
|
||||
return true, "all probes successful"
|
||||
}
|
||||
|
||||
func (o *Orchestrator) httpProbe(ctx context.Context, probeURL string) (int, error) {
|
||||
out, err := o.run(
|
||||
ctx,
|
||||
20*time.Second,
|
||||
"curl",
|
||||
"--silent",
|
||||
"--show-error",
|
||||
"--location",
|
||||
"--max-time",
|
||||
"12",
|
||||
"--output",
|
||||
"/dev/null",
|
||||
"--write-out",
|
||||
"%{http_code}",
|
||||
probeURL,
|
||||
)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
code, convErr := strconv.Atoi(strings.TrimSpace(out))
|
||||
if convErr != nil {
|
||||
return 0, fmt.Errorf("parse http status %q: %w", strings.TrimSpace(out), convErr)
|
||||
}
|
||||
return code, nil
|
||||
}
|
||||
|
||||
func (o *Orchestrator) resumeFluxAndReconcile(ctx context.Context) error {
|
||||
if err := o.patchFluxSuspendAll(ctx, false); err != nil {
|
||||
return err
|
||||
@ -1678,16 +2024,56 @@ func (o *Orchestrator) vaultUnsealKey(ctx context.Context) (string, error) {
|
||||
"get", "secret", "vault-init",
|
||||
"-o", "jsonpath={.data.unseal_key_b64}",
|
||||
)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("read vault-init secret: %w", err)
|
||||
if err == nil {
|
||||
decoded, decodeErr := base64.StdEncoding.DecodeString(strings.TrimSpace(out))
|
||||
if decodeErr == nil {
|
||||
key := strings.TrimSpace(string(decoded))
|
||||
if key != "" {
|
||||
o.bestEffort("cache vault unseal key locally", func() error { return o.writeVaultUnsealKeyFile(key) })
|
||||
return key, nil
|
||||
}
|
||||
err = fmt.Errorf("vault-init unseal key is empty")
|
||||
} else {
|
||||
err = fmt.Errorf("decode vault-init unseal_key_b64: %w", decodeErr)
|
||||
}
|
||||
} else {
|
||||
err = fmt.Errorf("read vault-init secret: %w", err)
|
||||
}
|
||||
decoded, err := base64.StdEncoding.DecodeString(strings.TrimSpace(out))
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("decode vault-init unseal_key_b64: %w", err)
|
||||
|
||||
fallbackKey, fileErr := o.readVaultUnsealKeyFile()
|
||||
if fileErr == nil {
|
||||
o.log.Printf("warning: using cached vault unseal key from %s", o.cfg.Startup.VaultUnsealKeyFile)
|
||||
return fallbackKey, nil
|
||||
}
|
||||
key := strings.TrimSpace(string(decoded))
|
||||
return "", fmt.Errorf("%v; fallback %v", err, fileErr)
|
||||
}
|
||||
|
||||
func (o *Orchestrator) writeVaultUnsealKeyFile(key string) error {
|
||||
path := strings.TrimSpace(o.cfg.Startup.VaultUnsealKeyFile)
|
||||
if path == "" {
|
||||
return fmt.Errorf("vault unseal key file path is empty")
|
||||
}
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0o700); err != nil {
|
||||
return fmt.Errorf("ensure vault unseal key dir: %w", err)
|
||||
}
|
||||
if err := os.WriteFile(path, []byte(strings.TrimSpace(key)+"\n"), 0o600); err != nil {
|
||||
return fmt.Errorf("write vault unseal key file: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (o *Orchestrator) readVaultUnsealKeyFile() (string, error) {
|
||||
path := strings.TrimSpace(o.cfg.Startup.VaultUnsealKeyFile)
|
||||
if path == "" {
|
||||
return "", fmt.Errorf("vault unseal key file path is empty")
|
||||
}
|
||||
b, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("read vault unseal key file %s: %w", path, err)
|
||||
}
|
||||
key := strings.TrimSpace(string(b))
|
||||
if key == "" {
|
||||
return "", fmt.Errorf("vault-init unseal key is empty")
|
||||
return "", fmt.Errorf("vault unseal key file %s is empty", path)
|
||||
}
|
||||
return key, nil
|
||||
}
|
||||
|
||||
@ -3,6 +3,7 @@ package config
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"gopkg.in/yaml.v3"
|
||||
)
|
||||
@ -33,14 +34,26 @@ type Config struct {
|
||||
}
|
||||
|
||||
type Startup struct {
|
||||
APIWaitSeconds int `yaml:"api_wait_seconds"`
|
||||
APIPollSeconds int `yaml:"api_poll_seconds"`
|
||||
RequireTimeSync bool `yaml:"require_time_sync"`
|
||||
TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"`
|
||||
TimeSyncPollSeconds int `yaml:"time_sync_poll_seconds"`
|
||||
ReconcileAccessOnBoot bool `yaml:"reconcile_access_on_boot"`
|
||||
AutoEtcdRestoreOnAPIFailure bool `yaml:"auto_etcd_restore_on_api_failure"`
|
||||
EtcdRestoreControlPlane string `yaml:"etcd_restore_control_plane"`
|
||||
APIWaitSeconds int `yaml:"api_wait_seconds"`
|
||||
APIPollSeconds int `yaml:"api_poll_seconds"`
|
||||
RequireTimeSync bool `yaml:"require_time_sync"`
|
||||
TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"`
|
||||
TimeSyncPollSeconds int `yaml:"time_sync_poll_seconds"`
|
||||
TimeSyncMode string `yaml:"time_sync_mode"`
|
||||
TimeSyncQuorum int `yaml:"time_sync_quorum"`
|
||||
ReconcileAccessOnBoot bool `yaml:"reconcile_access_on_boot"`
|
||||
AutoEtcdRestoreOnAPIFailure bool `yaml:"auto_etcd_restore_on_api_failure"`
|
||||
EtcdRestoreControlPlane string `yaml:"etcd_restore_control_plane"`
|
||||
RequireStorageReady bool `yaml:"require_storage_ready"`
|
||||
StorageReadyWaitSeconds int `yaml:"storage_ready_wait_seconds"`
|
||||
StorageReadyPollSeconds int `yaml:"storage_ready_poll_seconds"`
|
||||
StorageMinReadyNodes int `yaml:"storage_min_ready_nodes"`
|
||||
StorageCriticalPVCs []string `yaml:"storage_critical_pvcs"`
|
||||
RequirePostStartProbes bool `yaml:"require_post_start_probes"`
|
||||
PostStartProbeWaitSeconds int `yaml:"post_start_probe_wait_seconds"`
|
||||
PostStartProbePollSeconds int `yaml:"post_start_probe_poll_seconds"`
|
||||
PostStartProbes []string `yaml:"post_start_probes"`
|
||||
VaultUnsealKeyFile string `yaml:"vault_unseal_key_file"`
|
||||
}
|
||||
|
||||
type Shutdown struct {
|
||||
@ -162,6 +175,12 @@ func (c Config) Validate() error {
|
||||
if c.Startup.TimeSyncPollSeconds <= 0 {
|
||||
return fmt.Errorf("config.startup.time_sync_poll_seconds must be > 0")
|
||||
}
|
||||
if c.Startup.TimeSyncMode != "strict" && c.Startup.TimeSyncMode != "quorum" {
|
||||
return fmt.Errorf("config.startup.time_sync_mode must be strict or quorum")
|
||||
}
|
||||
if c.Startup.TimeSyncMode == "quorum" && c.Startup.TimeSyncQuorum <= 0 {
|
||||
return fmt.Errorf("config.startup.time_sync_quorum must be > 0 when time_sync_mode=quorum")
|
||||
}
|
||||
if c.Startup.EtcdRestoreControlPlane != "" {
|
||||
found := false
|
||||
for _, cp := range c.ControlPlanes {
|
||||
@ -174,6 +193,37 @@ func (c Config) Validate() error {
|
||||
return fmt.Errorf("config.startup.etcd_restore_control_plane must be one of config.control_planes when set")
|
||||
}
|
||||
}
|
||||
if c.Startup.StorageReadyWaitSeconds <= 0 {
|
||||
return fmt.Errorf("config.startup.storage_ready_wait_seconds must be > 0")
|
||||
}
|
||||
if c.Startup.StorageReadyPollSeconds <= 0 {
|
||||
return fmt.Errorf("config.startup.storage_ready_poll_seconds must be > 0")
|
||||
}
|
||||
if c.Startup.StorageMinReadyNodes <= 0 {
|
||||
return fmt.Errorf("config.startup.storage_min_ready_nodes must be > 0")
|
||||
}
|
||||
for _, pvc := range c.Startup.StorageCriticalPVCs {
|
||||
if strings.Count(strings.TrimSpace(pvc), "/") != 1 {
|
||||
return fmt.Errorf("config.startup.storage_critical_pvcs entries must be namespace/name, got %q", pvc)
|
||||
}
|
||||
}
|
||||
if c.Startup.PostStartProbeWaitSeconds <= 0 {
|
||||
return fmt.Errorf("config.startup.post_start_probe_wait_seconds must be > 0")
|
||||
}
|
||||
if c.Startup.PostStartProbePollSeconds <= 0 {
|
||||
return fmt.Errorf("config.startup.post_start_probe_poll_seconds must be > 0")
|
||||
}
|
||||
if c.Startup.RequirePostStartProbes && len(c.Startup.PostStartProbes) == 0 {
|
||||
return fmt.Errorf("config.startup.post_start_probes must not be empty when require_post_start_probes is true")
|
||||
}
|
||||
for _, probe := range c.Startup.PostStartProbes {
|
||||
if strings.TrimSpace(probe) == "" {
|
||||
return fmt.Errorf("config.startup.post_start_probes entries must not be empty")
|
||||
}
|
||||
}
|
||||
if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" {
|
||||
return fmt.Errorf("config.startup.vault_unseal_key_file must not be empty")
|
||||
}
|
||||
if c.SSHPort <= 0 || c.SSHPort > 65535 {
|
||||
return fmt.Errorf("config.ssh_port must be in range 1-65535")
|
||||
}
|
||||
@ -250,9 +300,30 @@ func defaults() Config {
|
||||
RequireTimeSync: true,
|
||||
TimeSyncWaitSeconds: 240,
|
||||
TimeSyncPollSeconds: 5,
|
||||
TimeSyncMode: "quorum",
|
||||
TimeSyncQuorum: 2,
|
||||
ReconcileAccessOnBoot: true,
|
||||
AutoEtcdRestoreOnAPIFailure: true,
|
||||
EtcdRestoreControlPlane: "titan-0a",
|
||||
RequireStorageReady: true,
|
||||
StorageReadyWaitSeconds: 420,
|
||||
StorageReadyPollSeconds: 5,
|
||||
StorageMinReadyNodes: 2,
|
||||
StorageCriticalPVCs: []string{
|
||||
"vault/data-vault-0",
|
||||
"postgres/postgres-data-postgres-0",
|
||||
"gitea/gitea-data",
|
||||
"sso/keycloak-data",
|
||||
},
|
||||
RequirePostStartProbes: true,
|
||||
PostStartProbeWaitSeconds: 240,
|
||||
PostStartProbePollSeconds: 5,
|
||||
PostStartProbes: []string{
|
||||
"https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration",
|
||||
"https://scm.bstein.dev/user/login",
|
||||
"https://metrics.bstein.dev/login",
|
||||
},
|
||||
VaultUnsealKeyFile: "/var/lib/hecate/vault-unseal.key",
|
||||
},
|
||||
Shutdown: Shutdown{
|
||||
DefaultBudgetSeconds: 1380,
|
||||
@ -319,9 +390,51 @@ func (c *Config) applyDefaults() {
|
||||
if c.Startup.TimeSyncPollSeconds <= 0 {
|
||||
c.Startup.TimeSyncPollSeconds = 5
|
||||
}
|
||||
if c.Startup.TimeSyncMode == "" {
|
||||
c.Startup.TimeSyncMode = "quorum"
|
||||
}
|
||||
if c.Startup.TimeSyncQuorum <= 0 {
|
||||
c.Startup.TimeSyncQuorum = 2
|
||||
}
|
||||
if c.Startup.TimeSyncQuorum > len(c.ControlPlanes) && len(c.ControlPlanes) > 0 {
|
||||
c.Startup.TimeSyncQuorum = len(c.ControlPlanes)
|
||||
}
|
||||
if c.Startup.EtcdRestoreControlPlane == "" && len(c.ControlPlanes) > 0 {
|
||||
c.Startup.EtcdRestoreControlPlane = c.ControlPlanes[0]
|
||||
}
|
||||
if c.Startup.StorageReadyWaitSeconds <= 0 {
|
||||
c.Startup.StorageReadyWaitSeconds = 420
|
||||
}
|
||||
if c.Startup.StorageReadyPollSeconds <= 0 {
|
||||
c.Startup.StorageReadyPollSeconds = 5
|
||||
}
|
||||
if c.Startup.StorageMinReadyNodes <= 0 {
|
||||
c.Startup.StorageMinReadyNodes = 2
|
||||
}
|
||||
if len(c.Startup.StorageCriticalPVCs) == 0 {
|
||||
c.Startup.StorageCriticalPVCs = []string{
|
||||
"vault/data-vault-0",
|
||||
"postgres/postgres-data-postgres-0",
|
||||
"gitea/gitea-data",
|
||||
"sso/keycloak-data",
|
||||
}
|
||||
}
|
||||
if c.Startup.PostStartProbeWaitSeconds <= 0 {
|
||||
c.Startup.PostStartProbeWaitSeconds = 240
|
||||
}
|
||||
if c.Startup.PostStartProbePollSeconds <= 0 {
|
||||
c.Startup.PostStartProbePollSeconds = 5
|
||||
}
|
||||
if len(c.Startup.PostStartProbes) == 0 {
|
||||
c.Startup.PostStartProbes = []string{
|
||||
"https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration",
|
||||
"https://scm.bstein.dev/user/login",
|
||||
"https://metrics.bstein.dev/login",
|
||||
}
|
||||
}
|
||||
if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" {
|
||||
c.Startup.VaultUnsealKeyFile = "/var/lib/hecate/vault-unseal.key"
|
||||
}
|
||||
if c.SSHPort <= 0 {
|
||||
c.SSHPort = 2277
|
||||
}
|
||||
|
||||
@ -92,4 +92,35 @@ state:
|
||||
if cfg.Startup.EtcdRestoreControlPlane == "" {
|
||||
t.Fatalf("expected startup etcd restore control plane default to be set")
|
||||
}
|
||||
if cfg.Startup.TimeSyncMode == "" {
|
||||
t.Fatalf("expected startup time sync mode default to be set")
|
||||
}
|
||||
if cfg.Startup.VaultUnsealKeyFile == "" {
|
||||
t.Fatalf("expected startup vault unseal key file default to be set")
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidateRejectsInvalidTimeSyncMode(t *testing.T) {
|
||||
cfg := defaults()
|
||||
cfg.Startup.TimeSyncMode = "invalid"
|
||||
if err := cfg.Validate(); err == nil {
|
||||
t.Fatalf("expected validation error for invalid time_sync_mode")
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidateRejectsBadStoragePVCFormat(t *testing.T) {
|
||||
cfg := defaults()
|
||||
cfg.Startup.StorageCriticalPVCs = []string{"vault-data-vault-0"}
|
||||
if err := cfg.Validate(); err == nil {
|
||||
t.Fatalf("expected validation error for invalid storage_critical_pvcs entry")
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidateRejectsMissingPostStartProbesWhenRequired(t *testing.T) {
|
||||
cfg := defaults()
|
||||
cfg.Startup.RequirePostStartProbes = true
|
||||
cfg.Startup.PostStartProbes = nil
|
||||
if err := cfg.Validate(); err == nil {
|
||||
t.Fatalf("expected validation error when post start probes are required but empty")
|
||||
}
|
||||
}
|
||||
|
||||
@ -209,6 +209,31 @@ migrate_hecate_config() {
|
||||
echo "[install] added startup time sync + access reconciliation defaults"
|
||||
changed=1
|
||||
fi
|
||||
if grep -Eq '^ time_sync_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/hecate.yaml" \
|
||||
&& ! grep -Eq '^ time_sync_mode:[[:space:]]*(strict|quorum)' "${CONF_DIR}/hecate.yaml"; then
|
||||
sed -Ei '/^ time_sync_poll_seconds:[[:space:]]*[0-9]+/a\ time_sync_mode: quorum\n time_sync_quorum: 2' "${CONF_DIR}/hecate.yaml"
|
||||
echo "[install] added startup time sync quorum defaults"
|
||||
changed=1
|
||||
fi
|
||||
if grep -Eq '^ etcd_restore_control_plane:[[:space:]]*[A-Za-z0-9._-]+' "${CONF_DIR}/hecate.yaml" \
|
||||
&& ! grep -Eq '^ require_storage_ready:[[:space:]]*(true|false)' "${CONF_DIR}/hecate.yaml"; then
|
||||
sed -Ei '/^ etcd_restore_control_plane:[[:space:]]*[A-Za-z0-9._-]+/a\ require_storage_ready: true\n storage_ready_wait_seconds: 420\n storage_ready_poll_seconds: 5\n storage_min_ready_nodes: 2\n storage_critical_pvcs:\n - vault/data-vault-0\n - postgres/postgres-data-postgres-0\n - gitea/gitea-data\n - sso/keycloak-data' "${CONF_DIR}/hecate.yaml"
|
||||
echo "[install] added startup storage readiness defaults"
|
||||
changed=1
|
||||
fi
|
||||
if grep -Eq '^ storage_critical_pvcs:[[:space:]]*$' "${CONF_DIR}/hecate.yaml" \
|
||||
&& ! grep -Eq '^ require_post_start_probes:[[:space:]]*(true|false)' "${CONF_DIR}/hecate.yaml"; then
|
||||
sed -Ei '/^ - sso\/keycloak-data$/a\ require_post_start_probes: true\n post_start_probe_wait_seconds: 240\n post_start_probe_poll_seconds: 5\n post_start_probes:\n - https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration\n - https://scm.bstein.dev/user/login\n - https://metrics.bstein.dev/login\n vault_unseal_key_file: /var/lib/hecate/vault-unseal.key' "${CONF_DIR}/hecate.yaml"
|
||||
echo "[install] added startup post-start probe + vault key fallback defaults"
|
||||
changed=1
|
||||
fi
|
||||
if ! grep -Eq '^ vault_unseal_key_file:[[:space:]]*/var/lib/hecate/vault-unseal.key' "${CONF_DIR}/hecate.yaml"; then
|
||||
if grep -Eq '^startup:[[:space:]]*$' "${CONF_DIR}/hecate.yaml" && grep -Eq '^ post_start_probes:[[:space:]]*$' "${CONF_DIR}/hecate.yaml"; then
|
||||
sed -Ei '/^ - https:\/\/metrics\.bstein\.dev\/login$/a\ vault_unseal_key_file: /var/lib/hecate/vault-unseal.key' "${CONF_DIR}/hecate.yaml"
|
||||
echo "[install] added startup.vault_unseal_key_file default"
|
||||
changed=1
|
||||
fi
|
||||
fi
|
||||
|
||||
local role
|
||||
role="$(read_hecate_role)"
|
||||
@ -371,14 +396,14 @@ migrate_hecate_config() {
|
||||
changed=1
|
||||
fi
|
||||
|
||||
if ! grep -Eq '^ - services/gitea$' "${CONF_DIR}/hecate.yaml"; then
|
||||
perl -0pi -e 's#local_bootstrap_paths:\n(?: - [^\n]*\n)*#local_bootstrap_paths:\n - infrastructure/core\n - clusters/atlas/flux-system\n - infrastructure/sources/helm\n - infrastructure/metallb\n - infrastructure/traefik\n - infrastructure/vault-csi\n - infrastructure/vault-injector\n - services/vault\n - infrastructure/postgres\n - services/gitea\n#s' "${CONF_DIR}/hecate.yaml"
|
||||
if ! grep -Eq '^ - services/keycloak$' "${CONF_DIR}/hecate.yaml" || ! grep -Eq '^ - infrastructure/cert-manager$' "${CONF_DIR}/hecate.yaml" || ! grep -Eq '^ - services/oauth2-proxy$' "${CONF_DIR}/hecate.yaml"; then
|
||||
perl -0pi -e 's#local_bootstrap_paths:\n(?: - [^\n]*\n)*#local_bootstrap_paths:\n - infrastructure/core\n - clusters/atlas/flux-system\n - infrastructure/sources/helm\n - infrastructure/metallb\n - infrastructure/traefik\n - infrastructure/cert-manager\n - infrastructure/vault-csi\n - infrastructure/vault-injector\n - services/vault\n - infrastructure/postgres\n - services/gitea\n - services/keycloak\n - services/oauth2-proxy\n#s' "${CONF_DIR}/hecate.yaml"
|
||||
echo "[install] refreshed peer local_bootstrap_paths for full fallback bootstrap parity"
|
||||
changed=1
|
||||
fi
|
||||
|
||||
if perl -0777 -ne 'exit(!(/local_bootstrap_paths:\n - infrastructure\/core\n/s))' "${CONF_DIR}/hecate.yaml"; then
|
||||
perl -0pi -e 's#local_bootstrap_paths:\n - infrastructure/core\n#local_bootstrap_paths:\n - infrastructure/core\n - clusters/atlas/flux-system\n - infrastructure/sources/helm\n - infrastructure/metallb\n - infrastructure/traefik\n - infrastructure/vault-csi\n - infrastructure/vault-injector\n - services/vault\n - infrastructure/postgres\n - services/gitea\n#s' "${CONF_DIR}/hecate.yaml"
|
||||
perl -0pi -e 's#local_bootstrap_paths:\n - infrastructure/core\n#local_bootstrap_paths:\n - infrastructure/core\n - clusters/atlas/flux-system\n - infrastructure/sources/helm\n - infrastructure/metallb\n - infrastructure/traefik\n - infrastructure/cert-manager\n - infrastructure/vault-csi\n - infrastructure/vault-injector\n - services/vault\n - infrastructure/postgres\n - services/gitea\n - services/keycloak\n - services/oauth2-proxy\n#s' "${CONF_DIR}/hecate.yaml"
|
||||
echo "[install] expanded peer local_bootstrap_paths for full fallback bootstrap parity"
|
||||
changed=1
|
||||
fi
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user