startup: order vault before harbor and fail-safe flux resume

This commit is contained in:
Brad Stein 2026-04-05 16:47:47 -03:00
parent 56df211261
commit 11a2f66e41

View File

@ -74,11 +74,11 @@ var criticalStartupWorkloads = []startupWorkload{
{Namespace: "flux-system", Kind: "deployment", Name: "kustomize-controller"}, {Namespace: "flux-system", Kind: "deployment", Name: "kustomize-controller"},
{Namespace: "flux-system", Kind: "deployment", Name: "helm-controller"}, {Namespace: "flux-system", Kind: "deployment", Name: "helm-controller"},
{Namespace: "flux-system", Kind: "deployment", Name: "notification-controller"}, {Namespace: "flux-system", Kind: "deployment", Name: "notification-controller"},
{Namespace: "harbor", Kind: "statefulset", Name: "harbor-redis"},
{Namespace: "harbor", Kind: "deployment", Name: "harbor-registry"},
{Namespace: "vault", Kind: "statefulset", Name: "vault"}, {Namespace: "vault", Kind: "statefulset", Name: "vault"},
{Namespace: "postgres", Kind: "statefulset", Name: "postgres"}, {Namespace: "postgres", Kind: "statefulset", Name: "postgres"},
{Namespace: "gitea", Kind: "deployment", Name: "gitea"}, {Namespace: "gitea", Kind: "deployment", Name: "gitea"},
{Namespace: "harbor", Kind: "statefulset", Name: "harbor-redis"},
{Namespace: "harbor", Kind: "deployment", Name: "harbor-registry"},
} }
var ErrEtcdRestoreNotApplicable = errors.New("etcd restore not applicable") var ErrEtcdRestoreNotApplicable = errors.New("etcd restore not applicable")
@ -102,6 +102,15 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
StartedAt: time.Now().UTC(), StartedAt: time.Now().UTC(),
} }
defer o.finalizeRecord(&record, &err) defer o.finalizeRecord(&record, &err)
resumedFlux := false
defer func() {
if o.runner.DryRun || err == nil || resumedFlux {
return
}
o.log.Printf("warning: startup failed before normal flux resume; attempting best-effort recovery resume")
o.bestEffort("restore scaled workloads after failed startup", func() error { return o.restoreScaledApps(ctx) })
o.bestEffort("resume flux after failed startup", func() error { return o.resumeFluxAndReconcile(ctx) })
}()
if !o.runner.DryRun { if !o.runner.DryRun {
currentIntent, readErr := state.ReadIntent(o.cfg.State.IntentPath) currentIntent, readErr := state.ReadIntent(o.cfg.State.IntentPath)
@ -273,6 +282,7 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
if err := o.resumeFluxAndReconcile(ctx); err != nil { if err := o.resumeFluxAndReconcile(ctx); err != nil {
return err return err
} }
resumedFlux = true
if o.cfg.Startup.RequirePostStartProbes { if o.cfg.Startup.RequirePostStartProbes {
if err := o.waitForPostStartProbes(ctx); err != nil { if err := o.waitForPostStartProbes(ctx); err != nil {
return err return err