diff --git a/internal/cluster/orchestrator.go b/internal/cluster/orchestrator.go index 578db1e..bdb7bbb 100644 --- a/internal/cluster/orchestrator.go +++ b/internal/cluster/orchestrator.go @@ -74,11 +74,11 @@ var criticalStartupWorkloads = []startupWorkload{ {Namespace: "flux-system", Kind: "deployment", Name: "kustomize-controller"}, {Namespace: "flux-system", Kind: "deployment", Name: "helm-controller"}, {Namespace: "flux-system", Kind: "deployment", Name: "notification-controller"}, - {Namespace: "harbor", Kind: "statefulset", Name: "harbor-redis"}, - {Namespace: "harbor", Kind: "deployment", Name: "harbor-registry"}, {Namespace: "vault", Kind: "statefulset", Name: "vault"}, {Namespace: "postgres", Kind: "statefulset", Name: "postgres"}, {Namespace: "gitea", Kind: "deployment", Name: "gitea"}, + {Namespace: "harbor", Kind: "statefulset", Name: "harbor-redis"}, + {Namespace: "harbor", Kind: "deployment", Name: "harbor-registry"}, } var ErrEtcdRestoreNotApplicable = errors.New("etcd restore not applicable") @@ -102,6 +102,15 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er StartedAt: time.Now().UTC(), } defer o.finalizeRecord(&record, &err) + resumedFlux := false + defer func() { + if o.runner.DryRun || err == nil || resumedFlux { + return + } + o.log.Printf("warning: startup failed before normal flux resume; attempting best-effort recovery resume") + o.bestEffort("restore scaled workloads after failed startup", func() error { return o.restoreScaledApps(ctx) }) + o.bestEffort("resume flux after failed startup", func() error { return o.resumeFluxAndReconcile(ctx) }) + }() if !o.runner.DryRun { currentIntent, readErr := state.ReadIntent(o.cfg.State.IntentPath) @@ -273,6 +282,7 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er if err := o.resumeFluxAndReconcile(ctx); err != nil { return err } + resumedFlux = true if o.cfg.Startup.RequirePostStartProbes { if err := o.waitForPostStartProbes(ctx); err != nil { return err