diff --git a/internal/cluster/orchestrator.go b/internal/cluster/orchestrator.go index 4e9a9fb..c47c02d 100644 --- a/internal/cluster/orchestrator.go +++ b/internal/cluster/orchestrator.go @@ -49,6 +49,8 @@ var criticalStartupWorkloads = []startupWorkload{ {Namespace: "flux-system", Kind: "deployment", Name: "kustomize-controller"}, {Namespace: "flux-system", Kind: "deployment", Name: "helm-controller"}, {Namespace: "flux-system", Kind: "deployment", Name: "notification-controller"}, + {Namespace: "harbor", Kind: "statefulset", Name: "harbor-redis"}, + {Namespace: "harbor", Kind: "deployment", Name: "harbor-registry"}, {Namespace: "vault", Kind: "statefulset", Name: "vault"}, {Namespace: "postgres", Kind: "statefulset", Name: "postgres"}, {Namespace: "gitea", Kind: "deployment", Name: "gitea"}, @@ -113,6 +115,7 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er } o.log.Printf("startup workers=%s", strings.Join(workers, ",")) o.startWorkers(ctx, workers) + o.bestEffort("uncordon workers", func() error { return o.uncordonWorkers(ctx, workers) }) if opts.ForceFluxBranch != "" { patch := fmt.Sprintf(`{"spec":{"ref":{"branch":"%s"}}}`, opts.ForceFluxBranch) @@ -350,7 +353,9 @@ func (o *Orchestrator) scaleDownApps(ctx context.Context) error { } func (o *Orchestrator) drainWorkers(ctx context.Context, workers []string) error { - for _, node := range workers { + total := len(workers) + for idx, node := range workers { + o.log.Printf("drain worker %d/%d: %s", idx+1, total, node) if _, err := o.kubectl(ctx, 20*time.Second, "cordon", node); err != nil { o.log.Printf("warning: cordon %s failed: %v", node, err) } @@ -361,6 +366,15 @@ func (o *Orchestrator) drainWorkers(ctx context.Context, workers []string) error return nil } +func (o *Orchestrator) uncordonWorkers(ctx context.Context, workers []string) error { + for _, node := range workers { + if _, err := o.kubectl(ctx, 20*time.Second, "uncordon", node); err != nil { + o.log.Printf("warning: uncordon %s failed: %v", node, err) + } + } + return nil +} + func (o *Orchestrator) stopWorkers(ctx context.Context, workers []string) { for _, n := range workers { if !o.sshManaged(n) { diff --git a/internal/config/config.go b/internal/config/config.go index f1fac5a..6d7216d 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -192,7 +192,7 @@ func defaults() Config { APIPollSeconds: 2, }, Shutdown: Shutdown{ - DefaultBudgetSeconds: 300, + DefaultBudgetSeconds: 1380, PoweroffEnabled: true, PoweroffDelaySeconds: 25, PoweroffLocalHost: true, @@ -201,7 +201,7 @@ func defaults() Config { Enabled: true, Provider: "nut", PollSeconds: 5, - RuntimeSafetyFactor: 1.10, + RuntimeSafetyFactor: 1.25, DebounceCount: 3, TelemetryTimeoutSeconds: 90, }, @@ -245,7 +245,7 @@ func (c *Config) applyDefaults() { c.SSHPort = 2277 } if c.Shutdown.DefaultBudgetSeconds <= 0 { - c.Shutdown.DefaultBudgetSeconds = 300 + c.Shutdown.DefaultBudgetSeconds = 1380 } if c.Shutdown.PoweroffDelaySeconds <= 0 { c.Shutdown.PoweroffDelaySeconds = 25 @@ -254,7 +254,7 @@ func (c *Config) applyDefaults() { c.UPS.PollSeconds = 5 } if c.UPS.RuntimeSafetyFactor <= 0 { - c.UPS.RuntimeSafetyFactor = 1.10 + c.UPS.RuntimeSafetyFactor = 1.25 } if c.UPS.DebounceCount <= 0 { c.UPS.DebounceCount = 3