diff --git a/internal/cluster/orchestrator.go b/internal/cluster/orchestrator.go index 5c86add..825c2e5 100644 --- a/internal/cluster/orchestrator.go +++ b/internal/cluster/orchestrator.go @@ -118,7 +118,16 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er } if !opts.SkipLocalBootstrap && needsLocalBootstrap { - o.log.Printf("startup bootstrap required: %s", strings.Join(bootstrapReasons, "; ")) + if ready, err := o.waitForFluxSourceReady(ctx, 2*time.Minute); err != nil { + o.log.Printf("warning: flux source readiness wait failed before local bootstrap: %v", err) + } else if ready { + o.log.Printf("flux source became ready after targeted recovery; skipping local bootstrap") + needsLocalBootstrap = false + } + } + + if !opts.SkipLocalBootstrap && needsLocalBootstrap { + o.log.Printf("startup bootstrap required after wait: %s", strings.Join(bootstrapReasons, "; ")) if err := o.bootstrapLocal(ctx); err != nil { return err } @@ -388,16 +397,46 @@ func (o *Orchestrator) reportFluxSource(ctx context.Context, forceBranch string) } func (o *Orchestrator) bootstrapLocal(ctx context.Context) error { + failures := 0 for _, rel := range o.cfg.LocalBootstrapPaths { full := filepath.Join(o.cfg.IACRepoPath, rel) o.log.Printf("local bootstrap apply -k %s", full) if _, err := o.kubectl(ctx, 2*time.Minute, "apply", "-k", full); err != nil { - return fmt.Errorf("local bootstrap apply failed at %s: %w", full, err) + failures++ + o.log.Printf("warning: local bootstrap apply failed at %s: %v", full, err) + continue } } + if failures == len(o.cfg.LocalBootstrapPaths) { + return fmt.Errorf("local bootstrap apply failed for every configured path (%d total)", failures) + } return nil } +func (o *Orchestrator) waitForFluxSourceReady(ctx context.Context, window time.Duration) (bool, error) { + if o.runner.DryRun { + return true, nil + } + deadline := time.Now().Add(window) + for { + ready, err := o.fluxSourceReady(ctx) + if err != nil { + return false, err + } + if ready { + return true, nil + } + if time.Now().After(deadline) { + return false, nil + } + select { + case <-ctx.Done(): + return false, ctx.Err() + case <-time.After(5 * time.Second): + } + } +} + func (o *Orchestrator) resumeFluxAndReconcile(ctx context.Context) error { if err := o.patchFluxSuspendAll(ctx, false); err != nil { return err