diff --git a/README.md b/README.md index 87a8560..1ada9d3 100644 --- a/README.md +++ b/README.md @@ -116,6 +116,7 @@ Power metrics: - Startup can automatically invoke the same restore path after API timeout using: - `startup.auto_etcd_restore_on_api_failure: true` - `startup.etcd_restore_control_plane: ` +- If control planes are configured with `--datastore-endpoint` (external DB), Hecate will skip etcd restore and retry control-plane startup instead. ## Disruptive startup drills diff --git a/internal/cluster/orchestrator.go b/internal/cluster/orchestrator.go index 962a415..6f18e4b 100644 --- a/internal/cluster/orchestrator.go +++ b/internal/cluster/orchestrator.go @@ -4,6 +4,7 @@ import ( "context" "encoding/base64" "encoding/json" + "errors" "fmt" "log" "os" @@ -74,6 +75,8 @@ var criticalStartupWorkloads = []startupWorkload{ {Namespace: "gitea", Kind: "deployment", Name: "gitea"}, } +var ErrEtcdRestoreNotApplicable = errors.New("etcd restore not applicable") + func New(cfg config.Config, runner *execx.Runner, store *state.Store, logger *log.Logger) *Orchestrator { return &Orchestrator{cfg: cfg, runner: runner, store: store, log: logger} } @@ -135,7 +138,13 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er } o.log.Printf("warning: initial API wait failed (%v); attempting automatic etcd restore on %s", err, cp) if restoreErr := o.EtcdRestore(ctx, EtcdRestoreOptions{ControlPlane: cp}); restoreErr != nil { - return fmt.Errorf("kubernetes API did not become reachable and automatic etcd restore failed: %w", restoreErr) + if errors.Is(restoreErr, ErrEtcdRestoreNotApplicable) { + o.log.Printf("warning: automatic etcd restore skipped: %v", restoreErr) + o.log.Printf("warning: retrying control-plane start because datastore recovery path is external") + o.startControlPlanes(ctx, o.cfg.ControlPlanes) + } else { + return fmt.Errorf("kubernetes API did not become reachable and automatic etcd restore failed: %w", restoreErr) + } } if err := o.waitForAPI(ctx, apiAttempts, apiPoll); err != nil { return fmt.Errorf("kubernetes API did not become reachable after automatic etcd restore: %w", err) @@ -239,6 +248,13 @@ func (o *Orchestrator) EtcdRestore(ctx context.Context, opts EtcdRestoreOptions) if !o.sshManaged(controlPlane) { return fmt.Errorf("cannot restore etcd on %s: node not in ssh_managed_nodes", controlPlane) } + externalDatastore, err := o.controlPlaneUsesExternalDatastore(ctx, controlPlane) + if err != nil { + return err + } + if externalDatastore { + return fmt.Errorf("%w: %s uses --datastore-endpoint", ErrEtcdRestoreNotApplicable, controlPlane) + } snapshotPath := strings.TrimSpace(opts.SnapshotPath) if snapshotPath == "" { @@ -831,6 +847,14 @@ func (o *Orchestrator) latestEtcdSnapshotPath(ctx context.Context, node string) return snapshot, nil } +func (o *Orchestrator) controlPlaneUsesExternalDatastore(ctx context.Context, node string) (bool, error) { + out, err := o.ssh(ctx, node, "sudo systemctl cat k3s") + if err != nil { + return false, fmt.Errorf("inspect k3s service on %s for datastore mode: %w", node, err) + } + return strings.Contains(out, "--datastore-endpoint="), nil +} + func (o *Orchestrator) waitForAPI(ctx context.Context, attempts int, sleep time.Duration) error { if o.runner.DryRun { return nil