hecate: handle external datastore in auto-recovery

This commit is contained in:
Brad Stein 2026-04-04 20:56:16 -03:00
parent 3bcaed7ec3
commit 522df2f6e8
2 changed files with 26 additions and 1 deletions

View File

@ -116,6 +116,7 @@ Power metrics:
- Startup can automatically invoke the same restore path after API timeout using:
- `startup.auto_etcd_restore_on_api_failure: true`
- `startup.etcd_restore_control_plane: <control-plane-node>`
- If control planes are configured with `--datastore-endpoint` (external DB), Hecate will skip etcd restore and retry control-plane startup instead.
## Disruptive startup drills

View File

@ -4,6 +4,7 @@ import (
"context"
"encoding/base64"
"encoding/json"
"errors"
"fmt"
"log"
"os"
@ -74,6 +75,8 @@ var criticalStartupWorkloads = []startupWorkload{
{Namespace: "gitea", Kind: "deployment", Name: "gitea"},
}
var ErrEtcdRestoreNotApplicable = errors.New("etcd restore not applicable")
func New(cfg config.Config, runner *execx.Runner, store *state.Store, logger *log.Logger) *Orchestrator {
return &Orchestrator{cfg: cfg, runner: runner, store: store, log: logger}
}
@ -135,8 +138,14 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
}
o.log.Printf("warning: initial API wait failed (%v); attempting automatic etcd restore on %s", err, cp)
if restoreErr := o.EtcdRestore(ctx, EtcdRestoreOptions{ControlPlane: cp}); restoreErr != nil {
if errors.Is(restoreErr, ErrEtcdRestoreNotApplicable) {
o.log.Printf("warning: automatic etcd restore skipped: %v", restoreErr)
o.log.Printf("warning: retrying control-plane start because datastore recovery path is external")
o.startControlPlanes(ctx, o.cfg.ControlPlanes)
} else {
return fmt.Errorf("kubernetes API did not become reachable and automatic etcd restore failed: %w", restoreErr)
}
}
if err := o.waitForAPI(ctx, apiAttempts, apiPoll); err != nil {
return fmt.Errorf("kubernetes API did not become reachable after automatic etcd restore: %w", err)
}
@ -239,6 +248,13 @@ func (o *Orchestrator) EtcdRestore(ctx context.Context, opts EtcdRestoreOptions)
if !o.sshManaged(controlPlane) {
return fmt.Errorf("cannot restore etcd on %s: node not in ssh_managed_nodes", controlPlane)
}
externalDatastore, err := o.controlPlaneUsesExternalDatastore(ctx, controlPlane)
if err != nil {
return err
}
if externalDatastore {
return fmt.Errorf("%w: %s uses --datastore-endpoint", ErrEtcdRestoreNotApplicable, controlPlane)
}
snapshotPath := strings.TrimSpace(opts.SnapshotPath)
if snapshotPath == "" {
@ -831,6 +847,14 @@ func (o *Orchestrator) latestEtcdSnapshotPath(ctx context.Context, node string)
return snapshot, nil
}
func (o *Orchestrator) controlPlaneUsesExternalDatastore(ctx context.Context, node string) (bool, error) {
out, err := o.ssh(ctx, node, "sudo systemctl cat k3s")
if err != nil {
return false, fmt.Errorf("inspect k3s service on %s for datastore mode: %w", node, err)
}
return strings.Contains(out, "--datastore-endpoint="), nil
}
func (o *Orchestrator) waitForAPI(ctx context.Context, attempts int, sleep time.Duration) error {
if o.runner.DryRun {
return nil