hecate: handle external datastore in auto-recovery
This commit is contained in:
parent
3bcaed7ec3
commit
522df2f6e8
@ -116,6 +116,7 @@ Power metrics:
|
||||
- Startup can automatically invoke the same restore path after API timeout using:
|
||||
- `startup.auto_etcd_restore_on_api_failure: true`
|
||||
- `startup.etcd_restore_control_plane: <control-plane-node>`
|
||||
- If control planes are configured with `--datastore-endpoint` (external DB), Hecate will skip etcd restore and retry control-plane startup instead.
|
||||
|
||||
## Disruptive startup drills
|
||||
|
||||
|
||||
@ -4,6 +4,7 @@ import (
|
||||
"context"
|
||||
"encoding/base64"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
@ -74,6 +75,8 @@ var criticalStartupWorkloads = []startupWorkload{
|
||||
{Namespace: "gitea", Kind: "deployment", Name: "gitea"},
|
||||
}
|
||||
|
||||
var ErrEtcdRestoreNotApplicable = errors.New("etcd restore not applicable")
|
||||
|
||||
func New(cfg config.Config, runner *execx.Runner, store *state.Store, logger *log.Logger) *Orchestrator {
|
||||
return &Orchestrator{cfg: cfg, runner: runner, store: store, log: logger}
|
||||
}
|
||||
@ -135,7 +138,13 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
|
||||
}
|
||||
o.log.Printf("warning: initial API wait failed (%v); attempting automatic etcd restore on %s", err, cp)
|
||||
if restoreErr := o.EtcdRestore(ctx, EtcdRestoreOptions{ControlPlane: cp}); restoreErr != nil {
|
||||
return fmt.Errorf("kubernetes API did not become reachable and automatic etcd restore failed: %w", restoreErr)
|
||||
if errors.Is(restoreErr, ErrEtcdRestoreNotApplicable) {
|
||||
o.log.Printf("warning: automatic etcd restore skipped: %v", restoreErr)
|
||||
o.log.Printf("warning: retrying control-plane start because datastore recovery path is external")
|
||||
o.startControlPlanes(ctx, o.cfg.ControlPlanes)
|
||||
} else {
|
||||
return fmt.Errorf("kubernetes API did not become reachable and automatic etcd restore failed: %w", restoreErr)
|
||||
}
|
||||
}
|
||||
if err := o.waitForAPI(ctx, apiAttempts, apiPoll); err != nil {
|
||||
return fmt.Errorf("kubernetes API did not become reachable after automatic etcd restore: %w", err)
|
||||
@ -239,6 +248,13 @@ func (o *Orchestrator) EtcdRestore(ctx context.Context, opts EtcdRestoreOptions)
|
||||
if !o.sshManaged(controlPlane) {
|
||||
return fmt.Errorf("cannot restore etcd on %s: node not in ssh_managed_nodes", controlPlane)
|
||||
}
|
||||
externalDatastore, err := o.controlPlaneUsesExternalDatastore(ctx, controlPlane)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if externalDatastore {
|
||||
return fmt.Errorf("%w: %s uses --datastore-endpoint", ErrEtcdRestoreNotApplicable, controlPlane)
|
||||
}
|
||||
|
||||
snapshotPath := strings.TrimSpace(opts.SnapshotPath)
|
||||
if snapshotPath == "" {
|
||||
@ -831,6 +847,14 @@ func (o *Orchestrator) latestEtcdSnapshotPath(ctx context.Context, node string)
|
||||
return snapshot, nil
|
||||
}
|
||||
|
||||
func (o *Orchestrator) controlPlaneUsesExternalDatastore(ctx context.Context, node string) (bool, error) {
|
||||
out, err := o.ssh(ctx, node, "sudo systemctl cat k3s")
|
||||
if err != nil {
|
||||
return false, fmt.Errorf("inspect k3s service on %s for datastore mode: %w", node, err)
|
||||
}
|
||||
return strings.Contains(out, "--datastore-endpoint="), nil
|
||||
}
|
||||
|
||||
func (o *Orchestrator) waitForAPI(ctx context.Context, attempts int, sleep time.Duration) error {
|
||||
if o.runner.DryRun {
|
||||
return nil
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user