hecate: handle external datastore in auto-recovery
This commit is contained in:
parent
3bcaed7ec3
commit
522df2f6e8
@ -116,6 +116,7 @@ Power metrics:
|
|||||||
- Startup can automatically invoke the same restore path after API timeout using:
|
- Startup can automatically invoke the same restore path after API timeout using:
|
||||||
- `startup.auto_etcd_restore_on_api_failure: true`
|
- `startup.auto_etcd_restore_on_api_failure: true`
|
||||||
- `startup.etcd_restore_control_plane: <control-plane-node>`
|
- `startup.etcd_restore_control_plane: <control-plane-node>`
|
||||||
|
- If control planes are configured with `--datastore-endpoint` (external DB), Hecate will skip etcd restore and retry control-plane startup instead.
|
||||||
|
|
||||||
## Disruptive startup drills
|
## Disruptive startup drills
|
||||||
|
|
||||||
|
|||||||
@ -4,6 +4,7 @@ import (
|
|||||||
"context"
|
"context"
|
||||||
"encoding/base64"
|
"encoding/base64"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"log"
|
"log"
|
||||||
"os"
|
"os"
|
||||||
@ -74,6 +75,8 @@ var criticalStartupWorkloads = []startupWorkload{
|
|||||||
{Namespace: "gitea", Kind: "deployment", Name: "gitea"},
|
{Namespace: "gitea", Kind: "deployment", Name: "gitea"},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var ErrEtcdRestoreNotApplicable = errors.New("etcd restore not applicable")
|
||||||
|
|
||||||
func New(cfg config.Config, runner *execx.Runner, store *state.Store, logger *log.Logger) *Orchestrator {
|
func New(cfg config.Config, runner *execx.Runner, store *state.Store, logger *log.Logger) *Orchestrator {
|
||||||
return &Orchestrator{cfg: cfg, runner: runner, store: store, log: logger}
|
return &Orchestrator{cfg: cfg, runner: runner, store: store, log: logger}
|
||||||
}
|
}
|
||||||
@ -135,7 +138,13 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
|
|||||||
}
|
}
|
||||||
o.log.Printf("warning: initial API wait failed (%v); attempting automatic etcd restore on %s", err, cp)
|
o.log.Printf("warning: initial API wait failed (%v); attempting automatic etcd restore on %s", err, cp)
|
||||||
if restoreErr := o.EtcdRestore(ctx, EtcdRestoreOptions{ControlPlane: cp}); restoreErr != nil {
|
if restoreErr := o.EtcdRestore(ctx, EtcdRestoreOptions{ControlPlane: cp}); restoreErr != nil {
|
||||||
return fmt.Errorf("kubernetes API did not become reachable and automatic etcd restore failed: %w", restoreErr)
|
if errors.Is(restoreErr, ErrEtcdRestoreNotApplicable) {
|
||||||
|
o.log.Printf("warning: automatic etcd restore skipped: %v", restoreErr)
|
||||||
|
o.log.Printf("warning: retrying control-plane start because datastore recovery path is external")
|
||||||
|
o.startControlPlanes(ctx, o.cfg.ControlPlanes)
|
||||||
|
} else {
|
||||||
|
return fmt.Errorf("kubernetes API did not become reachable and automatic etcd restore failed: %w", restoreErr)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if err := o.waitForAPI(ctx, apiAttempts, apiPoll); err != nil {
|
if err := o.waitForAPI(ctx, apiAttempts, apiPoll); err != nil {
|
||||||
return fmt.Errorf("kubernetes API did not become reachable after automatic etcd restore: %w", err)
|
return fmt.Errorf("kubernetes API did not become reachable after automatic etcd restore: %w", err)
|
||||||
@ -239,6 +248,13 @@ func (o *Orchestrator) EtcdRestore(ctx context.Context, opts EtcdRestoreOptions)
|
|||||||
if !o.sshManaged(controlPlane) {
|
if !o.sshManaged(controlPlane) {
|
||||||
return fmt.Errorf("cannot restore etcd on %s: node not in ssh_managed_nodes", controlPlane)
|
return fmt.Errorf("cannot restore etcd on %s: node not in ssh_managed_nodes", controlPlane)
|
||||||
}
|
}
|
||||||
|
externalDatastore, err := o.controlPlaneUsesExternalDatastore(ctx, controlPlane)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if externalDatastore {
|
||||||
|
return fmt.Errorf("%w: %s uses --datastore-endpoint", ErrEtcdRestoreNotApplicable, controlPlane)
|
||||||
|
}
|
||||||
|
|
||||||
snapshotPath := strings.TrimSpace(opts.SnapshotPath)
|
snapshotPath := strings.TrimSpace(opts.SnapshotPath)
|
||||||
if snapshotPath == "" {
|
if snapshotPath == "" {
|
||||||
@ -831,6 +847,14 @@ func (o *Orchestrator) latestEtcdSnapshotPath(ctx context.Context, node string)
|
|||||||
return snapshot, nil
|
return snapshot, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (o *Orchestrator) controlPlaneUsesExternalDatastore(ctx context.Context, node string) (bool, error) {
|
||||||
|
out, err := o.ssh(ctx, node, "sudo systemctl cat k3s")
|
||||||
|
if err != nil {
|
||||||
|
return false, fmt.Errorf("inspect k3s service on %s for datastore mode: %w", node, err)
|
||||||
|
}
|
||||||
|
return strings.Contains(out, "--datastore-endpoint="), nil
|
||||||
|
}
|
||||||
|
|
||||||
func (o *Orchestrator) waitForAPI(ctx context.Context, attempts int, sleep time.Duration) error {
|
func (o *Orchestrator) waitForAPI(ctx context.Context, attempts int, sleep time.Duration) error {
|
||||||
if o.runner.DryRun {
|
if o.runner.DryRun {
|
||||||
return nil
|
return nil
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user