recovery: preflight encrypted longhorn hosts
This commit is contained in:
parent
c415516376
commit
3e337043d5
@ -145,6 +145,97 @@ func (o *Orchestrator) uncordonWorkers(ctx context.Context, workers []string) er
|
||||
return nil
|
||||
}
|
||||
|
||||
// ensureLonghornEncryptedHostPrereqs runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) ensureLonghornEncryptedHostPrereqs(ctx context.Context, workers []string) ([]string, error).
|
||||
// Why: encrypted Longhorn PVCs fail at kubelet mount time when a storage host
|
||||
// lacks host cryptsetup; startup must quarantine those nodes before workloads
|
||||
// are scheduled there.
|
||||
func (o *Orchestrator) ensureLonghornEncryptedHostPrereqs(ctx context.Context, workers []string) ([]string, error) {
|
||||
longhornHosts, err := o.longhornHostNodes(ctx)
|
||||
if err != nil {
|
||||
return workers, err
|
||||
}
|
||||
if len(longhornHosts) == 0 {
|
||||
return workers, nil
|
||||
}
|
||||
|
||||
ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
|
||||
unsafe := map[string]struct{}{}
|
||||
var errs []string
|
||||
for node := range longhornHosts {
|
||||
if _, skip := ignored[node]; skip {
|
||||
continue
|
||||
}
|
||||
if !o.sshManaged(node) {
|
||||
o.log.Printf("warning: keeping longhorn host %s cordoned because encrypted-volume prerequisites cannot be verified without SSH management", node)
|
||||
if cordonErr := o.cordonNodeForMissingCryptsetup(ctx, node); cordonErr != nil {
|
||||
errs = append(errs, fmt.Sprintf("%s cordon after unverifiable cryptsetup prerequisite: %v", node, cordonErr))
|
||||
}
|
||||
unsafe[node] = struct{}{}
|
||||
continue
|
||||
}
|
||||
if checkErr := o.ensureHostCryptsetup(ctx, node); checkErr != nil {
|
||||
o.log.Printf("warning: keeping longhorn host %s cordoned after cryptsetup prerequisite check failed: %v", node, checkErr)
|
||||
if cordonErr := o.cordonNodeForMissingCryptsetup(ctx, node); cordonErr != nil {
|
||||
errs = append(errs, fmt.Sprintf("%s cordon after cryptsetup prerequisite failure: %v", node, cordonErr))
|
||||
}
|
||||
unsafe[node] = struct{}{}
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
guarded := make([]string, 0, len(workers))
|
||||
for _, worker := range workers {
|
||||
node := strings.TrimSpace(worker)
|
||||
if node == "" {
|
||||
continue
|
||||
}
|
||||
if _, blocked := unsafe[node]; blocked {
|
||||
continue
|
||||
}
|
||||
guarded = append(guarded, node)
|
||||
}
|
||||
if len(errs) > 0 {
|
||||
return guarded, fmt.Errorf("%s", strings.Join(errs, "; "))
|
||||
}
|
||||
return guarded, nil
|
||||
}
|
||||
|
||||
// longhornHostNodes runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) longhornHostNodes(ctx context.Context) (map[string]struct{}, error).
|
||||
// Why: the live node label captures storage hosts that may not be in Ananke's
|
||||
// static worker list, so startup quarantine decisions should follow the
|
||||
// cluster's actual scheduling surface.
|
||||
func (o *Orchestrator) longhornHostNodes(ctx context.Context) (map[string]struct{}, error) {
|
||||
out, err := o.kubectl(ctx, 20*time.Second,
|
||||
"get", "nodes",
|
||||
"-l", "longhorn-host=true",
|
||||
"-o", "jsonpath={range .items[*]}{.metadata.name}{'\\n'}{end}",
|
||||
)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("query longhorn host nodes: %w", err)
|
||||
}
|
||||
nodes := map[string]struct{}{}
|
||||
for _, line := range lines(out) {
|
||||
node := strings.TrimSpace(line)
|
||||
if node != "" {
|
||||
nodes[node] = struct{}{}
|
||||
}
|
||||
}
|
||||
if len(nodes) > 0 {
|
||||
return nodes, nil
|
||||
}
|
||||
for node, labels := range o.cfg.Startup.RequiredNodeLabels {
|
||||
if strings.EqualFold(strings.TrimSpace(labels["longhorn-host"]), "true") {
|
||||
name := strings.TrimSpace(node)
|
||||
if name != "" {
|
||||
nodes[name] = struct{}{}
|
||||
}
|
||||
}
|
||||
}
|
||||
return nodes, nil
|
||||
}
|
||||
|
||||
// stopWorkers runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) stopWorkers(ctx context.Context, workers []string).
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
|
||||
@ -219,7 +219,15 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
|
||||
o.bestEffort("reconcile worker access", func() error { return o.reconcileNodeAccess(ctx, workers) })
|
||||
}
|
||||
o.startWorkers(ctx, workers)
|
||||
o.bestEffort("uncordon workers", func() error { return o.uncordonWorkers(ctx, workers) })
|
||||
workersToUncordon := workers
|
||||
o.bestEffort("longhorn encrypted host prerequisites", func() error {
|
||||
guardedWorkers, err := o.ensureLonghornEncryptedHostPrereqs(ctx, workers)
|
||||
if err == nil {
|
||||
workersToUncordon = guardedWorkers
|
||||
}
|
||||
return err
|
||||
})
|
||||
o.bestEffort("uncordon workers", func() error { return o.uncordonWorkers(ctx, workersToUncordon) })
|
||||
sshCheckNodes := append([]string{}, o.cfg.ControlPlanes...)
|
||||
sshCheckNodes = append(sshCheckNodes, workers...)
|
||||
if err := o.waitForNodeSSHAuth(ctx, sshCheckNodes); err != nil {
|
||||
|
||||
@ -369,6 +369,76 @@ func TestEffectiveWorkersFiltersIgnoredUnavailableNodes(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestEnsureLonghornEncryptedHostPrereqsFiltersUnsafeWorkers runs one orchestration or CLI step.
|
||||
// Signature: TestEnsureLonghornEncryptedHostPrereqsFiltersUnsafeWorkers(t *testing.T).
|
||||
// Why: startup must not uncordon Longhorn workers that cannot mount encrypted
|
||||
// PVCs; cordoning those nodes is safe and avoids repeating the post-outage
|
||||
// mount deadlock.
|
||||
func TestEnsureLonghornEncryptedHostPrereqsFiltersUnsafeWorkers(t *testing.T) {
|
||||
cordoned := []string{}
|
||||
orch := buildOrchestratorWithStubs(t, config.Config{
|
||||
SSHManagedNodes: []string{"titan-04", "titan-19"},
|
||||
}, []commandStub{
|
||||
{match: matchContains("kubectl", "get", "nodes", "-l", "longhorn-host=true"), out: "titan-04\ntitan-19\ntitan-23\n"},
|
||||
{
|
||||
match: matchContains("ssh", "titan-04", "command -v cryptsetup"),
|
||||
out: "__ANANKE_CRYPTSETUP_PRESENT__",
|
||||
},
|
||||
{
|
||||
match: matchContains("ssh", "titan-19", "apt-get install -y --no-install-recommends cryptsetup-bin"),
|
||||
err: errors.New("sudo: a password is required"),
|
||||
},
|
||||
{
|
||||
match: func(name string, args []string) bool {
|
||||
if !matchContains("kubectl", "cordon")(name, args) {
|
||||
return false
|
||||
}
|
||||
if len(args) > 1 {
|
||||
cordoned = append(cordoned, args[len(args)-1])
|
||||
}
|
||||
return true
|
||||
},
|
||||
},
|
||||
})
|
||||
|
||||
got, err := orch.ensureLonghornEncryptedHostPrereqs(context.Background(), []string{"titan-04", "titan-19", "titan-20"})
|
||||
if err != nil {
|
||||
t.Fatalf("ensureLonghornEncryptedHostPrereqs failed: %v", err)
|
||||
}
|
||||
want := []string{"titan-04", "titan-20"}
|
||||
if strings.Join(got, ",") != strings.Join(want, ",") {
|
||||
t.Fatalf("guarded workers mismatch got=%v want=%v", got, want)
|
||||
}
|
||||
if strings.Join(cordoned, ",") != "titan-19,titan-23" {
|
||||
t.Fatalf("expected unsafe longhorn hosts to be cordoned, got %v", cordoned)
|
||||
}
|
||||
}
|
||||
|
||||
// TestLonghornHostNodesFallsBackToConfiguredLabels runs one orchestration or CLI step.
|
||||
// Signature: TestLonghornHostNodesFallsBackToConfiguredLabels(t *testing.T).
|
||||
// Why: bootstrap caches or minimal test clusters can lack live labels; the
|
||||
// static startup inventory should still protect configured storage workers.
|
||||
func TestLonghornHostNodesFallsBackToConfiguredLabels(t *testing.T) {
|
||||
orch := buildOrchestratorWithStubs(t, config.Config{
|
||||
Startup: config.Startup{
|
||||
RequiredNodeLabels: map[string]map[string]string{
|
||||
"titan-04": {"longhorn-host": "true"},
|
||||
"titan-20": {"node-role.kubernetes.io/worker": "true"},
|
||||
},
|
||||
},
|
||||
}, []commandStub{
|
||||
{match: matchContains("kubectl", "get", "nodes", "-l", "longhorn-host=true"), out: ""},
|
||||
})
|
||||
|
||||
got, err := orch.longhornHostNodes(context.Background())
|
||||
if err != nil {
|
||||
t.Fatalf("longhornHostNodes failed: %v", err)
|
||||
}
|
||||
if _, ok := got["titan-04"]; !ok || len(got) != 1 {
|
||||
t.Fatalf("expected configured longhorn host fallback, got %v", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestNewConstructsOrchestrator runs one orchestration or CLI step.
|
||||
// Signature: TestNewConstructsOrchestrator(t *testing.T).
|
||||
// Why: covers constructor path in orchestrator core module.
|
||||
|
||||
@ -429,8 +429,11 @@ func (o *Orchestrator) ensureHostCryptsetup(ctx context.Context, node string) er
|
||||
if err != nil {
|
||||
return fmt.Errorf("install cryptsetup-bin: %w (output=%s)", err, strings.TrimSpace(out))
|
||||
}
|
||||
o.log.Printf("ensured cryptsetup prerequisite on %s: %s", node, strings.TrimSpace(out))
|
||||
o.noteStartupAutoHeal(fmt.Sprintf("ensured cryptsetup on %s", node))
|
||||
trimmed := strings.TrimSpace(out)
|
||||
o.log.Printf("ensured cryptsetup prerequisite on %s: %s", node, trimmed)
|
||||
if strings.Contains(trimmed, "__ANANKE_CRYPTSETUP_INSTALLED__") {
|
||||
o.noteStartupAutoHeal(fmt.Sprintf("installed cryptsetup on %s", node))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user