recovery: preflight encrypted longhorn hosts

This commit is contained in:
codex 2026-06-18 23:18:31 -03:00
parent c415516376
commit 3e337043d5
4 changed files with 175 additions and 3 deletions

View File

@ -145,6 +145,97 @@ func (o *Orchestrator) uncordonWorkers(ctx context.Context, workers []string) er
return nil
}
// ensureLonghornEncryptedHostPrereqs runs one orchestration or CLI step.
// Signature: (o *Orchestrator) ensureLonghornEncryptedHostPrereqs(ctx context.Context, workers []string) ([]string, error).
// Why: encrypted Longhorn PVCs fail at kubelet mount time when a storage host
// lacks host cryptsetup; startup must quarantine those nodes before workloads
// are scheduled there.
func (o *Orchestrator) ensureLonghornEncryptedHostPrereqs(ctx context.Context, workers []string) ([]string, error) {
longhornHosts, err := o.longhornHostNodes(ctx)
if err != nil {
return workers, err
}
if len(longhornHosts) == 0 {
return workers, nil
}
ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
unsafe := map[string]struct{}{}
var errs []string
for node := range longhornHosts {
if _, skip := ignored[node]; skip {
continue
}
if !o.sshManaged(node) {
o.log.Printf("warning: keeping longhorn host %s cordoned because encrypted-volume prerequisites cannot be verified without SSH management", node)
if cordonErr := o.cordonNodeForMissingCryptsetup(ctx, node); cordonErr != nil {
errs = append(errs, fmt.Sprintf("%s cordon after unverifiable cryptsetup prerequisite: %v", node, cordonErr))
}
unsafe[node] = struct{}{}
continue
}
if checkErr := o.ensureHostCryptsetup(ctx, node); checkErr != nil {
o.log.Printf("warning: keeping longhorn host %s cordoned after cryptsetup prerequisite check failed: %v", node, checkErr)
if cordonErr := o.cordonNodeForMissingCryptsetup(ctx, node); cordonErr != nil {
errs = append(errs, fmt.Sprintf("%s cordon after cryptsetup prerequisite failure: %v", node, cordonErr))
}
unsafe[node] = struct{}{}
continue
}
}
guarded := make([]string, 0, len(workers))
for _, worker := range workers {
node := strings.TrimSpace(worker)
if node == "" {
continue
}
if _, blocked := unsafe[node]; blocked {
continue
}
guarded = append(guarded, node)
}
if len(errs) > 0 {
return guarded, fmt.Errorf("%s", strings.Join(errs, "; "))
}
return guarded, nil
}
// longhornHostNodes runs one orchestration or CLI step.
// Signature: (o *Orchestrator) longhornHostNodes(ctx context.Context) (map[string]struct{}, error).
// Why: the live node label captures storage hosts that may not be in Ananke's
// static worker list, so startup quarantine decisions should follow the
// cluster's actual scheduling surface.
func (o *Orchestrator) longhornHostNodes(ctx context.Context) (map[string]struct{}, error) {
out, err := o.kubectl(ctx, 20*time.Second,
"get", "nodes",
"-l", "longhorn-host=true",
"-o", "jsonpath={range .items[*]}{.metadata.name}{'\\n'}{end}",
)
if err != nil {
return nil, fmt.Errorf("query longhorn host nodes: %w", err)
}
nodes := map[string]struct{}{}
for _, line := range lines(out) {
node := strings.TrimSpace(line)
if node != "" {
nodes[node] = struct{}{}
}
}
if len(nodes) > 0 {
return nodes, nil
}
for node, labels := range o.cfg.Startup.RequiredNodeLabels {
if strings.EqualFold(strings.TrimSpace(labels["longhorn-host"]), "true") {
name := strings.TrimSpace(node)
if name != "" {
nodes[name] = struct{}{}
}
}
}
return nodes, nil
}
// stopWorkers runs one orchestration or CLI step.
// Signature: (o *Orchestrator) stopWorkers(ctx context.Context, workers []string).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.

View File

@ -219,7 +219,15 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
o.bestEffort("reconcile worker access", func() error { return o.reconcileNodeAccess(ctx, workers) })
}
o.startWorkers(ctx, workers)
o.bestEffort("uncordon workers", func() error { return o.uncordonWorkers(ctx, workers) })
workersToUncordon := workers
o.bestEffort("longhorn encrypted host prerequisites", func() error {
guardedWorkers, err := o.ensureLonghornEncryptedHostPrereqs(ctx, workers)
if err == nil {
workersToUncordon = guardedWorkers
}
return err
})
o.bestEffort("uncordon workers", func() error { return o.uncordonWorkers(ctx, workersToUncordon) })
sshCheckNodes := append([]string{}, o.cfg.ControlPlanes...)
sshCheckNodes = append(sshCheckNodes, workers...)
if err := o.waitForNodeSSHAuth(ctx, sshCheckNodes); err != nil {

View File

@ -369,6 +369,76 @@ func TestEffectiveWorkersFiltersIgnoredUnavailableNodes(t *testing.T) {
}
}
// TestEnsureLonghornEncryptedHostPrereqsFiltersUnsafeWorkers runs one orchestration or CLI step.
// Signature: TestEnsureLonghornEncryptedHostPrereqsFiltersUnsafeWorkers(t *testing.T).
// Why: startup must not uncordon Longhorn workers that cannot mount encrypted
// PVCs; cordoning those nodes is safe and avoids repeating the post-outage
// mount deadlock.
func TestEnsureLonghornEncryptedHostPrereqsFiltersUnsafeWorkers(t *testing.T) {
cordoned := []string{}
orch := buildOrchestratorWithStubs(t, config.Config{
SSHManagedNodes: []string{"titan-04", "titan-19"},
}, []commandStub{
{match: matchContains("kubectl", "get", "nodes", "-l", "longhorn-host=true"), out: "titan-04\ntitan-19\ntitan-23\n"},
{
match: matchContains("ssh", "titan-04", "command -v cryptsetup"),
out: "__ANANKE_CRYPTSETUP_PRESENT__",
},
{
match: matchContains("ssh", "titan-19", "apt-get install -y --no-install-recommends cryptsetup-bin"),
err: errors.New("sudo: a password is required"),
},
{
match: func(name string, args []string) bool {
if !matchContains("kubectl", "cordon")(name, args) {
return false
}
if len(args) > 1 {
cordoned = append(cordoned, args[len(args)-1])
}
return true
},
},
})
got, err := orch.ensureLonghornEncryptedHostPrereqs(context.Background(), []string{"titan-04", "titan-19", "titan-20"})
if err != nil {
t.Fatalf("ensureLonghornEncryptedHostPrereqs failed: %v", err)
}
want := []string{"titan-04", "titan-20"}
if strings.Join(got, ",") != strings.Join(want, ",") {
t.Fatalf("guarded workers mismatch got=%v want=%v", got, want)
}
if strings.Join(cordoned, ",") != "titan-19,titan-23" {
t.Fatalf("expected unsafe longhorn hosts to be cordoned, got %v", cordoned)
}
}
// TestLonghornHostNodesFallsBackToConfiguredLabels runs one orchestration or CLI step.
// Signature: TestLonghornHostNodesFallsBackToConfiguredLabels(t *testing.T).
// Why: bootstrap caches or minimal test clusters can lack live labels; the
// static startup inventory should still protect configured storage workers.
func TestLonghornHostNodesFallsBackToConfiguredLabels(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{
Startup: config.Startup{
RequiredNodeLabels: map[string]map[string]string{
"titan-04": {"longhorn-host": "true"},
"titan-20": {"node-role.kubernetes.io/worker": "true"},
},
},
}, []commandStub{
{match: matchContains("kubectl", "get", "nodes", "-l", "longhorn-host=true"), out: ""},
})
got, err := orch.longhornHostNodes(context.Background())
if err != nil {
t.Fatalf("longhornHostNodes failed: %v", err)
}
if _, ok := got["titan-04"]; !ok || len(got) != 1 {
t.Fatalf("expected configured longhorn host fallback, got %v", got)
}
}
// TestNewConstructsOrchestrator runs one orchestration or CLI step.
// Signature: TestNewConstructsOrchestrator(t *testing.T).
// Why: covers constructor path in orchestrator core module.

View File

@ -429,8 +429,11 @@ func (o *Orchestrator) ensureHostCryptsetup(ctx context.Context, node string) er
if err != nil {
return fmt.Errorf("install cryptsetup-bin: %w (output=%s)", err, strings.TrimSpace(out))
}
o.log.Printf("ensured cryptsetup prerequisite on %s: %s", node, strings.TrimSpace(out))
o.noteStartupAutoHeal(fmt.Sprintf("ensured cryptsetup on %s", node))
trimmed := strings.TrimSpace(out)
o.log.Printf("ensured cryptsetup prerequisite on %s: %s", node, trimmed)
if strings.Contains(trimmed, "__ANANKE_CRYPTSETUP_INSTALLED__") {
o.noteStartupAutoHeal(fmt.Sprintf("installed cryptsetup on %s", node))
}
return nil
}