diff --git a/internal/cluster/orchestrator_drain.go b/internal/cluster/orchestrator_drain.go index b7b5dd4..c4a2828 100644 --- a/internal/cluster/orchestrator_drain.go +++ b/internal/cluster/orchestrator_drain.go @@ -145,6 +145,97 @@ func (o *Orchestrator) uncordonWorkers(ctx context.Context, workers []string) er return nil } +// ensureLonghornEncryptedHostPrereqs runs one orchestration or CLI step. +// Signature: (o *Orchestrator) ensureLonghornEncryptedHostPrereqs(ctx context.Context, workers []string) ([]string, error). +// Why: encrypted Longhorn PVCs fail at kubelet mount time when a storage host +// lacks host cryptsetup; startup must quarantine those nodes before workloads +// are scheduled there. +func (o *Orchestrator) ensureLonghornEncryptedHostPrereqs(ctx context.Context, workers []string) ([]string, error) { + longhornHosts, err := o.longhornHostNodes(ctx) + if err != nil { + return workers, err + } + if len(longhornHosts) == 0 { + return workers, nil + } + + ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes) + unsafe := map[string]struct{}{} + var errs []string + for node := range longhornHosts { + if _, skip := ignored[node]; skip { + continue + } + if !o.sshManaged(node) { + o.log.Printf("warning: keeping longhorn host %s cordoned because encrypted-volume prerequisites cannot be verified without SSH management", node) + if cordonErr := o.cordonNodeForMissingCryptsetup(ctx, node); cordonErr != nil { + errs = append(errs, fmt.Sprintf("%s cordon after unverifiable cryptsetup prerequisite: %v", node, cordonErr)) + } + unsafe[node] = struct{}{} + continue + } + if checkErr := o.ensureHostCryptsetup(ctx, node); checkErr != nil { + o.log.Printf("warning: keeping longhorn host %s cordoned after cryptsetup prerequisite check failed: %v", node, checkErr) + if cordonErr := o.cordonNodeForMissingCryptsetup(ctx, node); cordonErr != nil { + errs = append(errs, fmt.Sprintf("%s cordon after cryptsetup prerequisite failure: %v", node, cordonErr)) + } + unsafe[node] = struct{}{} + continue + } + } + + guarded := make([]string, 0, len(workers)) + for _, worker := range workers { + node := strings.TrimSpace(worker) + if node == "" { + continue + } + if _, blocked := unsafe[node]; blocked { + continue + } + guarded = append(guarded, node) + } + if len(errs) > 0 { + return guarded, fmt.Errorf("%s", strings.Join(errs, "; ")) + } + return guarded, nil +} + +// longhornHostNodes runs one orchestration or CLI step. +// Signature: (o *Orchestrator) longhornHostNodes(ctx context.Context) (map[string]struct{}, error). +// Why: the live node label captures storage hosts that may not be in Ananke's +// static worker list, so startup quarantine decisions should follow the +// cluster's actual scheduling surface. +func (o *Orchestrator) longhornHostNodes(ctx context.Context) (map[string]struct{}, error) { + out, err := o.kubectl(ctx, 20*time.Second, + "get", "nodes", + "-l", "longhorn-host=true", + "-o", "jsonpath={range .items[*]}{.metadata.name}{'\\n'}{end}", + ) + if err != nil { + return nil, fmt.Errorf("query longhorn host nodes: %w", err) + } + nodes := map[string]struct{}{} + for _, line := range lines(out) { + node := strings.TrimSpace(line) + if node != "" { + nodes[node] = struct{}{} + } + } + if len(nodes) > 0 { + return nodes, nil + } + for node, labels := range o.cfg.Startup.RequiredNodeLabels { + if strings.EqualFold(strings.TrimSpace(labels["longhorn-host"]), "true") { + name := strings.TrimSpace(node) + if name != "" { + nodes[name] = struct{}{} + } + } + } + return nodes, nil +} + // stopWorkers runs one orchestration or CLI step. // Signature: (o *Orchestrator) stopWorkers(ctx context.Context, workers []string). // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. diff --git a/internal/cluster/orchestrator_lifecycle.go b/internal/cluster/orchestrator_lifecycle.go index e698c8d..6ba7622 100644 --- a/internal/cluster/orchestrator_lifecycle.go +++ b/internal/cluster/orchestrator_lifecycle.go @@ -219,7 +219,15 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er o.bestEffort("reconcile worker access", func() error { return o.reconcileNodeAccess(ctx, workers) }) } o.startWorkers(ctx, workers) - o.bestEffort("uncordon workers", func() error { return o.uncordonWorkers(ctx, workers) }) + workersToUncordon := workers + o.bestEffort("longhorn encrypted host prerequisites", func() error { + guardedWorkers, err := o.ensureLonghornEncryptedHostPrereqs(ctx, workers) + if err == nil { + workersToUncordon = guardedWorkers + } + return err + }) + o.bestEffort("uncordon workers", func() error { return o.uncordonWorkers(ctx, workersToUncordon) }) sshCheckNodes := append([]string{}, o.cfg.ControlPlanes...) sshCheckNodes = append(sshCheckNodes, workers...) if err := o.waitForNodeSSHAuth(ctx, sshCheckNodes); err != nil { diff --git a/internal/cluster/orchestrator_unit_additional_test.go b/internal/cluster/orchestrator_unit_additional_test.go index 62102d0..e69a77a 100644 --- a/internal/cluster/orchestrator_unit_additional_test.go +++ b/internal/cluster/orchestrator_unit_additional_test.go @@ -369,6 +369,76 @@ func TestEffectiveWorkersFiltersIgnoredUnavailableNodes(t *testing.T) { } } +// TestEnsureLonghornEncryptedHostPrereqsFiltersUnsafeWorkers runs one orchestration or CLI step. +// Signature: TestEnsureLonghornEncryptedHostPrereqsFiltersUnsafeWorkers(t *testing.T). +// Why: startup must not uncordon Longhorn workers that cannot mount encrypted +// PVCs; cordoning those nodes is safe and avoids repeating the post-outage +// mount deadlock. +func TestEnsureLonghornEncryptedHostPrereqsFiltersUnsafeWorkers(t *testing.T) { + cordoned := []string{} + orch := buildOrchestratorWithStubs(t, config.Config{ + SSHManagedNodes: []string{"titan-04", "titan-19"}, + }, []commandStub{ + {match: matchContains("kubectl", "get", "nodes", "-l", "longhorn-host=true"), out: "titan-04\ntitan-19\ntitan-23\n"}, + { + match: matchContains("ssh", "titan-04", "command -v cryptsetup"), + out: "__ANANKE_CRYPTSETUP_PRESENT__", + }, + { + match: matchContains("ssh", "titan-19", "apt-get install -y --no-install-recommends cryptsetup-bin"), + err: errors.New("sudo: a password is required"), + }, + { + match: func(name string, args []string) bool { + if !matchContains("kubectl", "cordon")(name, args) { + return false + } + if len(args) > 1 { + cordoned = append(cordoned, args[len(args)-1]) + } + return true + }, + }, + }) + + got, err := orch.ensureLonghornEncryptedHostPrereqs(context.Background(), []string{"titan-04", "titan-19", "titan-20"}) + if err != nil { + t.Fatalf("ensureLonghornEncryptedHostPrereqs failed: %v", err) + } + want := []string{"titan-04", "titan-20"} + if strings.Join(got, ",") != strings.Join(want, ",") { + t.Fatalf("guarded workers mismatch got=%v want=%v", got, want) + } + if strings.Join(cordoned, ",") != "titan-19,titan-23" { + t.Fatalf("expected unsafe longhorn hosts to be cordoned, got %v", cordoned) + } +} + +// TestLonghornHostNodesFallsBackToConfiguredLabels runs one orchestration or CLI step. +// Signature: TestLonghornHostNodesFallsBackToConfiguredLabels(t *testing.T). +// Why: bootstrap caches or minimal test clusters can lack live labels; the +// static startup inventory should still protect configured storage workers. +func TestLonghornHostNodesFallsBackToConfiguredLabels(t *testing.T) { + orch := buildOrchestratorWithStubs(t, config.Config{ + Startup: config.Startup{ + RequiredNodeLabels: map[string]map[string]string{ + "titan-04": {"longhorn-host": "true"}, + "titan-20": {"node-role.kubernetes.io/worker": "true"}, + }, + }, + }, []commandStub{ + {match: matchContains("kubectl", "get", "nodes", "-l", "longhorn-host=true"), out: ""}, + }) + + got, err := orch.longhornHostNodes(context.Background()) + if err != nil { + t.Fatalf("longhornHostNodes failed: %v", err) + } + if _, ok := got["titan-04"]; !ok || len(got) != 1 { + t.Fatalf("expected configured longhorn host fallback, got %v", got) + } +} + // TestNewConstructsOrchestrator runs one orchestration or CLI step. // Signature: TestNewConstructsOrchestrator(t *testing.T). // Why: covers constructor path in orchestrator core module. diff --git a/internal/cluster/orchestrator_workload_convergence.go b/internal/cluster/orchestrator_workload_convergence.go index 11738c5..c24cbee 100644 --- a/internal/cluster/orchestrator_workload_convergence.go +++ b/internal/cluster/orchestrator_workload_convergence.go @@ -429,8 +429,11 @@ func (o *Orchestrator) ensureHostCryptsetup(ctx context.Context, node string) er if err != nil { return fmt.Errorf("install cryptsetup-bin: %w (output=%s)", err, strings.TrimSpace(out)) } - o.log.Printf("ensured cryptsetup prerequisite on %s: %s", node, strings.TrimSpace(out)) - o.noteStartupAutoHeal(fmt.Sprintf("ensured cryptsetup on %s", node)) + trimmed := strings.TrimSpace(out) + o.log.Printf("ensured cryptsetup prerequisite on %s: %s", node, trimmed) + if strings.Contains(trimmed, "__ANANKE_CRYPTSETUP_INSTALLED__") { + o.noteStartupAutoHeal(fmt.Sprintf("installed cryptsetup on %s", node)) + } return nil }