recovery: preflight encrypted longhorn hosts
This commit is contained in:
parent
c415516376
commit
3e337043d5
@ -145,6 +145,97 @@ func (o *Orchestrator) uncordonWorkers(ctx context.Context, workers []string) er
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ensureLonghornEncryptedHostPrereqs runs one orchestration or CLI step.
|
||||||
|
// Signature: (o *Orchestrator) ensureLonghornEncryptedHostPrereqs(ctx context.Context, workers []string) ([]string, error).
|
||||||
|
// Why: encrypted Longhorn PVCs fail at kubelet mount time when a storage host
|
||||||
|
// lacks host cryptsetup; startup must quarantine those nodes before workloads
|
||||||
|
// are scheduled there.
|
||||||
|
func (o *Orchestrator) ensureLonghornEncryptedHostPrereqs(ctx context.Context, workers []string) ([]string, error) {
|
||||||
|
longhornHosts, err := o.longhornHostNodes(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return workers, err
|
||||||
|
}
|
||||||
|
if len(longhornHosts) == 0 {
|
||||||
|
return workers, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
|
||||||
|
unsafe := map[string]struct{}{}
|
||||||
|
var errs []string
|
||||||
|
for node := range longhornHosts {
|
||||||
|
if _, skip := ignored[node]; skip {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if !o.sshManaged(node) {
|
||||||
|
o.log.Printf("warning: keeping longhorn host %s cordoned because encrypted-volume prerequisites cannot be verified without SSH management", node)
|
||||||
|
if cordonErr := o.cordonNodeForMissingCryptsetup(ctx, node); cordonErr != nil {
|
||||||
|
errs = append(errs, fmt.Sprintf("%s cordon after unverifiable cryptsetup prerequisite: %v", node, cordonErr))
|
||||||
|
}
|
||||||
|
unsafe[node] = struct{}{}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if checkErr := o.ensureHostCryptsetup(ctx, node); checkErr != nil {
|
||||||
|
o.log.Printf("warning: keeping longhorn host %s cordoned after cryptsetup prerequisite check failed: %v", node, checkErr)
|
||||||
|
if cordonErr := o.cordonNodeForMissingCryptsetup(ctx, node); cordonErr != nil {
|
||||||
|
errs = append(errs, fmt.Sprintf("%s cordon after cryptsetup prerequisite failure: %v", node, cordonErr))
|
||||||
|
}
|
||||||
|
unsafe[node] = struct{}{}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
guarded := make([]string, 0, len(workers))
|
||||||
|
for _, worker := range workers {
|
||||||
|
node := strings.TrimSpace(worker)
|
||||||
|
if node == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if _, blocked := unsafe[node]; blocked {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
guarded = append(guarded, node)
|
||||||
|
}
|
||||||
|
if len(errs) > 0 {
|
||||||
|
return guarded, fmt.Errorf("%s", strings.Join(errs, "; "))
|
||||||
|
}
|
||||||
|
return guarded, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// longhornHostNodes runs one orchestration or CLI step.
|
||||||
|
// Signature: (o *Orchestrator) longhornHostNodes(ctx context.Context) (map[string]struct{}, error).
|
||||||
|
// Why: the live node label captures storage hosts that may not be in Ananke's
|
||||||
|
// static worker list, so startup quarantine decisions should follow the
|
||||||
|
// cluster's actual scheduling surface.
|
||||||
|
func (o *Orchestrator) longhornHostNodes(ctx context.Context) (map[string]struct{}, error) {
|
||||||
|
out, err := o.kubectl(ctx, 20*time.Second,
|
||||||
|
"get", "nodes",
|
||||||
|
"-l", "longhorn-host=true",
|
||||||
|
"-o", "jsonpath={range .items[*]}{.metadata.name}{'\\n'}{end}",
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("query longhorn host nodes: %w", err)
|
||||||
|
}
|
||||||
|
nodes := map[string]struct{}{}
|
||||||
|
for _, line := range lines(out) {
|
||||||
|
node := strings.TrimSpace(line)
|
||||||
|
if node != "" {
|
||||||
|
nodes[node] = struct{}{}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(nodes) > 0 {
|
||||||
|
return nodes, nil
|
||||||
|
}
|
||||||
|
for node, labels := range o.cfg.Startup.RequiredNodeLabels {
|
||||||
|
if strings.EqualFold(strings.TrimSpace(labels["longhorn-host"]), "true") {
|
||||||
|
name := strings.TrimSpace(node)
|
||||||
|
if name != "" {
|
||||||
|
nodes[name] = struct{}{}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nodes, nil
|
||||||
|
}
|
||||||
|
|
||||||
// stopWorkers runs one orchestration or CLI step.
|
// stopWorkers runs one orchestration or CLI step.
|
||||||
// Signature: (o *Orchestrator) stopWorkers(ctx context.Context, workers []string).
|
// Signature: (o *Orchestrator) stopWorkers(ctx context.Context, workers []string).
|
||||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||||
|
|||||||
@ -219,7 +219,15 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
|
|||||||
o.bestEffort("reconcile worker access", func() error { return o.reconcileNodeAccess(ctx, workers) })
|
o.bestEffort("reconcile worker access", func() error { return o.reconcileNodeAccess(ctx, workers) })
|
||||||
}
|
}
|
||||||
o.startWorkers(ctx, workers)
|
o.startWorkers(ctx, workers)
|
||||||
o.bestEffort("uncordon workers", func() error { return o.uncordonWorkers(ctx, workers) })
|
workersToUncordon := workers
|
||||||
|
o.bestEffort("longhorn encrypted host prerequisites", func() error {
|
||||||
|
guardedWorkers, err := o.ensureLonghornEncryptedHostPrereqs(ctx, workers)
|
||||||
|
if err == nil {
|
||||||
|
workersToUncordon = guardedWorkers
|
||||||
|
}
|
||||||
|
return err
|
||||||
|
})
|
||||||
|
o.bestEffort("uncordon workers", func() error { return o.uncordonWorkers(ctx, workersToUncordon) })
|
||||||
sshCheckNodes := append([]string{}, o.cfg.ControlPlanes...)
|
sshCheckNodes := append([]string{}, o.cfg.ControlPlanes...)
|
||||||
sshCheckNodes = append(sshCheckNodes, workers...)
|
sshCheckNodes = append(sshCheckNodes, workers...)
|
||||||
if err := o.waitForNodeSSHAuth(ctx, sshCheckNodes); err != nil {
|
if err := o.waitForNodeSSHAuth(ctx, sshCheckNodes); err != nil {
|
||||||
|
|||||||
@ -369,6 +369,76 @@ func TestEffectiveWorkersFiltersIgnoredUnavailableNodes(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TestEnsureLonghornEncryptedHostPrereqsFiltersUnsafeWorkers runs one orchestration or CLI step.
|
||||||
|
// Signature: TestEnsureLonghornEncryptedHostPrereqsFiltersUnsafeWorkers(t *testing.T).
|
||||||
|
// Why: startup must not uncordon Longhorn workers that cannot mount encrypted
|
||||||
|
// PVCs; cordoning those nodes is safe and avoids repeating the post-outage
|
||||||
|
// mount deadlock.
|
||||||
|
func TestEnsureLonghornEncryptedHostPrereqsFiltersUnsafeWorkers(t *testing.T) {
|
||||||
|
cordoned := []string{}
|
||||||
|
orch := buildOrchestratorWithStubs(t, config.Config{
|
||||||
|
SSHManagedNodes: []string{"titan-04", "titan-19"},
|
||||||
|
}, []commandStub{
|
||||||
|
{match: matchContains("kubectl", "get", "nodes", "-l", "longhorn-host=true"), out: "titan-04\ntitan-19\ntitan-23\n"},
|
||||||
|
{
|
||||||
|
match: matchContains("ssh", "titan-04", "command -v cryptsetup"),
|
||||||
|
out: "__ANANKE_CRYPTSETUP_PRESENT__",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
match: matchContains("ssh", "titan-19", "apt-get install -y --no-install-recommends cryptsetup-bin"),
|
||||||
|
err: errors.New("sudo: a password is required"),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
match: func(name string, args []string) bool {
|
||||||
|
if !matchContains("kubectl", "cordon")(name, args) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if len(args) > 1 {
|
||||||
|
cordoned = append(cordoned, args[len(args)-1])
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
},
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
got, err := orch.ensureLonghornEncryptedHostPrereqs(context.Background(), []string{"titan-04", "titan-19", "titan-20"})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("ensureLonghornEncryptedHostPrereqs failed: %v", err)
|
||||||
|
}
|
||||||
|
want := []string{"titan-04", "titan-20"}
|
||||||
|
if strings.Join(got, ",") != strings.Join(want, ",") {
|
||||||
|
t.Fatalf("guarded workers mismatch got=%v want=%v", got, want)
|
||||||
|
}
|
||||||
|
if strings.Join(cordoned, ",") != "titan-19,titan-23" {
|
||||||
|
t.Fatalf("expected unsafe longhorn hosts to be cordoned, got %v", cordoned)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestLonghornHostNodesFallsBackToConfiguredLabels runs one orchestration or CLI step.
|
||||||
|
// Signature: TestLonghornHostNodesFallsBackToConfiguredLabels(t *testing.T).
|
||||||
|
// Why: bootstrap caches or minimal test clusters can lack live labels; the
|
||||||
|
// static startup inventory should still protect configured storage workers.
|
||||||
|
func TestLonghornHostNodesFallsBackToConfiguredLabels(t *testing.T) {
|
||||||
|
orch := buildOrchestratorWithStubs(t, config.Config{
|
||||||
|
Startup: config.Startup{
|
||||||
|
RequiredNodeLabels: map[string]map[string]string{
|
||||||
|
"titan-04": {"longhorn-host": "true"},
|
||||||
|
"titan-20": {"node-role.kubernetes.io/worker": "true"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}, []commandStub{
|
||||||
|
{match: matchContains("kubectl", "get", "nodes", "-l", "longhorn-host=true"), out: ""},
|
||||||
|
})
|
||||||
|
|
||||||
|
got, err := orch.longhornHostNodes(context.Background())
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("longhornHostNodes failed: %v", err)
|
||||||
|
}
|
||||||
|
if _, ok := got["titan-04"]; !ok || len(got) != 1 {
|
||||||
|
t.Fatalf("expected configured longhorn host fallback, got %v", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// TestNewConstructsOrchestrator runs one orchestration or CLI step.
|
// TestNewConstructsOrchestrator runs one orchestration or CLI step.
|
||||||
// Signature: TestNewConstructsOrchestrator(t *testing.T).
|
// Signature: TestNewConstructsOrchestrator(t *testing.T).
|
||||||
// Why: covers constructor path in orchestrator core module.
|
// Why: covers constructor path in orchestrator core module.
|
||||||
|
|||||||
@ -429,8 +429,11 @@ func (o *Orchestrator) ensureHostCryptsetup(ctx context.Context, node string) er
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("install cryptsetup-bin: %w (output=%s)", err, strings.TrimSpace(out))
|
return fmt.Errorf("install cryptsetup-bin: %w (output=%s)", err, strings.TrimSpace(out))
|
||||||
}
|
}
|
||||||
o.log.Printf("ensured cryptsetup prerequisite on %s: %s", node, strings.TrimSpace(out))
|
trimmed := strings.TrimSpace(out)
|
||||||
o.noteStartupAutoHeal(fmt.Sprintf("ensured cryptsetup on %s", node))
|
o.log.Printf("ensured cryptsetup prerequisite on %s: %s", node, trimmed)
|
||||||
|
if strings.Contains(trimmed, "__ANANKE_CRYPTSETUP_INSTALLED__") {
|
||||||
|
o.noteStartupAutoHeal(fmt.Sprintf("installed cryptsetup on %s", node))
|
||||||
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user