recovery: avoid encrypted volume nodes missing cryptsetup

This commit is contained in:
codex 2026-06-18 22:37:54 -03:00
parent 93d98e1397
commit 4151254ba1
2 changed files with 73 additions and 0 deletions

View File

@ -227,6 +227,59 @@ func TestRecycleStuckControllerPodsRepairsEncryptedVolumeCryptsetup(t *testing.T
} }
} }
// TestRecycleStuckControllerPodsCordonsEncryptedVolumeNodeWhenRepairFails runs one orchestration or CLI step.
// Signature: TestRecycleStuckControllerPodsCordonsEncryptedVolumeNodeWhenRepairFails(t *testing.T).
// Why: when host package repair is blocked by sudo policy, Ananke should avoid
// the bad node and retry the controller-owned pod elsewhere.
func TestRecycleStuckControllerPodsCordonsEncryptedVolumeNodeWhenRepairFails(t *testing.T) {
created := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
lastSeen := time.Now().UTC().Format(time.RFC3339)
pods := `{"items":[{"metadata":{"namespace":"finance","name":"actual-budget-abc","creationTimestamp":"` + created + `","ownerReferences":[{"kind":"ReplicaSet","name":"actual-budget"}]},"spec":{"nodeName":"titan-19"},"status":{"phase":"Pending"}}]}`
events := `{"items":[{"metadata":{"namespace":"finance","creationTimestamp":"` + lastSeen + `"},"involvedObject":{"kind":"Pod","namespace":"finance","name":"actual-budget-abc"},"type":"Warning","reason":"FailedMount","message":"MountVolume.MountDevice failed for volume \"pvc-1\" : nsenter: failed to execute cryptsetup: No such file or directory","lastTimestamp":"` + lastSeen + `"}]}`
cordoned := false
deleted := false
orch := buildOrchestratorWithStubs(t, config.Config{
Startup: config.Startup{StuckPodGraceSeconds: 180},
}, []commandStub{
{match: matchContains("kubectl", "get", "pods", "-A", "-o", "json"), out: pods},
{match: matchContains("kubectl", "-n", "longhorn-system", "get", "nodes.longhorn.io"), out: "titan-19\tTrue\n"},
{match: matchContains("kubectl", "get", "events", "-A", "-o", "json"), out: events},
{
match: matchContains("ssh", "apt-get install -y --no-install-recommends cryptsetup-bin"),
err: errors.New("sudo: a password is required"),
},
{
match: func(name string, args []string) bool {
if !matchContains("kubectl", "cordon", "titan-19")(name, args) {
return false
}
cordoned = true
return true
},
},
{
match: func(name string, args []string) bool {
if !matchContains("kubectl", "-n", "finance", "delete", "pod", "actual-budget-abc", "--wait=false")(name, args) {
return false
}
deleted = true
return true
},
},
})
if err := orch.recycleStuckControllerPods(context.Background()); err != nil {
t.Fatalf("recycleStuckControllerPods failed: %v", err)
}
if !cordoned {
t.Fatalf("expected cryptsetup-missing node to be cordoned")
}
if !deleted {
t.Fatalf("expected encrypted-volume blocked pod to be recycled")
}
}
// TestNewConstructsOrchestrator runs one orchestration or CLI step. // TestNewConstructsOrchestrator runs one orchestration or CLI step.
// Signature: TestNewConstructsOrchestrator(t *testing.T). // Signature: TestNewConstructsOrchestrator(t *testing.T).
// Why: covers constructor path in orchestrator core module. // Why: covers constructor path in orchestrator core module.

View File

@ -311,6 +311,11 @@ func (o *Orchestrator) repairEncryptedVolumeMountPrereqs(ctx context.Context, po
if err := o.ensureHostCryptsetup(ctx, node); err != nil { if err := o.ensureHostCryptsetup(ctx, node); err != nil {
repairedNodes[node] = false repairedNodes[node] = false
o.log.Printf("warning: cryptsetup prerequisite repair failed on %s for pod %s: %v", node, key, err) o.log.Printf("warning: cryptsetup prerequisite repair failed on %s for pod %s: %v", node, key, err)
if cordonErr := o.cordonNodeForMissingCryptsetup(ctx, node); cordonErr != nil {
o.log.Printf("warning: cordon failed after cryptsetup repair failure on %s for pod %s: %v", node, key, cordonErr)
continue
}
reasons[key] = "EncryptedVolumeCryptsetupNodeCordoned:" + node
continue continue
} }
repairedNodes[node] = true repairedNodes[node] = true
@ -343,6 +348,21 @@ func (o *Orchestrator) ensureHostCryptsetup(ctx context.Context, node string) er
return nil return nil
} }
// cordonNodeForMissingCryptsetup runs one orchestration or CLI step.
// Signature: (o *Orchestrator) cordonNodeForMissingCryptsetup(ctx context.Context, node string) error.
// Why: when host package repair is not permitted, cordoning is the safest
// automatic fallback: it prevents new encrypted-volume pods from landing on a
// node kubelet cannot mount from, while leaving existing workloads and storage
// objects untouched.
func (o *Orchestrator) cordonNodeForMissingCryptsetup(ctx context.Context, node string) error {
if _, err := o.kubectl(ctx, 30*time.Second, "cordon", node); err != nil {
return err
}
o.log.Printf("cordoned node %s after encrypted volume cryptsetup prerequisite failure", node)
o.noteStartupAutoHeal(fmt.Sprintf("cordoned %s after missing cryptsetup blocked encrypted volume mount", node))
return nil
}
// longhornAttachBlockedPodReasons runs one orchestration or CLI step. // longhornAttachBlockedPodReasons runs one orchestration or CLI step.
// Signature: (o *Orchestrator) longhornAttachBlockedPodReasons(ctx context.Context, pods podList, grace time.Duration) (map[string]string, error). // Signature: (o *Orchestrator) longhornAttachBlockedPodReasons(ctx context.Context, pods podList, grace time.Duration) (map[string]string, error).
// Why: after a power event, Kubernetes can schedule a Longhorn-backed pod onto a // Why: after a power event, Kubernetes can schedule a Longhorn-backed pod onto a