diff --git a/internal/cluster/orchestrator_unit_additional_test.go b/internal/cluster/orchestrator_unit_additional_test.go index 5f98094..2d17df7 100644 --- a/internal/cluster/orchestrator_unit_additional_test.go +++ b/internal/cluster/orchestrator_unit_additional_test.go @@ -227,6 +227,59 @@ func TestRecycleStuckControllerPodsRepairsEncryptedVolumeCryptsetup(t *testing.T } } +// TestRecycleStuckControllerPodsCordonsEncryptedVolumeNodeWhenRepairFails runs one orchestration or CLI step. +// Signature: TestRecycleStuckControllerPodsCordonsEncryptedVolumeNodeWhenRepairFails(t *testing.T). +// Why: when host package repair is blocked by sudo policy, Ananke should avoid +// the bad node and retry the controller-owned pod elsewhere. +func TestRecycleStuckControllerPodsCordonsEncryptedVolumeNodeWhenRepairFails(t *testing.T) { + created := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339) + lastSeen := time.Now().UTC().Format(time.RFC3339) + pods := `{"items":[{"metadata":{"namespace":"finance","name":"actual-budget-abc","creationTimestamp":"` + created + `","ownerReferences":[{"kind":"ReplicaSet","name":"actual-budget"}]},"spec":{"nodeName":"titan-19"},"status":{"phase":"Pending"}}]}` + events := `{"items":[{"metadata":{"namespace":"finance","creationTimestamp":"` + lastSeen + `"},"involvedObject":{"kind":"Pod","namespace":"finance","name":"actual-budget-abc"},"type":"Warning","reason":"FailedMount","message":"MountVolume.MountDevice failed for volume \"pvc-1\" : nsenter: failed to execute cryptsetup: No such file or directory","lastTimestamp":"` + lastSeen + `"}]}` + + cordoned := false + deleted := false + orch := buildOrchestratorWithStubs(t, config.Config{ + Startup: config.Startup{StuckPodGraceSeconds: 180}, + }, []commandStub{ + {match: matchContains("kubectl", "get", "pods", "-A", "-o", "json"), out: pods}, + {match: matchContains("kubectl", "-n", "longhorn-system", "get", "nodes.longhorn.io"), out: "titan-19\tTrue\n"}, + {match: matchContains("kubectl", "get", "events", "-A", "-o", "json"), out: events}, + { + match: matchContains("ssh", "apt-get install -y --no-install-recommends cryptsetup-bin"), + err: errors.New("sudo: a password is required"), + }, + { + match: func(name string, args []string) bool { + if !matchContains("kubectl", "cordon", "titan-19")(name, args) { + return false + } + cordoned = true + return true + }, + }, + { + match: func(name string, args []string) bool { + if !matchContains("kubectl", "-n", "finance", "delete", "pod", "actual-budget-abc", "--wait=false")(name, args) { + return false + } + deleted = true + return true + }, + }, + }) + + if err := orch.recycleStuckControllerPods(context.Background()); err != nil { + t.Fatalf("recycleStuckControllerPods failed: %v", err) + } + if !cordoned { + t.Fatalf("expected cryptsetup-missing node to be cordoned") + } + if !deleted { + t.Fatalf("expected encrypted-volume blocked pod to be recycled") + } +} + // TestNewConstructsOrchestrator runs one orchestration or CLI step. // Signature: TestNewConstructsOrchestrator(t *testing.T). // Why: covers constructor path in orchestrator core module. diff --git a/internal/cluster/orchestrator_workload_convergence.go b/internal/cluster/orchestrator_workload_convergence.go index 6d7eae0..2d14ac8 100644 --- a/internal/cluster/orchestrator_workload_convergence.go +++ b/internal/cluster/orchestrator_workload_convergence.go @@ -311,6 +311,11 @@ func (o *Orchestrator) repairEncryptedVolumeMountPrereqs(ctx context.Context, po if err := o.ensureHostCryptsetup(ctx, node); err != nil { repairedNodes[node] = false o.log.Printf("warning: cryptsetup prerequisite repair failed on %s for pod %s: %v", node, key, err) + if cordonErr := o.cordonNodeForMissingCryptsetup(ctx, node); cordonErr != nil { + o.log.Printf("warning: cordon failed after cryptsetup repair failure on %s for pod %s: %v", node, key, cordonErr) + continue + } + reasons[key] = "EncryptedVolumeCryptsetupNodeCordoned:" + node continue } repairedNodes[node] = true @@ -343,6 +348,21 @@ func (o *Orchestrator) ensureHostCryptsetup(ctx context.Context, node string) er return nil } +// cordonNodeForMissingCryptsetup runs one orchestration or CLI step. +// Signature: (o *Orchestrator) cordonNodeForMissingCryptsetup(ctx context.Context, node string) error. +// Why: when host package repair is not permitted, cordoning is the safest +// automatic fallback: it prevents new encrypted-volume pods from landing on a +// node kubelet cannot mount from, while leaving existing workloads and storage +// objects untouched. +func (o *Orchestrator) cordonNodeForMissingCryptsetup(ctx context.Context, node string) error { + if _, err := o.kubectl(ctx, 30*time.Second, "cordon", node); err != nil { + return err + } + o.log.Printf("cordoned node %s after encrypted volume cryptsetup prerequisite failure", node) + o.noteStartupAutoHeal(fmt.Sprintf("cordoned %s after missing cryptsetup blocked encrypted volume mount", node)) + return nil +} + // longhornAttachBlockedPodReasons runs one orchestration or CLI step. // Signature: (o *Orchestrator) longhornAttachBlockedPodReasons(ctx context.Context, pods podList, grace time.Duration) (map[string]string, error). // Why: after a power event, Kubernetes can schedule a Longhorn-backed pod onto a