recovery: avoid encrypted volume nodes missing cryptsetup
This commit is contained in:
parent
93d98e1397
commit
4151254ba1
@ -227,6 +227,59 @@ func TestRecycleStuckControllerPodsRepairsEncryptedVolumeCryptsetup(t *testing.T
|
||||
}
|
||||
}
|
||||
|
||||
// TestRecycleStuckControllerPodsCordonsEncryptedVolumeNodeWhenRepairFails runs one orchestration or CLI step.
|
||||
// Signature: TestRecycleStuckControllerPodsCordonsEncryptedVolumeNodeWhenRepairFails(t *testing.T).
|
||||
// Why: when host package repair is blocked by sudo policy, Ananke should avoid
|
||||
// the bad node and retry the controller-owned pod elsewhere.
|
||||
func TestRecycleStuckControllerPodsCordonsEncryptedVolumeNodeWhenRepairFails(t *testing.T) {
|
||||
created := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
|
||||
lastSeen := time.Now().UTC().Format(time.RFC3339)
|
||||
pods := `{"items":[{"metadata":{"namespace":"finance","name":"actual-budget-abc","creationTimestamp":"` + created + `","ownerReferences":[{"kind":"ReplicaSet","name":"actual-budget"}]},"spec":{"nodeName":"titan-19"},"status":{"phase":"Pending"}}]}`
|
||||
events := `{"items":[{"metadata":{"namespace":"finance","creationTimestamp":"` + lastSeen + `"},"involvedObject":{"kind":"Pod","namespace":"finance","name":"actual-budget-abc"},"type":"Warning","reason":"FailedMount","message":"MountVolume.MountDevice failed for volume \"pvc-1\" : nsenter: failed to execute cryptsetup: No such file or directory","lastTimestamp":"` + lastSeen + `"}]}`
|
||||
|
||||
cordoned := false
|
||||
deleted := false
|
||||
orch := buildOrchestratorWithStubs(t, config.Config{
|
||||
Startup: config.Startup{StuckPodGraceSeconds: 180},
|
||||
}, []commandStub{
|
||||
{match: matchContains("kubectl", "get", "pods", "-A", "-o", "json"), out: pods},
|
||||
{match: matchContains("kubectl", "-n", "longhorn-system", "get", "nodes.longhorn.io"), out: "titan-19\tTrue\n"},
|
||||
{match: matchContains("kubectl", "get", "events", "-A", "-o", "json"), out: events},
|
||||
{
|
||||
match: matchContains("ssh", "apt-get install -y --no-install-recommends cryptsetup-bin"),
|
||||
err: errors.New("sudo: a password is required"),
|
||||
},
|
||||
{
|
||||
match: func(name string, args []string) bool {
|
||||
if !matchContains("kubectl", "cordon", "titan-19")(name, args) {
|
||||
return false
|
||||
}
|
||||
cordoned = true
|
||||
return true
|
||||
},
|
||||
},
|
||||
{
|
||||
match: func(name string, args []string) bool {
|
||||
if !matchContains("kubectl", "-n", "finance", "delete", "pod", "actual-budget-abc", "--wait=false")(name, args) {
|
||||
return false
|
||||
}
|
||||
deleted = true
|
||||
return true
|
||||
},
|
||||
},
|
||||
})
|
||||
|
||||
if err := orch.recycleStuckControllerPods(context.Background()); err != nil {
|
||||
t.Fatalf("recycleStuckControllerPods failed: %v", err)
|
||||
}
|
||||
if !cordoned {
|
||||
t.Fatalf("expected cryptsetup-missing node to be cordoned")
|
||||
}
|
||||
if !deleted {
|
||||
t.Fatalf("expected encrypted-volume blocked pod to be recycled")
|
||||
}
|
||||
}
|
||||
|
||||
// TestNewConstructsOrchestrator runs one orchestration or CLI step.
|
||||
// Signature: TestNewConstructsOrchestrator(t *testing.T).
|
||||
// Why: covers constructor path in orchestrator core module.
|
||||
|
||||
@ -311,6 +311,11 @@ func (o *Orchestrator) repairEncryptedVolumeMountPrereqs(ctx context.Context, po
|
||||
if err := o.ensureHostCryptsetup(ctx, node); err != nil {
|
||||
repairedNodes[node] = false
|
||||
o.log.Printf("warning: cryptsetup prerequisite repair failed on %s for pod %s: %v", node, key, err)
|
||||
if cordonErr := o.cordonNodeForMissingCryptsetup(ctx, node); cordonErr != nil {
|
||||
o.log.Printf("warning: cordon failed after cryptsetup repair failure on %s for pod %s: %v", node, key, cordonErr)
|
||||
continue
|
||||
}
|
||||
reasons[key] = "EncryptedVolumeCryptsetupNodeCordoned:" + node
|
||||
continue
|
||||
}
|
||||
repairedNodes[node] = true
|
||||
@ -343,6 +348,21 @@ func (o *Orchestrator) ensureHostCryptsetup(ctx context.Context, node string) er
|
||||
return nil
|
||||
}
|
||||
|
||||
// cordonNodeForMissingCryptsetup runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) cordonNodeForMissingCryptsetup(ctx context.Context, node string) error.
|
||||
// Why: when host package repair is not permitted, cordoning is the safest
|
||||
// automatic fallback: it prevents new encrypted-volume pods from landing on a
|
||||
// node kubelet cannot mount from, while leaving existing workloads and storage
|
||||
// objects untouched.
|
||||
func (o *Orchestrator) cordonNodeForMissingCryptsetup(ctx context.Context, node string) error {
|
||||
if _, err := o.kubectl(ctx, 30*time.Second, "cordon", node); err != nil {
|
||||
return err
|
||||
}
|
||||
o.log.Printf("cordoned node %s after encrypted volume cryptsetup prerequisite failure", node)
|
||||
o.noteStartupAutoHeal(fmt.Sprintf("cordoned %s after missing cryptsetup blocked encrypted volume mount", node))
|
||||
return nil
|
||||
}
|
||||
|
||||
// longhornAttachBlockedPodReasons runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) longhornAttachBlockedPodReasons(ctx context.Context, pods podList, grace time.Duration) (map[string]string, error).
|
||||
// Why: after a power event, Kubernetes can schedule a Longhorn-backed pod onto a
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user