recovery: repair encrypted volume mount prerequisites
This commit is contained in:
parent
904f6b1a62
commit
93d98e1397
@ -177,6 +177,56 @@ func TestRecycleStuckControllerPodsHandlesLonghornAttachBlockedPods(t *testing.T
|
||||
}
|
||||
}
|
||||
|
||||
// TestRecycleStuckControllerPodsRepairsEncryptedVolumeCryptsetup runs one orchestration or CLI step.
|
||||
// Signature: TestRecycleStuckControllerPodsRepairsEncryptedVolumeCryptsetup(t *testing.T).
|
||||
// Why: encrypted Longhorn PVC recovery should repair missing host cryptsetup and
|
||||
// then recycle the blocked pod without touching Longhorn data-plane objects.
|
||||
func TestRecycleStuckControllerPodsRepairsEncryptedVolumeCryptsetup(t *testing.T) {
|
||||
created := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
|
||||
lastSeen := time.Now().UTC().Format(time.RFC3339)
|
||||
pods := `{"items":[{"metadata":{"namespace":"finance","name":"actual-budget-abc","creationTimestamp":"` + created + `","ownerReferences":[{"kind":"ReplicaSet","name":"actual-budget"}]},"spec":{"nodeName":"titan-19"},"status":{"phase":"Pending"}}]}`
|
||||
events := `{"items":[{"metadata":{"namespace":"finance","creationTimestamp":"` + lastSeen + `"},"involvedObject":{"kind":"Pod","namespace":"finance","name":"actual-budget-abc"},"type":"Warning","reason":"FailedMount","message":"MountVolume.MountDevice failed for volume \"pvc-1\" : nsenter: failed to execute cryptsetup: No such file or directory","lastTimestamp":"` + lastSeen + `"}]}`
|
||||
|
||||
installed := false
|
||||
deleted := false
|
||||
orch := buildOrchestratorWithStubs(t, config.Config{
|
||||
Startup: config.Startup{StuckPodGraceSeconds: 180},
|
||||
}, []commandStub{
|
||||
{match: matchContains("kubectl", "get", "pods", "-A", "-o", "json"), out: pods},
|
||||
{match: matchContains("kubectl", "-n", "longhorn-system", "get", "nodes.longhorn.io"), out: "titan-19\tTrue\n"},
|
||||
{match: matchContains("kubectl", "get", "events", "-A", "-o", "json"), out: events},
|
||||
{
|
||||
match: func(name string, args []string) bool {
|
||||
if name != "ssh" || !strings.Contains(strings.Join(args, " "), "apt-get install -y --no-install-recommends cryptsetup-bin") {
|
||||
return false
|
||||
}
|
||||
installed = true
|
||||
return true
|
||||
},
|
||||
out: "__ANANKE_CRYPTSETUP_INSTALLED__",
|
||||
},
|
||||
{
|
||||
match: func(name string, args []string) bool {
|
||||
if !matchContains("kubectl", "-n", "finance", "delete", "pod", "actual-budget-abc", "--wait=false")(name, args) {
|
||||
return false
|
||||
}
|
||||
deleted = true
|
||||
return true
|
||||
},
|
||||
},
|
||||
})
|
||||
|
||||
if err := orch.recycleStuckControllerPods(context.Background()); err != nil {
|
||||
t.Fatalf("recycleStuckControllerPods failed: %v", err)
|
||||
}
|
||||
if !installed {
|
||||
t.Fatalf("expected missing host cryptsetup to be installed")
|
||||
}
|
||||
if !deleted {
|
||||
t.Fatalf("expected encrypted-volume blocked pod to be recycled")
|
||||
}
|
||||
}
|
||||
|
||||
// TestNewConstructsOrchestrator runs one orchestration or CLI step.
|
||||
// Signature: TestNewConstructsOrchestrator(t *testing.T).
|
||||
// Why: covers constructor path in orchestrator core module.
|
||||
|
||||
@ -176,6 +176,12 @@ func (o *Orchestrator) recycleStuckControllerPods(ctx context.Context) error {
|
||||
} else {
|
||||
longhornAttachReasons = reasons
|
||||
}
|
||||
encryptedMountReasons := map[string]string{}
|
||||
if reasons, scanErr := o.repairEncryptedVolumeMountPrereqs(ctx, list, grace); scanErr != nil {
|
||||
o.log.Printf("warning: encrypted volume mount prerequisite scan failed: %v", scanErr)
|
||||
} else {
|
||||
encryptedMountReasons = reasons
|
||||
}
|
||||
recycled := []string{}
|
||||
for _, pod := range list.Items {
|
||||
ns := strings.TrimSpace(pod.Metadata.Namespace)
|
||||
@ -206,6 +212,9 @@ func (o *Orchestrator) recycleStuckControllerPods(ctx context.Context) error {
|
||||
if reason == "" {
|
||||
reason = longhornAttachReasons[ns+"/"+name]
|
||||
}
|
||||
if reason == "" {
|
||||
reason = encryptedMountReasons[ns+"/"+name]
|
||||
}
|
||||
if reason == "" {
|
||||
continue
|
||||
}
|
||||
@ -224,6 +233,116 @@ func (o *Orchestrator) recycleStuckControllerPods(ctx context.Context) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// repairEncryptedVolumeMountPrereqs runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) repairEncryptedVolumeMountPrereqs(ctx context.Context, pods podList, grace time.Duration) (map[string]string, error).
|
||||
// Why: encrypted Longhorn volume mounts depend on host cryptsetup. After node
|
||||
// rebuilds or partial OS recovery, Kubernetes may be ready while kubelet cannot
|
||||
// mount encrypted PVCs; installing the missing host tool and recycling the
|
||||
// controller-owned pod lets kubelet retry the same volume safely.
|
||||
func (o *Orchestrator) repairEncryptedVolumeMountPrereqs(ctx context.Context, pods podList, grace time.Duration) (map[string]string, error) {
|
||||
eventsOut, err := o.kubectl(ctx, 30*time.Second, "get", "events", "-A", "-o", "json")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("query events for encrypted volume mount scan: %w", err)
|
||||
}
|
||||
var events eventList
|
||||
if err := json.Unmarshal([]byte(eventsOut), &events); err != nil {
|
||||
return nil, fmt.Errorf("decode events for encrypted volume mount scan: %w", err)
|
||||
}
|
||||
|
||||
podsByKey := map[string]podResource{}
|
||||
for _, pod := range pods.Items {
|
||||
ns := strings.TrimSpace(pod.Metadata.Namespace)
|
||||
name := strings.TrimSpace(pod.Metadata.Name)
|
||||
node := strings.TrimSpace(pod.Spec.NodeName)
|
||||
if ns == "" || name == "" || node == "" {
|
||||
continue
|
||||
}
|
||||
if !strings.EqualFold(strings.TrimSpace(pod.Status.Phase), "Pending") {
|
||||
continue
|
||||
}
|
||||
if !podControllerOwned(pod) {
|
||||
continue
|
||||
}
|
||||
if !pod.Metadata.CreationTimestamp.IsZero() && time.Since(pod.Metadata.CreationTimestamp) < grace {
|
||||
continue
|
||||
}
|
||||
podsByKey[ns+"/"+name] = pod
|
||||
}
|
||||
if len(podsByKey) == 0 {
|
||||
return map[string]string{}, nil
|
||||
}
|
||||
|
||||
repairedNodes := map[string]bool{}
|
||||
reasons := map[string]string{}
|
||||
for _, event := range events.Items {
|
||||
if !strings.EqualFold(strings.TrimSpace(event.Type), "Warning") {
|
||||
continue
|
||||
}
|
||||
if strings.TrimSpace(event.Reason) != "FailedMount" {
|
||||
continue
|
||||
}
|
||||
if !strings.EqualFold(strings.TrimSpace(event.InvolvedObject.Kind), "Pod") {
|
||||
continue
|
||||
}
|
||||
key := strings.TrimSpace(event.InvolvedObject.Namespace) + "/" + strings.TrimSpace(event.InvolvedObject.Name)
|
||||
pod, ok := podsByKey[key]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
lastSeen := eventLastObservedAt(event)
|
||||
if !lastSeen.IsZero() && !pod.Metadata.CreationTimestamp.IsZero() && lastSeen.Before(pod.Metadata.CreationTimestamp) {
|
||||
continue
|
||||
}
|
||||
message := strings.ToLower(strings.TrimSpace(event.Message))
|
||||
if !strings.Contains(message, "cryptsetup") || !strings.Contains(message, "no such file or directory") {
|
||||
continue
|
||||
}
|
||||
node := strings.TrimSpace(pod.Spec.NodeName)
|
||||
if node == "" || !o.sshManaged(node) {
|
||||
o.log.Printf("warning: encrypted volume mount blocked on unmanaged node %s for pod %s", node, key)
|
||||
continue
|
||||
}
|
||||
if repaired, ok := repairedNodes[node]; ok {
|
||||
if repaired {
|
||||
reasons[key] = "EncryptedVolumeCryptsetupRepaired:" + node
|
||||
}
|
||||
continue
|
||||
}
|
||||
if err := o.ensureHostCryptsetup(ctx, node); err != nil {
|
||||
repairedNodes[node] = false
|
||||
o.log.Printf("warning: cryptsetup prerequisite repair failed on %s for pod %s: %v", node, key, err)
|
||||
continue
|
||||
}
|
||||
repairedNodes[node] = true
|
||||
reasons[key] = "EncryptedVolumeCryptsetupRepaired:" + node
|
||||
}
|
||||
return reasons, nil
|
||||
}
|
||||
|
||||
// ensureHostCryptsetup runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) ensureHostCryptsetup(ctx context.Context, node string) error.
|
||||
// Why: kubelet's encrypted Longhorn mount helper shells into the host namespace,
|
||||
// so the package must exist on the node host, not merely inside a workload pod.
|
||||
func (o *Orchestrator) ensureHostCryptsetup(ctx context.Context, node string) error {
|
||||
command := strings.Join([]string{
|
||||
"set -eu",
|
||||
"if command -v cryptsetup >/dev/null 2>&1 || [ -x /usr/sbin/cryptsetup ] || [ -x /usr/bin/cryptsetup ]; then echo __ANANKE_CRYPTSETUP_PRESENT__; exit 0; fi",
|
||||
"if ! command -v apt-get >/dev/null 2>&1; then echo __ANANKE_CRYPTSETUP_NO_APT__; exit 42; fi",
|
||||
"sudo -n env DEBIAN_FRONTEND=noninteractive apt-get update",
|
||||
"sudo -n env DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cryptsetup-bin",
|
||||
"if command -v cryptsetup >/dev/null 2>&1 || [ -x /usr/sbin/cryptsetup ] || [ -x /usr/bin/cryptsetup ]; then echo __ANANKE_CRYPTSETUP_INSTALLED__; exit 0; fi",
|
||||
"echo __ANANKE_CRYPTSETUP_INSTALL_FAILED__",
|
||||
"exit 43",
|
||||
}, "; ")
|
||||
out, err := o.sshWithTimeout(ctx, node, command, 5*time.Minute)
|
||||
if err != nil {
|
||||
return fmt.Errorf("install cryptsetup-bin: %w (output=%s)", err, strings.TrimSpace(out))
|
||||
}
|
||||
o.log.Printf("ensured cryptsetup prerequisite on %s: %s", node, strings.TrimSpace(out))
|
||||
o.noteStartupAutoHeal(fmt.Sprintf("ensured cryptsetup on %s", node))
|
||||
return nil
|
||||
}
|
||||
|
||||
// longhornAttachBlockedPodReasons runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) longhornAttachBlockedPodReasons(ctx context.Context, pods podList, grace time.Duration) (map[string]string, error).
|
||||
// Why: after a power event, Kubernetes can schedule a Longhorn-backed pod onto a
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user