ananke/internal/cluster/orchestrator_longhorn_recovery.go

255 lines
9.9 KiB
Go
Raw Normal View History

package cluster
import (
"context"
"encoding/json"
"fmt"
"strings"
"time"
)
// repairEncryptedVolumeMountPrereqs runs one orchestration or CLI step.
// Signature: (o *Orchestrator) repairEncryptedVolumeMountPrereqs(ctx context.Context, pods podList, grace time.Duration) (map[string]string, error).
// Why: encrypted Longhorn volume mounts depend on host cryptsetup. After node
// rebuilds or partial OS recovery, Kubernetes may be ready while kubelet cannot
// mount encrypted PVCs; installing the missing host tool and recycling the
// controller-owned pod lets kubelet retry the same volume safely.
func (o *Orchestrator) repairEncryptedVolumeMountPrereqs(ctx context.Context, pods podList, grace time.Duration) (map[string]string, error) {
eventsOut, err := o.kubectl(ctx, 30*time.Second, "get", "events", "-A", "-o", "json")
if err != nil {
return nil, fmt.Errorf("query events for encrypted volume mount scan: %w", err)
}
var events eventList
if err := json.Unmarshal([]byte(eventsOut), &events); err != nil {
return nil, fmt.Errorf("decode events for encrypted volume mount scan: %w", err)
}
podsByKey := map[string]podResource{}
for _, pod := range pods.Items {
ns := strings.TrimSpace(pod.Metadata.Namespace)
name := strings.TrimSpace(pod.Metadata.Name)
node := strings.TrimSpace(pod.Spec.NodeName)
if ns == "" || name == "" || node == "" {
continue
}
if !strings.EqualFold(strings.TrimSpace(pod.Status.Phase), "Pending") {
continue
}
if !podControllerOwned(pod) {
continue
}
if !pod.Metadata.CreationTimestamp.IsZero() && time.Since(pod.Metadata.CreationTimestamp) < grace {
continue
}
podsByKey[ns+"/"+name] = pod
}
if len(podsByKey) == 0 {
return map[string]string{}, nil
}
repairedNodes := map[string]bool{}
reasons := map[string]string{}
for _, event := range events.Items {
if !strings.EqualFold(strings.TrimSpace(event.Type), "Warning") {
continue
}
if strings.TrimSpace(event.Reason) != "FailedMount" {
continue
}
if !strings.EqualFold(strings.TrimSpace(event.InvolvedObject.Kind), "Pod") {
continue
}
key := strings.TrimSpace(event.InvolvedObject.Namespace) + "/" + strings.TrimSpace(event.InvolvedObject.Name)
pod, ok := podsByKey[key]
if !ok {
continue
}
lastSeen := eventLastObservedAt(event)
if !lastSeen.IsZero() && !pod.Metadata.CreationTimestamp.IsZero() && lastSeen.Before(pod.Metadata.CreationTimestamp) {
continue
}
message := strings.ToLower(strings.TrimSpace(event.Message))
if !strings.Contains(message, "cryptsetup") || !strings.Contains(message, "no such file or directory") {
continue
}
node := strings.TrimSpace(pod.Spec.NodeName)
if node == "" || !o.sshManaged(node) {
o.log.Printf("warning: encrypted volume mount blocked on unmanaged node %s for pod %s", node, key)
continue
}
if repaired, ok := repairedNodes[node]; ok {
if repaired {
reasons[key] = "EncryptedVolumeCryptsetupRepaired:" + node
}
continue
}
if err := o.ensureHostCryptsetup(ctx, node); err != nil {
repairedNodes[node] = false
o.log.Printf("warning: cryptsetup prerequisite repair failed on %s for pod %s: %v", node, key, err)
if cordonErr := o.cordonNodeForMissingCryptsetup(ctx, node); cordonErr != nil {
o.log.Printf("warning: cordon failed after cryptsetup repair failure on %s for pod %s: %v", node, key, cordonErr)
continue
}
reasons[key] = "EncryptedVolumeCryptsetupNodeCordoned:" + node
continue
}
repairedNodes[node] = true
reasons[key] = "EncryptedVolumeCryptsetupRepaired:" + node
}
return reasons, nil
}
// ensureHostCryptsetup runs one orchestration or CLI step.
// Signature: (o *Orchestrator) ensureHostCryptsetup(ctx context.Context, node string) error.
// Why: kubelet's encrypted Longhorn mount helper shells into the host namespace,
// so the package must exist on the node host, not merely inside a workload pod.
func (o *Orchestrator) ensureHostCryptsetup(ctx context.Context, node string) error {
command := strings.Join([]string{
"set -eu",
"if command -v cryptsetup >/dev/null 2>&1 || [ -x /usr/sbin/cryptsetup ] || [ -x /usr/bin/cryptsetup ]; then echo __ANANKE_CRYPTSETUP_PRESENT__; exit 0; fi",
"if ! command -v apt-get >/dev/null 2>&1; then echo __ANANKE_CRYPTSETUP_NO_APT__; exit 42; fi",
"sudo -n env DEBIAN_FRONTEND=noninteractive apt-get update",
"sudo -n env DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cryptsetup-bin",
"if command -v cryptsetup >/dev/null 2>&1 || [ -x /usr/sbin/cryptsetup ] || [ -x /usr/bin/cryptsetup ]; then echo __ANANKE_CRYPTSETUP_INSTALLED__; exit 0; fi",
"echo __ANANKE_CRYPTSETUP_INSTALL_FAILED__",
"exit 43",
}, "; ")
out, err := o.sshWithTimeout(ctx, node, command, 5*time.Minute)
if err != nil {
return fmt.Errorf("install cryptsetup-bin: %w (output=%s)", err, strings.TrimSpace(out))
}
trimmed := strings.TrimSpace(out)
o.log.Printf("ensured cryptsetup prerequisite on %s: %s", node, trimmed)
if strings.Contains(trimmed, "__ANANKE_CRYPTSETUP_INSTALLED__") {
o.noteStartupAutoHeal(fmt.Sprintf("installed cryptsetup on %s", node))
}
return nil
}
// cordonNodeForMissingCryptsetup runs one orchestration or CLI step.
// Signature: (o *Orchestrator) cordonNodeForMissingCryptsetup(ctx context.Context, node string) error.
// Why: when host package repair is not permitted, cordoning is the safest
// automatic fallback: it prevents new encrypted-volume pods from landing on a
// node kubelet cannot mount from, while leaving existing workloads and storage
// objects untouched.
func (o *Orchestrator) cordonNodeForMissingCryptsetup(ctx context.Context, node string) error {
if err := o.cordonNodeWithLease(ctx, node, cordonReasonMissingCryptsetup, "encrypted Longhorn volume mount failed because host cryptsetup is missing"); err != nil {
return err
}
o.log.Printf("cordoned node %s after encrypted volume cryptsetup prerequisite failure", node)
o.noteStartupAutoHeal(fmt.Sprintf("cordoned %s after missing cryptsetup blocked encrypted volume mount", node))
return nil
}
// longhornAttachBlockedPodReasons runs one orchestration or CLI step.
// Signature: (o *Orchestrator) longhornAttachBlockedPodReasons(ctx context.Context, pods podList, grace time.Duration) (map[string]string, error).
// Why: after a power event, Kubernetes can schedule a Longhorn-backed pod onto a
// node Longhorn still marks unready. Recycling the unattached Pending pod lets
// the scheduler pick a Longhorn-ready node without touching Longhorn data-plane
// objects.
func (o *Orchestrator) longhornAttachBlockedPodReasons(ctx context.Context, pods podList, grace time.Duration) (map[string]string, error) {
unreadyNodes, err := o.longhornUnreadyNodes(ctx)
if err != nil {
return nil, err
}
if len(unreadyNodes) == 0 {
return map[string]string{}, nil
}
eventsOut, err := o.kubectl(ctx, 30*time.Second, "get", "events", "-A", "-o", "json")
if err != nil {
return nil, fmt.Errorf("query events for longhorn attach-blocked pod scan: %w", err)
}
var events eventList
if err := json.Unmarshal([]byte(eventsOut), &events); err != nil {
return nil, fmt.Errorf("decode events for longhorn attach-blocked pod scan: %w", err)
}
podsByKey := map[string]podResource{}
for _, pod := range pods.Items {
ns := strings.TrimSpace(pod.Metadata.Namespace)
name := strings.TrimSpace(pod.Metadata.Name)
node := strings.TrimSpace(pod.Spec.NodeName)
if ns == "" || name == "" || node == "" {
continue
}
if !strings.EqualFold(strings.TrimSpace(pod.Status.Phase), "Pending") {
continue
}
if _, unready := unreadyNodes[node]; !unready {
continue
}
if !podControllerOwned(pod) {
continue
}
if !pod.Metadata.CreationTimestamp.IsZero() && time.Since(pod.Metadata.CreationTimestamp) < grace {
continue
}
podsByKey[ns+"/"+name] = pod
}
if len(podsByKey) == 0 {
return map[string]string{}, nil
}
reasons := map[string]string{}
for _, event := range events.Items {
if !strings.EqualFold(strings.TrimSpace(event.Type), "Warning") {
continue
}
if strings.TrimSpace(event.Reason) != "FailedAttachVolume" {
continue
}
if !strings.EqualFold(strings.TrimSpace(event.InvolvedObject.Kind), "Pod") {
continue
}
key := strings.TrimSpace(event.InvolvedObject.Namespace) + "/" + strings.TrimSpace(event.InvolvedObject.Name)
pod, ok := podsByKey[key]
if !ok {
continue
}
lastSeen := eventLastObservedAt(event)
if !lastSeen.IsZero() && !pod.Metadata.CreationTimestamp.IsZero() && lastSeen.Before(pod.Metadata.CreationTimestamp) {
continue
}
node := strings.TrimSpace(pod.Spec.NodeName)
message := strings.ToLower(strings.TrimSpace(event.Message))
if !strings.Contains(message, "longhorn-backend") || !strings.Contains(message, "failed for volume") {
continue
}
if !strings.Contains(message, "node "+strings.ToLower(node)+" is not ready") {
continue
}
reasons[key] = "LonghornAttachBlockedOnUnreadyNode:" + node
}
return reasons, nil
}
// longhornUnreadyNodes runs one orchestration or CLI step.
// Signature: (o *Orchestrator) longhornUnreadyNodes(ctx context.Context) (map[string]struct{}, error).
// Why: Longhorn node readiness can lag or intentionally differ from Kubernetes
// node readiness; attach recovery must use Longhorn's view for safety.
func (o *Orchestrator) longhornUnreadyNodes(ctx context.Context) (map[string]struct{}, error) {
out, err := o.kubectl(ctx, 30*time.Second,
"-n", "longhorn-system",
"get", "nodes.longhorn.io",
"-o", "jsonpath={range .items[*]}{.metadata.name}{'\\t'}{range .status.conditions[?(@.type==\"Ready\")]}{.status}{end}{'\\n'}{end}",
)
if err != nil {
if isNotFoundErr(err) {
return map[string]struct{}{}, nil
}
return nil, fmt.Errorf("query longhorn node readiness: %w", err)
}
unready := map[string]struct{}{}
for _, line := range lines(out) {
fields := strings.Fields(line)
if len(fields) < 2 {
continue
}
if !strings.EqualFold(strings.TrimSpace(fields[1]), "True") {
unready[strings.TrimSpace(fields[0])] = struct{}{}
}
}
return unready, nil
}