From 9031e09f4e16d435decfae670e694a9d4923c12c Mon Sep 17 00:00:00 2001 From: codex Date: Fri, 19 Jun 2026 04:03:37 -0300 Subject: [PATCH] recovery: exempt veles longhorn host from cryptsetup guard --- configs/ananke.example.yaml | 1 + configs/ananke.tethys.yaml | 2 + configs/ananke.titan-db.yaml | 2 + internal/cluster/orchestrator_drain.go | 39 ++++++++++ internal/cluster/orchestrator_lifecycle.go | 3 + .../orchestrator_unit_additional_test.go | 76 ++++++++++++++++++- internal/config/apply_defaults.go | 3 + internal/config/defaults.go | 1 + internal/config/types.go | 1 + internal/config/validate.go | 5 ++ internal/config/validate_matrix_test.go | 1 + scripts/install-config-migration.sh | 6 ++ 12 files changed, 139 insertions(+), 1 deletion(-) diff --git a/configs/ananke.example.yaml b/configs/ananke.example.yaml index fa5ae18..1ce7c00 100644 --- a/configs/ananke.example.yaml +++ b/configs/ananke.example.yaml @@ -52,6 +52,7 @@ startup: node_inventory_reachability_wait_seconds: 300 node_inventory_reachability_poll_seconds: 5 node_inventory_reachability_required_nodes: [] + longhorn_cryptsetup_exempt_nodes: [] required_node_labels: titan-09: ananke.bstein.dev/harbor-bootstrap: "true" diff --git a/configs/ananke.tethys.yaml b/configs/ananke.tethys.yaml index 75ff3e4..068e0e4 100644 --- a/configs/ananke.tethys.yaml +++ b/configs/ananke.tethys.yaml @@ -121,6 +121,8 @@ startup: - titan-0a - titan-0b - titan-0c + longhorn_cryptsetup_exempt_nodes: + - titan-23 required_node_labels: titan-04: node-role.kubernetes.io/worker: "true" diff --git a/configs/ananke.titan-db.yaml b/configs/ananke.titan-db.yaml index c056618..64954bc 100644 --- a/configs/ananke.titan-db.yaml +++ b/configs/ananke.titan-db.yaml @@ -121,6 +121,8 @@ startup: - titan-0a - titan-0b - titan-0c + longhorn_cryptsetup_exempt_nodes: + - titan-23 required_node_labels: titan-04: node-role.kubernetes.io/worker: "true" diff --git a/internal/cluster/orchestrator_drain.go b/internal/cluster/orchestrator_drain.go index c4a2828..c4dff26 100644 --- a/internal/cluster/orchestrator_drain.go +++ b/internal/cluster/orchestrator_drain.go @@ -3,6 +3,7 @@ package cluster import ( "context" "fmt" + "sort" "strings" "sync" "time" @@ -160,12 +161,17 @@ func (o *Orchestrator) ensureLonghornEncryptedHostPrereqs(ctx context.Context, w } ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes) + exempt := makeStringSet(o.cfg.Startup.LonghornCryptsetupExemptNodes) unsafe := map[string]struct{}{} var errs []string for node := range longhornHosts { if _, skip := ignored[node]; skip { continue } + if _, skip := exempt[node]; skip { + o.log.Printf("skip cryptsetup preflight on longhorn host %s: configured exemption", node) + continue + } if !o.sshManaged(node) { o.log.Printf("warning: keeping longhorn host %s cordoned because encrypted-volume prerequisites cannot be verified without SSH management", node) if cordonErr := o.cordonNodeForMissingCryptsetup(ctx, node); cordonErr != nil { @@ -201,6 +207,39 @@ func (o *Orchestrator) ensureLonghornEncryptedHostPrereqs(ctx context.Context, w return guarded, nil } +// uncordonLonghornCryptsetupExemptNodes runs one orchestration or CLI step. +// Signature: (o *Orchestrator) uncordonLonghornCryptsetupExemptNodes(ctx context.Context) error. +// Why: special Longhorn hosts that only run non-encrypted local workloads, such +// as Veles/Oceanus, must recover from stale cordons without weakening the +// encrypted-volume guard for normal storage workers. +func (o *Orchestrator) uncordonLonghornCryptsetupExemptNodes(ctx context.Context) error { + exempt := makeStringSet(o.cfg.Startup.LonghornCryptsetupExemptNodes) + if len(exempt) == 0 { + return nil + } + longhornHosts, err := o.longhornHostNodes(ctx) + if err != nil { + return err + } + ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes) + nodes := make([]string, 0, len(exempt)) + for node := range exempt { + if _, skip := ignored[node]; skip { + continue + } + if _, ok := longhornHosts[node]; ok { + nodes = append(nodes, node) + } + } + sort.Strings(nodes) + if len(nodes) == 0 { + return nil + } + o.log.Printf("uncordon longhorn cryptsetup-exempt hosts: %s", strings.Join(nodes, ",")) + o.noteStartupAutoHeal(fmt.Sprintf("uncordoned longhorn cryptsetup-exempt hosts: %s", strings.Join(nodes, ","))) + return o.uncordonWorkers(ctx, nodes) +} + // longhornHostNodes runs one orchestration or CLI step. // Signature: (o *Orchestrator) longhornHostNodes(ctx context.Context) (map[string]struct{}, error). // Why: the live node label captures storage hosts that may not be in Ananke's diff --git a/internal/cluster/orchestrator_lifecycle.go b/internal/cluster/orchestrator_lifecycle.go index 6ba7622..fa9640f 100644 --- a/internal/cluster/orchestrator_lifecycle.go +++ b/internal/cluster/orchestrator_lifecycle.go @@ -228,6 +228,9 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er return err }) o.bestEffort("uncordon workers", func() error { return o.uncordonWorkers(ctx, workersToUncordon) }) + o.bestEffort("uncordon longhorn cryptsetup-exempt hosts", func() error { + return o.uncordonLonghornCryptsetupExemptNodes(ctx) + }) sshCheckNodes := append([]string{}, o.cfg.ControlPlanes...) sshCheckNodes = append(sshCheckNodes, workers...) if err := o.waitForNodeSSHAuth(ctx, sshCheckNodes); err != nil { diff --git a/internal/cluster/orchestrator_unit_additional_test.go b/internal/cluster/orchestrator_unit_additional_test.go index e69a77a..05ff6ed 100644 --- a/internal/cluster/orchestrator_unit_additional_test.go +++ b/internal/cluster/orchestrator_unit_additional_test.go @@ -390,7 +390,7 @@ func TestEnsureLonghornEncryptedHostPrereqsFiltersUnsafeWorkers(t *testing.T) { }, { match: func(name string, args []string) bool { - if !matchContains("kubectl", "cordon")(name, args) { + if name != "kubectl" || len(args) == 0 || args[0] != "cordon" { return false } if len(args) > 1 { @@ -414,6 +414,80 @@ func TestEnsureLonghornEncryptedHostPrereqsFiltersUnsafeWorkers(t *testing.T) { } } +// TestLonghornCryptsetupExemptNodesAreNotQuarantined runs one orchestration or CLI step. +// Signature: TestLonghornCryptsetupExemptNodesAreNotQuarantined(t *testing.T). +// Why: Veles/Oceanus uses titan-23 as a Longhorn host for unencrypted local +// volumes; startup should uncordon that policy-exempt node without requiring +// host SSH or weakening encrypted-volume safety on other workers. +func TestLonghornCryptsetupExemptNodesAreNotQuarantined(t *testing.T) { + cordoned := []string{} + uncordoned := []string{} + sshTitan23 := false + orch := buildOrchestratorWithStubs(t, config.Config{ + SSHManagedNodes: []string{"titan-04"}, + Startup: config.Startup{ + LonghornCryptsetupExemptNodes: []string{"titan-23"}, + }, + }, []commandStub{ + {match: matchContains("kubectl", "get", "nodes", "-l", "longhorn-host=true"), out: "titan-04\ntitan-23\n"}, + { + match: matchContains("ssh", "titan-04", "command -v cryptsetup"), + out: "__ANANKE_CRYPTSETUP_PRESENT__", + }, + { + match: func(name string, args []string) bool { + if name == "ssh" && strings.Contains(strings.Join(args, " "), "titan-23") { + sshTitan23 = true + return true + } + return false + }, + }, + { + match: func(name string, args []string) bool { + if name != "kubectl" || len(args) == 0 || args[0] != "cordon" { + return false + } + if len(args) > 1 { + cordoned = append(cordoned, args[len(args)-1]) + } + return true + }, + }, + { + match: func(name string, args []string) bool { + if !matchContains("kubectl", "uncordon")(name, args) { + return false + } + if len(args) > 1 { + uncordoned = append(uncordoned, args[len(args)-1]) + } + return true + }, + }, + }) + + got, err := orch.ensureLonghornEncryptedHostPrereqs(context.Background(), []string{"titan-04"}) + if err != nil { + t.Fatalf("ensureLonghornEncryptedHostPrereqs failed: %v", err) + } + if strings.Join(got, ",") != "titan-04" { + t.Fatalf("guarded workers mismatch got=%v", got) + } + if err := orch.uncordonLonghornCryptsetupExemptNodes(context.Background()); err != nil { + t.Fatalf("uncordonLonghornCryptsetupExemptNodes failed: %v", err) + } + if sshTitan23 { + t.Fatalf("did not expect cryptsetup SSH check for exempt titan-23") + } + if len(cordoned) != 0 { + t.Fatalf("did not expect exempt node to be cordoned, got %v", cordoned) + } + if strings.Join(uncordoned, ",") != "titan-23" { + t.Fatalf("expected exempt titan-23 to be uncordoned, got %v", uncordoned) + } +} + // TestLonghornHostNodesFallsBackToConfiguredLabels runs one orchestration or CLI step. // Signature: TestLonghornHostNodesFallsBackToConfiguredLabels(t *testing.T). // Why: bootstrap caches or minimal test clusters can lack live labels; the diff --git a/internal/config/apply_defaults.go b/internal/config/apply_defaults.go index 62fd730..408a1e7 100644 --- a/internal/config/apply_defaults.go +++ b/internal/config/apply_defaults.go @@ -192,6 +192,9 @@ func (c *Config) applyDefaults() { if c.Startup.IgnoreUnavailableNodes == nil { c.Startup.IgnoreUnavailableNodes = []string{} } + if c.Startup.LonghornCryptsetupExemptNodes == nil { + c.Startup.LonghornCryptsetupExemptNodes = []string{} + } if c.Startup.StuckPodGraceSeconds <= 0 { c.Startup.StuckPodGraceSeconds = 180 } diff --git a/internal/config/defaults.go b/internal/config/defaults.go index 5fb4eef..04fd065 100644 --- a/internal/config/defaults.go +++ b/internal/config/defaults.go @@ -46,6 +46,7 @@ func defaults() Config { NodeInventoryReachWaitSeconds: 300, NodeInventoryReachPollSeconds: 5, NodeInventoryReachRequiredNodes: []string{}, + LonghornCryptsetupExemptNodes: []string{}, RequireTimeSync: true, TimeSyncWaitSeconds: 240, TimeSyncPollSeconds: 5, diff --git a/internal/config/types.go b/internal/config/types.go index cccef00..7ebe94f 100644 --- a/internal/config/types.go +++ b/internal/config/types.go @@ -36,6 +36,7 @@ type Startup struct { NodeInventoryReachWaitSeconds int `yaml:"node_inventory_reachability_wait_seconds"` NodeInventoryReachPollSeconds int `yaml:"node_inventory_reachability_poll_seconds"` NodeInventoryReachRequiredNodes []string `yaml:"node_inventory_reachability_required_nodes"` + LonghornCryptsetupExemptNodes []string `yaml:"longhorn_cryptsetup_exempt_nodes"` RequiredNodeLabels map[string]map[string]string `yaml:"required_node_labels"` RequireTimeSync bool `yaml:"require_time_sync"` TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"` diff --git a/internal/config/validate.go b/internal/config/validate.go index 6dcc34d..7ed707d 100644 --- a/internal/config/validate.go +++ b/internal/config/validate.go @@ -66,6 +66,11 @@ func (c Config) Validate() error { return fmt.Errorf("config.startup.node_inventory_reachability_required_nodes entries must not be empty") } } + for _, node := range c.Startup.LonghornCryptsetupExemptNodes { + if strings.TrimSpace(node) == "" { + return fmt.Errorf("config.startup.longhorn_cryptsetup_exempt_nodes entries must not be empty") + } + } for node, labels := range c.Startup.RequiredNodeLabels { if strings.TrimSpace(node) == "" { return fmt.Errorf("config.startup.required_node_labels keys must not be empty") diff --git a/internal/config/validate_matrix_test.go b/internal/config/validate_matrix_test.go index da77bb9..58b95cb 100644 --- a/internal/config/validate_matrix_test.go +++ b/internal/config/validate_matrix_test.go @@ -31,6 +31,7 @@ func TestValidateRejectsInvalidFieldsMatrix(t *testing.T) { {"bad_min_battery_percent", func(c *Config) { c.Startup.MinimumBatteryPercent = 101 }}, {"bad_node_inventory_poll", func(c *Config) { c.Startup.NodeInventoryReachPollSeconds = 0 }}, {"bad_empty_node_inventory_required_node", func(c *Config) { c.Startup.NodeInventoryReachRequiredNodes = []string{"titan-0a", ""} }}, + {"bad_empty_longhorn_cryptsetup_exempt_node", func(c *Config) { c.Startup.LonghornCryptsetupExemptNodes = []string{"titan-23", ""} }}, {"bad_time_sync_wait", func(c *Config) { c.Startup.TimeSyncWaitSeconds = 0 }}, {"bad_time_sync_poll", func(c *Config) { c.Startup.TimeSyncPollSeconds = 0 }}, {"bad_time_sync_quorum", func(c *Config) { c.Startup.TimeSyncMode = "quorum"; c.Startup.TimeSyncQuorum = 0 }}, diff --git a/scripts/install-config-migration.sh b/scripts/install-config-migration.sh index 44ac206..29ce0af 100755 --- a/scripts/install-config-migration.sh +++ b/scripts/install-config-migration.sh @@ -77,6 +77,12 @@ migrate_ananke_config() { echo "[install] added startup node inventory reachability gate defaults" changed=1 fi + if grep -Eq '^ required_node_labels:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \ + && ! grep -Eq '^ longhorn_cryptsetup_exempt_nodes:' "${CONF_DIR}/ananke.yaml"; then + sed -Ei '/^ required_node_labels:[[:space:]]*$/i\ longhorn_cryptsetup_exempt_nodes:\n - titan-23' "${CONF_DIR}/ananke.yaml" + echo "[install] added startup.longhorn_cryptsetup_exempt_nodes default (titan-23)" + changed=1 + fi if grep -Eq '^state:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \ && ! grep -Eq '^ reports_dir:[[:space:]]*/var/lib/ananke/reports' "${CONF_DIR}/ananke.yaml"; then sed -Ei '/^ dir:[[:space:]]*\/var\/lib\/ananke$/a\ reports_dir: /var/lib/ananke/reports' "${CONF_DIR}/ananke.yaml"