recovery: exempt veles longhorn host from cryptsetup guard

This commit is contained in:
codex 2026-06-19 04:03:37 -03:00
parent 54f0b29bce
commit 9031e09f4e
12 changed files with 139 additions and 1 deletions

View File

@ -52,6 +52,7 @@ startup:
node_inventory_reachability_wait_seconds: 300 node_inventory_reachability_wait_seconds: 300
node_inventory_reachability_poll_seconds: 5 node_inventory_reachability_poll_seconds: 5
node_inventory_reachability_required_nodes: [] node_inventory_reachability_required_nodes: []
longhorn_cryptsetup_exempt_nodes: []
required_node_labels: required_node_labels:
titan-09: titan-09:
ananke.bstein.dev/harbor-bootstrap: "true" ananke.bstein.dev/harbor-bootstrap: "true"

View File

@ -121,6 +121,8 @@ startup:
- titan-0a - titan-0a
- titan-0b - titan-0b
- titan-0c - titan-0c
longhorn_cryptsetup_exempt_nodes:
- titan-23
required_node_labels: required_node_labels:
titan-04: titan-04:
node-role.kubernetes.io/worker: "true" node-role.kubernetes.io/worker: "true"

View File

@ -121,6 +121,8 @@ startup:
- titan-0a - titan-0a
- titan-0b - titan-0b
- titan-0c - titan-0c
longhorn_cryptsetup_exempt_nodes:
- titan-23
required_node_labels: required_node_labels:
titan-04: titan-04:
node-role.kubernetes.io/worker: "true" node-role.kubernetes.io/worker: "true"

View File

@ -3,6 +3,7 @@ package cluster
import ( import (
"context" "context"
"fmt" "fmt"
"sort"
"strings" "strings"
"sync" "sync"
"time" "time"
@ -160,12 +161,17 @@ func (o *Orchestrator) ensureLonghornEncryptedHostPrereqs(ctx context.Context, w
} }
ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes) ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
exempt := makeStringSet(o.cfg.Startup.LonghornCryptsetupExemptNodes)
unsafe := map[string]struct{}{} unsafe := map[string]struct{}{}
var errs []string var errs []string
for node := range longhornHosts { for node := range longhornHosts {
if _, skip := ignored[node]; skip { if _, skip := ignored[node]; skip {
continue continue
} }
if _, skip := exempt[node]; skip {
o.log.Printf("skip cryptsetup preflight on longhorn host %s: configured exemption", node)
continue
}
if !o.sshManaged(node) { if !o.sshManaged(node) {
o.log.Printf("warning: keeping longhorn host %s cordoned because encrypted-volume prerequisites cannot be verified without SSH management", node) o.log.Printf("warning: keeping longhorn host %s cordoned because encrypted-volume prerequisites cannot be verified without SSH management", node)
if cordonErr := o.cordonNodeForMissingCryptsetup(ctx, node); cordonErr != nil { if cordonErr := o.cordonNodeForMissingCryptsetup(ctx, node); cordonErr != nil {
@ -201,6 +207,39 @@ func (o *Orchestrator) ensureLonghornEncryptedHostPrereqs(ctx context.Context, w
return guarded, nil return guarded, nil
} }
// uncordonLonghornCryptsetupExemptNodes runs one orchestration or CLI step.
// Signature: (o *Orchestrator) uncordonLonghornCryptsetupExemptNodes(ctx context.Context) error.
// Why: special Longhorn hosts that only run non-encrypted local workloads, such
// as Veles/Oceanus, must recover from stale cordons without weakening the
// encrypted-volume guard for normal storage workers.
func (o *Orchestrator) uncordonLonghornCryptsetupExemptNodes(ctx context.Context) error {
exempt := makeStringSet(o.cfg.Startup.LonghornCryptsetupExemptNodes)
if len(exempt) == 0 {
return nil
}
longhornHosts, err := o.longhornHostNodes(ctx)
if err != nil {
return err
}
ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
nodes := make([]string, 0, len(exempt))
for node := range exempt {
if _, skip := ignored[node]; skip {
continue
}
if _, ok := longhornHosts[node]; ok {
nodes = append(nodes, node)
}
}
sort.Strings(nodes)
if len(nodes) == 0 {
return nil
}
o.log.Printf("uncordon longhorn cryptsetup-exempt hosts: %s", strings.Join(nodes, ","))
o.noteStartupAutoHeal(fmt.Sprintf("uncordoned longhorn cryptsetup-exempt hosts: %s", strings.Join(nodes, ",")))
return o.uncordonWorkers(ctx, nodes)
}
// longhornHostNodes runs one orchestration or CLI step. // longhornHostNodes runs one orchestration or CLI step.
// Signature: (o *Orchestrator) longhornHostNodes(ctx context.Context) (map[string]struct{}, error). // Signature: (o *Orchestrator) longhornHostNodes(ctx context.Context) (map[string]struct{}, error).
// Why: the live node label captures storage hosts that may not be in Ananke's // Why: the live node label captures storage hosts that may not be in Ananke's

View File

@ -228,6 +228,9 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
return err return err
}) })
o.bestEffort("uncordon workers", func() error { return o.uncordonWorkers(ctx, workersToUncordon) }) o.bestEffort("uncordon workers", func() error { return o.uncordonWorkers(ctx, workersToUncordon) })
o.bestEffort("uncordon longhorn cryptsetup-exempt hosts", func() error {
return o.uncordonLonghornCryptsetupExemptNodes(ctx)
})
sshCheckNodes := append([]string{}, o.cfg.ControlPlanes...) sshCheckNodes := append([]string{}, o.cfg.ControlPlanes...)
sshCheckNodes = append(sshCheckNodes, workers...) sshCheckNodes = append(sshCheckNodes, workers...)
if err := o.waitForNodeSSHAuth(ctx, sshCheckNodes); err != nil { if err := o.waitForNodeSSHAuth(ctx, sshCheckNodes); err != nil {

View File

@ -390,7 +390,7 @@ func TestEnsureLonghornEncryptedHostPrereqsFiltersUnsafeWorkers(t *testing.T) {
}, },
{ {
match: func(name string, args []string) bool { match: func(name string, args []string) bool {
if !matchContains("kubectl", "cordon")(name, args) { if name != "kubectl" || len(args) == 0 || args[0] != "cordon" {
return false return false
} }
if len(args) > 1 { if len(args) > 1 {
@ -414,6 +414,80 @@ func TestEnsureLonghornEncryptedHostPrereqsFiltersUnsafeWorkers(t *testing.T) {
} }
} }
// TestLonghornCryptsetupExemptNodesAreNotQuarantined runs one orchestration or CLI step.
// Signature: TestLonghornCryptsetupExemptNodesAreNotQuarantined(t *testing.T).
// Why: Veles/Oceanus uses titan-23 as a Longhorn host for unencrypted local
// volumes; startup should uncordon that policy-exempt node without requiring
// host SSH or weakening encrypted-volume safety on other workers.
func TestLonghornCryptsetupExemptNodesAreNotQuarantined(t *testing.T) {
cordoned := []string{}
uncordoned := []string{}
sshTitan23 := false
orch := buildOrchestratorWithStubs(t, config.Config{
SSHManagedNodes: []string{"titan-04"},
Startup: config.Startup{
LonghornCryptsetupExemptNodes: []string{"titan-23"},
},
}, []commandStub{
{match: matchContains("kubectl", "get", "nodes", "-l", "longhorn-host=true"), out: "titan-04\ntitan-23\n"},
{
match: matchContains("ssh", "titan-04", "command -v cryptsetup"),
out: "__ANANKE_CRYPTSETUP_PRESENT__",
},
{
match: func(name string, args []string) bool {
if name == "ssh" && strings.Contains(strings.Join(args, " "), "titan-23") {
sshTitan23 = true
return true
}
return false
},
},
{
match: func(name string, args []string) bool {
if name != "kubectl" || len(args) == 0 || args[0] != "cordon" {
return false
}
if len(args) > 1 {
cordoned = append(cordoned, args[len(args)-1])
}
return true
},
},
{
match: func(name string, args []string) bool {
if !matchContains("kubectl", "uncordon")(name, args) {
return false
}
if len(args) > 1 {
uncordoned = append(uncordoned, args[len(args)-1])
}
return true
},
},
})
got, err := orch.ensureLonghornEncryptedHostPrereqs(context.Background(), []string{"titan-04"})
if err != nil {
t.Fatalf("ensureLonghornEncryptedHostPrereqs failed: %v", err)
}
if strings.Join(got, ",") != "titan-04" {
t.Fatalf("guarded workers mismatch got=%v", got)
}
if err := orch.uncordonLonghornCryptsetupExemptNodes(context.Background()); err != nil {
t.Fatalf("uncordonLonghornCryptsetupExemptNodes failed: %v", err)
}
if sshTitan23 {
t.Fatalf("did not expect cryptsetup SSH check for exempt titan-23")
}
if len(cordoned) != 0 {
t.Fatalf("did not expect exempt node to be cordoned, got %v", cordoned)
}
if strings.Join(uncordoned, ",") != "titan-23" {
t.Fatalf("expected exempt titan-23 to be uncordoned, got %v", uncordoned)
}
}
// TestLonghornHostNodesFallsBackToConfiguredLabels runs one orchestration or CLI step. // TestLonghornHostNodesFallsBackToConfiguredLabels runs one orchestration or CLI step.
// Signature: TestLonghornHostNodesFallsBackToConfiguredLabels(t *testing.T). // Signature: TestLonghornHostNodesFallsBackToConfiguredLabels(t *testing.T).
// Why: bootstrap caches or minimal test clusters can lack live labels; the // Why: bootstrap caches or minimal test clusters can lack live labels; the

View File

@ -192,6 +192,9 @@ func (c *Config) applyDefaults() {
if c.Startup.IgnoreUnavailableNodes == nil { if c.Startup.IgnoreUnavailableNodes == nil {
c.Startup.IgnoreUnavailableNodes = []string{} c.Startup.IgnoreUnavailableNodes = []string{}
} }
if c.Startup.LonghornCryptsetupExemptNodes == nil {
c.Startup.LonghornCryptsetupExemptNodes = []string{}
}
if c.Startup.StuckPodGraceSeconds <= 0 { if c.Startup.StuckPodGraceSeconds <= 0 {
c.Startup.StuckPodGraceSeconds = 180 c.Startup.StuckPodGraceSeconds = 180
} }

View File

@ -46,6 +46,7 @@ func defaults() Config {
NodeInventoryReachWaitSeconds: 300, NodeInventoryReachWaitSeconds: 300,
NodeInventoryReachPollSeconds: 5, NodeInventoryReachPollSeconds: 5,
NodeInventoryReachRequiredNodes: []string{}, NodeInventoryReachRequiredNodes: []string{},
LonghornCryptsetupExemptNodes: []string{},
RequireTimeSync: true, RequireTimeSync: true,
TimeSyncWaitSeconds: 240, TimeSyncWaitSeconds: 240,
TimeSyncPollSeconds: 5, TimeSyncPollSeconds: 5,

View File

@ -36,6 +36,7 @@ type Startup struct {
NodeInventoryReachWaitSeconds int `yaml:"node_inventory_reachability_wait_seconds"` NodeInventoryReachWaitSeconds int `yaml:"node_inventory_reachability_wait_seconds"`
NodeInventoryReachPollSeconds int `yaml:"node_inventory_reachability_poll_seconds"` NodeInventoryReachPollSeconds int `yaml:"node_inventory_reachability_poll_seconds"`
NodeInventoryReachRequiredNodes []string `yaml:"node_inventory_reachability_required_nodes"` NodeInventoryReachRequiredNodes []string `yaml:"node_inventory_reachability_required_nodes"`
LonghornCryptsetupExemptNodes []string `yaml:"longhorn_cryptsetup_exempt_nodes"`
RequiredNodeLabels map[string]map[string]string `yaml:"required_node_labels"` RequiredNodeLabels map[string]map[string]string `yaml:"required_node_labels"`
RequireTimeSync bool `yaml:"require_time_sync"` RequireTimeSync bool `yaml:"require_time_sync"`
TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"` TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"`

View File

@ -66,6 +66,11 @@ func (c Config) Validate() error {
return fmt.Errorf("config.startup.node_inventory_reachability_required_nodes entries must not be empty") return fmt.Errorf("config.startup.node_inventory_reachability_required_nodes entries must not be empty")
} }
} }
for _, node := range c.Startup.LonghornCryptsetupExemptNodes {
if strings.TrimSpace(node) == "" {
return fmt.Errorf("config.startup.longhorn_cryptsetup_exempt_nodes entries must not be empty")
}
}
for node, labels := range c.Startup.RequiredNodeLabels { for node, labels := range c.Startup.RequiredNodeLabels {
if strings.TrimSpace(node) == "" { if strings.TrimSpace(node) == "" {
return fmt.Errorf("config.startup.required_node_labels keys must not be empty") return fmt.Errorf("config.startup.required_node_labels keys must not be empty")

View File

@ -31,6 +31,7 @@ func TestValidateRejectsInvalidFieldsMatrix(t *testing.T) {
{"bad_min_battery_percent", func(c *Config) { c.Startup.MinimumBatteryPercent = 101 }}, {"bad_min_battery_percent", func(c *Config) { c.Startup.MinimumBatteryPercent = 101 }},
{"bad_node_inventory_poll", func(c *Config) { c.Startup.NodeInventoryReachPollSeconds = 0 }}, {"bad_node_inventory_poll", func(c *Config) { c.Startup.NodeInventoryReachPollSeconds = 0 }},
{"bad_empty_node_inventory_required_node", func(c *Config) { c.Startup.NodeInventoryReachRequiredNodes = []string{"titan-0a", ""} }}, {"bad_empty_node_inventory_required_node", func(c *Config) { c.Startup.NodeInventoryReachRequiredNodes = []string{"titan-0a", ""} }},
{"bad_empty_longhorn_cryptsetup_exempt_node", func(c *Config) { c.Startup.LonghornCryptsetupExemptNodes = []string{"titan-23", ""} }},
{"bad_time_sync_wait", func(c *Config) { c.Startup.TimeSyncWaitSeconds = 0 }}, {"bad_time_sync_wait", func(c *Config) { c.Startup.TimeSyncWaitSeconds = 0 }},
{"bad_time_sync_poll", func(c *Config) { c.Startup.TimeSyncPollSeconds = 0 }}, {"bad_time_sync_poll", func(c *Config) { c.Startup.TimeSyncPollSeconds = 0 }},
{"bad_time_sync_quorum", func(c *Config) { c.Startup.TimeSyncMode = "quorum"; c.Startup.TimeSyncQuorum = 0 }}, {"bad_time_sync_quorum", func(c *Config) { c.Startup.TimeSyncMode = "quorum"; c.Startup.TimeSyncQuorum = 0 }},

View File

@ -77,6 +77,12 @@ migrate_ananke_config() {
echo "[install] added startup node inventory reachability gate defaults" echo "[install] added startup node inventory reachability gate defaults"
changed=1 changed=1
fi fi
if grep -Eq '^ required_node_labels:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ longhorn_cryptsetup_exempt_nodes:' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ required_node_labels:[[:space:]]*$/i\ longhorn_cryptsetup_exempt_nodes:\n - titan-23' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup.longhorn_cryptsetup_exempt_nodes default (titan-23)"
changed=1
fi
if grep -Eq '^state:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \ if grep -Eq '^state:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ reports_dir:[[:space:]]*/var/lib/ananke/reports' "${CONF_DIR}/ananke.yaml"; then && ! grep -Eq '^ reports_dir:[[:space:]]*/var/lib/ananke/reports' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ dir:[[:space:]]*\/var\/lib\/ananke$/a\ reports_dir: /var/lib/ananke/reports' "${CONF_DIR}/ananke.yaml" sed -Ei '/^ dir:[[:space:]]*\/var\/lib\/ananke$/a\ reports_dir: /var/lib/ananke/reports' "${CONF_DIR}/ananke.yaml"