recovery: exempt veles longhorn host from cryptsetup guard
This commit is contained in:
parent
54f0b29bce
commit
9031e09f4e
@ -52,6 +52,7 @@ startup:
|
||||
node_inventory_reachability_wait_seconds: 300
|
||||
node_inventory_reachability_poll_seconds: 5
|
||||
node_inventory_reachability_required_nodes: []
|
||||
longhorn_cryptsetup_exempt_nodes: []
|
||||
required_node_labels:
|
||||
titan-09:
|
||||
ananke.bstein.dev/harbor-bootstrap: "true"
|
||||
|
||||
@ -121,6 +121,8 @@ startup:
|
||||
- titan-0a
|
||||
- titan-0b
|
||||
- titan-0c
|
||||
longhorn_cryptsetup_exempt_nodes:
|
||||
- titan-23
|
||||
required_node_labels:
|
||||
titan-04:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
|
||||
@ -121,6 +121,8 @@ startup:
|
||||
- titan-0a
|
||||
- titan-0b
|
||||
- titan-0c
|
||||
longhorn_cryptsetup_exempt_nodes:
|
||||
- titan-23
|
||||
required_node_labels:
|
||||
titan-04:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
|
||||
@ -3,6 +3,7 @@ package cluster
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
@ -160,12 +161,17 @@ func (o *Orchestrator) ensureLonghornEncryptedHostPrereqs(ctx context.Context, w
|
||||
}
|
||||
|
||||
ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
|
||||
exempt := makeStringSet(o.cfg.Startup.LonghornCryptsetupExemptNodes)
|
||||
unsafe := map[string]struct{}{}
|
||||
var errs []string
|
||||
for node := range longhornHosts {
|
||||
if _, skip := ignored[node]; skip {
|
||||
continue
|
||||
}
|
||||
if _, skip := exempt[node]; skip {
|
||||
o.log.Printf("skip cryptsetup preflight on longhorn host %s: configured exemption", node)
|
||||
continue
|
||||
}
|
||||
if !o.sshManaged(node) {
|
||||
o.log.Printf("warning: keeping longhorn host %s cordoned because encrypted-volume prerequisites cannot be verified without SSH management", node)
|
||||
if cordonErr := o.cordonNodeForMissingCryptsetup(ctx, node); cordonErr != nil {
|
||||
@ -201,6 +207,39 @@ func (o *Orchestrator) ensureLonghornEncryptedHostPrereqs(ctx context.Context, w
|
||||
return guarded, nil
|
||||
}
|
||||
|
||||
// uncordonLonghornCryptsetupExemptNodes runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) uncordonLonghornCryptsetupExemptNodes(ctx context.Context) error.
|
||||
// Why: special Longhorn hosts that only run non-encrypted local workloads, such
|
||||
// as Veles/Oceanus, must recover from stale cordons without weakening the
|
||||
// encrypted-volume guard for normal storage workers.
|
||||
func (o *Orchestrator) uncordonLonghornCryptsetupExemptNodes(ctx context.Context) error {
|
||||
exempt := makeStringSet(o.cfg.Startup.LonghornCryptsetupExemptNodes)
|
||||
if len(exempt) == 0 {
|
||||
return nil
|
||||
}
|
||||
longhornHosts, err := o.longhornHostNodes(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
|
||||
nodes := make([]string, 0, len(exempt))
|
||||
for node := range exempt {
|
||||
if _, skip := ignored[node]; skip {
|
||||
continue
|
||||
}
|
||||
if _, ok := longhornHosts[node]; ok {
|
||||
nodes = append(nodes, node)
|
||||
}
|
||||
}
|
||||
sort.Strings(nodes)
|
||||
if len(nodes) == 0 {
|
||||
return nil
|
||||
}
|
||||
o.log.Printf("uncordon longhorn cryptsetup-exempt hosts: %s", strings.Join(nodes, ","))
|
||||
o.noteStartupAutoHeal(fmt.Sprintf("uncordoned longhorn cryptsetup-exempt hosts: %s", strings.Join(nodes, ",")))
|
||||
return o.uncordonWorkers(ctx, nodes)
|
||||
}
|
||||
|
||||
// longhornHostNodes runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) longhornHostNodes(ctx context.Context) (map[string]struct{}, error).
|
||||
// Why: the live node label captures storage hosts that may not be in Ananke's
|
||||
|
||||
@ -228,6 +228,9 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
|
||||
return err
|
||||
})
|
||||
o.bestEffort("uncordon workers", func() error { return o.uncordonWorkers(ctx, workersToUncordon) })
|
||||
o.bestEffort("uncordon longhorn cryptsetup-exempt hosts", func() error {
|
||||
return o.uncordonLonghornCryptsetupExemptNodes(ctx)
|
||||
})
|
||||
sshCheckNodes := append([]string{}, o.cfg.ControlPlanes...)
|
||||
sshCheckNodes = append(sshCheckNodes, workers...)
|
||||
if err := o.waitForNodeSSHAuth(ctx, sshCheckNodes); err != nil {
|
||||
|
||||
@ -390,7 +390,7 @@ func TestEnsureLonghornEncryptedHostPrereqsFiltersUnsafeWorkers(t *testing.T) {
|
||||
},
|
||||
{
|
||||
match: func(name string, args []string) bool {
|
||||
if !matchContains("kubectl", "cordon")(name, args) {
|
||||
if name != "kubectl" || len(args) == 0 || args[0] != "cordon" {
|
||||
return false
|
||||
}
|
||||
if len(args) > 1 {
|
||||
@ -414,6 +414,80 @@ func TestEnsureLonghornEncryptedHostPrereqsFiltersUnsafeWorkers(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestLonghornCryptsetupExemptNodesAreNotQuarantined runs one orchestration or CLI step.
|
||||
// Signature: TestLonghornCryptsetupExemptNodesAreNotQuarantined(t *testing.T).
|
||||
// Why: Veles/Oceanus uses titan-23 as a Longhorn host for unencrypted local
|
||||
// volumes; startup should uncordon that policy-exempt node without requiring
|
||||
// host SSH or weakening encrypted-volume safety on other workers.
|
||||
func TestLonghornCryptsetupExemptNodesAreNotQuarantined(t *testing.T) {
|
||||
cordoned := []string{}
|
||||
uncordoned := []string{}
|
||||
sshTitan23 := false
|
||||
orch := buildOrchestratorWithStubs(t, config.Config{
|
||||
SSHManagedNodes: []string{"titan-04"},
|
||||
Startup: config.Startup{
|
||||
LonghornCryptsetupExemptNodes: []string{"titan-23"},
|
||||
},
|
||||
}, []commandStub{
|
||||
{match: matchContains("kubectl", "get", "nodes", "-l", "longhorn-host=true"), out: "titan-04\ntitan-23\n"},
|
||||
{
|
||||
match: matchContains("ssh", "titan-04", "command -v cryptsetup"),
|
||||
out: "__ANANKE_CRYPTSETUP_PRESENT__",
|
||||
},
|
||||
{
|
||||
match: func(name string, args []string) bool {
|
||||
if name == "ssh" && strings.Contains(strings.Join(args, " "), "titan-23") {
|
||||
sshTitan23 = true
|
||||
return true
|
||||
}
|
||||
return false
|
||||
},
|
||||
},
|
||||
{
|
||||
match: func(name string, args []string) bool {
|
||||
if name != "kubectl" || len(args) == 0 || args[0] != "cordon" {
|
||||
return false
|
||||
}
|
||||
if len(args) > 1 {
|
||||
cordoned = append(cordoned, args[len(args)-1])
|
||||
}
|
||||
return true
|
||||
},
|
||||
},
|
||||
{
|
||||
match: func(name string, args []string) bool {
|
||||
if !matchContains("kubectl", "uncordon")(name, args) {
|
||||
return false
|
||||
}
|
||||
if len(args) > 1 {
|
||||
uncordoned = append(uncordoned, args[len(args)-1])
|
||||
}
|
||||
return true
|
||||
},
|
||||
},
|
||||
})
|
||||
|
||||
got, err := orch.ensureLonghornEncryptedHostPrereqs(context.Background(), []string{"titan-04"})
|
||||
if err != nil {
|
||||
t.Fatalf("ensureLonghornEncryptedHostPrereqs failed: %v", err)
|
||||
}
|
||||
if strings.Join(got, ",") != "titan-04" {
|
||||
t.Fatalf("guarded workers mismatch got=%v", got)
|
||||
}
|
||||
if err := orch.uncordonLonghornCryptsetupExemptNodes(context.Background()); err != nil {
|
||||
t.Fatalf("uncordonLonghornCryptsetupExemptNodes failed: %v", err)
|
||||
}
|
||||
if sshTitan23 {
|
||||
t.Fatalf("did not expect cryptsetup SSH check for exempt titan-23")
|
||||
}
|
||||
if len(cordoned) != 0 {
|
||||
t.Fatalf("did not expect exempt node to be cordoned, got %v", cordoned)
|
||||
}
|
||||
if strings.Join(uncordoned, ",") != "titan-23" {
|
||||
t.Fatalf("expected exempt titan-23 to be uncordoned, got %v", uncordoned)
|
||||
}
|
||||
}
|
||||
|
||||
// TestLonghornHostNodesFallsBackToConfiguredLabels runs one orchestration or CLI step.
|
||||
// Signature: TestLonghornHostNodesFallsBackToConfiguredLabels(t *testing.T).
|
||||
// Why: bootstrap caches or minimal test clusters can lack live labels; the
|
||||
|
||||
@ -192,6 +192,9 @@ func (c *Config) applyDefaults() {
|
||||
if c.Startup.IgnoreUnavailableNodes == nil {
|
||||
c.Startup.IgnoreUnavailableNodes = []string{}
|
||||
}
|
||||
if c.Startup.LonghornCryptsetupExemptNodes == nil {
|
||||
c.Startup.LonghornCryptsetupExemptNodes = []string{}
|
||||
}
|
||||
if c.Startup.StuckPodGraceSeconds <= 0 {
|
||||
c.Startup.StuckPodGraceSeconds = 180
|
||||
}
|
||||
|
||||
@ -46,6 +46,7 @@ func defaults() Config {
|
||||
NodeInventoryReachWaitSeconds: 300,
|
||||
NodeInventoryReachPollSeconds: 5,
|
||||
NodeInventoryReachRequiredNodes: []string{},
|
||||
LonghornCryptsetupExemptNodes: []string{},
|
||||
RequireTimeSync: true,
|
||||
TimeSyncWaitSeconds: 240,
|
||||
TimeSyncPollSeconds: 5,
|
||||
|
||||
@ -36,6 +36,7 @@ type Startup struct {
|
||||
NodeInventoryReachWaitSeconds int `yaml:"node_inventory_reachability_wait_seconds"`
|
||||
NodeInventoryReachPollSeconds int `yaml:"node_inventory_reachability_poll_seconds"`
|
||||
NodeInventoryReachRequiredNodes []string `yaml:"node_inventory_reachability_required_nodes"`
|
||||
LonghornCryptsetupExemptNodes []string `yaml:"longhorn_cryptsetup_exempt_nodes"`
|
||||
RequiredNodeLabels map[string]map[string]string `yaml:"required_node_labels"`
|
||||
RequireTimeSync bool `yaml:"require_time_sync"`
|
||||
TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"`
|
||||
|
||||
@ -66,6 +66,11 @@ func (c Config) Validate() error {
|
||||
return fmt.Errorf("config.startup.node_inventory_reachability_required_nodes entries must not be empty")
|
||||
}
|
||||
}
|
||||
for _, node := range c.Startup.LonghornCryptsetupExemptNodes {
|
||||
if strings.TrimSpace(node) == "" {
|
||||
return fmt.Errorf("config.startup.longhorn_cryptsetup_exempt_nodes entries must not be empty")
|
||||
}
|
||||
}
|
||||
for node, labels := range c.Startup.RequiredNodeLabels {
|
||||
if strings.TrimSpace(node) == "" {
|
||||
return fmt.Errorf("config.startup.required_node_labels keys must not be empty")
|
||||
|
||||
@ -31,6 +31,7 @@ func TestValidateRejectsInvalidFieldsMatrix(t *testing.T) {
|
||||
{"bad_min_battery_percent", func(c *Config) { c.Startup.MinimumBatteryPercent = 101 }},
|
||||
{"bad_node_inventory_poll", func(c *Config) { c.Startup.NodeInventoryReachPollSeconds = 0 }},
|
||||
{"bad_empty_node_inventory_required_node", func(c *Config) { c.Startup.NodeInventoryReachRequiredNodes = []string{"titan-0a", ""} }},
|
||||
{"bad_empty_longhorn_cryptsetup_exempt_node", func(c *Config) { c.Startup.LonghornCryptsetupExemptNodes = []string{"titan-23", ""} }},
|
||||
{"bad_time_sync_wait", func(c *Config) { c.Startup.TimeSyncWaitSeconds = 0 }},
|
||||
{"bad_time_sync_poll", func(c *Config) { c.Startup.TimeSyncPollSeconds = 0 }},
|
||||
{"bad_time_sync_quorum", func(c *Config) { c.Startup.TimeSyncMode = "quorum"; c.Startup.TimeSyncQuorum = 0 }},
|
||||
|
||||
@ -77,6 +77,12 @@ migrate_ananke_config() {
|
||||
echo "[install] added startup node inventory reachability gate defaults"
|
||||
changed=1
|
||||
fi
|
||||
if grep -Eq '^ required_node_labels:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
|
||||
&& ! grep -Eq '^ longhorn_cryptsetup_exempt_nodes:' "${CONF_DIR}/ananke.yaml"; then
|
||||
sed -Ei '/^ required_node_labels:[[:space:]]*$/i\ longhorn_cryptsetup_exempt_nodes:\n - titan-23' "${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] added startup.longhorn_cryptsetup_exempt_nodes default (titan-23)"
|
||||
changed=1
|
||||
fi
|
||||
if grep -Eq '^state:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
|
||||
&& ! grep -Eq '^ reports_dir:[[:space:]]*/var/lib/ananke/reports' "${CONF_DIR}/ananke.yaml"; then
|
||||
sed -Ei '/^ dir:[[:space:]]*\/var\/lib\/ananke$/a\ reports_dir: /var/lib/ananke/reports' "${CONF_DIR}/ananke.yaml"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user