diff --git a/configs/ananke.tethys.yaml b/configs/ananke.tethys.yaml index f245853..75ff3e4 100644 --- a/configs/ananke.tethys.yaml +++ b/configs/ananke.tethys.yaml @@ -282,6 +282,7 @@ startup: ignore_workloads: [] ignore_unavailable_nodes: - titan-09 + - titan-10 auto_recycle_stuck_pods: true auto_quarantine_scheduling_storms: true scheduling_storm_event_threshold: 30 diff --git a/configs/ananke.titan-db.yaml b/configs/ananke.titan-db.yaml index 84545dc..c056618 100644 --- a/configs/ananke.titan-db.yaml +++ b/configs/ananke.titan-db.yaml @@ -282,6 +282,7 @@ startup: ignore_workloads: [] ignore_unavailable_nodes: - titan-09 + - titan-10 auto_recycle_stuck_pods: true auto_quarantine_scheduling_storms: true scheduling_storm_event_threshold: 30 diff --git a/internal/cluster/orchestrator_scaling.go b/internal/cluster/orchestrator_scaling.go index 59414d4..96ec919 100644 --- a/internal/cluster/orchestrator_scaling.go +++ b/internal/cluster/orchestrator_scaling.go @@ -18,13 +18,13 @@ import ( // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func (o *Orchestrator) effectiveWorkers(ctx context.Context) ([]string, error) { if len(o.cfg.Workers) > 0 { - return append([]string{}, o.cfg.Workers...), nil + return filterIgnoredNodes(o.cfg.Workers, o.cfg.Startup.IgnoreUnavailableNodes), nil } workers, err := o.discoverWorkers(ctx) if err == nil { - return workers, nil + return filterIgnoredNodes(workers, o.cfg.Startup.IgnoreUnavailableNodes), nil } - fallback := o.fallbackWorkersFromInventory() + fallback := filterIgnoredNodes(o.fallbackWorkersFromInventory(), o.cfg.Startup.IgnoreUnavailableNodes) if len(fallback) == 0 { return nil, err } @@ -32,6 +32,27 @@ func (o *Orchestrator) effectiveWorkers(ctx context.Context) ([]string, error) { return fallback, nil } +// filterIgnoredNodes runs one orchestration or CLI step. +// Signature: filterIgnoredNodes(nodes []string, ignored []string) []string. +// Why: ignored unavailable nodes should be excluded from active startup and +// shutdown worker operations, not just readiness gates, so known-absent +// hardware does not create noisy SSH or uncordon attempts. +func filterIgnoredNodes(nodes []string, ignored []string) []string { + ignoredSet := makeStringSet(ignored) + filtered := []string{} + for _, node := range nodes { + name := strings.TrimSpace(node) + if name == "" { + continue + } + if _, skip := ignoredSet[name]; skip { + continue + } + filtered = append(filtered, name) + } + return filtered +} + // discoverWorkers runs one orchestration or CLI step. // Signature: (o *Orchestrator) discoverWorkers(ctx context.Context) ([]string, error). // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. diff --git a/internal/cluster/orchestrator_unit_additional_test.go b/internal/cluster/orchestrator_unit_additional_test.go index 697291a..070a9ef 100644 --- a/internal/cluster/orchestrator_unit_additional_test.go +++ b/internal/cluster/orchestrator_unit_additional_test.go @@ -323,6 +323,28 @@ func TestRecycleStuckControllerPodsHandlesUnknownPodsOnReadyNodes(t *testing.T) } } +// TestEffectiveWorkersFiltersIgnoredUnavailableNodes runs one orchestration or CLI step. +// Signature: TestEffectiveWorkersFiltersIgnoredUnavailableNodes(t *testing.T). +// Why: ignored unavailable nodes should be excluded before startup tries SSH, +// k3s-agent start, or uncordon operations against intentionally absent hosts. +func TestEffectiveWorkersFiltersIgnoredUnavailableNodes(t *testing.T) { + cfg := config.Config{ + Workers: []string{" titan-08 ", "titan-09", "titan-10", "titan-11"}, + Startup: config.Startup{ + IgnoreUnavailableNodes: []string{"titan-09", "titan-10"}, + }, + } + orch := buildOrchestratorWithStubs(t, cfg, nil) + got, err := orch.effectiveWorkers(context.Background()) + if err != nil { + t.Fatalf("effectiveWorkers failed: %v", err) + } + want := []string{"titan-08", "titan-11"} + if strings.Join(got, ",") != strings.Join(want, ",") { + t.Fatalf("effectiveWorkers mismatch got=%v want=%v", got, want) + } +} + // TestNewConstructsOrchestrator runs one orchestration or CLI step. // Signature: TestNewConstructsOrchestrator(t *testing.T). // Why: covers constructor path in orchestrator core module.