recovery: skip ignored workers during startup

This commit is contained in:
codex 2026-06-18 22:49:05 -03:00
parent 566765696b
commit 7f3a9c1428
4 changed files with 48 additions and 3 deletions

View File

@ -282,6 +282,7 @@ startup:
ignore_workloads: [] ignore_workloads: []
ignore_unavailable_nodes: ignore_unavailable_nodes:
- titan-09 - titan-09
- titan-10
auto_recycle_stuck_pods: true auto_recycle_stuck_pods: true
auto_quarantine_scheduling_storms: true auto_quarantine_scheduling_storms: true
scheduling_storm_event_threshold: 30 scheduling_storm_event_threshold: 30

View File

@ -282,6 +282,7 @@ startup:
ignore_workloads: [] ignore_workloads: []
ignore_unavailable_nodes: ignore_unavailable_nodes:
- titan-09 - titan-09
- titan-10
auto_recycle_stuck_pods: true auto_recycle_stuck_pods: true
auto_quarantine_scheduling_storms: true auto_quarantine_scheduling_storms: true
scheduling_storm_event_threshold: 30 scheduling_storm_event_threshold: 30

View File

@ -18,13 +18,13 @@ import (
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) effectiveWorkers(ctx context.Context) ([]string, error) { func (o *Orchestrator) effectiveWorkers(ctx context.Context) ([]string, error) {
if len(o.cfg.Workers) > 0 { if len(o.cfg.Workers) > 0 {
return append([]string{}, o.cfg.Workers...), nil return filterIgnoredNodes(o.cfg.Workers, o.cfg.Startup.IgnoreUnavailableNodes), nil
} }
workers, err := o.discoverWorkers(ctx) workers, err := o.discoverWorkers(ctx)
if err == nil { if err == nil {
return workers, nil return filterIgnoredNodes(workers, o.cfg.Startup.IgnoreUnavailableNodes), nil
} }
fallback := o.fallbackWorkersFromInventory() fallback := filterIgnoredNodes(o.fallbackWorkersFromInventory(), o.cfg.Startup.IgnoreUnavailableNodes)
if len(fallback) == 0 { if len(fallback) == 0 {
return nil, err return nil, err
} }
@ -32,6 +32,27 @@ func (o *Orchestrator) effectiveWorkers(ctx context.Context) ([]string, error) {
return fallback, nil return fallback, nil
} }
// filterIgnoredNodes runs one orchestration or CLI step.
// Signature: filterIgnoredNodes(nodes []string, ignored []string) []string.
// Why: ignored unavailable nodes should be excluded from active startup and
// shutdown worker operations, not just readiness gates, so known-absent
// hardware does not create noisy SSH or uncordon attempts.
func filterIgnoredNodes(nodes []string, ignored []string) []string {
ignoredSet := makeStringSet(ignored)
filtered := []string{}
for _, node := range nodes {
name := strings.TrimSpace(node)
if name == "" {
continue
}
if _, skip := ignoredSet[name]; skip {
continue
}
filtered = append(filtered, name)
}
return filtered
}
// discoverWorkers runs one orchestration or CLI step. // discoverWorkers runs one orchestration or CLI step.
// Signature: (o *Orchestrator) discoverWorkers(ctx context.Context) ([]string, error). // Signature: (o *Orchestrator) discoverWorkers(ctx context.Context) ([]string, error).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.

View File

@ -323,6 +323,28 @@ func TestRecycleStuckControllerPodsHandlesUnknownPodsOnReadyNodes(t *testing.T)
} }
} }
// TestEffectiveWorkersFiltersIgnoredUnavailableNodes runs one orchestration or CLI step.
// Signature: TestEffectiveWorkersFiltersIgnoredUnavailableNodes(t *testing.T).
// Why: ignored unavailable nodes should be excluded before startup tries SSH,
// k3s-agent start, or uncordon operations against intentionally absent hosts.
func TestEffectiveWorkersFiltersIgnoredUnavailableNodes(t *testing.T) {
cfg := config.Config{
Workers: []string{" titan-08 ", "titan-09", "titan-10", "titan-11"},
Startup: config.Startup{
IgnoreUnavailableNodes: []string{"titan-09", "titan-10"},
},
}
orch := buildOrchestratorWithStubs(t, cfg, nil)
got, err := orch.effectiveWorkers(context.Background())
if err != nil {
t.Fatalf("effectiveWorkers failed: %v", err)
}
want := []string{"titan-08", "titan-11"}
if strings.Join(got, ",") != strings.Join(want, ",") {
t.Fatalf("effectiveWorkers mismatch got=%v want=%v", got, want)
}
}
// TestNewConstructsOrchestrator runs one orchestration or CLI step. // TestNewConstructsOrchestrator runs one orchestration or CLI step.
// Signature: TestNewConstructsOrchestrator(t *testing.T). // Signature: TestNewConstructsOrchestrator(t *testing.T).
// Why: covers constructor path in orchestrator core module. // Why: covers constructor path in orchestrator core module.