recovery: skip ignored workers during startup
This commit is contained in:
parent
566765696b
commit
7f3a9c1428
@ -282,6 +282,7 @@ startup:
|
|||||||
ignore_workloads: []
|
ignore_workloads: []
|
||||||
ignore_unavailable_nodes:
|
ignore_unavailable_nodes:
|
||||||
- titan-09
|
- titan-09
|
||||||
|
- titan-10
|
||||||
auto_recycle_stuck_pods: true
|
auto_recycle_stuck_pods: true
|
||||||
auto_quarantine_scheduling_storms: true
|
auto_quarantine_scheduling_storms: true
|
||||||
scheduling_storm_event_threshold: 30
|
scheduling_storm_event_threshold: 30
|
||||||
|
|||||||
@ -282,6 +282,7 @@ startup:
|
|||||||
ignore_workloads: []
|
ignore_workloads: []
|
||||||
ignore_unavailable_nodes:
|
ignore_unavailable_nodes:
|
||||||
- titan-09
|
- titan-09
|
||||||
|
- titan-10
|
||||||
auto_recycle_stuck_pods: true
|
auto_recycle_stuck_pods: true
|
||||||
auto_quarantine_scheduling_storms: true
|
auto_quarantine_scheduling_storms: true
|
||||||
scheduling_storm_event_threshold: 30
|
scheduling_storm_event_threshold: 30
|
||||||
|
|||||||
@ -18,13 +18,13 @@ import (
|
|||||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||||
func (o *Orchestrator) effectiveWorkers(ctx context.Context) ([]string, error) {
|
func (o *Orchestrator) effectiveWorkers(ctx context.Context) ([]string, error) {
|
||||||
if len(o.cfg.Workers) > 0 {
|
if len(o.cfg.Workers) > 0 {
|
||||||
return append([]string{}, o.cfg.Workers...), nil
|
return filterIgnoredNodes(o.cfg.Workers, o.cfg.Startup.IgnoreUnavailableNodes), nil
|
||||||
}
|
}
|
||||||
workers, err := o.discoverWorkers(ctx)
|
workers, err := o.discoverWorkers(ctx)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
return workers, nil
|
return filterIgnoredNodes(workers, o.cfg.Startup.IgnoreUnavailableNodes), nil
|
||||||
}
|
}
|
||||||
fallback := o.fallbackWorkersFromInventory()
|
fallback := filterIgnoredNodes(o.fallbackWorkersFromInventory(), o.cfg.Startup.IgnoreUnavailableNodes)
|
||||||
if len(fallback) == 0 {
|
if len(fallback) == 0 {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@ -32,6 +32,27 @@ func (o *Orchestrator) effectiveWorkers(ctx context.Context) ([]string, error) {
|
|||||||
return fallback, nil
|
return fallback, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// filterIgnoredNodes runs one orchestration or CLI step.
|
||||||
|
// Signature: filterIgnoredNodes(nodes []string, ignored []string) []string.
|
||||||
|
// Why: ignored unavailable nodes should be excluded from active startup and
|
||||||
|
// shutdown worker operations, not just readiness gates, so known-absent
|
||||||
|
// hardware does not create noisy SSH or uncordon attempts.
|
||||||
|
func filterIgnoredNodes(nodes []string, ignored []string) []string {
|
||||||
|
ignoredSet := makeStringSet(ignored)
|
||||||
|
filtered := []string{}
|
||||||
|
for _, node := range nodes {
|
||||||
|
name := strings.TrimSpace(node)
|
||||||
|
if name == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if _, skip := ignoredSet[name]; skip {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
filtered = append(filtered, name)
|
||||||
|
}
|
||||||
|
return filtered
|
||||||
|
}
|
||||||
|
|
||||||
// discoverWorkers runs one orchestration or CLI step.
|
// discoverWorkers runs one orchestration or CLI step.
|
||||||
// Signature: (o *Orchestrator) discoverWorkers(ctx context.Context) ([]string, error).
|
// Signature: (o *Orchestrator) discoverWorkers(ctx context.Context) ([]string, error).
|
||||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||||
|
|||||||
@ -323,6 +323,28 @@ func TestRecycleStuckControllerPodsHandlesUnknownPodsOnReadyNodes(t *testing.T)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TestEffectiveWorkersFiltersIgnoredUnavailableNodes runs one orchestration or CLI step.
|
||||||
|
// Signature: TestEffectiveWorkersFiltersIgnoredUnavailableNodes(t *testing.T).
|
||||||
|
// Why: ignored unavailable nodes should be excluded before startup tries SSH,
|
||||||
|
// k3s-agent start, or uncordon operations against intentionally absent hosts.
|
||||||
|
func TestEffectiveWorkersFiltersIgnoredUnavailableNodes(t *testing.T) {
|
||||||
|
cfg := config.Config{
|
||||||
|
Workers: []string{" titan-08 ", "titan-09", "titan-10", "titan-11"},
|
||||||
|
Startup: config.Startup{
|
||||||
|
IgnoreUnavailableNodes: []string{"titan-09", "titan-10"},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
orch := buildOrchestratorWithStubs(t, cfg, nil)
|
||||||
|
got, err := orch.effectiveWorkers(context.Background())
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("effectiveWorkers failed: %v", err)
|
||||||
|
}
|
||||||
|
want := []string{"titan-08", "titan-11"}
|
||||||
|
if strings.Join(got, ",") != strings.Join(want, ",") {
|
||||||
|
t.Fatalf("effectiveWorkers mismatch got=%v want=%v", got, want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// TestNewConstructsOrchestrator runs one orchestration or CLI step.
|
// TestNewConstructsOrchestrator runs one orchestration or CLI step.
|
||||||
// Signature: TestNewConstructsOrchestrator(t *testing.T).
|
// Signature: TestNewConstructsOrchestrator(t *testing.T).
|
||||||
// Why: covers constructor path in orchestrator core module.
|
// Why: covers constructor path in orchestrator core module.
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user