recovery(ananke): auto-heal sealed vault and dead-node drift
This commit is contained in:
parent
d225291c5d
commit
d105e43e49
@ -154,6 +154,8 @@ startup:
|
|||||||
scheduling_storm_event_threshold: 30
|
scheduling_storm_event_threshold: 30
|
||||||
scheduling_storm_window_seconds: 180
|
scheduling_storm_window_seconds: 180
|
||||||
stuck_pod_grace_seconds: 180
|
stuck_pod_grace_seconds: 180
|
||||||
|
post_start_auto_heal_seconds: 60
|
||||||
|
dead_node_cleanup_grace_seconds: 300
|
||||||
vault_unseal_key_file: /var/lib/ananke/vault-unseal.key
|
vault_unseal_key_file: /var/lib/ananke/vault-unseal.key
|
||||||
vault_unseal_breakglass_command: ""
|
vault_unseal_breakglass_command: ""
|
||||||
vault_unseal_breakglass_timeout_seconds: 15
|
vault_unseal_breakglass_timeout_seconds: 15
|
||||||
|
|||||||
@ -286,6 +286,8 @@ startup:
|
|||||||
scheduling_storm_event_threshold: 30
|
scheduling_storm_event_threshold: 30
|
||||||
scheduling_storm_window_seconds: 180
|
scheduling_storm_window_seconds: 180
|
||||||
stuck_pod_grace_seconds: 180
|
stuck_pod_grace_seconds: 180
|
||||||
|
post_start_auto_heal_seconds: 60
|
||||||
|
dead_node_cleanup_grace_seconds: 300
|
||||||
vault_unseal_key_file: /var/lib/ananke/vault-unseal.key
|
vault_unseal_key_file: /var/lib/ananke/vault-unseal.key
|
||||||
vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/tethys/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.ananke-breakglass/vault-unseal.key'"
|
vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/tethys/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.ananke-breakglass/vault-unseal.key'"
|
||||||
vault_unseal_breakglass_timeout_seconds: 15
|
vault_unseal_breakglass_timeout_seconds: 15
|
||||||
|
|||||||
@ -286,6 +286,8 @@ startup:
|
|||||||
scheduling_storm_event_threshold: 30
|
scheduling_storm_event_threshold: 30
|
||||||
scheduling_storm_window_seconds: 180
|
scheduling_storm_window_seconds: 180
|
||||||
stuck_pod_grace_seconds: 180
|
stuck_pod_grace_seconds: 180
|
||||||
|
post_start_auto_heal_seconds: 60
|
||||||
|
dead_node_cleanup_grace_seconds: 300
|
||||||
vault_unseal_key_file: /var/lib/ananke/vault-unseal.key
|
vault_unseal_key_file: /var/lib/ananke/vault-unseal.key
|
||||||
vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/atlas/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.ananke-breakglass/vault-unseal.key'"
|
vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/atlas/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.ananke-breakglass/vault-unseal.key'"
|
||||||
vault_unseal_breakglass_timeout_seconds: 15
|
vault_unseal_breakglass_timeout_seconds: 15
|
||||||
|
|||||||
288
internal/cluster/orchestrator_autorepair.go
Normal file
288
internal/cluster/orchestrator_autorepair.go
Normal file
@ -0,0 +1,288 @@
|
|||||||
|
package cluster
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
type nodeReadyList struct {
|
||||||
|
Items []struct {
|
||||||
|
Metadata struct {
|
||||||
|
Name string `json:"name"`
|
||||||
|
} `json:"metadata"`
|
||||||
|
Status struct {
|
||||||
|
Conditions []struct {
|
||||||
|
Type string `json:"type"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
} `json:"conditions"`
|
||||||
|
} `json:"status"`
|
||||||
|
} `json:"items"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type podDeleteList struct {
|
||||||
|
Items []struct {
|
||||||
|
Metadata struct {
|
||||||
|
Namespace string `json:"namespace"`
|
||||||
|
Name string `json:"name"`
|
||||||
|
DeletionTimestamp *time.Time `json:"deletionTimestamp"`
|
||||||
|
} `json:"metadata"`
|
||||||
|
Spec struct {
|
||||||
|
NodeName string `json:"nodeName"`
|
||||||
|
} `json:"spec"`
|
||||||
|
} `json:"items"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// RunPostStartAutoHeal runs one orchestration or CLI step.
|
||||||
|
// Signature: (o *Orchestrator) RunPostStartAutoHeal(ctx context.Context) error.
|
||||||
|
// Why: gives the long-running daemon a narrow, testable repair entrypoint for
|
||||||
|
// post-start drift without rerunning the full startup flow.
|
||||||
|
func (o *Orchestrator) RunPostStartAutoHeal(ctx context.Context) error {
|
||||||
|
return o.postStartAutoHeal(ctx)
|
||||||
|
}
|
||||||
|
|
||||||
|
// postStartAutoHeal runs one orchestration or CLI step.
|
||||||
|
// Signature: (o *Orchestrator) postStartAutoHeal(ctx context.Context) error.
|
||||||
|
// Why: centralizes bounded post-start repair actions so recurring outage
|
||||||
|
// patterns only trigger the specific remediation they need.
|
||||||
|
func (o *Orchestrator) postStartAutoHeal(ctx context.Context) error {
|
||||||
|
if o.runner.DryRun {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
errs := []string{}
|
||||||
|
requestReconcile := false
|
||||||
|
|
||||||
|
if err := o.ensureRequiredNodeLabels(ctx); err != nil {
|
||||||
|
errs = append(errs, fmt.Sprintf("required node labels: %v", err))
|
||||||
|
}
|
||||||
|
|
||||||
|
vaultRecovered, err := o.autoRecoverSealedVault(ctx)
|
||||||
|
if err != nil {
|
||||||
|
errs = append(errs, fmt.Sprintf("vault auto-recovery: %v", err))
|
||||||
|
} else if vaultRecovered {
|
||||||
|
requestReconcile = true
|
||||||
|
if err := o.rerunVaultK8sAuthConfigJob(ctx); err != nil {
|
||||||
|
errs = append(errs, fmt.Sprintf("vault k8s auth config rerun: %v", err))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
cleaned, err := o.cleanupTerminatingPodsOnUnavailableNodes(ctx)
|
||||||
|
if err != nil {
|
||||||
|
errs = append(errs, fmt.Sprintf("dead-node terminating pod cleanup: %v", err))
|
||||||
|
} else if cleaned > 0 {
|
||||||
|
requestReconcile = true
|
||||||
|
}
|
||||||
|
|
||||||
|
if requestReconcile {
|
||||||
|
o.bestEffort("request flux reconcile after post-start auto-heal", func() error {
|
||||||
|
return o.requestFluxReconcile(ctx)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(errs) > 0 {
|
||||||
|
return errors.New(strings.Join(errs, "; "))
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// autoRecoverSealedVault runs one orchestration or CLI step.
|
||||||
|
// Signature: (o *Orchestrator) autoRecoverSealedVault(ctx context.Context) (bool, error).
|
||||||
|
// Why: lets the daemon repair a later Vault reseal without waiting for a new
|
||||||
|
// bootstrap run.
|
||||||
|
func (o *Orchestrator) autoRecoverSealedVault(ctx context.Context) (bool, error) {
|
||||||
|
if o.runner.DryRun {
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
phase, err := o.kubectl(ctx, 15*time.Second, "-n", "vault", "get", "pod", "vault-0", "-o", "jsonpath={.status.phase}")
|
||||||
|
if err != nil {
|
||||||
|
if isNotFoundErr(err) {
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
return false, fmt.Errorf("vault pod phase check failed: %w", err)
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(phase) != "Running" {
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
sealed, err := o.vaultSealed(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
if !sealed {
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
o.log.Printf("warning: detected sealed Vault after startup; attempting post-start auto-recovery")
|
||||||
|
if err := o.ensureVaultUnsealed(ctx); err != nil {
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
return true, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// rerunVaultK8sAuthConfigJob runs one orchestration or CLI step.
|
||||||
|
// Signature: (o *Orchestrator) rerunVaultK8sAuthConfigJob(ctx context.Context) error.
|
||||||
|
// Why: post-unseal Vault recovery needs the auth-config job retriggered so
|
||||||
|
// downstream secret consumers stop carrying stale failures from the sealed window.
|
||||||
|
func (o *Orchestrator) rerunVaultK8sAuthConfigJob(ctx context.Context) error {
|
||||||
|
if o.runner.DryRun {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
jobName := fmt.Sprintf("vault-k8s-auth-config-autoheal-%d", time.Now().Unix())
|
||||||
|
if _, err := o.kubectl(
|
||||||
|
ctx,
|
||||||
|
25*time.Second,
|
||||||
|
"-n", "vault",
|
||||||
|
"create", "job",
|
||||||
|
"--from=cronjob/vault-k8s-auth-config",
|
||||||
|
jobName,
|
||||||
|
); err != nil {
|
||||||
|
return fmt.Errorf("create job %s: %w", jobName, err)
|
||||||
|
}
|
||||||
|
o.log.Printf("triggered vault k8s auth config job %s after vault recovery", jobName)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// cleanupTerminatingPodsOnUnavailableNodes runs one orchestration or CLI step.
|
||||||
|
// Signature: (o *Orchestrator) cleanupTerminatingPodsOnUnavailableNodes(ctx context.Context) (int, error).
|
||||||
|
// Why: dead nodes can strand terminating pods indefinitely, so the daemon should
|
||||||
|
// clear only that narrow failure class instead of leaving garbage behind forever.
|
||||||
|
func (o *Orchestrator) cleanupTerminatingPodsOnUnavailableNodes(ctx context.Context) (int, error) {
|
||||||
|
if o.runner.DryRun {
|
||||||
|
return 0, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
unavailable, err := o.unavailableNodeSet(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
if len(unavailable) == 0 {
|
||||||
|
return 0, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
out, err := o.kubectl(ctx, 30*time.Second, "get", "pods", "-A", "-o", "json")
|
||||||
|
if err != nil {
|
||||||
|
return 0, fmt.Errorf("query pods: %w", err)
|
||||||
|
}
|
||||||
|
var pods podDeleteList
|
||||||
|
if err := json.Unmarshal([]byte(out), &pods); err != nil {
|
||||||
|
return 0, fmt.Errorf("decode pods: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
grace := time.Duration(o.cfg.Startup.DeadNodeCleanupGraceSeconds) * time.Second
|
||||||
|
now := time.Now()
|
||||||
|
count := 0
|
||||||
|
for _, item := range pods.Items {
|
||||||
|
if item.Metadata.DeletionTimestamp == nil || item.Spec.NodeName == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if _, badNode := unavailable[item.Spec.NodeName]; !badNode {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if now.Sub(*item.Metadata.DeletionTimestamp) < grace {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
o.log.Printf("warning: force deleting terminating pod %s/%s on unavailable node %s", item.Metadata.Namespace, item.Metadata.Name, item.Spec.NodeName)
|
||||||
|
if _, err := o.kubectl(
|
||||||
|
ctx,
|
||||||
|
20*time.Second,
|
||||||
|
"-n", item.Metadata.Namespace,
|
||||||
|
"delete", "pod", item.Metadata.Name,
|
||||||
|
"--grace-period=0",
|
||||||
|
"--force",
|
||||||
|
"--wait=false",
|
||||||
|
); err != nil && !isNotFoundErr(err) {
|
||||||
|
return count, fmt.Errorf("delete pod %s/%s: %w", item.Metadata.Namespace, item.Metadata.Name, err)
|
||||||
|
}
|
||||||
|
count++
|
||||||
|
}
|
||||||
|
if count > 0 {
|
||||||
|
o.log.Printf("post-start auto-heal cleaned %d terminating pod(s) from unavailable nodes", count)
|
||||||
|
}
|
||||||
|
return count, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// unavailableNodeSet runs one orchestration or CLI step.
|
||||||
|
// Signature: (o *Orchestrator) unavailableNodeSet(ctx context.Context) (map[string]struct{}, error).
|
||||||
|
// Why: isolates Ready-condition parsing so dead-node cleanup stays targeted.
|
||||||
|
func (o *Orchestrator) unavailableNodeSet(ctx context.Context) (map[string]struct{}, error) {
|
||||||
|
out, err := o.kubectl(ctx, 20*time.Second, "get", "nodes", "-o", "json")
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("query nodes: %w", err)
|
||||||
|
}
|
||||||
|
var nodes nodeReadyList
|
||||||
|
if err := json.Unmarshal([]byte(out), &nodes); err != nil {
|
||||||
|
return nil, fmt.Errorf("decode nodes: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
unavailable := map[string]struct{}{}
|
||||||
|
for _, item := range nodes.Items {
|
||||||
|
ready := ""
|
||||||
|
for _, cond := range item.Status.Conditions {
|
||||||
|
if strings.EqualFold(strings.TrimSpace(cond.Type), "Ready") {
|
||||||
|
ready = strings.TrimSpace(cond.Status)
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ready != "True" {
|
||||||
|
unavailable[item.Metadata.Name] = struct{}{}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return unavailable, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// requestFluxReconcile runs one orchestration or CLI step.
|
||||||
|
// Signature: (o *Orchestrator) requestFluxReconcile(ctx context.Context) error.
|
||||||
|
// Why: post-start repairs need a lightweight way to refresh GitOps health
|
||||||
|
// without reusing the broader startup flux-resume flow.
|
||||||
|
func (o *Orchestrator) requestFluxReconcile(ctx context.Context) error {
|
||||||
|
if o.runner.DryRun {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
now := time.Now().UTC().Format(time.RFC3339)
|
||||||
|
if _, err := o.kubectl(
|
||||||
|
ctx,
|
||||||
|
25*time.Second,
|
||||||
|
"-n", "flux-system",
|
||||||
|
"annotate", "gitrepository", "flux-system",
|
||||||
|
"reconcile.fluxcd.io/requestedAt="+now,
|
||||||
|
"--overwrite",
|
||||||
|
); err != nil {
|
||||||
|
return fmt.Errorf("annotate flux source reconcile: %w", err)
|
||||||
|
}
|
||||||
|
if _, err := o.kubectl(
|
||||||
|
ctx,
|
||||||
|
25*time.Second,
|
||||||
|
"-n", "flux-system",
|
||||||
|
"annotate",
|
||||||
|
"kustomizations.kustomize.toolkit.fluxcd.io",
|
||||||
|
"--all",
|
||||||
|
"reconcile.fluxcd.io/requestedAt="+now,
|
||||||
|
"--overwrite",
|
||||||
|
); err != nil {
|
||||||
|
return fmt.Errorf("annotate flux kustomizations reconcile: %w", err)
|
||||||
|
}
|
||||||
|
if _, err := o.kubectl(
|
||||||
|
ctx,
|
||||||
|
25*time.Second,
|
||||||
|
"annotate",
|
||||||
|
"--all-namespaces",
|
||||||
|
"helmreleases.helm.toolkit.fluxcd.io",
|
||||||
|
"--all",
|
||||||
|
"reconcile.fluxcd.io/requestedAt="+now,
|
||||||
|
"--overwrite",
|
||||||
|
); err != nil {
|
||||||
|
o.log.Printf("warning: annotate helmreleases for post-start reconcile failed: %v", err)
|
||||||
|
}
|
||||||
|
if o.runOverride == nil && o.runner.CommandExists("flux") {
|
||||||
|
if _, err := o.run(ctx, 75*time.Second, "flux", "reconcile", "source", "git", "flux-system", "-n", "flux-system", "--timeout=60s"); err != nil {
|
||||||
|
o.log.Printf("warning: flux source reconcile command failed during post-start auto-heal: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
296
internal/cluster/orchestrator_autorepair_cleanup_test.go
Normal file
296
internal/cluster/orchestrator_autorepair_cleanup_test.go
Normal file
@ -0,0 +1,296 @@
|
|||||||
|
package cluster
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"errors"
|
||||||
|
"io"
|
||||||
|
"log"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"scm.bstein.dev/bstein/ananke/internal/config"
|
||||||
|
"scm.bstein.dev/bstein/ananke/internal/execx"
|
||||||
|
"scm.bstein.dev/bstein/ananke/internal/state"
|
||||||
|
)
|
||||||
|
|
||||||
|
// TestCleanupTerminatingPodsOnUnavailableNodesBranches runs one orchestration or CLI step.
|
||||||
|
// Signature: TestCleanupTerminatingPodsOnUnavailableNodesBranches(t *testing.T).
|
||||||
|
// Why: cleanup on dead nodes must be selective so Ananke only force-deletes the
|
||||||
|
// truly stranded pods and tolerates already-gone objects.
|
||||||
|
func TestCleanupTerminatingPodsOnUnavailableNodesBranches(t *testing.T) {
|
||||||
|
t.Run("dry run skips", func(t *testing.T) {
|
||||||
|
orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
|
||||||
|
orch.runner.DryRun = true
|
||||||
|
count, err := orch.cleanupTerminatingPodsOnUnavailableNodes(context.Background())
|
||||||
|
if err != nil || count != 0 {
|
||||||
|
t.Fatalf("expected dry-run skip, got count=%d err=%v", count, err)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("selective cleanup tolerates not found", func(t *testing.T) {
|
||||||
|
oldDelete := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
|
||||||
|
recentDelete := time.Now().Add(-2 * time.Minute).UTC().Format(time.RFC3339)
|
||||||
|
orch := buildOrchestratorWithStubs(t, config.Config{
|
||||||
|
Startup: config.Startup{DeadNodeCleanupGraceSeconds: 300},
|
||||||
|
}, []commandStub{
|
||||||
|
{
|
||||||
|
match: matchContains("kubectl", "get nodes -o json"),
|
||||||
|
out: `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"Unknown"}]}},{"metadata":{"name":"titan-07"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
match: matchContains("kubectl", "get pods -A -o json"),
|
||||||
|
out: `{"items":[` +
|
||||||
|
`{"metadata":{"namespace":"maintenance","name":"old-stale","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-22"}},` +
|
||||||
|
`{"metadata":{"namespace":"maintenance","name":"fresh-stale","deletionTimestamp":"` + recentDelete + `"},"spec":{"nodeName":"titan-22"}},` +
|
||||||
|
`{"metadata":{"namespace":"logging","name":"healthy-node","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-18"}},` +
|
||||||
|
`{"metadata":{"namespace":"logging","name":"no-delete"},"spec":{"nodeName":"titan-22"}}]}`,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
match: matchContains("kubectl", "-n maintenance delete pod old-stale --grace-period=0 --force --wait=false"),
|
||||||
|
err: errors.New("pod old-stale not found"),
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
count, err := orch.cleanupTerminatingPodsOnUnavailableNodes(context.Background())
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("cleanupTerminatingPodsOnUnavailableNodes failed: %v", err)
|
||||||
|
}
|
||||||
|
if count != 1 {
|
||||||
|
t.Fatalf("expected one cleaned pod, got %d", count)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("query and decode errors surface", func(t *testing.T) {
|
||||||
|
queryErrOrch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
||||||
|
{
|
||||||
|
match: matchContains("kubectl", "get nodes -o json"),
|
||||||
|
err: errors.New("nodes failed"),
|
||||||
|
},
|
||||||
|
})
|
||||||
|
if _, err := queryErrOrch.cleanupTerminatingPodsOnUnavailableNodes(context.Background()); err == nil || !strings.Contains(err.Error(), "query nodes") {
|
||||||
|
t.Fatalf("expected node query error, got %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
decodeErrOrch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
||||||
|
{
|
||||||
|
match: matchContains("kubectl", "get nodes -o json"),
|
||||||
|
out: `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"Unknown"}]}}]}`,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
match: matchContains("kubectl", "get pods -A -o json"),
|
||||||
|
out: `{bad json`,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
if _, err := decodeErrOrch.cleanupTerminatingPodsOnUnavailableNodes(context.Background()); err == nil || !strings.Contains(err.Error(), "decode pods") {
|
||||||
|
t.Fatalf("expected pod decode error, got %v", err)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("delete hard error surfaces", func(t *testing.T) {
|
||||||
|
oldDelete := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
|
||||||
|
orch := buildOrchestratorWithStubs(t, config.Config{
|
||||||
|
Startup: config.Startup{DeadNodeCleanupGraceSeconds: 300},
|
||||||
|
}, []commandStub{
|
||||||
|
{
|
||||||
|
match: matchContains("kubectl", "get nodes -o json"),
|
||||||
|
out: `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"False"}]}}]}`,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
match: matchContains("kubectl", "get pods -A -o json"),
|
||||||
|
out: `{"items":[{"metadata":{"namespace":"maintenance","name":"old-stale","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-22"}}]}`,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
match: matchContains("kubectl", "-n maintenance delete pod old-stale --grace-period=0 --force --wait=false"),
|
||||||
|
err: errors.New("delete failed"),
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
count, err := orch.cleanupTerminatingPodsOnUnavailableNodes(context.Background())
|
||||||
|
if count != 0 || err == nil || !strings.Contains(err.Error(), "delete pod maintenance/old-stale") {
|
||||||
|
t.Fatalf("expected delete failure, got count=%d err=%v", count, err)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestUnavailableNodeSetBranches runs one orchestration or CLI step.
|
||||||
|
// Signature: TestUnavailableNodeSetBranches(t *testing.T).
|
||||||
|
// Why: node Ready parsing drives dead-node cleanup, so malformed and missing
|
||||||
|
// Ready condition payloads need direct coverage too.
|
||||||
|
func TestUnavailableNodeSetBranches(t *testing.T) {
|
||||||
|
t.Run("decode error surfaces", func(t *testing.T) {
|
||||||
|
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
||||||
|
{match: matchContains("kubectl", "get nodes -o json"), out: `{bad json`},
|
||||||
|
})
|
||||||
|
if _, err := orch.unavailableNodeSet(context.Background()); err == nil || !strings.Contains(err.Error(), "decode nodes") {
|
||||||
|
t.Fatalf("expected decode error, got %v", err)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("missing ready condition counts as unavailable", func(t *testing.T) {
|
||||||
|
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
||||||
|
{
|
||||||
|
match: matchContains("kubectl", "get nodes -o json"),
|
||||||
|
out: `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"MemoryPressure","status":"False"}]}},{"metadata":{"name":"titan-07"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
nodes, err := orch.unavailableNodeSet(context.Background())
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("unavailableNodeSet failed: %v", err)
|
||||||
|
}
|
||||||
|
if _, ok := nodes["titan-22"]; !ok {
|
||||||
|
t.Fatalf("expected titan-22 to be treated as unavailable")
|
||||||
|
}
|
||||||
|
if _, ok := nodes["titan-07"]; ok {
|
||||||
|
t.Fatalf("did not expect titan-07 to be treated as unavailable")
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestRequestFluxReconcileBranches runs one orchestration or CLI step.
|
||||||
|
// Signature: TestRequestFluxReconcileBranches(t *testing.T).
|
||||||
|
// Why: the post-start repair loop needs predictable Flux refresh behavior even
|
||||||
|
// when one annotation call is flaky.
|
||||||
|
func TestRequestFluxReconcileBranches(t *testing.T) {
|
||||||
|
t.Run("dry run skips", func(t *testing.T) {
|
||||||
|
orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
|
||||||
|
orch.runner.DryRun = true
|
||||||
|
if err := orch.requestFluxReconcile(context.Background()); err != nil {
|
||||||
|
t.Fatalf("dry-run requestFluxReconcile failed: %v", err)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("git source annotate error surfaces", func(t *testing.T) {
|
||||||
|
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
||||||
|
{
|
||||||
|
match: matchContains("kubectl", "-n flux-system annotate gitrepository flux-system reconcile.fluxcd.io/requestedAt="),
|
||||||
|
err: errors.New("annotate failed"),
|
||||||
|
},
|
||||||
|
})
|
||||||
|
if err := orch.requestFluxReconcile(context.Background()); err == nil || !strings.Contains(err.Error(), "annotate flux source reconcile") {
|
||||||
|
t.Fatalf("expected gitrepository annotate error, got %v", err)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("kustomization annotate error surfaces", func(t *testing.T) {
|
||||||
|
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
||||||
|
{
|
||||||
|
match: matchContains("kubectl", "-n flux-system annotate gitrepository flux-system reconcile.fluxcd.io/requestedAt="),
|
||||||
|
out: "",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
match: matchContains("kubectl", "-n flux-system annotate kustomizations.kustomize.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt="),
|
||||||
|
err: errors.New("annotate failed"),
|
||||||
|
},
|
||||||
|
})
|
||||||
|
if err := orch.requestFluxReconcile(context.Background()); err == nil || !strings.Contains(err.Error(), "annotate flux kustomizations reconcile") {
|
||||||
|
t.Fatalf("expected kustomization annotate error, got %v", err)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("helm annotate warning and flux command path", func(t *testing.T) {
|
||||||
|
tmpDir := t.TempDir()
|
||||||
|
callLog := filepath.Join(tmpDir, "calls.log")
|
||||||
|
kubectlPath := filepath.Join(tmpDir, "kubectl")
|
||||||
|
fluxPath := filepath.Join(tmpDir, "flux")
|
||||||
|
|
||||||
|
kubectlScript := "#!/bin/sh\n" +
|
||||||
|
"printf '%s\\n' \"$*\" >> \"" + callLog + "\"\n" +
|
||||||
|
"case \"$*\" in\n" +
|
||||||
|
" *helmreleases.helm.toolkit.fluxcd.io*) echo helm annotate failed >&2; exit 1 ;;\n" +
|
||||||
|
"esac\n" +
|
||||||
|
"exit 0\n"
|
||||||
|
fluxScript := "#!/bin/sh\n" +
|
||||||
|
"printf 'flux %s\\n' \"$*\" >> \"" + callLog + "\"\n" +
|
||||||
|
"exit 0\n"
|
||||||
|
|
||||||
|
if err := os.WriteFile(kubectlPath, []byte(kubectlScript), 0o755); err != nil {
|
||||||
|
t.Fatalf("write fake kubectl: %v", err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(fluxPath, []byte(fluxScript), 0o755); err != nil {
|
||||||
|
t.Fatalf("write fake flux: %v", err)
|
||||||
|
}
|
||||||
|
t.Setenv("PATH", tmpDir+":"+os.Getenv("PATH"))
|
||||||
|
|
||||||
|
cfg := config.Config{
|
||||||
|
State: config.State{
|
||||||
|
Dir: t.TempDir(),
|
||||||
|
ReportsDir: filepath.Join(t.TempDir(), "reports"),
|
||||||
|
RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
orch := &Orchestrator{
|
||||||
|
cfg: cfg,
|
||||||
|
runner: &execx.Runner{},
|
||||||
|
store: state.New(cfg.State.RunHistoryPath),
|
||||||
|
log: log.New(io.Discard, "", 0),
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := orch.requestFluxReconcile(context.Background()); err != nil {
|
||||||
|
t.Fatalf("requestFluxReconcile with fake binaries failed: %v", err)
|
||||||
|
}
|
||||||
|
calls, err := os.ReadFile(callLog)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("read fake command log: %v", err)
|
||||||
|
}
|
||||||
|
logText := string(calls)
|
||||||
|
if !strings.Contains(logText, "annotate gitrepository flux-system") {
|
||||||
|
t.Fatalf("expected gitrepository annotate call, got %q", logText)
|
||||||
|
}
|
||||||
|
if !strings.Contains(logText, "annotate kustomizations.kustomize.toolkit.fluxcd.io --all") {
|
||||||
|
t.Fatalf("expected kustomization annotate call, got %q", logText)
|
||||||
|
}
|
||||||
|
if !strings.Contains(logText, "flux reconcile source git flux-system -n flux-system --timeout=60s") {
|
||||||
|
t.Fatalf("expected flux reconcile command, got %q", logText)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("flux command failure is tolerated", func(t *testing.T) {
|
||||||
|
tmpDir := t.TempDir()
|
||||||
|
callLog := filepath.Join(tmpDir, "calls.log")
|
||||||
|
kubectlPath := filepath.Join(tmpDir, "kubectl")
|
||||||
|
fluxPath := filepath.Join(tmpDir, "flux")
|
||||||
|
|
||||||
|
kubectlScript := "#!/bin/sh\n" +
|
||||||
|
"printf '%s\\n' \"$*\" >> \"" + callLog + "\"\n" +
|
||||||
|
"exit 0\n"
|
||||||
|
fluxScript := "#!/bin/sh\n" +
|
||||||
|
"printf 'flux-fail %s\\n' \"$*\" >> \"" + callLog + "\"\n" +
|
||||||
|
"exit 1\n"
|
||||||
|
|
||||||
|
if err := os.WriteFile(kubectlPath, []byte(kubectlScript), 0o755); err != nil {
|
||||||
|
t.Fatalf("write fake kubectl: %v", err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(fluxPath, []byte(fluxScript), 0o755); err != nil {
|
||||||
|
t.Fatalf("write fake flux: %v", err)
|
||||||
|
}
|
||||||
|
t.Setenv("PATH", tmpDir+":"+os.Getenv("PATH"))
|
||||||
|
|
||||||
|
cfg := config.Config{
|
||||||
|
State: config.State{
|
||||||
|
Dir: t.TempDir(),
|
||||||
|
ReportsDir: filepath.Join(t.TempDir(), "reports"),
|
||||||
|
RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
orch := &Orchestrator{
|
||||||
|
cfg: cfg,
|
||||||
|
runner: &execx.Runner{},
|
||||||
|
store: state.New(cfg.State.RunHistoryPath),
|
||||||
|
log: log.New(io.Discard, "", 0),
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := orch.requestFluxReconcile(context.Background()); err != nil {
|
||||||
|
t.Fatalf("requestFluxReconcile should tolerate flux failure, got %v", err)
|
||||||
|
}
|
||||||
|
calls, err := os.ReadFile(callLog)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("read fake command log: %v", err)
|
||||||
|
}
|
||||||
|
if !strings.Contains(string(calls), "flux-fail reconcile source git flux-system -n flux-system --timeout=60s") {
|
||||||
|
t.Fatalf("expected failing flux command to be attempted, got %q", string(calls))
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
382
internal/cluster/orchestrator_autorepair_test.go
Normal file
382
internal/cluster/orchestrator_autorepair_test.go
Normal file
@ -0,0 +1,382 @@
|
|||||||
|
package cluster
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/base64"
|
||||||
|
"errors"
|
||||||
|
"io"
|
||||||
|
"log"
|
||||||
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"scm.bstein.dev/bstein/ananke/internal/config"
|
||||||
|
"scm.bstein.dev/bstein/ananke/internal/execx"
|
||||||
|
"scm.bstein.dev/bstein/ananke/internal/state"
|
||||||
|
)
|
||||||
|
|
||||||
|
// TestPostStartAutoHealRepairsVaultAndUnavailableNodes runs one orchestration or CLI step.
|
||||||
|
// Signature: TestPostStartAutoHealRepairsVaultAndUnavailableNodes(t *testing.T).
|
||||||
|
// Why: covers the new daemon-triggered repair path for late Vault reseals and
|
||||||
|
// stale terminating pods anchored to unavailable nodes.
|
||||||
|
func TestPostStartAutoHealRepairsVaultAndUnavailableNodes(t *testing.T) {
|
||||||
|
cfg := config.Config{
|
||||||
|
Startup: config.Startup{
|
||||||
|
DeadNodeCleanupGraceSeconds: 300,
|
||||||
|
RequiredNodeLabels: map[string]map[string]string{
|
||||||
|
"titan-07": {"node-role.kubernetes.io/worker": "true"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
State: config.State{
|
||||||
|
Dir: t.TempDir(),
|
||||||
|
ReportsDir: filepath.Join(t.TempDir(), "reports"),
|
||||||
|
RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
orch := &Orchestrator{
|
||||||
|
cfg: cfg,
|
||||||
|
runner: &execx.Runner{},
|
||||||
|
store: state.New(filepath.Join(t.TempDir(), "runs.json")),
|
||||||
|
log: log.New(io.Discard, "", 0),
|
||||||
|
}
|
||||||
|
|
||||||
|
oldDelete := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
|
||||||
|
unsealCalls := 0
|
||||||
|
jobCreated := false
|
||||||
|
reconciled := false
|
||||||
|
deleted := map[string]bool{}
|
||||||
|
dispatch := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
|
||||||
|
if name != "kubectl" {
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
joined := strings.Join(args, " ")
|
||||||
|
switch {
|
||||||
|
case strings.Contains(joined, "label node titan-07 --overwrite node-role.kubernetes.io/worker=true"):
|
||||||
|
return "", nil
|
||||||
|
case strings.Contains(joined, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
|
||||||
|
return "Running", nil
|
||||||
|
case strings.Contains(joined, "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"):
|
||||||
|
if unsealCalls == 0 {
|
||||||
|
return `{"initialized":true,"sealed":true}`, nil
|
||||||
|
}
|
||||||
|
return `{"initialized":true,"sealed":false}`, nil
|
||||||
|
case strings.Contains(joined, "-n vault get secret vault-init -o jsonpath={.data.unseal_key_b64}"):
|
||||||
|
return base64.StdEncoding.EncodeToString([]byte("vault-unseal-key")), nil
|
||||||
|
case strings.Contains(joined, "vault operator unseal"):
|
||||||
|
unsealCalls++
|
||||||
|
return "", nil
|
||||||
|
case strings.Contains(joined, "-n vault create job --from=cronjob/vault-k8s-auth-config"):
|
||||||
|
jobCreated = true
|
||||||
|
return "", nil
|
||||||
|
case strings.Contains(joined, "get nodes -o json"):
|
||||||
|
return `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"Unknown"}]}},{"metadata":{"name":"titan-07"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`, nil
|
||||||
|
case strings.Contains(joined, "get pods -A -o json"):
|
||||||
|
return `{"items":[{"metadata":{"namespace":"maintenance","name":"stale-pod","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-22"}},{"metadata":{"namespace":"logging","name":"healthy-node-pod","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-18"}}]}`, nil
|
||||||
|
case strings.Contains(joined, "-n maintenance delete pod stale-pod --grace-period=0 --force --wait=false"):
|
||||||
|
deleted["maintenance/stale-pod"] = true
|
||||||
|
return "", nil
|
||||||
|
case strings.Contains(joined, "-n flux-system annotate gitrepository flux-system reconcile.fluxcd.io/requestedAt="):
|
||||||
|
reconciled = true
|
||||||
|
return "", nil
|
||||||
|
case strings.Contains(joined, "-n flux-system annotate kustomizations.kustomize.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt="):
|
||||||
|
return "", nil
|
||||||
|
case strings.Contains(joined, "annotate --all-namespaces helmreleases.helm.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt="):
|
||||||
|
return "", nil
|
||||||
|
default:
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
orch.SetCommandOverrides(dispatch, dispatch)
|
||||||
|
|
||||||
|
if err := orch.postStartAutoHeal(context.Background()); err != nil {
|
||||||
|
t.Fatalf("postStartAutoHeal failed: %v", err)
|
||||||
|
}
|
||||||
|
if unsealCalls != 1 {
|
||||||
|
t.Fatalf("expected one Vault unseal attempt, got %d", unsealCalls)
|
||||||
|
}
|
||||||
|
if !jobCreated {
|
||||||
|
t.Fatalf("expected vault k8s auth config job to be created")
|
||||||
|
}
|
||||||
|
if !deleted["maintenance/stale-pod"] {
|
||||||
|
t.Fatalf("expected stale unavailable-node pod to be deleted")
|
||||||
|
}
|
||||||
|
if !reconciled {
|
||||||
|
t.Fatalf("expected flux reconcile request after repairs")
|
||||||
|
}
|
||||||
|
if deleted["logging/healthy-node-pod"] {
|
||||||
|
t.Fatalf("did not expect terminating pod on healthy node to be deleted")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestPostStartAutoHealSkipsWhenClusterIsHealthy runs one orchestration or CLI step.
|
||||||
|
// Signature: TestPostStartAutoHealSkipsWhenClusterIsHealthy(t *testing.T).
|
||||||
|
// Why: proves the new post-start repair loop stays quiet when the specific
|
||||||
|
// failure patterns are absent.
|
||||||
|
func TestPostStartAutoHealSkipsWhenClusterIsHealthy(t *testing.T) {
|
||||||
|
cfg := config.Config{
|
||||||
|
Startup: config.Startup{
|
||||||
|
DeadNodeCleanupGraceSeconds: 300,
|
||||||
|
},
|
||||||
|
State: config.State{
|
||||||
|
Dir: t.TempDir(),
|
||||||
|
ReportsDir: filepath.Join(t.TempDir(), "reports"),
|
||||||
|
RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
orch := &Orchestrator{
|
||||||
|
cfg: cfg,
|
||||||
|
runner: &execx.Runner{},
|
||||||
|
store: state.New(filepath.Join(t.TempDir(), "runs.json")),
|
||||||
|
log: log.New(io.Discard, "", 0),
|
||||||
|
}
|
||||||
|
|
||||||
|
unsealCalls := 0
|
||||||
|
jobCreated := false
|
||||||
|
reconciled := false
|
||||||
|
dispatch := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
|
||||||
|
if name != "kubectl" {
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
joined := strings.Join(args, " ")
|
||||||
|
switch {
|
||||||
|
case strings.Contains(joined, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
|
||||||
|
return "Running", nil
|
||||||
|
case strings.Contains(joined, "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"):
|
||||||
|
return `{"initialized":true,"sealed":false}`, nil
|
||||||
|
case strings.Contains(joined, "-n vault create job --from=cronjob/vault-k8s-auth-config"):
|
||||||
|
jobCreated = true
|
||||||
|
return "", nil
|
||||||
|
case strings.Contains(joined, "vault operator unseal"):
|
||||||
|
unsealCalls++
|
||||||
|
return "", nil
|
||||||
|
case strings.Contains(joined, "get nodes -o json"):
|
||||||
|
return `{"items":[{"metadata":{"name":"titan-07"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`, nil
|
||||||
|
case strings.Contains(joined, "get pods -A -o json"):
|
||||||
|
return `{"items":[]}`, nil
|
||||||
|
case strings.Contains(joined, "reconcile.fluxcd.io/requestedAt="):
|
||||||
|
reconciled = true
|
||||||
|
return "", nil
|
||||||
|
default:
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
orch.SetCommandOverrides(dispatch, dispatch)
|
||||||
|
|
||||||
|
if err := orch.postStartAutoHeal(context.Background()); err != nil {
|
||||||
|
t.Fatalf("postStartAutoHeal failed: %v", err)
|
||||||
|
}
|
||||||
|
if unsealCalls != 0 {
|
||||||
|
t.Fatalf("did not expect Vault unseal calls, got %d", unsealCalls)
|
||||||
|
}
|
||||||
|
if jobCreated {
|
||||||
|
t.Fatalf("did not expect vault auth config job creation")
|
||||||
|
}
|
||||||
|
if reconciled {
|
||||||
|
t.Fatalf("did not expect flux reconcile request for healthy cluster")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestRunPostStartAutoHealDryRun runs one orchestration or CLI step.
|
||||||
|
// Signature: TestRunPostStartAutoHealDryRun(t *testing.T).
|
||||||
|
// Why: covers the exported wrapper and the top-level dry-run guard so daemon
|
||||||
|
// auto-heal never mutates cluster state during rehearsal runs.
|
||||||
|
func TestRunPostStartAutoHealDryRun(t *testing.T) {
|
||||||
|
orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
|
||||||
|
orch.runner.DryRun = true
|
||||||
|
|
||||||
|
if err := orch.RunPostStartAutoHeal(context.Background()); err != nil {
|
||||||
|
t.Fatalf("RunPostStartAutoHeal dry-run failed: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestPostStartAutoHealAggregatesErrors runs one orchestration or CLI step.
|
||||||
|
// Signature: TestPostStartAutoHealAggregatesErrors(t *testing.T).
|
||||||
|
// Why: proves the daemon reports each failed sub-repair together instead of
|
||||||
|
// hiding later failures behind the first problem.
|
||||||
|
func TestPostStartAutoHealAggregatesErrors(t *testing.T) {
|
||||||
|
cfg := config.Config{
|
||||||
|
Startup: config.Startup{
|
||||||
|
DeadNodeCleanupGraceSeconds: 300,
|
||||||
|
RequiredNodeLabels: map[string]map[string]string{
|
||||||
|
"titan-07": {"node-role.kubernetes.io/worker": "true"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
orch := buildOrchestratorWithStubs(t, cfg, []commandStub{
|
||||||
|
{
|
||||||
|
match: matchContains("kubectl", "label node titan-07 --overwrite node-role.kubernetes.io/worker=true"),
|
||||||
|
err: errors.New("label failed"),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
|
||||||
|
err: errors.New("vault phase failed"),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
match: matchContains("kubectl", "get nodes -o json"),
|
||||||
|
err: errors.New("node query failed"),
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
err := orch.postStartAutoHeal(context.Background())
|
||||||
|
if err == nil {
|
||||||
|
t.Fatalf("expected aggregated error")
|
||||||
|
}
|
||||||
|
msg := err.Error()
|
||||||
|
for _, want := range []string{
|
||||||
|
"required node labels:",
|
||||||
|
"vault auto-recovery:",
|
||||||
|
"dead-node terminating pod cleanup:",
|
||||||
|
} {
|
||||||
|
if !strings.Contains(msg, want) {
|
||||||
|
t.Fatalf("expected %q in %q", want, msg)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestAutoRecoverSealedVaultBranches runs one orchestration or CLI step.
|
||||||
|
// Signature: TestAutoRecoverSealedVaultBranches(t *testing.T).
|
||||||
|
// Why: late Vault reseals are a high-risk failure path, so the daemon needs
|
||||||
|
// coverage across the quiet-skip, parse-failure, and unseal-failure branches.
|
||||||
|
func TestAutoRecoverSealedVaultBranches(t *testing.T) {
|
||||||
|
t.Run("dry run skips", func(t *testing.T) {
|
||||||
|
orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
|
||||||
|
orch.runner.DryRun = true
|
||||||
|
|
||||||
|
recovered, err := orch.autoRecoverSealedVault(context.Background())
|
||||||
|
if err != nil || recovered {
|
||||||
|
t.Fatalf("expected dry-run skip, got recovered=%v err=%v", recovered, err)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("pod missing is quiet", func(t *testing.T) {
|
||||||
|
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
||||||
|
{
|
||||||
|
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
|
||||||
|
err: errors.New("vault-0 not found"),
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
recovered, err := orch.autoRecoverSealedVault(context.Background())
|
||||||
|
if err != nil || recovered {
|
||||||
|
t.Fatalf("expected quiet skip, got recovered=%v err=%v", recovered, err)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("phase check error surfaces", func(t *testing.T) {
|
||||||
|
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
||||||
|
{
|
||||||
|
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
|
||||||
|
err: errors.New("phase check failed"),
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
recovered, err := orch.autoRecoverSealedVault(context.Background())
|
||||||
|
if recovered || err == nil || !strings.Contains(err.Error(), "vault pod phase check failed") {
|
||||||
|
t.Fatalf("expected phase check error, got recovered=%v err=%v", recovered, err)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("non-running pod defers", func(t *testing.T) {
|
||||||
|
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
||||||
|
{
|
||||||
|
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
|
||||||
|
out: "Pending",
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
recovered, err := orch.autoRecoverSealedVault(context.Background())
|
||||||
|
if err != nil || recovered {
|
||||||
|
t.Fatalf("expected pending pod skip, got recovered=%v err=%v", recovered, err)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("status parse failure surfaces", func(t *testing.T) {
|
||||||
|
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
||||||
|
{
|
||||||
|
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
|
||||||
|
out: "Running",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
match: matchContains("kubectl", "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"),
|
||||||
|
out: "garbage",
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
recovered, err := orch.autoRecoverSealedVault(context.Background())
|
||||||
|
if recovered || err == nil || !strings.Contains(err.Error(), "parse vault status") {
|
||||||
|
t.Fatalf("expected parse error, got recovered=%v err=%v", recovered, err)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("already unsealed stays quiet", func(t *testing.T) {
|
||||||
|
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
||||||
|
{
|
||||||
|
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
|
||||||
|
out: "Running",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
match: matchContains("kubectl", "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"),
|
||||||
|
out: `{"sealed":false}`,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
recovered, err := orch.autoRecoverSealedVault(context.Background())
|
||||||
|
if err != nil || recovered {
|
||||||
|
t.Fatalf("expected already-unsealed skip, got recovered=%v err=%v", recovered, err)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("unseal failure surfaces", func(t *testing.T) {
|
||||||
|
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
||||||
|
{
|
||||||
|
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
|
||||||
|
out: "Running",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
match: matchContains("kubectl", "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"),
|
||||||
|
out: `{"sealed":true}`,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
match: matchContains("kubectl", "-n vault get secret vault-init -o jsonpath={.data.unseal_key_b64}"),
|
||||||
|
out: base64.StdEncoding.EncodeToString([]byte("vault-unseal-key")),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
match: matchContains("kubectl", "vault operator unseal"),
|
||||||
|
err: errors.New("exec boom"),
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
recovered, err := orch.autoRecoverSealedVault(context.Background())
|
||||||
|
if recovered || err == nil || !strings.Contains(err.Error(), "vault unseal attempt 1 failed") {
|
||||||
|
t.Fatalf("expected unseal failure, got recovered=%v err=%v", recovered, err)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestRerunVaultK8sAuthConfigJobBranches runs one orchestration or CLI step.
|
||||||
|
// Signature: TestRerunVaultK8sAuthConfigJobBranches(t *testing.T).
|
||||||
|
// Why: the post-unseal auth job is part of the production recovery chain, so
|
||||||
|
// dry-run and create-error behavior both need explicit coverage.
|
||||||
|
func TestRerunVaultK8sAuthConfigJobBranches(t *testing.T) {
|
||||||
|
t.Run("dry run skips", func(t *testing.T) {
|
||||||
|
orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
|
||||||
|
orch.runner.DryRun = true
|
||||||
|
if err := orch.rerunVaultK8sAuthConfigJob(context.Background()); err != nil {
|
||||||
|
t.Fatalf("dry-run rerunVaultK8sAuthConfigJob failed: %v", err)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("create error surfaces", func(t *testing.T) {
|
||||||
|
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
||||||
|
{
|
||||||
|
match: matchContains("kubectl", "-n vault create job --from=cronjob/vault-k8s-auth-config"),
|
||||||
|
err: errors.New("create failed"),
|
||||||
|
},
|
||||||
|
})
|
||||||
|
err := orch.rerunVaultK8sAuthConfigJob(context.Background())
|
||||||
|
if err == nil || !strings.Contains(err.Error(), "create job vault-k8s-auth-config-autoheal-") {
|
||||||
|
t.Fatalf("expected create-job error, got %v", err)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
@ -195,6 +195,12 @@ func (c *Config) applyDefaults() {
|
|||||||
if c.Startup.StuckPodGraceSeconds <= 0 {
|
if c.Startup.StuckPodGraceSeconds <= 0 {
|
||||||
c.Startup.StuckPodGraceSeconds = 180
|
c.Startup.StuckPodGraceSeconds = 180
|
||||||
}
|
}
|
||||||
|
if c.Startup.PostStartAutoHealSeconds <= 0 {
|
||||||
|
c.Startup.PostStartAutoHealSeconds = 60
|
||||||
|
}
|
||||||
|
if c.Startup.DeadNodeCleanupGraceSeconds <= 0 {
|
||||||
|
c.Startup.DeadNodeCleanupGraceSeconds = 300
|
||||||
|
}
|
||||||
if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" {
|
if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" {
|
||||||
c.Startup.VaultUnsealKeyFile = "/var/lib/ananke/vault-unseal.key"
|
c.Startup.VaultUnsealKeyFile = "/var/lib/ananke/vault-unseal.key"
|
||||||
}
|
}
|
||||||
|
|||||||
@ -91,6 +91,8 @@ type Startup struct {
|
|||||||
SchedulingStormEventThreshold int `yaml:"scheduling_storm_event_threshold"`
|
SchedulingStormEventThreshold int `yaml:"scheduling_storm_event_threshold"`
|
||||||
SchedulingStormWindowSeconds int `yaml:"scheduling_storm_window_seconds"`
|
SchedulingStormWindowSeconds int `yaml:"scheduling_storm_window_seconds"`
|
||||||
StuckPodGraceSeconds int `yaml:"stuck_pod_grace_seconds"`
|
StuckPodGraceSeconds int `yaml:"stuck_pod_grace_seconds"`
|
||||||
|
PostStartAutoHealSeconds int `yaml:"post_start_auto_heal_seconds"`
|
||||||
|
DeadNodeCleanupGraceSeconds int `yaml:"dead_node_cleanup_grace_seconds"`
|
||||||
VaultUnsealKeyFile string `yaml:"vault_unseal_key_file"`
|
VaultUnsealKeyFile string `yaml:"vault_unseal_key_file"`
|
||||||
VaultUnsealBreakglassCommand string `yaml:"vault_unseal_breakglass_command"`
|
VaultUnsealBreakglassCommand string `yaml:"vault_unseal_breakglass_command"`
|
||||||
VaultUnsealBreakglassTimeout int `yaml:"vault_unseal_breakglass_timeout_seconds"`
|
VaultUnsealBreakglassTimeout int `yaml:"vault_unseal_breakglass_timeout_seconds"`
|
||||||
|
|||||||
@ -272,6 +272,12 @@ func (c Config) Validate() error {
|
|||||||
if c.Startup.StuckPodGraceSeconds <= 0 {
|
if c.Startup.StuckPodGraceSeconds <= 0 {
|
||||||
return fmt.Errorf("config.startup.stuck_pod_grace_seconds must be > 0")
|
return fmt.Errorf("config.startup.stuck_pod_grace_seconds must be > 0")
|
||||||
}
|
}
|
||||||
|
if c.Startup.PostStartAutoHealSeconds <= 0 {
|
||||||
|
return fmt.Errorf("config.startup.post_start_auto_heal_seconds must be > 0")
|
||||||
|
}
|
||||||
|
if c.Startup.DeadNodeCleanupGraceSeconds <= 0 {
|
||||||
|
return fmt.Errorf("config.startup.dead_node_cleanup_grace_seconds must be > 0")
|
||||||
|
}
|
||||||
for _, probe := range c.Startup.PostStartProbes {
|
for _, probe := range c.Startup.PostStartProbes {
|
||||||
if strings.TrimSpace(probe) == "" {
|
if strings.TrimSpace(probe) == "" {
|
||||||
return fmt.Errorf("config.startup.post_start_probes entries must not be empty")
|
return fmt.Errorf("config.startup.post_start_probes entries must not be empty")
|
||||||
|
|||||||
@ -78,6 +78,8 @@ func TestValidateRejectsInvalidFieldsMatrix(t *testing.T) {
|
|||||||
{"bad_workload_poll", func(c *Config) { c.Startup.WorkloadConvergencePollSeconds = 0 }},
|
{"bad_workload_poll", func(c *Config) { c.Startup.WorkloadConvergencePollSeconds = 0 }},
|
||||||
{"bad_empty_required_workload_namespace", func(c *Config) { c.Startup.WorkloadConvergenceRequiredNamespaces = []string{"monitoring", ""} }},
|
{"bad_empty_required_workload_namespace", func(c *Config) { c.Startup.WorkloadConvergenceRequiredNamespaces = []string{"monitoring", ""} }},
|
||||||
{"bad_stuck_pod_grace", func(c *Config) { c.Startup.StuckPodGraceSeconds = 0 }},
|
{"bad_stuck_pod_grace", func(c *Config) { c.Startup.StuckPodGraceSeconds = 0 }},
|
||||||
|
{"bad_post_start_auto_heal_seconds", func(c *Config) { c.Startup.PostStartAutoHealSeconds = 0 }},
|
||||||
|
{"bad_dead_node_cleanup_grace_seconds", func(c *Config) { c.Startup.DeadNodeCleanupGraceSeconds = 0 }},
|
||||||
{"bad_empty_post_start_probe_entry", func(c *Config) { c.Startup.PostStartProbes = []string{"https://ok", ""} }},
|
{"bad_empty_post_start_probe_entry", func(c *Config) { c.Startup.PostStartProbes = []string{"https://ok", ""} }},
|
||||||
{"bad_empty_ignore_flux_entry", func(c *Config) { c.Startup.IgnoreFluxKustomizations = []string{"", "ns/name"} }},
|
{"bad_empty_ignore_flux_entry", func(c *Config) { c.Startup.IgnoreFluxKustomizations = []string{"", "ns/name"} }},
|
||||||
{"bad_empty_ignore_workloads_entry", func(c *Config) { c.Startup.IgnoreWorkloads = []string{"", "ns/name"} }},
|
{"bad_empty_ignore_workloads_entry", func(c *Config) { c.Startup.IgnoreWorkloads = []string{"", "ns/name"} }},
|
||||||
@ -143,6 +145,9 @@ func TestApplyDefaultsPopulatesZeroConfig(t *testing.T) {
|
|||||||
if cfg.Startup.TimeSyncMode == "" || cfg.Startup.EtcdRestoreControlPlane == "" || cfg.Startup.VaultUnsealKeyFile == "" {
|
if cfg.Startup.TimeSyncMode == "" || cfg.Startup.EtcdRestoreControlPlane == "" || cfg.Startup.VaultUnsealKeyFile == "" {
|
||||||
t.Fatalf("expected startup defaults to be set")
|
t.Fatalf("expected startup defaults to be set")
|
||||||
}
|
}
|
||||||
|
if cfg.Startup.PostStartAutoHealSeconds <= 0 || cfg.Startup.DeadNodeCleanupGraceSeconds <= 0 {
|
||||||
|
t.Fatalf("expected post-start auto-heal defaults to be set")
|
||||||
|
}
|
||||||
if cfg.Startup.NodeInventoryReachRequiredNodes == nil || cfg.Startup.NodeSSHAuthRequiredNodes == nil ||
|
if cfg.Startup.NodeInventoryReachRequiredNodes == nil || cfg.Startup.NodeSSHAuthRequiredNodes == nil ||
|
||||||
cfg.Startup.FluxHealthRequiredKustomizations == nil || cfg.Startup.WorkloadConvergenceRequiredNamespaces == nil {
|
cfg.Startup.FluxHealthRequiredKustomizations == nil || cfg.Startup.WorkloadConvergenceRequiredNamespaces == nil {
|
||||||
t.Fatalf("expected startup recovery scope slices to be initialized")
|
t.Fatalf("expected startup recovery scope slices to be initialized")
|
||||||
|
|||||||
@ -32,6 +32,8 @@ type Daemon struct {
|
|||||||
targets []Target
|
targets []Target
|
||||||
log *log.Logger
|
log *log.Logger
|
||||||
exporter *metrics.Exporter
|
exporter *metrics.Exporter
|
||||||
|
|
||||||
|
postStartAutoHealOverride func(context.Context) error
|
||||||
}
|
}
|
||||||
|
|
||||||
var sshConfigCandidates = []string{
|
var sshConfigCandidates = []string{
|
||||||
@ -94,6 +96,7 @@ func (d *Daemon) Run(ctx context.Context) error {
|
|||||||
lastOnBattery := map[string]bool{}
|
lastOnBattery := map[string]bool{}
|
||||||
onBatterySince := map[string]time.Time{}
|
onBatterySince := map[string]time.Time{}
|
||||||
breachCount := map[string]int{}
|
breachCount := map[string]int{}
|
||||||
|
lastAutoHeal := time.Time{}
|
||||||
for _, t := range d.targets {
|
for _, t := range d.targets {
|
||||||
lastGood[t.Name] = time.Now()
|
lastGood[t.Name] = time.Now()
|
||||||
}
|
}
|
||||||
@ -108,12 +111,16 @@ func (d *Daemon) Run(ctx context.Context) error {
|
|||||||
case <-t.C:
|
case <-t.C:
|
||||||
budget := d.orch.EstimatedEmergencyShutdownSeconds()
|
budget := d.orch.EstimatedEmergencyShutdownSeconds()
|
||||||
threshold := int(math.Ceil(float64(budget) * d.cfg.UPS.RuntimeSafetyFactor))
|
threshold := int(math.Ceil(float64(budget) * d.cfg.UPS.RuntimeSafetyFactor))
|
||||||
|
anyOnBattery := false
|
||||||
|
|
||||||
d.exporter.UpdateBudget(budget)
|
d.exporter.UpdateBudget(budget)
|
||||||
|
|
||||||
for _, target := range d.targets {
|
for _, target := range d.targets {
|
||||||
sample, err := target.Provider.Read(ctx)
|
sample, err := target.Provider.Read(ctx)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
if lastOnBattery[target.Name] {
|
||||||
|
anyOnBattery = true
|
||||||
|
}
|
||||||
d.log.Printf("warning: ups read failed target=%s (%s): %v", target.Name, target.Target, err)
|
d.log.Printf("warning: ups read failed target=%s (%s): %v", target.Name, target.Target, err)
|
||||||
d.exporter.UpdateSample(metrics.Sample{
|
d.exporter.UpdateSample(metrics.Sample{
|
||||||
Name: target.Name,
|
Name: target.Name,
|
||||||
@ -132,6 +139,9 @@ func (d *Daemon) Run(ctx context.Context) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
lastGood[target.Name] = time.Now()
|
lastGood[target.Name] = time.Now()
|
||||||
|
if sample.OnBattery {
|
||||||
|
anyOnBattery = true
|
||||||
|
}
|
||||||
wasOnBattery := lastOnBattery[target.Name]
|
wasOnBattery := lastOnBattery[target.Name]
|
||||||
if sample.OnBattery {
|
if sample.OnBattery {
|
||||||
if !wasOnBattery || onBatterySince[target.Name].IsZero() {
|
if !wasOnBattery || onBatterySince[target.Name].IsZero() {
|
||||||
@ -189,10 +199,51 @@ func (d *Daemon) Run(ctx context.Context) error {
|
|||||||
return d.triggerShutdown(ctx, triggerReason)
|
return d.triggerShutdown(ctx, triggerReason)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
d.maybeRunPostStartAutoHeal(ctx, &lastAutoHeal, anyOnBattery)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// maybeRunPostStartAutoHeal runs one orchestration or CLI step.
|
||||||
|
// Signature: (d *Daemon) maybeRunPostStartAutoHeal(ctx context.Context, lastRun *time.Time, anyOnBattery bool).
|
||||||
|
// Why: gives the long-running daemon a bounded path to repair post-start drift
|
||||||
|
// like a later Vault reseal or stale dead-node deletions without waiting for a
|
||||||
|
// fresh bootstrap run.
|
||||||
|
func (d *Daemon) maybeRunPostStartAutoHeal(ctx context.Context, lastRun *time.Time, anyOnBattery bool) {
|
||||||
|
interval := time.Duration(d.cfg.Startup.PostStartAutoHealSeconds) * time.Second
|
||||||
|
if interval <= 0 || anyOnBattery {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if d.orch == nil && d.postStartAutoHealOverride == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
now := time.Now()
|
||||||
|
if lastRun != nil && !lastRun.IsZero() && now.Sub(*lastRun) < interval {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if lastRun != nil {
|
||||||
|
*lastRun = now
|
||||||
|
}
|
||||||
|
if err := d.runPostStartAutoHeal(ctx); err != nil {
|
||||||
|
d.log.Printf("warning: post-start auto-heal: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// runPostStartAutoHeal runs one orchestration or CLI step.
|
||||||
|
// Signature: (d *Daemon) runPostStartAutoHeal(ctx context.Context) error.
|
||||||
|
// Why: keeps the daemon loop readable while allowing unit tests to inject a
|
||||||
|
// deterministic repair hook without a live cluster.
|
||||||
|
func (d *Daemon) runPostStartAutoHeal(ctx context.Context) error {
|
||||||
|
if d.postStartAutoHealOverride != nil {
|
||||||
|
return d.postStartAutoHealOverride(ctx)
|
||||||
|
}
|
||||||
|
if d.orch == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return d.orch.RunPostStartAutoHeal(ctx)
|
||||||
|
}
|
||||||
|
|
||||||
// triggerShutdown runs one orchestration or CLI step.
|
// triggerShutdown runs one orchestration or CLI step.
|
||||||
// Signature: (d *Daemon) triggerShutdown(ctx context.Context, reason string) error.
|
// Signature: (d *Daemon) triggerShutdown(ctx context.Context, reason string) error.
|
||||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||||
|
|||||||
51
internal/service/daemon_poststart_autorepair_test.go
Normal file
51
internal/service/daemon_poststart_autorepair_test.go
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
package service
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"scm.bstein.dev/bstein/ananke/internal/config"
|
||||||
|
)
|
||||||
|
|
||||||
|
// TestDaemonMaybeRunPostStartAutoHeal runs one orchestration or CLI step.
|
||||||
|
// Signature: TestDaemonMaybeRunPostStartAutoHeal(t *testing.T).
|
||||||
|
// Why: covers the daemon-side interval and on-battery guards for the new
|
||||||
|
// post-start repair loop.
|
||||||
|
func TestDaemonMaybeRunPostStartAutoHeal(t *testing.T) {
|
||||||
|
calls := 0
|
||||||
|
d := &Daemon{
|
||||||
|
cfg: config.Config{
|
||||||
|
Startup: config.Startup{
|
||||||
|
PostStartAutoHealSeconds: 10,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
postStartAutoHealOverride: func(context.Context) error {
|
||||||
|
calls++
|
||||||
|
return nil
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
var last time.Time
|
||||||
|
d.maybeRunPostStartAutoHeal(context.Background(), &last, false)
|
||||||
|
if calls != 1 {
|
||||||
|
t.Fatalf("expected first auto-heal invocation, got %d", calls)
|
||||||
|
}
|
||||||
|
|
||||||
|
d.maybeRunPostStartAutoHeal(context.Background(), &last, false)
|
||||||
|
if calls != 1 {
|
||||||
|
t.Fatalf("expected interval guard to suppress second call, got %d", calls)
|
||||||
|
}
|
||||||
|
|
||||||
|
last = time.Now().Add(-11 * time.Second)
|
||||||
|
d.maybeRunPostStartAutoHeal(context.Background(), &last, true)
|
||||||
|
if calls != 1 {
|
||||||
|
t.Fatalf("expected on-battery guard to suppress call, got %d", calls)
|
||||||
|
}
|
||||||
|
|
||||||
|
last = time.Now().Add(-11 * time.Second)
|
||||||
|
d.maybeRunPostStartAutoHeal(context.Background(), &last, false)
|
||||||
|
if calls != 2 {
|
||||||
|
t.Fatalf("expected second allowed auto-heal call, got %d", calls)
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -13,6 +13,8 @@ cmd/ananke/power_safety_test.go
|
|||||||
cmd/ananke/test_helpers_test.go
|
cmd/ananke/test_helpers_test.go
|
||||||
internal/cluster/orchestrator_inventory_test.go
|
internal/cluster/orchestrator_inventory_test.go
|
||||||
internal/cluster/orchestrator_report_test.go
|
internal/cluster/orchestrator_report_test.go
|
||||||
|
internal/cluster/orchestrator_autorepair_test.go
|
||||||
|
internal/cluster/orchestrator_autorepair_cleanup_test.go
|
||||||
internal/cluster/orchestrator_test.go
|
internal/cluster/orchestrator_test.go
|
||||||
internal/cluster/orchestrator_unit_additional_test.go
|
internal/cluster/orchestrator_unit_additional_test.go
|
||||||
internal/cluster/orchestrator_vault_test.go
|
internal/cluster/orchestrator_vault_test.go
|
||||||
@ -21,6 +23,7 @@ internal/config/load_additional_test.go
|
|||||||
internal/config/validate_matrix_test.go
|
internal/config/validate_matrix_test.go
|
||||||
internal/service/daemon_additional_test.go
|
internal/service/daemon_additional_test.go
|
||||||
internal/service/daemon_coverage_closeout_test.go
|
internal/service/daemon_coverage_closeout_test.go
|
||||||
|
internal/service/daemon_poststart_autorepair_test.go
|
||||||
internal/service/daemon_quality_branches_test.go
|
internal/service/daemon_quality_branches_test.go
|
||||||
internal/service/daemon_test.go
|
internal/service/daemon_test.go
|
||||||
internal/sshutil/repair_test.go
|
internal/sshutil/repair_test.go
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user