startup: enforce external service behavior checks
This commit is contained in:
parent
296ca85c78
commit
95fefba244
124
internal/cluster/orchestrator_core.go
Normal file
124
internal/cluster/orchestrator_core.go
Normal file
@ -0,0 +1,124 @@
|
||||
package cluster
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"log"
|
||||
"regexp"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"scm.bstein.dev/bstein/ananke/internal/config"
|
||||
"scm.bstein.dev/bstein/ananke/internal/execx"
|
||||
"scm.bstein.dev/bstein/ananke/internal/state"
|
||||
)
|
||||
|
||||
type Orchestrator struct {
|
||||
cfg config.Config
|
||||
runner *execx.Runner
|
||||
store *state.Store
|
||||
log *log.Logger
|
||||
runOverride func(timeoutCtx context.Context, timeout time.Duration, name string, args ...string) (string, error)
|
||||
runSensitiveOverride func(timeoutCtx context.Context, timeout time.Duration, name string, args ...string) (string, error)
|
||||
startupReportMu sync.Mutex
|
||||
activeStartupReport *startupReport
|
||||
}
|
||||
|
||||
type commandOverrideFunc func(timeoutCtx context.Context, timeout time.Duration, name string, args ...string) (string, error)
|
||||
|
||||
type StartupOptions struct {
|
||||
ForceFluxBranch string
|
||||
SkipLocalBootstrap bool
|
||||
Reason string
|
||||
}
|
||||
|
||||
type ShutdownOptions struct {
|
||||
SkipEtcdSnapshot bool
|
||||
SkipDrain bool
|
||||
Mode string
|
||||
Reason string
|
||||
}
|
||||
|
||||
type EtcdRestoreOptions struct {
|
||||
ControlPlane string
|
||||
SnapshotPath string
|
||||
}
|
||||
|
||||
type startupWorkload struct {
|
||||
Namespace string
|
||||
Kind string
|
||||
Name string
|
||||
}
|
||||
|
||||
type workloadScaleEntry struct {
|
||||
Namespace string `json:"namespace"`
|
||||
Kind string `json:"kind"`
|
||||
Name string `json:"name"`
|
||||
Replicas int `json:"replicas"`
|
||||
}
|
||||
|
||||
type remotePeerStatus struct {
|
||||
Intent state.Intent
|
||||
BootstrapActive bool
|
||||
}
|
||||
|
||||
type workloadScaleSnapshot struct {
|
||||
GeneratedAt time.Time `json:"generated_at"`
|
||||
Entries []workloadScaleEntry `json:"entries"`
|
||||
}
|
||||
|
||||
type startupReport struct {
|
||||
StartedAt time.Time `json:"started_at"`
|
||||
Completed time.Time `json:"completed_at"`
|
||||
Reason string `json:"reason"`
|
||||
Status string `json:"status"`
|
||||
Phase string `json:"phase"`
|
||||
Success bool `json:"success"`
|
||||
Error string `json:"error,omitempty"`
|
||||
Checks map[string]startupCheckRecord `json:"checks"`
|
||||
AutoHeals []string `json:"auto_heals"`
|
||||
SourceHost string `json:"source_host"`
|
||||
LastUpdated time.Time `json:"last_updated"`
|
||||
}
|
||||
|
||||
type startupCheckRecord struct {
|
||||
Status string `json:"status"`
|
||||
Detail string `json:"detail"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
}
|
||||
|
||||
var datastoreEndpointPattern = regexp.MustCompile(`--datastore-endpoint(?:=|\s+)(?:'([^']+)'|"([^"]+)"|([^\s\\]+))`)
|
||||
|
||||
var criticalStartupWorkloads = []startupWorkload{
|
||||
{Namespace: "flux-system", Kind: "deployment", Name: "source-controller"},
|
||||
{Namespace: "flux-system", Kind: "deployment", Name: "kustomize-controller"},
|
||||
{Namespace: "flux-system", Kind: "deployment", Name: "helm-controller"},
|
||||
{Namespace: "flux-system", Kind: "deployment", Name: "notification-controller"},
|
||||
{Namespace: "vault", Kind: "statefulset", Name: "vault"},
|
||||
{Namespace: "postgres", Kind: "statefulset", Name: "postgres"},
|
||||
{Namespace: "gitea", Kind: "deployment", Name: "gitea"},
|
||||
{Namespace: "monitoring", Kind: "deployment", Name: "grafana"},
|
||||
{Namespace: "monitoring", Kind: "statefulset", Name: "victoria-metrics-single-server"},
|
||||
{Namespace: "monitoring", Kind: "deployment", Name: "kube-state-metrics"},
|
||||
{Namespace: "logging", Kind: "deployment", Name: "oauth2-proxy-logs"},
|
||||
{Namespace: "logging", Kind: "deployment", Name: "opensearch-dashboards"},
|
||||
{Namespace: "logging", Kind: "statefulset", Name: "opensearch"},
|
||||
}
|
||||
|
||||
var ErrEtcdRestoreNotApplicable = errors.New("etcd restore not applicable")
|
||||
|
||||
// New runs one orchestration or CLI step.
|
||||
// Signature: New(cfg config.Config, runner *execx.Runner, store *state.Store, logger *log.Logger) *Orchestrator.
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func New(cfg config.Config, runner *execx.Runner, store *state.Store, logger *log.Logger) *Orchestrator {
|
||||
return &Orchestrator{cfg: cfg, runner: runner, store: store, log: logger}
|
||||
}
|
||||
|
||||
// SetCommandOverrides runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) SetCommandOverrides(run commandOverrideFunc, runSensitive commandOverrideFunc).
|
||||
// Why: enables deterministic integration testing from the top-level testing module
|
||||
// without requiring package-local test files or live cluster dependencies.
|
||||
func (o *Orchestrator) SetCommandOverrides(run commandOverrideFunc, runSensitive commandOverrideFunc) {
|
||||
o.runOverride = run
|
||||
o.runSensitiveOverride = runSensitive
|
||||
}
|
||||
389
internal/cluster/orchestrator_service_stability.go
Normal file
389
internal/cluster/orchestrator_service_stability.go
Normal file
@ -0,0 +1,389 @@
|
||||
package cluster
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/tls"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
"unicode"
|
||||
|
||||
"scm.bstein.dev/bstein/ananke/internal/config"
|
||||
)
|
||||
|
||||
// isLikelyHostname runs one orchestration or CLI step.
|
||||
// Signature: isLikelyHostname(value string) bool.
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func isLikelyHostname(value string) bool {
|
||||
value = strings.TrimSpace(value)
|
||||
if value == "" {
|
||||
return false
|
||||
}
|
||||
if strings.Contains(value, " ") || strings.Contains(value, "/") {
|
||||
return false
|
||||
}
|
||||
return strings.Contains(value, ".")
|
||||
}
|
||||
|
||||
// healIngressHostBackendReplicas runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) healIngressHostBackendReplicas(ctx context.Context, host string) ([]string, error).
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func (o *Orchestrator) healIngressHostBackendReplicas(ctx context.Context, host string) ([]string, error) {
|
||||
namespaces, err := o.discoverIngressNamespacesForHost(ctx, host)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if len(namespaces) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
targetNamespaces := makeStringSet(namespaces)
|
||||
out, err := o.kubectl(ctx, 30*time.Second, "get", "deploy,statefulset", "-A", "-o", "json")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("query workloads: %w", err)
|
||||
}
|
||||
var list workloadList
|
||||
if err := json.Unmarshal([]byte(out), &list); err != nil {
|
||||
return nil, fmt.Errorf("decode workloads: %w", err)
|
||||
}
|
||||
healed := []string{}
|
||||
for _, item := range list.Items {
|
||||
kind := strings.ToLower(strings.TrimSpace(item.Kind))
|
||||
ns := strings.TrimSpace(item.Metadata.Namespace)
|
||||
name := strings.TrimSpace(item.Metadata.Name)
|
||||
if kind == "" || ns == "" || name == "" {
|
||||
continue
|
||||
}
|
||||
if kind != "deployment" && kind != "statefulset" {
|
||||
continue
|
||||
}
|
||||
if _, ok := targetNamespaces[ns]; !ok {
|
||||
continue
|
||||
}
|
||||
desired := int32(1)
|
||||
if item.Spec.Replicas != nil {
|
||||
desired = *item.Spec.Replicas
|
||||
}
|
||||
if desired >= 1 {
|
||||
continue
|
||||
}
|
||||
workload := startupWorkload{Namespace: ns, Kind: kind, Name: name}
|
||||
if err := o.ensureWorkloadReplicas(ctx, workload, 1); err != nil {
|
||||
if isNotFoundErr(err) {
|
||||
continue
|
||||
}
|
||||
return healed, fmt.Errorf("scale %s/%s/%s to 1: %w", ns, kind, name, err)
|
||||
}
|
||||
healed = append(healed, ns+"/"+kind+"/"+name)
|
||||
}
|
||||
return healed, nil
|
||||
}
|
||||
|
||||
// waitForServiceChecklist runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) waitForServiceChecklist(ctx context.Context) error.
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func (o *Orchestrator) waitForServiceChecklist(ctx context.Context) error {
|
||||
wait := time.Duration(o.cfg.Startup.ServiceChecklistWaitSeconds) * time.Second
|
||||
if wait <= 0 {
|
||||
wait = 7 * time.Minute
|
||||
}
|
||||
poll := time.Duration(o.cfg.Startup.ServiceChecklistPollSeconds) * time.Second
|
||||
if poll <= 0 {
|
||||
poll = 5 * time.Second
|
||||
}
|
||||
deadline := time.Now().Add(wait)
|
||||
lastFailure := "unknown"
|
||||
lastLogged := time.Time{}
|
||||
lastRecycleAttempt := time.Time{}
|
||||
lastReplicaHeal := time.Time{}
|
||||
lastIngressHeal := time.Time{}
|
||||
for {
|
||||
o.maybeAutoRecycleStuckPods(ctx, &lastRecycleAttempt)
|
||||
o.maybeAutoHealCriticalWorkloadReplicas(ctx, &lastReplicaHeal)
|
||||
prevFailure := lastFailure
|
||||
ready, detail := o.serviceChecklistReady(ctx)
|
||||
lastFailure = detail
|
||||
if ready {
|
||||
o.log.Printf("external service checklist passed (%s)", detail)
|
||||
return nil
|
||||
}
|
||||
o.maybeAutoHealIngressHostBackends(ctx, &lastIngressHeal, lastFailure)
|
||||
if lastFailure != prevFailure || time.Since(lastLogged) >= 30*time.Second {
|
||||
remaining := time.Until(deadline).Round(time.Second)
|
||||
if remaining < 0 {
|
||||
remaining = 0
|
||||
}
|
||||
o.log.Printf("waiting for external service checklist (%s remaining): %s", remaining, lastFailure)
|
||||
lastLogged = time.Now()
|
||||
}
|
||||
if time.Now().After(deadline) {
|
||||
return fmt.Errorf("startup blocked: external service checklist not satisfied within %s (%s)", wait, lastFailure)
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-time.After(poll):
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// serviceChecklistReady runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) serviceChecklistReady(ctx context.Context) (bool, string).
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func (o *Orchestrator) serviceChecklistReady(ctx context.Context) (bool, string) {
|
||||
checks := o.cfg.Startup.ServiceChecklist
|
||||
if len(checks) == 0 {
|
||||
return true, "no checklist items configured"
|
||||
}
|
||||
for _, check := range checks {
|
||||
ok, detail := o.serviceCheckReady(ctx, check)
|
||||
if !ok {
|
||||
name := strings.TrimSpace(check.Name)
|
||||
if name == "" {
|
||||
name = strings.TrimSpace(check.URL)
|
||||
}
|
||||
return false, fmt.Sprintf("%s: %s", name, detail)
|
||||
}
|
||||
}
|
||||
return true, fmt.Sprintf("checks=%d", len(checks))
|
||||
}
|
||||
|
||||
// serviceCheckReady runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) serviceCheckReady(ctx context.Context, check config.ServiceChecklistCheck) (bool, string).
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func (o *Orchestrator) serviceCheckReady(ctx context.Context, check config.ServiceChecklistCheck) (bool, string) {
|
||||
result, err := o.httpChecklistProbeResult(ctx, check)
|
||||
if err != nil {
|
||||
return false, err.Error()
|
||||
}
|
||||
|
||||
accepted := check.AcceptedStatuses
|
||||
if len(accepted) == 0 {
|
||||
accepted = []int{200, 201, 202, 203, 204, 301, 302, 303, 307, 308, 401, 403}
|
||||
}
|
||||
statusOk := false
|
||||
for _, code := range accepted {
|
||||
if result.Status == code {
|
||||
statusOk = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !statusOk {
|
||||
return false, fmt.Sprintf("unexpected status code=%d", result.Status)
|
||||
}
|
||||
|
||||
locationContains := strings.TrimSpace(check.LocationContains)
|
||||
if locationContains != "" && !checklistContains(result.Location, locationContains) {
|
||||
return false, fmt.Sprintf("location header missing expected marker %q", locationContains)
|
||||
}
|
||||
|
||||
locationNotContains := strings.TrimSpace(check.LocationNotContains)
|
||||
if locationNotContains != "" && checklistContains(result.Location, locationNotContains) {
|
||||
return false, fmt.Sprintf("location header contained forbidden marker %q", locationNotContains)
|
||||
}
|
||||
|
||||
bodyContains := strings.TrimSpace(check.BodyContains)
|
||||
if bodyContains != "" && !checklistContains(result.Body, bodyContains) {
|
||||
return false, fmt.Sprintf("response missing expected marker %q", bodyContains)
|
||||
}
|
||||
|
||||
bodyNotContains := strings.TrimSpace(check.BodyNotContains)
|
||||
if bodyNotContains != "" && checklistContains(result.Body, bodyNotContains) {
|
||||
return false, fmt.Sprintf("response contained forbidden marker %q", bodyNotContains)
|
||||
}
|
||||
|
||||
return true, fmt.Sprintf("status=%d", result.Status)
|
||||
}
|
||||
|
||||
type checklistHTTPProbeResult struct {
|
||||
Status int
|
||||
Body string
|
||||
Location string
|
||||
}
|
||||
|
||||
// httpChecklistProbeResult runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) httpChecklistProbeResult(ctx context.Context, check config.ServiceChecklistCheck) (checklistHTTPProbeResult, error).
|
||||
// Why: checklist checks need response headers (for redirect verification) in
|
||||
// addition to status/body so startup can validate real user-facing behavior.
|
||||
func (o *Orchestrator) httpChecklistProbeResult(ctx context.Context, check config.ServiceChecklistCheck) (checklistHTTPProbeResult, error) {
|
||||
result := checklistHTTPProbeResult{}
|
||||
status, body, location, err := o.httpChecklistProbeWithLocation(ctx, check)
|
||||
if err != nil {
|
||||
return result, err
|
||||
}
|
||||
result.Status = status
|
||||
result.Body = body
|
||||
result.Location = location
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// httpChecklistProbe runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) httpChecklistProbe(ctx context.Context, check config.ServiceChecklistCheck) (int, string, error).
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func (o *Orchestrator) httpChecklistProbe(ctx context.Context, check config.ServiceChecklistCheck) (int, string, error) {
|
||||
status, body, _, err := o.httpChecklistProbeWithLocation(ctx, check)
|
||||
return status, body, err
|
||||
}
|
||||
|
||||
// httpChecklistProbeWithLocation runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) httpChecklistProbeWithLocation(ctx context.Context, check config.ServiceChecklistCheck) (int, string, string, error).
|
||||
// Why: redirects and auth gates require location-header assertions to prevent
|
||||
// startup false-positives on partially healthy protected services.
|
||||
func (o *Orchestrator) httpChecklistProbeWithLocation(ctx context.Context, check config.ServiceChecklistCheck) (int, string, string, error) {
|
||||
timeout := time.Duration(check.TimeoutSeconds) * time.Second
|
||||
if timeout <= 0 {
|
||||
timeout = 12 * time.Second
|
||||
}
|
||||
|
||||
transport := &http.Transport{}
|
||||
if check.InsecureSkipTLS {
|
||||
transport.TLSClientConfig = &tls.Config{InsecureSkipVerify: true}
|
||||
}
|
||||
client := &http.Client{
|
||||
Timeout: timeout,
|
||||
Transport: transport,
|
||||
CheckRedirect: func(_ *http.Request, _ []*http.Request) error {
|
||||
return http.ErrUseLastResponse
|
||||
},
|
||||
}
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, strings.TrimSpace(check.URL), nil)
|
||||
if err != nil {
|
||||
return 0, "", "", fmt.Errorf("build request: %w", err)
|
||||
}
|
||||
req.Header.Set("User-Agent", "ananke/startup-checklist")
|
||||
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return 0, "", "", fmt.Errorf("request failed: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
body, readErr := io.ReadAll(io.LimitReader(resp.Body, 64*1024))
|
||||
if readErr != nil {
|
||||
return resp.StatusCode, "", "", fmt.Errorf("read response body: %w", readErr)
|
||||
}
|
||||
|
||||
return resp.StatusCode, string(body), strings.TrimSpace(resp.Header.Get("Location")), nil
|
||||
}
|
||||
|
||||
// checklistContains runs one orchestration or CLI step.
|
||||
// Signature: checklistContains(body, marker string) bool.
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func checklistContains(body, marker string) bool {
|
||||
bodyLower := strings.ToLower(body)
|
||||
markerLower := strings.ToLower(marker)
|
||||
if strings.Contains(bodyLower, markerLower) {
|
||||
return true
|
||||
}
|
||||
bodyCompact := compactLowerNoSpace(bodyLower)
|
||||
markerCompact := compactLowerNoSpace(markerLower)
|
||||
if markerCompact == "" {
|
||||
return true
|
||||
}
|
||||
return strings.Contains(bodyCompact, markerCompact)
|
||||
}
|
||||
|
||||
// compactLowerNoSpace runs one orchestration or CLI step.
|
||||
// Signature: compactLowerNoSpace(s string) string.
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func compactLowerNoSpace(s string) string {
|
||||
var b strings.Builder
|
||||
b.Grow(len(s))
|
||||
for _, r := range s {
|
||||
if unicode.IsSpace(r) {
|
||||
continue
|
||||
}
|
||||
b.WriteRune(r)
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// waitForStabilityWindow runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) waitForStabilityWindow(ctx context.Context) error.
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func (o *Orchestrator) waitForStabilityWindow(ctx context.Context) error {
|
||||
window := time.Duration(o.cfg.Startup.ServiceChecklistStabilitySec) * time.Second
|
||||
if window <= 0 {
|
||||
return nil
|
||||
}
|
||||
poll := time.Duration(o.cfg.Startup.ServiceChecklistPollSeconds) * time.Second
|
||||
if poll <= 0 {
|
||||
poll = 5 * time.Second
|
||||
}
|
||||
deadline := time.Now().Add(window)
|
||||
lastStatus := time.Time{}
|
||||
lastRecycleAttempt := time.Time{}
|
||||
lastReplicaHeal := time.Time{}
|
||||
|
||||
for {
|
||||
o.maybeAutoRecycleStuckPods(ctx, &lastRecycleAttempt)
|
||||
o.maybeAutoHealCriticalWorkloadReplicas(ctx, &lastReplicaHeal)
|
||||
if err := o.startupStabilityHealthy(ctx); err != nil {
|
||||
return fmt.Errorf("startup stability window failed: %w", err)
|
||||
}
|
||||
if time.Now().After(deadline) {
|
||||
o.log.Printf("startup stability window passed (%s)", window)
|
||||
return nil
|
||||
}
|
||||
if time.Since(lastStatus) >= 30*time.Second {
|
||||
remaining := time.Until(deadline).Round(time.Second)
|
||||
if remaining < 0 {
|
||||
remaining = 0
|
||||
}
|
||||
o.log.Printf("startup stability soak in progress (%s remaining)", remaining)
|
||||
lastStatus = time.Now()
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-time.After(poll):
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// startupStabilityHealthy runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) startupStabilityHealthy(ctx context.Context) error.
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func (o *Orchestrator) startupStabilityHealthy(ctx context.Context) error {
|
||||
if o.cfg.Startup.RequireFluxHealth {
|
||||
ready, detail, err := o.fluxHealthReady(ctx)
|
||||
if err != nil {
|
||||
return fmt.Errorf("flux check error: %w", err)
|
||||
}
|
||||
if !ready {
|
||||
return fmt.Errorf("flux not ready: %s", detail)
|
||||
}
|
||||
}
|
||||
if o.cfg.Startup.RequireWorkloadConvergence {
|
||||
ready, detail, err := o.workloadConvergenceReady(ctx)
|
||||
if err != nil {
|
||||
return fmt.Errorf("workload check error: %w", err)
|
||||
}
|
||||
if !ready {
|
||||
return fmt.Errorf("workloads not converged: %s", detail)
|
||||
}
|
||||
}
|
||||
if o.cfg.Startup.RequireServiceChecklist {
|
||||
ready, detail := o.serviceChecklistReady(ctx)
|
||||
if !ready {
|
||||
return fmt.Errorf("external services not healthy: %s", detail)
|
||||
}
|
||||
}
|
||||
if o.cfg.Startup.RequireIngressChecklist {
|
||||
ready, detail := o.ingressChecklistReady(ctx)
|
||||
if !ready {
|
||||
return fmt.Errorf("ingress reachability not healthy: %s", detail)
|
||||
}
|
||||
}
|
||||
failures, err := o.startupFailurePods(ctx)
|
||||
if err != nil {
|
||||
return fmt.Errorf("pod failure check error: %w", err)
|
||||
}
|
||||
if len(failures) > 0 {
|
||||
return fmt.Errorf("pods in crash/image-pull failures: %s", joinLimited(failures, 8))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@ -15,6 +15,9 @@ import (
|
||||
"scm.bstein.dev/bstein/ananke/internal/state"
|
||||
)
|
||||
|
||||
// TestParseVaultSealed runs one orchestration or CLI step.
|
||||
// Signature: TestParseVaultSealed(t *testing.T).
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func TestParseVaultSealed(t *testing.T) {
|
||||
sealed, err := parseVaultSealed(`{"initialized":true,"sealed":true}`)
|
||||
if err != nil {
|
||||
@ -33,12 +36,18 @@ func TestParseVaultSealed(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestParseVaultSealedRejectsEmpty runs one orchestration or CLI step.
|
||||
// Signature: TestParseVaultSealedRejectsEmpty(t *testing.T).
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func TestParseVaultSealedRejectsEmpty(t *testing.T) {
|
||||
if _, err := parseVaultSealed(" "); err == nil {
|
||||
t.Fatalf("expected parse error for empty status payload")
|
||||
}
|
||||
}
|
||||
|
||||
// TestParseVaultSealedWithKubectlPreamble runs one orchestration or CLI step.
|
||||
// Signature: TestParseVaultSealedWithKubectlPreamble(t *testing.T).
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func TestParseVaultSealedWithKubectlPreamble(t *testing.T) {
|
||||
raw := "Defaulted container \"vault\" out of: vault, setup-config (init)\n{\"sealed\":true,\"initialized\":true}\n"
|
||||
sealed, err := parseVaultSealed(raw)
|
||||
@ -50,6 +59,9 @@ func TestParseVaultSealedWithKubectlPreamble(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestFallbackWorkersFromInventoryUsesManagedNodes runs one orchestration or CLI step.
|
||||
// Signature: TestFallbackWorkersFromInventoryUsesManagedNodes(t *testing.T).
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func TestFallbackWorkersFromInventoryUsesManagedNodes(t *testing.T) {
|
||||
orch := &Orchestrator{
|
||||
cfg: config.Config{
|
||||
@ -70,6 +82,9 @@ func TestFallbackWorkersFromInventoryUsesManagedNodes(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestFallbackWorkersFromInventoryFallsBackToHosts runs one orchestration or CLI step.
|
||||
// Signature: TestFallbackWorkersFromInventoryFallsBackToHosts(t *testing.T).
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func TestFallbackWorkersFromInventoryFallsBackToHosts(t *testing.T) {
|
||||
orch := &Orchestrator{
|
||||
cfg: config.Config{
|
||||
@ -89,12 +104,18 @@ func TestFallbackWorkersFromInventoryFallsBackToHosts(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestIntentFreshTreatsZeroTimestampAsFresh runs one orchestration or CLI step.
|
||||
// Signature: TestIntentFreshTreatsZeroTimestampAsFresh(t *testing.T).
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func TestIntentFreshTreatsZeroTimestampAsFresh(t *testing.T) {
|
||||
if !intentFresh(state.Intent{}, 30*time.Second) {
|
||||
t.Fatalf("zero updated_at intent should be treated as fresh")
|
||||
}
|
||||
}
|
||||
|
||||
// TestIntentFreshRespectsAge runs one orchestration or CLI step.
|
||||
// Signature: TestIntentFreshRespectsAge(t *testing.T).
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func TestIntentFreshRespectsAge(t *testing.T) {
|
||||
stale := state.Intent{UpdatedAt: time.Now().Add(-2 * time.Minute)}
|
||||
fresh := state.Intent{UpdatedAt: time.Now().Add(-20 * time.Second)}
|
||||
@ -106,6 +127,9 @@ func TestIntentFreshRespectsAge(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestCoordinationPeersDedupesAndIncludesForwardHost runs one orchestration or CLI step.
|
||||
// Signature: TestCoordinationPeersDedupesAndIncludesForwardHost(t *testing.T).
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func TestCoordinationPeersDedupesAndIncludesForwardHost(t *testing.T) {
|
||||
orch := &Orchestrator{
|
||||
cfg: config.Config{
|
||||
@ -122,6 +146,9 @@ func TestCoordinationPeersDedupesAndIncludesForwardHost(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestWorkloadTargetsIgnoredNodesByNodeSelector runs one orchestration or CLI step.
|
||||
// Signature: TestWorkloadTargetsIgnoredNodesByNodeSelector(t *testing.T).
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func TestWorkloadTargetsIgnoredNodesByNodeSelector(t *testing.T) {
|
||||
spec := podSpec{
|
||||
NodeSelector: map[string]string{
|
||||
@ -134,6 +161,9 @@ func TestWorkloadTargetsIgnoredNodesByNodeSelector(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestParseWorkloadIgnoreRules runs one orchestration or CLI step.
|
||||
// Signature: TestParseWorkloadIgnoreRules(t *testing.T).
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func TestParseWorkloadIgnoreRules(t *testing.T) {
|
||||
rules := parseWorkloadIgnoreRules([]string{
|
||||
"maintenance/metis",
|
||||
@ -153,6 +183,9 @@ func TestParseWorkloadIgnoreRules(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestNamespaceCandidatesFromIgnoreKustomizations runs one orchestration or CLI step.
|
||||
// Signature: TestNamespaceCandidatesFromIgnoreKustomizations(t *testing.T).
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func TestNamespaceCandidatesFromIgnoreKustomizations(t *testing.T) {
|
||||
got := namespaceCandidatesFromIgnoreKustomizations([]string{
|
||||
"flux-system/jellyfin",
|
||||
@ -166,12 +199,18 @@ func TestNamespaceCandidatesFromIgnoreKustomizations(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestProbeStatusAcceptedRejects404 runs one orchestration or CLI step.
|
||||
// Signature: TestProbeStatusAcceptedRejects404(t *testing.T).
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func TestProbeStatusAcceptedRejects404(t *testing.T) {
|
||||
if probeStatusAccepted("https://metrics.bstein.dev/login", 404) {
|
||||
t.Fatalf("expected 404 probe status to be rejected")
|
||||
}
|
||||
}
|
||||
|
||||
// TestParseFluxKustomizationTimeout runs one orchestration or CLI step.
|
||||
// Signature: TestParseFluxKustomizationTimeout(t *testing.T).
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func TestParseFluxKustomizationTimeout(t *testing.T) {
|
||||
if got := parseFluxKustomizationTimeout("30m"); got != 30*time.Minute {
|
||||
t.Fatalf("expected 30m duration, got %s", got)
|
||||
@ -187,6 +226,9 @@ func TestParseFluxKustomizationTimeout(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestServiceCheckReadyRequiresBodyContains runs one orchestration or CLI step.
|
||||
// Signature: TestServiceCheckReadyRequiresBodyContains(t *testing.T).
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func TestServiceCheckReadyRequiresBodyContains(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
|
||||
w.WriteHeader(http.StatusOK)
|
||||
@ -209,6 +251,9 @@ func TestServiceCheckReadyRequiresBodyContains(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestServiceCheckReadyBodyContainsIgnoresWhitespace runs one orchestration or CLI step.
|
||||
// Signature: TestServiceCheckReadyBodyContainsIgnoresWhitespace(t *testing.T).
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func TestServiceCheckReadyBodyContainsIgnoresWhitespace(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
|
||||
w.WriteHeader(http.StatusOK)
|
||||
@ -231,6 +276,62 @@ func TestServiceCheckReadyBodyContainsIgnoresWhitespace(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestServiceCheckReadyRequiresLocationContains runs one orchestration or CLI step.
|
||||
// Signature: TestServiceCheckReadyRequiresLocationContains(t *testing.T).
|
||||
// Why: startup checks must validate redirect targets for OIDC-gated services.
|
||||
func TestServiceCheckReadyRequiresLocationContains(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
|
||||
w.Header().Set("Location", "https://sso.bstein.dev/realms/atlas/protocol/openid-connect/auth?client_id=logs")
|
||||
w.WriteHeader(http.StatusFound)
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
orch := &Orchestrator{
|
||||
log: log.New(os.Stdout, "", 0),
|
||||
}
|
||||
ok, detail := orch.serviceCheckReady(context.Background(), config.ServiceChecklistCheck{
|
||||
Name: "logging-oidc-redirect",
|
||||
URL: srv.URL,
|
||||
AcceptedStatuses: []int{302},
|
||||
LocationContains: "client_id=logs",
|
||||
TimeoutSeconds: 5,
|
||||
})
|
||||
if !ok {
|
||||
t.Fatalf("expected location-aware service check to pass, detail=%s", detail)
|
||||
}
|
||||
}
|
||||
|
||||
// TestServiceCheckReadyRejectsMissingLocationMarker runs one orchestration or CLI step.
|
||||
// Signature: TestServiceCheckReadyRejectsMissingLocationMarker(t *testing.T).
|
||||
// Why: prevents false positives when redirects point somewhere unexpected.
|
||||
func TestServiceCheckReadyRejectsMissingLocationMarker(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
|
||||
w.Header().Set("Location", "https://sso.bstein.dev/realms/atlas/protocol/openid-connect/auth?client_id=wrong")
|
||||
w.WriteHeader(http.StatusFound)
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
orch := &Orchestrator{
|
||||
log: log.New(os.Stdout, "", 0),
|
||||
}
|
||||
ok, detail := orch.serviceCheckReady(context.Background(), config.ServiceChecklistCheck{
|
||||
Name: "logging-oidc-redirect",
|
||||
URL: srv.URL,
|
||||
AcceptedStatuses: []int{302},
|
||||
LocationContains: "client_id=logs",
|
||||
TimeoutSeconds: 5,
|
||||
})
|
||||
if ok {
|
||||
t.Fatalf("expected location-aware service check to fail")
|
||||
}
|
||||
if !strings.Contains(detail, "location header missing expected marker") {
|
||||
t.Fatalf("expected missing location marker detail, got %q", detail)
|
||||
}
|
||||
}
|
||||
|
||||
// TestChecklistFailureHostFromIngressDetail runs one orchestration or CLI step.
|
||||
// Signature: TestChecklistFailureHostFromIngressDetail(t *testing.T).
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func TestChecklistFailureHostFromIngressDetail(t *testing.T) {
|
||||
orch := &Orchestrator{}
|
||||
got := orch.checklistFailureHost("cloud.bstein.dev: unexpected status code=500")
|
||||
@ -239,6 +340,9 @@ func TestChecklistFailureHostFromIngressDetail(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestChecklistFailureHostFromServiceCheckName runs one orchestration or CLI step.
|
||||
// Signature: TestChecklistFailureHostFromServiceCheckName(t *testing.T).
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func TestChecklistFailureHostFromServiceCheckName(t *testing.T) {
|
||||
orch := &Orchestrator{
|
||||
cfg: config.Config{
|
||||
@ -258,6 +362,9 @@ func TestChecklistFailureHostFromServiceCheckName(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestChecklistFailureHostUnknown runs one orchestration or CLI step.
|
||||
// Signature: TestChecklistFailureHostUnknown(t *testing.T).
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func TestChecklistFailureHostUnknown(t *testing.T) {
|
||||
orch := &Orchestrator{
|
||||
cfg: config.Config{
|
||||
@ -279,6 +386,9 @@ func TestChecklistFailureHostUnknown(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestStuckVaultInitReasonDetectsHungInit runs one orchestration or CLI step.
|
||||
// Signature: TestStuckVaultInitReasonDetectsHungInit(t *testing.T).
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func TestStuckVaultInitReasonDetectsHungInit(t *testing.T) {
|
||||
var pod podResource
|
||||
pod.Status.Phase = "Pending"
|
||||
@ -302,6 +412,9 @@ func TestStuckVaultInitReasonDetectsHungInit(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestStuckVaultInitReasonIgnoresFreshOrNonVaultPods runs one orchestration or CLI step.
|
||||
// Signature: TestStuckVaultInitReasonIgnoresFreshOrNonVaultPods(t *testing.T).
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func TestStuckVaultInitReasonIgnoresFreshOrNonVaultPods(t *testing.T) {
|
||||
var pod podResource
|
||||
pod.Status.Phase = "Pending"
|
||||
@ -328,70 +441,3 @@ func TestStuckVaultInitReasonIgnoresFreshOrNonVaultPods(t *testing.T) {
|
||||
t.Fatalf("expected no reason for non-vault pod, got %q", reason)
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidateNodeInventoryPassesForStrictMappings(t *testing.T) {
|
||||
orch := &Orchestrator{
|
||||
cfg: config.Config{
|
||||
SSHUser: "atlas",
|
||||
SSHPort: 2277,
|
||||
SSHNodeHosts: map[string]string{
|
||||
"titan-0a": "192.168.22.11",
|
||||
"titan-0b": "192.168.22.12",
|
||||
"titan-0c": "192.168.22.13",
|
||||
"titan-22": "192.168.22.22",
|
||||
},
|
||||
SSHManagedNodes: []string{"titan-0a", "titan-0b", "titan-0c", "titan-22"},
|
||||
ControlPlanes: []string{"titan-0a", "titan-0b", "titan-0c"},
|
||||
Workers: []string{"titan-22"},
|
||||
},
|
||||
log: log.New(os.Stdout, "", 0),
|
||||
}
|
||||
if err := orch.validateNodeInventory(); err != nil {
|
||||
t.Fatalf("expected inventory to pass, got error: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidateNodeInventoryFailsWhenNodeMappingMissing(t *testing.T) {
|
||||
orch := &Orchestrator{
|
||||
cfg: config.Config{
|
||||
SSHUser: "atlas",
|
||||
SSHPort: 2277,
|
||||
SSHNodeHosts: map[string]string{"titan-0a": "192.168.22.11"},
|
||||
SSHManagedNodes: []string{"titan-0a", "titan-0b"},
|
||||
ControlPlanes: []string{"titan-0a"},
|
||||
Workers: []string{"titan-0b"},
|
||||
},
|
||||
log: log.New(os.Stdout, "", 0),
|
||||
}
|
||||
err := orch.validateNodeInventory()
|
||||
if err == nil {
|
||||
t.Fatalf("expected inventory error for missing mapping")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "missing ssh_node_hosts entry") {
|
||||
t.Fatalf("expected missing-mapping detail, got: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidateNodeInventoryFailsWhenWorkerNotManaged(t *testing.T) {
|
||||
orch := &Orchestrator{
|
||||
cfg: config.Config{
|
||||
SSHUser: "atlas",
|
||||
SSHPort: 2277,
|
||||
SSHNodeHosts: map[string]string{
|
||||
"titan-0a": "192.168.22.11",
|
||||
"titan-22": "192.168.22.22",
|
||||
},
|
||||
SSHManagedNodes: []string{"titan-0a"},
|
||||
ControlPlanes: []string{"titan-0a"},
|
||||
Workers: []string{"titan-22"},
|
||||
},
|
||||
log: log.New(os.Stdout, "", 0),
|
||||
}
|
||||
err := orch.validateNodeInventory()
|
||||
if err == nil {
|
||||
t.Fatalf("expected inventory error for unmanaged worker")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "missing from ssh_managed_nodes") {
|
||||
t.Fatalf("expected unmanaged-worker detail, got: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
236
internal/config/apply_defaults.go
Normal file
236
internal/config/apply_defaults.go
Normal file
@ -0,0 +1,236 @@
|
||||
package config
|
||||
|
||||
import "strings"
|
||||
|
||||
// applyDefaults runs one orchestration or CLI step.
|
||||
// Signature: (c *Config) applyDefaults().
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func (c *Config) applyDefaults() {
|
||||
if c.ExpectedFluxBranch == "" {
|
||||
c.ExpectedFluxBranch = "main"
|
||||
}
|
||||
if c.IACRepoPath == "" {
|
||||
c.IACRepoPath = "/opt/titan-iac"
|
||||
}
|
||||
if c.ExpectedFluxSource == "" {
|
||||
c.ExpectedFluxSource = "ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git"
|
||||
}
|
||||
if c.Startup.APIWaitSeconds <= 0 {
|
||||
c.Startup.APIWaitSeconds = 1200
|
||||
}
|
||||
if c.Startup.APIPollSeconds <= 0 {
|
||||
c.Startup.APIPollSeconds = 2
|
||||
}
|
||||
if c.Startup.ShutdownCooldownSeconds <= 0 {
|
||||
c.Startup.ShutdownCooldownSeconds = 45
|
||||
}
|
||||
if c.Startup.MinimumBatteryPercent <= 0 {
|
||||
c.Startup.MinimumBatteryPercent = 20
|
||||
}
|
||||
if c.Startup.NodeInventoryReachWaitSeconds <= 0 {
|
||||
c.Startup.NodeInventoryReachWaitSeconds = 300
|
||||
}
|
||||
if c.Startup.NodeInventoryReachPollSeconds <= 0 {
|
||||
c.Startup.NodeInventoryReachPollSeconds = 5
|
||||
}
|
||||
if c.Startup.RequiredNodeLabels == nil {
|
||||
c.Startup.RequiredNodeLabels = map[string]map[string]string{
|
||||
"titan-09": {
|
||||
"ananke.bstein.dev/harbor-bootstrap": "true",
|
||||
},
|
||||
}
|
||||
}
|
||||
if c.Startup.TimeSyncWaitSeconds <= 0 {
|
||||
c.Startup.TimeSyncWaitSeconds = 240
|
||||
}
|
||||
if c.Startup.TimeSyncPollSeconds <= 0 {
|
||||
c.Startup.TimeSyncPollSeconds = 5
|
||||
}
|
||||
if c.Startup.TimeSyncMode == "" {
|
||||
c.Startup.TimeSyncMode = "quorum"
|
||||
}
|
||||
if c.Startup.TimeSyncQuorum <= 0 {
|
||||
c.Startup.TimeSyncQuorum = 2
|
||||
}
|
||||
if c.Startup.TimeSyncQuorum > len(c.ControlPlanes) && len(c.ControlPlanes) > 0 {
|
||||
c.Startup.TimeSyncQuorum = len(c.ControlPlanes)
|
||||
}
|
||||
if c.Startup.EtcdRestoreControlPlane == "" && len(c.ControlPlanes) > 0 {
|
||||
c.Startup.EtcdRestoreControlPlane = c.ControlPlanes[0]
|
||||
}
|
||||
if c.Startup.StorageReadyWaitSeconds <= 0 {
|
||||
c.Startup.StorageReadyWaitSeconds = 420
|
||||
}
|
||||
if c.Startup.StorageReadyPollSeconds <= 0 {
|
||||
c.Startup.StorageReadyPollSeconds = 5
|
||||
}
|
||||
if c.Startup.StorageMinReadyNodes <= 0 {
|
||||
c.Startup.StorageMinReadyNodes = 2
|
||||
}
|
||||
if len(c.Startup.StorageCriticalPVCs) == 0 {
|
||||
c.Startup.StorageCriticalPVCs = []string{
|
||||
"vault/data-vault-0",
|
||||
"postgres/postgres-data-postgres-0",
|
||||
"gitea/gitea-data",
|
||||
"sso/keycloak-data",
|
||||
}
|
||||
}
|
||||
if c.Startup.PostStartProbeWaitSeconds <= 0 {
|
||||
c.Startup.PostStartProbeWaitSeconds = 240
|
||||
}
|
||||
if c.Startup.PostStartProbePollSeconds <= 0 {
|
||||
c.Startup.PostStartProbePollSeconds = 5
|
||||
}
|
||||
if len(c.Startup.PostStartProbes) == 0 {
|
||||
c.Startup.PostStartProbes = []string{
|
||||
"https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration",
|
||||
"https://scm.bstein.dev/api/healthz",
|
||||
"https://metrics.bstein.dev/api/health",
|
||||
}
|
||||
}
|
||||
if c.Startup.ServiceChecklistWaitSeconds <= 0 {
|
||||
c.Startup.ServiceChecklistWaitSeconds = 420
|
||||
}
|
||||
if c.Startup.ServiceChecklistPollSeconds <= 0 {
|
||||
c.Startup.ServiceChecklistPollSeconds = 5
|
||||
}
|
||||
if c.Startup.ServiceChecklistStabilitySec < 0 {
|
||||
c.Startup.ServiceChecklistStabilitySec = 0
|
||||
}
|
||||
c.Startup.ServiceChecklist = mergeServiceChecklistDefaults(c.Startup.ServiceChecklist, defaultServiceChecklist())
|
||||
for i := range c.Startup.ServiceChecklist {
|
||||
if c.Startup.ServiceChecklist[i].TimeoutSeconds <= 0 {
|
||||
c.Startup.ServiceChecklist[i].TimeoutSeconds = 12
|
||||
}
|
||||
}
|
||||
if c.Startup.CriticalServiceEndpointWaitSec <= 0 {
|
||||
c.Startup.CriticalServiceEndpointWaitSec = 420
|
||||
}
|
||||
if c.Startup.CriticalServiceEndpointPollSec <= 0 {
|
||||
c.Startup.CriticalServiceEndpointPollSec = 5
|
||||
}
|
||||
c.Startup.CriticalServiceEndpoints = mergeStringDefaults(c.Startup.CriticalServiceEndpoints, defaultCriticalServiceEndpoints())
|
||||
if c.Startup.IngressChecklistWaitSeconds <= 0 {
|
||||
c.Startup.IngressChecklistWaitSeconds = 420
|
||||
}
|
||||
if c.Startup.IngressChecklistPollSeconds <= 0 {
|
||||
c.Startup.IngressChecklistPollSeconds = 5
|
||||
}
|
||||
if len(c.Startup.IngressChecklistAccepted) == 0 {
|
||||
c.Startup.IngressChecklistAccepted = []int{200, 301, 302, 307, 308, 401, 403, 404}
|
||||
}
|
||||
if c.Startup.IngressChecklistIgnoreHosts == nil {
|
||||
c.Startup.IngressChecklistIgnoreHosts = []string{}
|
||||
}
|
||||
if c.Startup.NodeSSHAuthWaitSeconds <= 0 {
|
||||
c.Startup.NodeSSHAuthWaitSeconds = 240
|
||||
}
|
||||
if c.Startup.NodeSSHAuthPollSeconds <= 0 {
|
||||
c.Startup.NodeSSHAuthPollSeconds = 5
|
||||
}
|
||||
if c.Startup.FluxHealthWaitSeconds <= 0 {
|
||||
c.Startup.FluxHealthWaitSeconds = 900
|
||||
}
|
||||
if c.Startup.FluxHealthPollSeconds <= 0 {
|
||||
c.Startup.FluxHealthPollSeconds = 5
|
||||
}
|
||||
if c.Startup.IgnoreFluxKustomizations == nil {
|
||||
c.Startup.IgnoreFluxKustomizations = []string{}
|
||||
}
|
||||
if c.Startup.WorkloadConvergenceWaitSeconds <= 0 {
|
||||
c.Startup.WorkloadConvergenceWaitSeconds = 900
|
||||
}
|
||||
if c.Startup.WorkloadConvergencePollSeconds <= 0 {
|
||||
c.Startup.WorkloadConvergencePollSeconds = 5
|
||||
}
|
||||
if c.Startup.IgnoreWorkloadNamespaces == nil {
|
||||
c.Startup.IgnoreWorkloadNamespaces = []string{}
|
||||
}
|
||||
if c.Startup.IgnoreWorkloads == nil {
|
||||
c.Startup.IgnoreWorkloads = []string{}
|
||||
}
|
||||
if c.Startup.IgnoreUnavailableNodes == nil {
|
||||
c.Startup.IgnoreUnavailableNodes = []string{}
|
||||
}
|
||||
if c.Startup.StuckPodGraceSeconds <= 0 {
|
||||
c.Startup.StuckPodGraceSeconds = 180
|
||||
}
|
||||
if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" {
|
||||
c.Startup.VaultUnsealKeyFile = "/var/lib/ananke/vault-unseal.key"
|
||||
}
|
||||
if c.Startup.VaultUnsealBreakglassTimeout <= 0 {
|
||||
c.Startup.VaultUnsealBreakglassTimeout = 15
|
||||
}
|
||||
if c.SSHPort <= 0 {
|
||||
c.SSHPort = 2277
|
||||
}
|
||||
if c.Shutdown.DefaultBudgetSeconds <= 0 {
|
||||
c.Shutdown.DefaultBudgetSeconds = 1380
|
||||
}
|
||||
if c.Shutdown.HistoryMinSamples <= 0 {
|
||||
c.Shutdown.HistoryMinSamples = 3
|
||||
}
|
||||
if c.Shutdown.EmergencyBudgetSec <= 0 {
|
||||
c.Shutdown.EmergencyBudgetSec = 420
|
||||
}
|
||||
if c.Shutdown.EmergencyMinSamples <= 0 {
|
||||
c.Shutdown.EmergencyMinSamples = 3
|
||||
}
|
||||
if c.Shutdown.DrainParallelism <= 0 {
|
||||
c.Shutdown.DrainParallelism = 6
|
||||
}
|
||||
if c.Shutdown.ScaleParallelism <= 0 {
|
||||
c.Shutdown.ScaleParallelism = 8
|
||||
}
|
||||
if c.Shutdown.SSHParallelism <= 0 {
|
||||
c.Shutdown.SSHParallelism = 8
|
||||
}
|
||||
if c.UPS.PollSeconds <= 0 {
|
||||
c.UPS.PollSeconds = 5
|
||||
}
|
||||
if c.UPS.RuntimeSafetyFactor <= 0 {
|
||||
c.UPS.RuntimeSafetyFactor = 1.25
|
||||
}
|
||||
if c.UPS.DebounceCount <= 0 {
|
||||
c.UPS.DebounceCount = 3
|
||||
}
|
||||
if c.UPS.TelemetryTimeoutSeconds <= 0 {
|
||||
c.UPS.TelemetryTimeoutSeconds = 90
|
||||
}
|
||||
if c.Coordination.ForwardShutdownConfig == "" {
|
||||
c.Coordination.ForwardShutdownConfig = "/etc/ananke/ananke.yaml"
|
||||
}
|
||||
if c.Coordination.PeerHosts == nil {
|
||||
c.Coordination.PeerHosts = []string{}
|
||||
}
|
||||
if c.Coordination.CommandTimeoutSeconds <= 0 {
|
||||
c.Coordination.CommandTimeoutSeconds = 25
|
||||
}
|
||||
if c.Coordination.StartupGuardMaxAgeSec <= 0 {
|
||||
c.Coordination.StartupGuardMaxAgeSec = 900
|
||||
}
|
||||
if c.Coordination.Role == "" {
|
||||
c.Coordination.Role = "coordinator"
|
||||
}
|
||||
if c.Metrics.BindAddr == "" {
|
||||
c.Metrics.BindAddr = "0.0.0.0:9560"
|
||||
}
|
||||
if c.Metrics.Path == "" {
|
||||
c.Metrics.Path = "/metrics"
|
||||
}
|
||||
if c.State.Dir == "" {
|
||||
c.State.Dir = "/var/lib/ananke"
|
||||
}
|
||||
if c.State.ReportsDir == "" {
|
||||
c.State.ReportsDir = "/var/lib/ananke/reports"
|
||||
}
|
||||
if c.State.RunHistoryPath == "" {
|
||||
c.State.RunHistoryPath = "/var/lib/ananke/runs.json"
|
||||
}
|
||||
if c.State.LockPath == "" {
|
||||
c.State.LockPath = "/var/lib/ananke/ananke.lock"
|
||||
}
|
||||
if c.State.IntentPath == "" {
|
||||
c.State.IntentPath = "/var/lib/ananke/intent.json"
|
||||
}
|
||||
}
|
||||
@ -7,6 +7,9 @@ import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
// TestLoadAcceptsUPSTargets runs one orchestration or CLI step.
|
||||
// Signature: TestLoadAcceptsUPSTargets(t *testing.T).
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func TestLoadAcceptsUPSTargets(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
cfgPath := filepath.Join(tmp, "ananke.yaml")
|
||||
@ -39,6 +42,9 @@ state:
|
||||
}
|
||||
}
|
||||
|
||||
// TestValidateForwardShutdownRequiresConfigPath runs one orchestration or CLI step.
|
||||
// Signature: TestValidateForwardShutdownRequiresConfigPath(t *testing.T).
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func TestValidateForwardShutdownRequiresConfigPath(t *testing.T) {
|
||||
cfg := defaults()
|
||||
cfg.Coordination.ForwardShutdownHost = "titan-db"
|
||||
@ -48,6 +54,9 @@ func TestValidateForwardShutdownRequiresConfigPath(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestValidateRejectsUnknownRole runs one orchestration or CLI step.
|
||||
// Signature: TestValidateRejectsUnknownRole(t *testing.T).
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func TestValidateRejectsUnknownRole(t *testing.T) {
|
||||
cfg := defaults()
|
||||
cfg.Coordination.Role = "unknown"
|
||||
@ -56,6 +65,9 @@ func TestValidateRejectsUnknownRole(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestValidateRejectsEmptyPeerHostEntry runs one orchestration or CLI step.
|
||||
// Signature: TestValidateRejectsEmptyPeerHostEntry(t *testing.T).
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func TestValidateRejectsEmptyPeerHostEntry(t *testing.T) {
|
||||
cfg := defaults()
|
||||
cfg.Coordination.PeerHosts = []string{"titan-24", " "}
|
||||
@ -64,6 +76,9 @@ func TestValidateRejectsEmptyPeerHostEntry(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestValidateRejectsUnknownEtcdRestoreControlPlane runs one orchestration or CLI step.
|
||||
// Signature: TestValidateRejectsUnknownEtcdRestoreControlPlane(t *testing.T).
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func TestValidateRejectsUnknownEtcdRestoreControlPlane(t *testing.T) {
|
||||
cfg := defaults()
|
||||
cfg.Startup.EtcdRestoreControlPlane = "titan-missing"
|
||||
@ -72,6 +87,9 @@ func TestValidateRejectsUnknownEtcdRestoreControlPlane(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestLoadSetsCoordinationGuardDefaults runs one orchestration or CLI step.
|
||||
// Signature: TestLoadSetsCoordinationGuardDefaults(t *testing.T).
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func TestLoadSetsCoordinationGuardDefaults(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
cfgPath := filepath.Join(tmp, "ananke.yaml")
|
||||
@ -114,6 +132,9 @@ state:
|
||||
}
|
||||
}
|
||||
|
||||
// TestValidateRejectsInvalidStartupShutdownCooldown runs one orchestration or CLI step.
|
||||
// Signature: TestValidateRejectsInvalidStartupShutdownCooldown(t *testing.T).
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func TestValidateRejectsInvalidStartupShutdownCooldown(t *testing.T) {
|
||||
cfg := defaults()
|
||||
cfg.Startup.ShutdownCooldownSeconds = 0
|
||||
@ -122,6 +143,9 @@ func TestValidateRejectsInvalidStartupShutdownCooldown(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestValidateRejectsInvalidTimeSyncMode runs one orchestration or CLI step.
|
||||
// Signature: TestValidateRejectsInvalidTimeSyncMode(t *testing.T).
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func TestValidateRejectsInvalidTimeSyncMode(t *testing.T) {
|
||||
cfg := defaults()
|
||||
cfg.Startup.TimeSyncMode = "invalid"
|
||||
@ -130,6 +154,9 @@ func TestValidateRejectsInvalidTimeSyncMode(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestValidateRejectsBadStoragePVCFormat runs one orchestration or CLI step.
|
||||
// Signature: TestValidateRejectsBadStoragePVCFormat(t *testing.T).
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func TestValidateRejectsBadStoragePVCFormat(t *testing.T) {
|
||||
cfg := defaults()
|
||||
cfg.Startup.StorageCriticalPVCs = []string{"vault-data-vault-0"}
|
||||
@ -138,6 +165,9 @@ func TestValidateRejectsBadStoragePVCFormat(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestValidateRejectsMissingPostStartProbesWhenRequired runs one orchestration or CLI step.
|
||||
// Signature: TestValidateRejectsMissingPostStartProbesWhenRequired(t *testing.T).
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func TestValidateRejectsMissingPostStartProbesWhenRequired(t *testing.T) {
|
||||
cfg := defaults()
|
||||
cfg.Startup.RequirePostStartProbes = true
|
||||
@ -147,6 +177,9 @@ func TestValidateRejectsMissingPostStartProbesWhenRequired(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestValidateRejectsMissingServiceChecklistWhenRequired runs one orchestration or CLI step.
|
||||
// Signature: TestValidateRejectsMissingServiceChecklistWhenRequired(t *testing.T).
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func TestValidateRejectsMissingServiceChecklistWhenRequired(t *testing.T) {
|
||||
cfg := defaults()
|
||||
cfg.Startup.RequireServiceChecklist = true
|
||||
@ -156,6 +189,9 @@ func TestValidateRejectsMissingServiceChecklistWhenRequired(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestValidateRejectsBadServiceChecklistURL runs one orchestration or CLI step.
|
||||
// Signature: TestValidateRejectsBadServiceChecklistURL(t *testing.T).
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func TestValidateRejectsBadServiceChecklistURL(t *testing.T) {
|
||||
cfg := defaults()
|
||||
cfg.Startup.ServiceChecklist = []ServiceChecklistCheck{
|
||||
@ -171,6 +207,9 @@ func TestValidateRejectsBadServiceChecklistURL(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestValidateRejectsBadIgnoreFluxKustomizationFormat runs one orchestration or CLI step.
|
||||
// Signature: TestValidateRejectsBadIgnoreFluxKustomizationFormat(t *testing.T).
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func TestValidateRejectsBadIgnoreFluxKustomizationFormat(t *testing.T) {
|
||||
cfg := defaults()
|
||||
cfg.Startup.IgnoreFluxKustomizations = []string{"jellyfin"}
|
||||
@ -179,6 +218,9 @@ func TestValidateRejectsBadIgnoreFluxKustomizationFormat(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestValidateRejectsBadIgnoreWorkloadFormat runs one orchestration or CLI step.
|
||||
// Signature: TestValidateRejectsBadIgnoreWorkloadFormat(t *testing.T).
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func TestValidateRejectsBadIgnoreWorkloadFormat(t *testing.T) {
|
||||
cfg := defaults()
|
||||
cfg.Startup.IgnoreWorkloads = []string{"maintenance/metis/extra/value"}
|
||||
@ -187,6 +229,9 @@ func TestValidateRejectsBadIgnoreWorkloadFormat(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestValidateRejectsInvalidRequiredNodeLabel runs one orchestration or CLI step.
|
||||
// Signature: TestValidateRejectsInvalidRequiredNodeLabel(t *testing.T).
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func TestValidateRejectsInvalidRequiredNodeLabel(t *testing.T) {
|
||||
cfg := defaults()
|
||||
cfg.Startup.RequiredNodeLabels = map[string]map[string]string{
|
||||
@ -198,3 +243,85 @@ func TestValidateRejectsInvalidRequiredNodeLabel(t *testing.T) {
|
||||
t.Fatalf("expected validation error for invalid required_node_labels entry")
|
||||
}
|
||||
}
|
||||
|
||||
// TestValidateRejectsInvalidNodeInventoryReachWindow runs one orchestration or CLI step.
|
||||
// Signature: TestValidateRejectsInvalidNodeInventoryReachWindow(t *testing.T).
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func TestValidateRejectsInvalidNodeInventoryReachWindow(t *testing.T) {
|
||||
cfg := defaults()
|
||||
cfg.Startup.NodeInventoryReachWaitSeconds = 0
|
||||
if err := cfg.Validate(); err == nil {
|
||||
t.Fatalf("expected validation error for invalid node_inventory_reachability_wait_seconds")
|
||||
}
|
||||
}
|
||||
|
||||
// TestValidateRejectsMissingReportsDir runs one orchestration or CLI step.
|
||||
// Signature: TestValidateRejectsMissingReportsDir(t *testing.T).
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func TestValidateRejectsMissingReportsDir(t *testing.T) {
|
||||
cfg := defaults()
|
||||
cfg.State.ReportsDir = ""
|
||||
if err := cfg.Validate(); err == nil {
|
||||
t.Fatalf("expected validation error for missing state.reports_dir")
|
||||
}
|
||||
}
|
||||
|
||||
// TestApplyDefaultsMergesServiceChecklistDefaults runs one orchestration or CLI step.
|
||||
// Signature: TestApplyDefaultsMergesServiceChecklistDefaults(t *testing.T).
|
||||
// Why: host configs may define a partial checklist; startup still needs the
|
||||
// baseline service validations learned from drills.
|
||||
func TestApplyDefaultsMergesServiceChecklistDefaults(t *testing.T) {
|
||||
cfg := Config{
|
||||
Startup: Startup{
|
||||
ServiceChecklist: []ServiceChecklistCheck{
|
||||
{
|
||||
Name: "custom-smoke",
|
||||
URL: "https://example.invalid/healthz",
|
||||
TimeoutSeconds: 7,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
cfg.applyDefaults()
|
||||
|
||||
names := map[string]struct{}{}
|
||||
for _, check := range cfg.Startup.ServiceChecklist {
|
||||
names[check.Name] = struct{}{}
|
||||
}
|
||||
if _, ok := names["custom-smoke"]; !ok {
|
||||
t.Fatalf("expected custom checklist entry to be preserved")
|
||||
}
|
||||
if _, ok := names["logging-oidc-redirect"]; !ok {
|
||||
t.Fatalf("expected default logging redirect check to be merged in")
|
||||
}
|
||||
if _, ok := names["vaultwarden-ui"]; !ok {
|
||||
t.Fatalf("expected default vaultwarden check to be merged in")
|
||||
}
|
||||
}
|
||||
|
||||
// TestApplyDefaultsMergesCriticalServiceEndpointDefaults runs one orchestration or CLI step.
|
||||
// Signature: TestApplyDefaultsMergesCriticalServiceEndpointDefaults(t *testing.T).
|
||||
// Why: startup endpoint gating must keep baseline backend checks even when host
|
||||
// configs only provide a subset.
|
||||
func TestApplyDefaultsMergesCriticalServiceEndpointDefaults(t *testing.T) {
|
||||
cfg := Config{
|
||||
Startup: Startup{
|
||||
CriticalServiceEndpoints: []string{"customns/customsvc"},
|
||||
},
|
||||
}
|
||||
cfg.applyDefaults()
|
||||
|
||||
seen := map[string]struct{}{}
|
||||
for _, entry := range cfg.Startup.CriticalServiceEndpoints {
|
||||
seen[entry] = struct{}{}
|
||||
}
|
||||
if _, ok := seen["customns/customsvc"]; !ok {
|
||||
t.Fatalf("expected custom critical endpoint to be preserved")
|
||||
}
|
||||
if _, ok := seen["logging/opensearch-dashboards"]; !ok {
|
||||
t.Fatalf("expected logging/opensearch-dashboards critical endpoint default")
|
||||
}
|
||||
if _, ok := seen["monitoring/victoria-metrics-single-server"]; !ok {
|
||||
t.Fatalf("expected monitoring/victoria-metrics-single-server critical endpoint default")
|
||||
}
|
||||
}
|
||||
|
||||
155
internal/config/defaults.go
Normal file
155
internal/config/defaults.go
Normal file
@ -0,0 +1,155 @@
|
||||
package config
|
||||
|
||||
// defaults runs one orchestration or CLI step.
|
||||
// Signature: defaults() Config.
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func defaults() Config {
|
||||
c := Config{
|
||||
IACRepoPath: "/opt/titan-iac",
|
||||
ExpectedFluxBranch: "main",
|
||||
ExpectedFluxSource: "ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git",
|
||||
SSHPort: 2277,
|
||||
ControlPlanes: []string{"titan-0a", "titan-0b", "titan-0c"},
|
||||
LocalBootstrapPaths: []string{
|
||||
"infrastructure/core",
|
||||
"clusters/atlas/flux-system",
|
||||
"infrastructure/sources/helm",
|
||||
"infrastructure/metallb",
|
||||
"infrastructure/traefik",
|
||||
"infrastructure/cert-manager",
|
||||
"infrastructure/vault-csi",
|
||||
"infrastructure/vault-injector",
|
||||
"services/vault",
|
||||
"infrastructure/postgres",
|
||||
"services/gitea",
|
||||
"services/keycloak",
|
||||
"services/oauth2-proxy",
|
||||
},
|
||||
ExcludedNamespaces: []string{
|
||||
"kube-system",
|
||||
"kube-public",
|
||||
"kube-node-lease",
|
||||
"flux-system",
|
||||
"traefik",
|
||||
"metallb-system",
|
||||
"cert-manager",
|
||||
"longhorn-system",
|
||||
"vault",
|
||||
"postgres",
|
||||
"maintenance",
|
||||
},
|
||||
Startup: Startup{
|
||||
APIWaitSeconds: 1200,
|
||||
APIPollSeconds: 2,
|
||||
ShutdownCooldownSeconds: 45,
|
||||
RequireNodeInventoryReach: true,
|
||||
NodeInventoryReachWaitSeconds: 300,
|
||||
NodeInventoryReachPollSeconds: 5,
|
||||
RequireTimeSync: true,
|
||||
TimeSyncWaitSeconds: 240,
|
||||
TimeSyncPollSeconds: 5,
|
||||
TimeSyncMode: "quorum",
|
||||
TimeSyncQuorum: 2,
|
||||
ReconcileAccessOnBoot: true,
|
||||
AutoEtcdRestoreOnAPIFailure: true,
|
||||
EtcdRestoreControlPlane: "titan-0a",
|
||||
RequireStorageReady: true,
|
||||
StorageReadyWaitSeconds: 420,
|
||||
StorageReadyPollSeconds: 5,
|
||||
StorageMinReadyNodes: 2,
|
||||
StorageCriticalPVCs: []string{
|
||||
"vault/data-vault-0",
|
||||
"postgres/postgres-data-postgres-0",
|
||||
"gitea/gitea-data",
|
||||
"sso/keycloak-data",
|
||||
},
|
||||
MinimumBatteryPercent: 20,
|
||||
RequiredNodeLabels: map[string]map[string]string{
|
||||
"titan-09": {
|
||||
"ananke.bstein.dev/harbor-bootstrap": "true",
|
||||
},
|
||||
},
|
||||
RequirePostStartProbes: true,
|
||||
PostStartProbeWaitSeconds: 240,
|
||||
PostStartProbePollSeconds: 5,
|
||||
PostStartProbes: []string{
|
||||
"https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration",
|
||||
"https://scm.bstein.dev/api/healthz",
|
||||
"https://metrics.bstein.dev/api/health",
|
||||
},
|
||||
RequireServiceChecklist: true,
|
||||
ServiceChecklistWaitSeconds: 420,
|
||||
ServiceChecklistPollSeconds: 5,
|
||||
ServiceChecklistStabilitySec: 120,
|
||||
ServiceChecklist: defaultServiceChecklist(),
|
||||
RequireCriticalServiceEndpoints: true,
|
||||
CriticalServiceEndpointWaitSec: 420,
|
||||
CriticalServiceEndpointPollSec: 5,
|
||||
CriticalServiceEndpoints: defaultCriticalServiceEndpoints(),
|
||||
RequireIngressChecklist: true,
|
||||
IngressChecklistWaitSeconds: 420,
|
||||
IngressChecklistPollSeconds: 5,
|
||||
IngressChecklistAccepted: []int{200, 301, 302, 307, 308, 401, 403, 404},
|
||||
IngressChecklistIgnoreHosts: []string{},
|
||||
RequireNodeSSHAuth: true,
|
||||
NodeSSHAuthWaitSeconds: 240,
|
||||
NodeSSHAuthPollSeconds: 5,
|
||||
RequireFluxHealth: true,
|
||||
FluxHealthWaitSeconds: 900,
|
||||
FluxHealthPollSeconds: 5,
|
||||
IgnoreFluxKustomizations: []string{},
|
||||
RequireWorkloadConvergence: true,
|
||||
WorkloadConvergenceWaitSeconds: 900,
|
||||
WorkloadConvergencePollSeconds: 5,
|
||||
IgnoreWorkloadNamespaces: []string{},
|
||||
IgnoreWorkloads: []string{},
|
||||
IgnoreUnavailableNodes: []string{},
|
||||
AutoRecycleStuckPods: true,
|
||||
StuckPodGraceSeconds: 180,
|
||||
VaultUnsealKeyFile: "/var/lib/ananke/vault-unseal.key",
|
||||
VaultUnsealBreakglassTimeout: 15,
|
||||
},
|
||||
Shutdown: Shutdown{
|
||||
DefaultBudgetSeconds: 1380,
|
||||
HistoryMinSamples: 3,
|
||||
EmergencyBudgetSec: 420,
|
||||
EmergencyMinSamples: 3,
|
||||
EmergencySkipEtcd: true,
|
||||
EmergencySkipDrain: true,
|
||||
DrainParallelism: 6,
|
||||
ScaleParallelism: 8,
|
||||
SSHParallelism: 8,
|
||||
},
|
||||
UPS: UPS{
|
||||
Enabled: true,
|
||||
Provider: "nut",
|
||||
PollSeconds: 5,
|
||||
RuntimeSafetyFactor: 1.25,
|
||||
DebounceCount: 3,
|
||||
TelemetryTimeoutSeconds: 90,
|
||||
},
|
||||
Coordination: Coordination{
|
||||
ForwardShutdownConfig: "/etc/ananke/ananke.yaml",
|
||||
PeerHosts: []string{},
|
||||
FallbackLocalShutdown: true,
|
||||
CommandTimeoutSeconds: 25,
|
||||
StartupGuardMaxAgeSec: 900,
|
||||
Role: "coordinator",
|
||||
AllowStartupOnBattery: false,
|
||||
},
|
||||
Metrics: Metrics{
|
||||
Enabled: true,
|
||||
BindAddr: "0.0.0.0:9560",
|
||||
Path: "/metrics",
|
||||
},
|
||||
State: State{
|
||||
Dir: "/var/lib/ananke",
|
||||
ReportsDir: "/var/lib/ananke/reports",
|
||||
RunHistoryPath: "/var/lib/ananke/runs.json",
|
||||
LockPath: "/var/lib/ananke/ananke.lock",
|
||||
IntentPath: "/var/lib/ananke/intent.json",
|
||||
},
|
||||
}
|
||||
c.applyDefaults()
|
||||
return c
|
||||
}
|
||||
315
internal/config/startup_service_catalog.go
Normal file
315
internal/config/startup_service_catalog.go
Normal file
@ -0,0 +1,315 @@
|
||||
package config
|
||||
|
||||
import "strings"
|
||||
|
||||
// defaultServiceChecklist runs one orchestration or CLI step.
|
||||
// Signature: defaultServiceChecklist() []ServiceChecklistCheck.
|
||||
// Why: startup must verify real external behavior per service (not only generic
|
||||
// ingress reachability) so false positives do not pass drills.
|
||||
func defaultServiceChecklist() []ServiceChecklistCheck {
|
||||
return []ServiceChecklistCheck{
|
||||
{
|
||||
Name: "gitea-api",
|
||||
URL: "https://scm.bstein.dev/api/healthz",
|
||||
AcceptedStatuses: []int{200},
|
||||
BodyContains: "pass",
|
||||
TimeoutSeconds: 12,
|
||||
},
|
||||
{
|
||||
Name: "grafana-api",
|
||||
URL: "https://metrics.bstein.dev/api/health",
|
||||
AcceptedStatuses: []int{200},
|
||||
BodyContains: "\"database\":\"ok\"",
|
||||
TimeoutSeconds: 12,
|
||||
},
|
||||
{
|
||||
Name: "keycloak-oidc",
|
||||
URL: "https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration",
|
||||
AcceptedStatuses: []int{200},
|
||||
BodyContains: "\"issuer\":\"https://sso.bstein.dev/realms/atlas\"",
|
||||
TimeoutSeconds: 12,
|
||||
},
|
||||
{
|
||||
Name: "harbor-registry-api",
|
||||
URL: "https://registry.bstein.dev/v2/",
|
||||
AcceptedStatuses: []int{401},
|
||||
BodyContains: "unauthorized",
|
||||
TimeoutSeconds: 12,
|
||||
},
|
||||
{
|
||||
Name: "alerts-ui",
|
||||
URL: "https://alerts.bstein.dev/",
|
||||
AcceptedStatuses: []int{200},
|
||||
BodyContains: "Alertmanager",
|
||||
TimeoutSeconds: 12,
|
||||
},
|
||||
{
|
||||
Name: "auth-gateway-redirect",
|
||||
URL: "https://auth.bstein.dev/",
|
||||
AcceptedStatuses: []int{302},
|
||||
LocationContains: "https://sso.bstein.dev/realms/atlas/",
|
||||
TimeoutSeconds: 12,
|
||||
},
|
||||
{
|
||||
Name: "home-site",
|
||||
URL: "https://bstein.dev/",
|
||||
AcceptedStatuses: []int{200},
|
||||
BodyContains: "Titan Lab",
|
||||
TimeoutSeconds: 12,
|
||||
},
|
||||
{
|
||||
Name: "actual-budget-ui",
|
||||
URL: "https://budget.bstein.dev/",
|
||||
AcceptedStatuses: []int{200},
|
||||
BodyContains: "<title>Actual",
|
||||
TimeoutSeconds: 12,
|
||||
},
|
||||
{
|
||||
Name: "element-call-ui",
|
||||
URL: "https://call.live.bstein.dev/",
|
||||
AcceptedStatuses: []int{200},
|
||||
BodyContains: "Element Call",
|
||||
TimeoutSeconds: 12,
|
||||
},
|
||||
{
|
||||
Name: "flux-gitops-ui",
|
||||
URL: "https://cd.bstein.dev/",
|
||||
AcceptedStatuses: []int{200},
|
||||
BodyContains: "Weave GitOps",
|
||||
TimeoutSeconds: 12,
|
||||
},
|
||||
{
|
||||
Name: "chat-ai-health",
|
||||
URL: "https://chat.ai.bstein.dev/",
|
||||
AcceptedStatuses: []int{200},
|
||||
BodyContains: "\"ok\": true",
|
||||
TimeoutSeconds: 12,
|
||||
},
|
||||
{
|
||||
Name: "jenkins-auth-gate",
|
||||
URL: "https://ci.bstein.dev/",
|
||||
AcceptedStatuses: []int{403},
|
||||
BodyContains: "commenceLogin",
|
||||
TimeoutSeconds: 12,
|
||||
},
|
||||
{
|
||||
Name: "nextcloud-login-redirect",
|
||||
URL: "https://cloud.bstein.dev/",
|
||||
AcceptedStatuses: []int{302},
|
||||
LocationContains: "/index.php/login",
|
||||
TimeoutSeconds: 12,
|
||||
},
|
||||
{
|
||||
Name: "wger-redirect",
|
||||
URL: "https://health.bstein.dev/",
|
||||
AcceptedStatuses: []int{302},
|
||||
LocationContains: "/en/",
|
||||
TimeoutSeconds: 12,
|
||||
},
|
||||
{
|
||||
Name: "livekit-edge",
|
||||
URL: "https://kit.live.bstein.dev/",
|
||||
AcceptedStatuses: []int{404},
|
||||
BodyContains: "404 page not found",
|
||||
TimeoutSeconds: 12,
|
||||
},
|
||||
{
|
||||
Name: "element-web-ui",
|
||||
URL: "https://live.bstein.dev/",
|
||||
AcceptedStatuses: []int{200},
|
||||
BodyContains: "<title>Element</title>",
|
||||
TimeoutSeconds: 12,
|
||||
},
|
||||
{
|
||||
Name: "logging-oidc-redirect",
|
||||
URL: "https://logs.bstein.dev/",
|
||||
AcceptedStatuses: []int{302},
|
||||
LocationContains: "client_id=logs",
|
||||
TimeoutSeconds: 12,
|
||||
},
|
||||
{
|
||||
Name: "longhorn-oidc-redirect",
|
||||
URL: "https://longhorn.bstein.dev/",
|
||||
AcceptedStatuses: []int{302},
|
||||
LocationContains: "https://sso.bstein.dev/realms/atlas/",
|
||||
TimeoutSeconds: 12,
|
||||
},
|
||||
{
|
||||
Name: "matrix-auth-ui",
|
||||
URL: "https://matrix.live.bstein.dev/",
|
||||
AcceptedStatuses: []int{200},
|
||||
BodyContains: "matrix-authentication-service",
|
||||
TimeoutSeconds: 12,
|
||||
},
|
||||
{
|
||||
Name: "monero-edge",
|
||||
URL: "https://monero.bstein.dev/",
|
||||
AcceptedStatuses: []int{404},
|
||||
TimeoutSeconds: 12,
|
||||
},
|
||||
{
|
||||
Name: "firefly-login-redirect",
|
||||
URL: "https://money.bstein.dev/",
|
||||
AcceptedStatuses: []int{302},
|
||||
LocationContains: "/login",
|
||||
TimeoutSeconds: 12,
|
||||
},
|
||||
{
|
||||
Name: "outline-ui",
|
||||
URL: "https://notes.bstein.dev/",
|
||||
AcceptedStatuses: []int{200},
|
||||
BodyContains: "<title>Outline</title>",
|
||||
TimeoutSeconds: 12,
|
||||
},
|
||||
{
|
||||
Name: "collabora-probe",
|
||||
URL: "https://office.bstein.dev/",
|
||||
AcceptedStatuses: []int{200},
|
||||
BodyContains: "OK",
|
||||
TimeoutSeconds: 12,
|
||||
},
|
||||
{
|
||||
Name: "pegasus-ui",
|
||||
URL: "https://pegasus.bstein.dev/",
|
||||
AcceptedStatuses: []int{200},
|
||||
BodyContains: "<title>Pegasus</title>",
|
||||
TimeoutSeconds: 12,
|
||||
},
|
||||
{
|
||||
Name: "harbor-ui",
|
||||
URL: "https://registry.bstein.dev/",
|
||||
AcceptedStatuses: []int{200},
|
||||
BodyContains: "<title>Harbor</title>",
|
||||
TimeoutSeconds: 12,
|
||||
},
|
||||
{
|
||||
Name: "vault-ui-redirect",
|
||||
URL: "https://secret.bstein.dev/",
|
||||
AcceptedStatuses: []int{307},
|
||||
LocationContains: "/ui/",
|
||||
TimeoutSeconds: 12,
|
||||
},
|
||||
{
|
||||
Name: "sentinel-oidc-redirect",
|
||||
URL: "https://sentinel.bstein.dev/",
|
||||
AcceptedStatuses: []int{302},
|
||||
LocationContains: "client_id=metis",
|
||||
TimeoutSeconds: 12,
|
||||
},
|
||||
{
|
||||
Name: "keycloak-admin-redirect",
|
||||
URL: "https://sso.bstein.dev/",
|
||||
AcceptedStatuses: []int{302},
|
||||
LocationContains: "https://sso.bstein.dev/admin/",
|
||||
TimeoutSeconds: 12,
|
||||
},
|
||||
{
|
||||
Name: "jellyfin-edge",
|
||||
URL: "https://stream.bstein.dev/",
|
||||
AcceptedStatuses: []int{302},
|
||||
LocationContains: "web/",
|
||||
TimeoutSeconds: 12,
|
||||
},
|
||||
{
|
||||
Name: "planka-ui",
|
||||
URL: "https://tasks.bstein.dev/",
|
||||
AcceptedStatuses: []int{200},
|
||||
BodyContains: "<title>PLANKA</title>",
|
||||
TimeoutSeconds: 12,
|
||||
},
|
||||
{
|
||||
Name: "vaultwarden-ui",
|
||||
URL: "https://vault.bstein.dev/",
|
||||
AcceptedStatuses: []int{200},
|
||||
BodyContains: "<title>Vaultwarden Web</title>",
|
||||
TimeoutSeconds: 12,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// defaultCriticalServiceEndpoints runs one orchestration or CLI step.
|
||||
// Signature: defaultCriticalServiceEndpoints() []string.
|
||||
// Why: service edge checks are insufficient for protected stacks; endpoint
|
||||
// presence verifies that backends are actually routable before startup success.
|
||||
func defaultCriticalServiceEndpoints() []string {
|
||||
return []string{
|
||||
"monitoring/victoria-metrics-single-server",
|
||||
"monitoring/grafana",
|
||||
"monitoring/kube-state-metrics",
|
||||
"logging/oauth2-proxy-logs",
|
||||
"logging/opensearch-dashboards",
|
||||
"logging/opensearch-master",
|
||||
}
|
||||
}
|
||||
|
||||
// mergeServiceChecklistDefaults runs one orchestration or CLI step.
|
||||
// Signature: mergeServiceChecklistDefaults(existing, defaults []ServiceChecklistCheck) []ServiceChecklistCheck.
|
||||
// Why: host configs can keep custom checks while still inheriting mandatory
|
||||
// baseline checks introduced after incident learnings.
|
||||
func mergeServiceChecklistDefaults(existing, defaults []ServiceChecklistCheck) []ServiceChecklistCheck {
|
||||
if len(existing) == 0 {
|
||||
out := make([]ServiceChecklistCheck, 0, len(defaults))
|
||||
out = append(out, defaults...)
|
||||
return out
|
||||
}
|
||||
|
||||
byName := map[string]struct{}{}
|
||||
for _, check := range existing {
|
||||
name := strings.TrimSpace(check.Name)
|
||||
if name == "" {
|
||||
continue
|
||||
}
|
||||
byName[name] = struct{}{}
|
||||
}
|
||||
|
||||
out := make([]ServiceChecklistCheck, 0, len(existing)+len(defaults))
|
||||
out = append(out, existing...)
|
||||
for _, check := range defaults {
|
||||
name := strings.TrimSpace(check.Name)
|
||||
if name == "" {
|
||||
continue
|
||||
}
|
||||
if _, exists := byName[name]; exists {
|
||||
continue
|
||||
}
|
||||
out = append(out, check)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// mergeStringDefaults runs one orchestration or CLI step.
|
||||
// Signature: mergeStringDefaults(existing, defaults []string) []string.
|
||||
// Why: keeps baseline startup guards applied while preserving site-specific
|
||||
// additions already declared in host configs.
|
||||
func mergeStringDefaults(existing, defaults []string) []string {
|
||||
if len(existing) == 0 {
|
||||
out := make([]string, 0, len(defaults))
|
||||
out = append(out, defaults...)
|
||||
return out
|
||||
}
|
||||
seen := map[string]struct{}{}
|
||||
out := make([]string, 0, len(existing)+len(defaults))
|
||||
for _, item := range existing {
|
||||
key := strings.TrimSpace(item)
|
||||
if key == "" {
|
||||
continue
|
||||
}
|
||||
if _, ok := seen[key]; ok {
|
||||
continue
|
||||
}
|
||||
seen[key] = struct{}{}
|
||||
out = append(out, key)
|
||||
}
|
||||
for _, item := range defaults {
|
||||
key := strings.TrimSpace(item)
|
||||
if key == "" {
|
||||
continue
|
||||
}
|
||||
if _, ok := seen[key]; ok {
|
||||
continue
|
||||
}
|
||||
seen[key] = struct{}{}
|
||||
out = append(out, key)
|
||||
}
|
||||
return out
|
||||
}
|
||||
156
internal/config/types.go
Normal file
156
internal/config/types.go
Normal file
@ -0,0 +1,156 @@
|
||||
package config
|
||||
|
||||
type Config struct {
|
||||
Kubeconfig string `yaml:"kubeconfig"`
|
||||
SSHUser string `yaml:"ssh_user"`
|
||||
SSHPort int `yaml:"ssh_port"`
|
||||
SSHConfigFile string `yaml:"ssh_config_file"`
|
||||
SSHIdentityFile string `yaml:"ssh_identity_file"`
|
||||
SSHNodeHosts map[string]string `yaml:"ssh_node_hosts"`
|
||||
SSHNodeUsers map[string]string `yaml:"ssh_node_users"`
|
||||
SSHManagedNodes []string `yaml:"ssh_managed_nodes"`
|
||||
SSHJumpHost string `yaml:"ssh_jump_host"`
|
||||
SSHJumpUser string `yaml:"ssh_jump_user"`
|
||||
IACRepoPath string `yaml:"iac_repo_path"`
|
||||
ExpectedFluxBranch string `yaml:"expected_flux_branch"`
|
||||
ExpectedFluxSource string `yaml:"expected_flux_source_url"`
|
||||
ControlPlanes []string `yaml:"control_planes"`
|
||||
Workers []string `yaml:"workers"`
|
||||
LocalBootstrapPaths []string `yaml:"local_bootstrap_paths"`
|
||||
ExcludedNamespaces []string `yaml:"excluded_namespaces"`
|
||||
Startup Startup `yaml:"startup"`
|
||||
Shutdown Shutdown `yaml:"shutdown"`
|
||||
UPS UPS `yaml:"ups"`
|
||||
Coordination Coordination `yaml:"coordination"`
|
||||
Metrics Metrics `yaml:"metrics"`
|
||||
State State `yaml:"state"`
|
||||
}
|
||||
|
||||
type Startup struct {
|
||||
APIWaitSeconds int `yaml:"api_wait_seconds"`
|
||||
APIPollSeconds int `yaml:"api_poll_seconds"`
|
||||
ShutdownCooldownSeconds int `yaml:"shutdown_cooldown_seconds"`
|
||||
MinimumBatteryPercent float64 `yaml:"minimum_battery_percent"`
|
||||
RequireNodeInventoryReach bool `yaml:"require_node_inventory_reachability"`
|
||||
NodeInventoryReachWaitSeconds int `yaml:"node_inventory_reachability_wait_seconds"`
|
||||
NodeInventoryReachPollSeconds int `yaml:"node_inventory_reachability_poll_seconds"`
|
||||
RequiredNodeLabels map[string]map[string]string `yaml:"required_node_labels"`
|
||||
RequireTimeSync bool `yaml:"require_time_sync"`
|
||||
TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"`
|
||||
TimeSyncPollSeconds int `yaml:"time_sync_poll_seconds"`
|
||||
TimeSyncMode string `yaml:"time_sync_mode"`
|
||||
TimeSyncQuorum int `yaml:"time_sync_quorum"`
|
||||
ReconcileAccessOnBoot bool `yaml:"reconcile_access_on_boot"`
|
||||
AutoEtcdRestoreOnAPIFailure bool `yaml:"auto_etcd_restore_on_api_failure"`
|
||||
EtcdRestoreControlPlane string `yaml:"etcd_restore_control_plane"`
|
||||
RequireStorageReady bool `yaml:"require_storage_ready"`
|
||||
StorageReadyWaitSeconds int `yaml:"storage_ready_wait_seconds"`
|
||||
StorageReadyPollSeconds int `yaml:"storage_ready_poll_seconds"`
|
||||
StorageMinReadyNodes int `yaml:"storage_min_ready_nodes"`
|
||||
StorageCriticalPVCs []string `yaml:"storage_critical_pvcs"`
|
||||
RequirePostStartProbes bool `yaml:"require_post_start_probes"`
|
||||
PostStartProbeWaitSeconds int `yaml:"post_start_probe_wait_seconds"`
|
||||
PostStartProbePollSeconds int `yaml:"post_start_probe_poll_seconds"`
|
||||
PostStartProbes []string `yaml:"post_start_probes"`
|
||||
RequireServiceChecklist bool `yaml:"require_service_checklist"`
|
||||
ServiceChecklistWaitSeconds int `yaml:"service_checklist_wait_seconds"`
|
||||
ServiceChecklistPollSeconds int `yaml:"service_checklist_poll_seconds"`
|
||||
ServiceChecklistStabilitySec int `yaml:"service_checklist_stability_seconds"`
|
||||
ServiceChecklist []ServiceChecklistCheck `yaml:"service_checklist"`
|
||||
RequireCriticalServiceEndpoints bool `yaml:"require_critical_service_endpoints"`
|
||||
CriticalServiceEndpointWaitSec int `yaml:"critical_service_endpoint_wait_seconds"`
|
||||
CriticalServiceEndpointPollSec int `yaml:"critical_service_endpoint_poll_seconds"`
|
||||
CriticalServiceEndpoints []string `yaml:"critical_service_endpoints"`
|
||||
RequireIngressChecklist bool `yaml:"require_ingress_checklist"`
|
||||
IngressChecklistWaitSeconds int `yaml:"ingress_checklist_wait_seconds"`
|
||||
IngressChecklistPollSeconds int `yaml:"ingress_checklist_poll_seconds"`
|
||||
IngressChecklistAccepted []int `yaml:"ingress_checklist_accepted_statuses"`
|
||||
IngressChecklistIgnoreHosts []string `yaml:"ingress_checklist_ignore_hosts"`
|
||||
IngressChecklistInsecureSkip bool `yaml:"ingress_checklist_insecure_skip_tls"`
|
||||
RequireNodeSSHAuth bool `yaml:"require_node_ssh_auth"`
|
||||
NodeSSHAuthWaitSeconds int `yaml:"node_ssh_auth_wait_seconds"`
|
||||
NodeSSHAuthPollSeconds int `yaml:"node_ssh_auth_poll_seconds"`
|
||||
RequireFluxHealth bool `yaml:"require_flux_health"`
|
||||
FluxHealthWaitSeconds int `yaml:"flux_health_wait_seconds"`
|
||||
FluxHealthPollSeconds int `yaml:"flux_health_poll_seconds"`
|
||||
IgnoreFluxKustomizations []string `yaml:"ignore_flux_kustomizations"`
|
||||
RequireWorkloadConvergence bool `yaml:"require_workload_convergence"`
|
||||
WorkloadConvergenceWaitSeconds int `yaml:"workload_convergence_wait_seconds"`
|
||||
WorkloadConvergencePollSeconds int `yaml:"workload_convergence_poll_seconds"`
|
||||
IgnoreWorkloadNamespaces []string `yaml:"ignore_workload_namespaces"`
|
||||
IgnoreWorkloads []string `yaml:"ignore_workloads"`
|
||||
IgnoreUnavailableNodes []string `yaml:"ignore_unavailable_nodes"`
|
||||
AutoRecycleStuckPods bool `yaml:"auto_recycle_stuck_pods"`
|
||||
StuckPodGraceSeconds int `yaml:"stuck_pod_grace_seconds"`
|
||||
VaultUnsealKeyFile string `yaml:"vault_unseal_key_file"`
|
||||
VaultUnsealBreakglassCommand string `yaml:"vault_unseal_breakglass_command"`
|
||||
VaultUnsealBreakglassTimeout int `yaml:"vault_unseal_breakglass_timeout_seconds"`
|
||||
}
|
||||
|
||||
type ServiceChecklistCheck struct {
|
||||
Name string `yaml:"name"`
|
||||
URL string `yaml:"url"`
|
||||
AcceptedStatuses []int `yaml:"accepted_statuses"`
|
||||
LocationContains string `yaml:"location_contains"`
|
||||
LocationNotContains string `yaml:"location_not_contains"`
|
||||
BodyContains string `yaml:"body_contains"`
|
||||
BodyNotContains string `yaml:"body_not_contains"`
|
||||
TimeoutSeconds int `yaml:"timeout_seconds"`
|
||||
InsecureSkipTLS bool `yaml:"insecure_skip_tls"`
|
||||
}
|
||||
|
||||
type Shutdown struct {
|
||||
DefaultBudgetSeconds int `yaml:"default_budget_seconds"`
|
||||
HistoryMinSamples int `yaml:"history_min_samples"`
|
||||
EmergencyBudgetSec int `yaml:"emergency_budget_seconds"`
|
||||
EmergencyMinSamples int `yaml:"emergency_history_min_samples"`
|
||||
EmergencySkipEtcd bool `yaml:"emergency_skip_etcd_snapshot"`
|
||||
EmergencySkipDrain bool `yaml:"emergency_skip_drain"`
|
||||
SkipEtcdSnapshot bool `yaml:"skip_etcd_snapshot"`
|
||||
SkipDrain bool `yaml:"skip_drain"`
|
||||
DrainParallelism int `yaml:"drain_parallelism"`
|
||||
ScaleParallelism int `yaml:"scale_parallelism"`
|
||||
SSHParallelism int `yaml:"ssh_parallelism"`
|
||||
}
|
||||
|
||||
type UPS struct {
|
||||
Enabled bool `yaml:"enabled"`
|
||||
Provider string `yaml:"provider"`
|
||||
Target string `yaml:"target"`
|
||||
Targets []UPSTarget `yaml:"targets"`
|
||||
PollSeconds int `yaml:"poll_seconds"`
|
||||
RuntimeSafetyFactor float64 `yaml:"runtime_safety_factor"`
|
||||
DebounceCount int `yaml:"debounce_count"`
|
||||
TelemetryTimeoutSeconds int `yaml:"telemetry_timeout_seconds"`
|
||||
}
|
||||
|
||||
type UPSTarget struct {
|
||||
Name string `yaml:"name"`
|
||||
Target string `yaml:"target"`
|
||||
}
|
||||
|
||||
type Coordination struct {
|
||||
ForwardShutdownHost string `yaml:"forward_shutdown_host"`
|
||||
ForwardShutdownUser string `yaml:"forward_shutdown_user"`
|
||||
ForwardShutdownConfig string `yaml:"forward_shutdown_config"`
|
||||
PeerHosts []string `yaml:"peer_hosts"`
|
||||
FallbackLocalShutdown bool `yaml:"fallback_local_shutdown"`
|
||||
CommandTimeoutSeconds int `yaml:"command_timeout_seconds"`
|
||||
StartupGuardMaxAgeSec int `yaml:"startup_guard_max_age_seconds"`
|
||||
Role string `yaml:"role"`
|
||||
AllowStartupOnBattery bool `yaml:"allow_startup_on_battery"`
|
||||
}
|
||||
|
||||
type Metrics struct {
|
||||
Enabled bool `yaml:"enabled"`
|
||||
BindAddr string `yaml:"bind_addr"`
|
||||
Path string `yaml:"path"`
|
||||
}
|
||||
|
||||
type State struct {
|
||||
Dir string `yaml:"dir"`
|
||||
ReportsDir string `yaml:"reports_dir"`
|
||||
RunHistoryPath string `yaml:"run_history_path"`
|
||||
LockPath string `yaml:"lock_path"`
|
||||
IntentPath string `yaml:"intent_path"`
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user