package config import ( "fmt" neturl "net/url" "os" "strings" "gopkg.in/yaml.v3" ) type Config struct { Kubeconfig string `yaml:"kubeconfig"` SSHUser string `yaml:"ssh_user"` SSHPort int `yaml:"ssh_port"` SSHConfigFile string `yaml:"ssh_config_file"` SSHIdentityFile string `yaml:"ssh_identity_file"` SSHNodeHosts map[string]string `yaml:"ssh_node_hosts"` SSHNodeUsers map[string]string `yaml:"ssh_node_users"` SSHManagedNodes []string `yaml:"ssh_managed_nodes"` SSHJumpHost string `yaml:"ssh_jump_host"` SSHJumpUser string `yaml:"ssh_jump_user"` IACRepoPath string `yaml:"iac_repo_path"` ExpectedFluxBranch string `yaml:"expected_flux_branch"` ExpectedFluxSource string `yaml:"expected_flux_source_url"` ControlPlanes []string `yaml:"control_planes"` Workers []string `yaml:"workers"` LocalBootstrapPaths []string `yaml:"local_bootstrap_paths"` ExcludedNamespaces []string `yaml:"excluded_namespaces"` Startup Startup `yaml:"startup"` Shutdown Shutdown `yaml:"shutdown"` UPS UPS `yaml:"ups"` Coordination Coordination `yaml:"coordination"` Metrics Metrics `yaml:"metrics"` State State `yaml:"state"` } type Startup struct { APIWaitSeconds int `yaml:"api_wait_seconds"` APIPollSeconds int `yaml:"api_poll_seconds"` ShutdownCooldownSeconds int `yaml:"shutdown_cooldown_seconds"` MinimumBatteryPercent float64 `yaml:"minimum_battery_percent"` RequiredNodeLabels map[string]map[string]string `yaml:"required_node_labels"` RequireTimeSync bool `yaml:"require_time_sync"` TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"` TimeSyncPollSeconds int `yaml:"time_sync_poll_seconds"` TimeSyncMode string `yaml:"time_sync_mode"` TimeSyncQuorum int `yaml:"time_sync_quorum"` ReconcileAccessOnBoot bool `yaml:"reconcile_access_on_boot"` AutoEtcdRestoreOnAPIFailure bool `yaml:"auto_etcd_restore_on_api_failure"` EtcdRestoreControlPlane string `yaml:"etcd_restore_control_plane"` RequireStorageReady bool `yaml:"require_storage_ready"` StorageReadyWaitSeconds int `yaml:"storage_ready_wait_seconds"` StorageReadyPollSeconds int `yaml:"storage_ready_poll_seconds"` StorageMinReadyNodes int `yaml:"storage_min_ready_nodes"` StorageCriticalPVCs []string `yaml:"storage_critical_pvcs"` RequirePostStartProbes bool `yaml:"require_post_start_probes"` PostStartProbeWaitSeconds int `yaml:"post_start_probe_wait_seconds"` PostStartProbePollSeconds int `yaml:"post_start_probe_poll_seconds"` PostStartProbes []string `yaml:"post_start_probes"` RequireServiceChecklist bool `yaml:"require_service_checklist"` ServiceChecklistWaitSeconds int `yaml:"service_checklist_wait_seconds"` ServiceChecklistPollSeconds int `yaml:"service_checklist_poll_seconds"` ServiceChecklistStabilitySec int `yaml:"service_checklist_stability_seconds"` ServiceChecklist []ServiceChecklistCheck `yaml:"service_checklist"` RequireIngressChecklist bool `yaml:"require_ingress_checklist"` IngressChecklistWaitSeconds int `yaml:"ingress_checklist_wait_seconds"` IngressChecklistPollSeconds int `yaml:"ingress_checklist_poll_seconds"` IngressChecklistAccepted []int `yaml:"ingress_checklist_accepted_statuses"` IngressChecklistIgnoreHosts []string `yaml:"ingress_checklist_ignore_hosts"` IngressChecklistInsecureSkip bool `yaml:"ingress_checklist_insecure_skip_tls"` RequireNodeSSHAuth bool `yaml:"require_node_ssh_auth"` NodeSSHAuthWaitSeconds int `yaml:"node_ssh_auth_wait_seconds"` NodeSSHAuthPollSeconds int `yaml:"node_ssh_auth_poll_seconds"` RequireFluxHealth bool `yaml:"require_flux_health"` FluxHealthWaitSeconds int `yaml:"flux_health_wait_seconds"` FluxHealthPollSeconds int `yaml:"flux_health_poll_seconds"` IgnoreFluxKustomizations []string `yaml:"ignore_flux_kustomizations"` RequireWorkloadConvergence bool `yaml:"require_workload_convergence"` WorkloadConvergenceWaitSeconds int `yaml:"workload_convergence_wait_seconds"` WorkloadConvergencePollSeconds int `yaml:"workload_convergence_poll_seconds"` IgnoreWorkloadNamespaces []string `yaml:"ignore_workload_namespaces"` IgnoreWorkloads []string `yaml:"ignore_workloads"` IgnoreUnavailableNodes []string `yaml:"ignore_unavailable_nodes"` AutoRecycleStuckPods bool `yaml:"auto_recycle_stuck_pods"` StuckPodGraceSeconds int `yaml:"stuck_pod_grace_seconds"` VaultUnsealKeyFile string `yaml:"vault_unseal_key_file"` VaultUnsealBreakglassCommand string `yaml:"vault_unseal_breakglass_command"` VaultUnsealBreakglassTimeout int `yaml:"vault_unseal_breakglass_timeout_seconds"` } type ServiceChecklistCheck struct { Name string `yaml:"name"` URL string `yaml:"url"` AcceptedStatuses []int `yaml:"accepted_statuses"` BodyContains string `yaml:"body_contains"` BodyNotContains string `yaml:"body_not_contains"` TimeoutSeconds int `yaml:"timeout_seconds"` InsecureSkipTLS bool `yaml:"insecure_skip_tls"` } type Shutdown struct { DefaultBudgetSeconds int `yaml:"default_budget_seconds"` HistoryMinSamples int `yaml:"history_min_samples"` EmergencyBudgetSec int `yaml:"emergency_budget_seconds"` EmergencyMinSamples int `yaml:"emergency_history_min_samples"` EmergencySkipEtcd bool `yaml:"emergency_skip_etcd_snapshot"` EmergencySkipDrain bool `yaml:"emergency_skip_drain"` SkipEtcdSnapshot bool `yaml:"skip_etcd_snapshot"` SkipDrain bool `yaml:"skip_drain"` DrainParallelism int `yaml:"drain_parallelism"` ScaleParallelism int `yaml:"scale_parallelism"` SSHParallelism int `yaml:"ssh_parallelism"` PoweroffEnabled bool `yaml:"poweroff_enabled"` PoweroffDelaySeconds int `yaml:"poweroff_delay_seconds"` PoweroffLocalHost bool `yaml:"poweroff_local_host"` ExtraPoweroffHosts []string `yaml:"extra_poweroff_hosts"` } type UPS struct { Enabled bool `yaml:"enabled"` Provider string `yaml:"provider"` Target string `yaml:"target"` Targets []UPSTarget `yaml:"targets"` PollSeconds int `yaml:"poll_seconds"` RuntimeSafetyFactor float64 `yaml:"runtime_safety_factor"` DebounceCount int `yaml:"debounce_count"` TelemetryTimeoutSeconds int `yaml:"telemetry_timeout_seconds"` } type UPSTarget struct { Name string `yaml:"name"` Target string `yaml:"target"` } type Coordination struct { ForwardShutdownHost string `yaml:"forward_shutdown_host"` ForwardShutdownUser string `yaml:"forward_shutdown_user"` ForwardShutdownConfig string `yaml:"forward_shutdown_config"` PeerHosts []string `yaml:"peer_hosts"` FallbackLocalShutdown bool `yaml:"fallback_local_shutdown"` CommandTimeoutSeconds int `yaml:"command_timeout_seconds"` StartupGuardMaxAgeSec int `yaml:"startup_guard_max_age_seconds"` Role string `yaml:"role"` AllowStartupOnBattery bool `yaml:"allow_startup_on_battery"` } type Metrics struct { Enabled bool `yaml:"enabled"` BindAddr string `yaml:"bind_addr"` Path string `yaml:"path"` } type State struct { Dir string `yaml:"dir"` RunHistoryPath string `yaml:"run_history_path"` LockPath string `yaml:"lock_path"` IntentPath string `yaml:"intent_path"` } func Load(path string) (Config, error) { cfg := defaults() b, err := os.ReadFile(path) if err != nil { return Config{}, fmt.Errorf("read config %s: %w", path, err) } if err := yaml.Unmarshal(b, &cfg); err != nil { return Config{}, fmt.Errorf("decode config %s: %w", path, err) } cfg.applyDefaults() if err := cfg.Validate(); err != nil { return Config{}, err } return cfg, nil } func (c Config) Validate() error { if len(c.ControlPlanes) == 0 { return fmt.Errorf("config.control_planes must not be empty") } if c.ExpectedFluxBranch == "" { return fmt.Errorf("config.expected_flux_branch must not be empty") } if c.ExpectedFluxSource == "" { return fmt.Errorf("config.expected_flux_source_url must not be empty") } if c.IACRepoPath == "" { return fmt.Errorf("config.iac_repo_path must not be empty") } if c.Shutdown.DefaultBudgetSeconds <= 0 { return fmt.Errorf("config.shutdown.default_budget_seconds must be > 0") } if c.Shutdown.HistoryMinSamples <= 0 { return fmt.Errorf("config.shutdown.history_min_samples must be > 0") } if c.Shutdown.EmergencyBudgetSec <= 0 { return fmt.Errorf("config.shutdown.emergency_budget_seconds must be > 0") } if c.Shutdown.EmergencyMinSamples <= 0 { return fmt.Errorf("config.shutdown.emergency_history_min_samples must be > 0") } if c.Shutdown.DrainParallelism <= 0 { return fmt.Errorf("config.shutdown.drain_parallelism must be > 0") } if c.Shutdown.ScaleParallelism <= 0 { return fmt.Errorf("config.shutdown.scale_parallelism must be > 0") } if c.Shutdown.SSHParallelism <= 0 { return fmt.Errorf("config.shutdown.ssh_parallelism must be > 0") } if c.Startup.APIWaitSeconds <= 0 { return fmt.Errorf("config.startup.api_wait_seconds must be > 0") } if c.Startup.APIPollSeconds <= 0 { return fmt.Errorf("config.startup.api_poll_seconds must be > 0") } if c.Startup.ShutdownCooldownSeconds <= 0 { return fmt.Errorf("config.startup.shutdown_cooldown_seconds must be > 0") } if c.Startup.MinimumBatteryPercent < 0 || c.Startup.MinimumBatteryPercent > 100 { return fmt.Errorf("config.startup.minimum_battery_percent must be between 0 and 100") } for node, labels := range c.Startup.RequiredNodeLabels { if strings.TrimSpace(node) == "" { return fmt.Errorf("config.startup.required_node_labels keys must not be empty") } if len(labels) == 0 { return fmt.Errorf("config.startup.required_node_labels[%q] must include at least one label", node) } for key, value := range labels { if strings.TrimSpace(key) == "" { return fmt.Errorf("config.startup.required_node_labels[%q] contains empty label key", node) } if strings.TrimSpace(value) == "" { return fmt.Errorf("config.startup.required_node_labels[%q][%q] must not be empty", node, key) } } } if c.Startup.TimeSyncWaitSeconds <= 0 { return fmt.Errorf("config.startup.time_sync_wait_seconds must be > 0") } if c.Startup.TimeSyncPollSeconds <= 0 { return fmt.Errorf("config.startup.time_sync_poll_seconds must be > 0") } if c.Startup.TimeSyncMode != "strict" && c.Startup.TimeSyncMode != "quorum" { return fmt.Errorf("config.startup.time_sync_mode must be strict or quorum") } if c.Startup.TimeSyncMode == "quorum" && c.Startup.TimeSyncQuorum <= 0 { return fmt.Errorf("config.startup.time_sync_quorum must be > 0 when time_sync_mode=quorum") } if c.Startup.EtcdRestoreControlPlane != "" { found := false for _, cp := range c.ControlPlanes { if cp == c.Startup.EtcdRestoreControlPlane { found = true break } } if !found { return fmt.Errorf("config.startup.etcd_restore_control_plane must be one of config.control_planes when set") } } if c.Startup.StorageReadyWaitSeconds <= 0 { return fmt.Errorf("config.startup.storage_ready_wait_seconds must be > 0") } if c.Startup.StorageReadyPollSeconds <= 0 { return fmt.Errorf("config.startup.storage_ready_poll_seconds must be > 0") } if c.Startup.StorageMinReadyNodes <= 0 { return fmt.Errorf("config.startup.storage_min_ready_nodes must be > 0") } for _, pvc := range c.Startup.StorageCriticalPVCs { if strings.Count(strings.TrimSpace(pvc), "/") != 1 { return fmt.Errorf("config.startup.storage_critical_pvcs entries must be namespace/name, got %q", pvc) } } if c.Startup.PostStartProbeWaitSeconds <= 0 { return fmt.Errorf("config.startup.post_start_probe_wait_seconds must be > 0") } if c.Startup.PostStartProbePollSeconds <= 0 { return fmt.Errorf("config.startup.post_start_probe_poll_seconds must be > 0") } if c.Startup.RequirePostStartProbes && len(c.Startup.PostStartProbes) == 0 { return fmt.Errorf("config.startup.post_start_probes must not be empty when require_post_start_probes is true") } if c.Startup.ServiceChecklistWaitSeconds <= 0 { return fmt.Errorf("config.startup.service_checklist_wait_seconds must be > 0") } if c.Startup.ServiceChecklistPollSeconds <= 0 { return fmt.Errorf("config.startup.service_checklist_poll_seconds must be > 0") } if c.Startup.ServiceChecklistStabilitySec < 0 { return fmt.Errorf("config.startup.service_checklist_stability_seconds must be >= 0") } if c.Startup.RequireServiceChecklist && len(c.Startup.ServiceChecklist) == 0 { return fmt.Errorf("config.startup.service_checklist must not be empty when require_service_checklist is true") } for i, check := range c.Startup.ServiceChecklist { if strings.TrimSpace(check.Name) == "" { return fmt.Errorf("config.startup.service_checklist[%d].name must not be empty", i) } rawURL := strings.TrimSpace(check.URL) if rawURL == "" { return fmt.Errorf("config.startup.service_checklist[%d].url must not be empty", i) } parsed, err := neturl.Parse(rawURL) if err != nil || parsed.Scheme == "" || parsed.Host == "" { return fmt.Errorf("config.startup.service_checklist[%d].url is invalid: %q", i, rawURL) } if check.TimeoutSeconds <= 0 { return fmt.Errorf("config.startup.service_checklist[%d].timeout_seconds must be > 0", i) } for _, code := range check.AcceptedStatuses { if code < 100 || code > 599 { return fmt.Errorf("config.startup.service_checklist[%d].accepted_statuses contains invalid HTTP code %d", i, code) } } } if c.Startup.IngressChecklistWaitSeconds <= 0 { return fmt.Errorf("config.startup.ingress_checklist_wait_seconds must be > 0") } if c.Startup.IngressChecklistPollSeconds <= 0 { return fmt.Errorf("config.startup.ingress_checklist_poll_seconds must be > 0") } for _, code := range c.Startup.IngressChecklistAccepted { if code < 100 || code > 599 { return fmt.Errorf("config.startup.ingress_checklist_accepted_statuses contains invalid HTTP code %d", code) } } for _, host := range c.Startup.IngressChecklistIgnoreHosts { if strings.TrimSpace(host) == "" { return fmt.Errorf("config.startup.ingress_checklist_ignore_hosts entries must not be empty") } } if c.Startup.NodeSSHAuthWaitSeconds <= 0 { return fmt.Errorf("config.startup.node_ssh_auth_wait_seconds must be > 0") } if c.Startup.NodeSSHAuthPollSeconds <= 0 { return fmt.Errorf("config.startup.node_ssh_auth_poll_seconds must be > 0") } if c.Startup.FluxHealthWaitSeconds <= 0 { return fmt.Errorf("config.startup.flux_health_wait_seconds must be > 0") } if c.Startup.FluxHealthPollSeconds <= 0 { return fmt.Errorf("config.startup.flux_health_poll_seconds must be > 0") } if c.Startup.WorkloadConvergenceWaitSeconds <= 0 { return fmt.Errorf("config.startup.workload_convergence_wait_seconds must be > 0") } if c.Startup.WorkloadConvergencePollSeconds <= 0 { return fmt.Errorf("config.startup.workload_convergence_poll_seconds must be > 0") } if c.Startup.StuckPodGraceSeconds <= 0 { return fmt.Errorf("config.startup.stuck_pod_grace_seconds must be > 0") } for _, probe := range c.Startup.PostStartProbes { if strings.TrimSpace(probe) == "" { return fmt.Errorf("config.startup.post_start_probes entries must not be empty") } } for _, item := range c.Startup.IgnoreFluxKustomizations { item = strings.TrimSpace(item) if item == "" { return fmt.Errorf("config.startup.ignore_flux_kustomizations entries must not be empty") } if strings.Count(item, "/") != 1 { return fmt.Errorf("config.startup.ignore_flux_kustomizations entries must be namespace/name, got %q", item) } } for _, item := range c.Startup.IgnoreWorkloads { item = strings.TrimSpace(item) if item == "" { return fmt.Errorf("config.startup.ignore_workloads entries must not be empty") } parts := strings.Split(item, "/") if len(parts) != 2 && len(parts) != 3 { return fmt.Errorf("config.startup.ignore_workloads entries must be namespace/name or namespace/kind/name, got %q", item) } } for _, ns := range c.Startup.IgnoreWorkloadNamespaces { if strings.TrimSpace(ns) == "" { return fmt.Errorf("config.startup.ignore_workload_namespaces entries must not be empty") } } for _, node := range c.Startup.IgnoreUnavailableNodes { if strings.TrimSpace(node) == "" { return fmt.Errorf("config.startup.ignore_unavailable_nodes entries must not be empty") } } if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" { return fmt.Errorf("config.startup.vault_unseal_key_file must not be empty") } if c.SSHPort <= 0 || c.SSHPort > 65535 { return fmt.Errorf("config.ssh_port must be in range 1-65535") } if c.UPS.Enabled { if c.UPS.Provider == "" { return fmt.Errorf("config.ups.provider must not be empty when ups is enabled") } if c.UPS.Target == "" && len(c.UPS.Targets) == 0 { return fmt.Errorf("config.ups.target or config.ups.targets must be set when ups is enabled") } for _, t := range c.UPS.Targets { if t.Target == "" { return fmt.Errorf("config.ups.targets[].target must not be empty") } } } if c.Coordination.ForwardShutdownHost != "" { if c.Coordination.ForwardShutdownConfig == "" { return fmt.Errorf("config.coordination.forward_shutdown_config must not be empty when forward_shutdown_host is set") } } for _, peer := range c.Coordination.PeerHosts { if strings.TrimSpace(peer) == "" { return fmt.Errorf("config.coordination.peer_hosts entries must not be empty") } } if c.Coordination.StartupGuardMaxAgeSec <= 0 { return fmt.Errorf("config.coordination.startup_guard_max_age_seconds must be > 0") } if c.Coordination.Role != "coordinator" && c.Coordination.Role != "peer" { return fmt.Errorf("config.coordination.role must be coordinator or peer") } if c.State.RunHistoryPath == "" || c.State.LockPath == "" { return fmt.Errorf("config.state.run_history_path and config.state.lock_path must not be empty") } if c.State.IntentPath == "" { return fmt.Errorf("config.state.intent_path must not be empty") } return nil } func defaults() Config { c := Config{ IACRepoPath: "/opt/titan-iac", ExpectedFluxBranch: "main", ExpectedFluxSource: "ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git", SSHPort: 2277, ControlPlanes: []string{"titan-0a", "titan-0b", "titan-0c"}, LocalBootstrapPaths: []string{ "infrastructure/core", "clusters/atlas/flux-system", "infrastructure/sources/helm", "infrastructure/metallb", "infrastructure/traefik", "infrastructure/cert-manager", "infrastructure/vault-csi", "infrastructure/vault-injector", "services/vault", "infrastructure/postgres", "services/gitea", "services/keycloak", "services/oauth2-proxy", }, ExcludedNamespaces: []string{ "kube-system", "kube-public", "kube-node-lease", "flux-system", "traefik", "metallb-system", "cert-manager", "longhorn-system", "vault", "postgres", "maintenance", }, Startup: Startup{ APIWaitSeconds: 1200, APIPollSeconds: 2, ShutdownCooldownSeconds: 45, RequireTimeSync: true, TimeSyncWaitSeconds: 240, TimeSyncPollSeconds: 5, TimeSyncMode: "quorum", TimeSyncQuorum: 2, ReconcileAccessOnBoot: true, AutoEtcdRestoreOnAPIFailure: true, EtcdRestoreControlPlane: "titan-0a", RequireStorageReady: true, StorageReadyWaitSeconds: 420, StorageReadyPollSeconds: 5, StorageMinReadyNodes: 2, StorageCriticalPVCs: []string{ "vault/data-vault-0", "postgres/postgres-data-postgres-0", "gitea/gitea-data", "sso/keycloak-data", }, MinimumBatteryPercent: 20, RequiredNodeLabels: map[string]map[string]string{ "titan-09": { "ananke.bstein.dev/harbor-bootstrap": "true", }, }, RequirePostStartProbes: true, PostStartProbeWaitSeconds: 240, PostStartProbePollSeconds: 5, PostStartProbes: []string{ "https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration", "https://scm.bstein.dev/api/healthz", "https://metrics.bstein.dev/api/health", }, RequireServiceChecklist: true, ServiceChecklistWaitSeconds: 420, ServiceChecklistPollSeconds: 5, ServiceChecklistStabilitySec: 120, ServiceChecklist: []ServiceChecklistCheck{ { Name: "gitea-api", URL: "https://scm.bstein.dev/api/healthz", AcceptedStatuses: []int{200}, BodyContains: "pass", TimeoutSeconds: 12, }, { Name: "grafana-api", URL: "https://metrics.bstein.dev/api/health", AcceptedStatuses: []int{200}, BodyContains: "\"database\":\"ok\"", TimeoutSeconds: 12, }, { Name: "keycloak-oidc", URL: "https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration", AcceptedStatuses: []int{200}, BodyContains: "\"issuer\":\"https://sso.bstein.dev/realms/atlas\"", TimeoutSeconds: 12, }, { Name: "harbor-registry", URL: "https://registry.bstein.dev/v2/", AcceptedStatuses: []int{401}, BodyContains: "unauthorized", TimeoutSeconds: 12, }, { Name: "longhorn-auth", URL: "https://longhorn.bstein.dev/", AcceptedStatuses: []int{200, 302}, TimeoutSeconds: 12, }, }, RequireIngressChecklist: true, IngressChecklistWaitSeconds: 420, IngressChecklistPollSeconds: 5, IngressChecklistAccepted: []int{200, 301, 302, 307, 308, 401, 403, 404}, IngressChecklistIgnoreHosts: []string{}, RequireNodeSSHAuth: true, NodeSSHAuthWaitSeconds: 240, NodeSSHAuthPollSeconds: 5, RequireFluxHealth: true, FluxHealthWaitSeconds: 900, FluxHealthPollSeconds: 5, IgnoreFluxKustomizations: []string{}, RequireWorkloadConvergence: true, WorkloadConvergenceWaitSeconds: 900, WorkloadConvergencePollSeconds: 5, IgnoreWorkloadNamespaces: []string{}, IgnoreWorkloads: []string{}, IgnoreUnavailableNodes: []string{}, AutoRecycleStuckPods: true, StuckPodGraceSeconds: 180, VaultUnsealKeyFile: "/var/lib/ananke/vault-unseal.key", VaultUnsealBreakglassTimeout: 15, }, Shutdown: Shutdown{ DefaultBudgetSeconds: 1380, HistoryMinSamples: 3, EmergencyBudgetSec: 420, EmergencyMinSamples: 3, EmergencySkipEtcd: true, EmergencySkipDrain: true, DrainParallelism: 6, ScaleParallelism: 8, SSHParallelism: 8, PoweroffEnabled: false, PoweroffDelaySeconds: 25, PoweroffLocalHost: false, }, UPS: UPS{ Enabled: true, Provider: "nut", PollSeconds: 5, RuntimeSafetyFactor: 1.25, DebounceCount: 3, TelemetryTimeoutSeconds: 90, }, Coordination: Coordination{ ForwardShutdownConfig: "/etc/ananke/ananke.yaml", PeerHosts: []string{}, FallbackLocalShutdown: true, CommandTimeoutSeconds: 25, StartupGuardMaxAgeSec: 900, Role: "coordinator", AllowStartupOnBattery: false, }, Metrics: Metrics{ Enabled: true, BindAddr: "0.0.0.0:9560", Path: "/metrics", }, State: State{ Dir: "/var/lib/ananke", RunHistoryPath: "/var/lib/ananke/runs.json", LockPath: "/var/lib/ananke/ananke.lock", IntentPath: "/var/lib/ananke/intent.json", }, } c.applyDefaults() return c } func (c *Config) applyDefaults() { if c.ExpectedFluxBranch == "" { c.ExpectedFluxBranch = "main" } if c.IACRepoPath == "" { c.IACRepoPath = "/opt/titan-iac" } if c.ExpectedFluxSource == "" { c.ExpectedFluxSource = "ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git" } if c.Startup.APIWaitSeconds <= 0 { c.Startup.APIWaitSeconds = 1200 } if c.Startup.APIPollSeconds <= 0 { c.Startup.APIPollSeconds = 2 } if c.Startup.ShutdownCooldownSeconds <= 0 { c.Startup.ShutdownCooldownSeconds = 45 } if c.Startup.MinimumBatteryPercent <= 0 { c.Startup.MinimumBatteryPercent = 20 } if c.Startup.RequiredNodeLabels == nil { c.Startup.RequiredNodeLabels = map[string]map[string]string{ "titan-09": { "ananke.bstein.dev/harbor-bootstrap": "true", }, } } if c.Startup.TimeSyncWaitSeconds <= 0 { c.Startup.TimeSyncWaitSeconds = 240 } if c.Startup.TimeSyncPollSeconds <= 0 { c.Startup.TimeSyncPollSeconds = 5 } if c.Startup.TimeSyncMode == "" { c.Startup.TimeSyncMode = "quorum" } if c.Startup.TimeSyncQuorum <= 0 { c.Startup.TimeSyncQuorum = 2 } if c.Startup.TimeSyncQuorum > len(c.ControlPlanes) && len(c.ControlPlanes) > 0 { c.Startup.TimeSyncQuorum = len(c.ControlPlanes) } if c.Startup.EtcdRestoreControlPlane == "" && len(c.ControlPlanes) > 0 { c.Startup.EtcdRestoreControlPlane = c.ControlPlanes[0] } if c.Startup.StorageReadyWaitSeconds <= 0 { c.Startup.StorageReadyWaitSeconds = 420 } if c.Startup.StorageReadyPollSeconds <= 0 { c.Startup.StorageReadyPollSeconds = 5 } if c.Startup.StorageMinReadyNodes <= 0 { c.Startup.StorageMinReadyNodes = 2 } if len(c.Startup.StorageCriticalPVCs) == 0 { c.Startup.StorageCriticalPVCs = []string{ "vault/data-vault-0", "postgres/postgres-data-postgres-0", "gitea/gitea-data", "sso/keycloak-data", } } if c.Startup.PostStartProbeWaitSeconds <= 0 { c.Startup.PostStartProbeWaitSeconds = 240 } if c.Startup.PostStartProbePollSeconds <= 0 { c.Startup.PostStartProbePollSeconds = 5 } if len(c.Startup.PostStartProbes) == 0 { c.Startup.PostStartProbes = []string{ "https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration", "https://scm.bstein.dev/api/healthz", "https://metrics.bstein.dev/api/health", } } if c.Startup.ServiceChecklistWaitSeconds <= 0 { c.Startup.ServiceChecklistWaitSeconds = 420 } if c.Startup.ServiceChecklistPollSeconds <= 0 { c.Startup.ServiceChecklistPollSeconds = 5 } if c.Startup.ServiceChecklistStabilitySec < 0 { c.Startup.ServiceChecklistStabilitySec = 0 } if len(c.Startup.ServiceChecklist) == 0 { c.Startup.ServiceChecklist = []ServiceChecklistCheck{ { Name: "gitea-api", URL: "https://scm.bstein.dev/api/healthz", AcceptedStatuses: []int{200}, BodyContains: "pass", TimeoutSeconds: 12, }, { Name: "grafana-api", URL: "https://metrics.bstein.dev/api/health", AcceptedStatuses: []int{200}, BodyContains: "\"database\":\"ok\"", TimeoutSeconds: 12, }, } } for i := range c.Startup.ServiceChecklist { if c.Startup.ServiceChecklist[i].TimeoutSeconds <= 0 { c.Startup.ServiceChecklist[i].TimeoutSeconds = 12 } } if c.Startup.IngressChecklistWaitSeconds <= 0 { c.Startup.IngressChecklistWaitSeconds = 420 } if c.Startup.IngressChecklistPollSeconds <= 0 { c.Startup.IngressChecklistPollSeconds = 5 } if len(c.Startup.IngressChecklistAccepted) == 0 { c.Startup.IngressChecklistAccepted = []int{200, 301, 302, 307, 308, 401, 403, 404} } if c.Startup.IngressChecklistIgnoreHosts == nil { c.Startup.IngressChecklistIgnoreHosts = []string{} } if c.Startup.NodeSSHAuthWaitSeconds <= 0 { c.Startup.NodeSSHAuthWaitSeconds = 240 } if c.Startup.NodeSSHAuthPollSeconds <= 0 { c.Startup.NodeSSHAuthPollSeconds = 5 } if c.Startup.FluxHealthWaitSeconds <= 0 { c.Startup.FluxHealthWaitSeconds = 900 } if c.Startup.FluxHealthPollSeconds <= 0 { c.Startup.FluxHealthPollSeconds = 5 } if c.Startup.IgnoreFluxKustomizations == nil { c.Startup.IgnoreFluxKustomizations = []string{} } if c.Startup.WorkloadConvergenceWaitSeconds <= 0 { c.Startup.WorkloadConvergenceWaitSeconds = 900 } if c.Startup.WorkloadConvergencePollSeconds <= 0 { c.Startup.WorkloadConvergencePollSeconds = 5 } if c.Startup.IgnoreWorkloadNamespaces == nil { c.Startup.IgnoreWorkloadNamespaces = []string{} } if c.Startup.IgnoreWorkloads == nil { c.Startup.IgnoreWorkloads = []string{} } if c.Startup.IgnoreUnavailableNodes == nil { c.Startup.IgnoreUnavailableNodes = []string{} } if c.Startup.StuckPodGraceSeconds <= 0 { c.Startup.StuckPodGraceSeconds = 180 } if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" { c.Startup.VaultUnsealKeyFile = "/var/lib/ananke/vault-unseal.key" } if c.Startup.VaultUnsealBreakglassTimeout <= 0 { c.Startup.VaultUnsealBreakglassTimeout = 15 } if c.SSHPort <= 0 { c.SSHPort = 2277 } if c.Shutdown.DefaultBudgetSeconds <= 0 { c.Shutdown.DefaultBudgetSeconds = 1380 } if c.Shutdown.HistoryMinSamples <= 0 { c.Shutdown.HistoryMinSamples = 3 } if c.Shutdown.EmergencyBudgetSec <= 0 { c.Shutdown.EmergencyBudgetSec = 420 } if c.Shutdown.EmergencyMinSamples <= 0 { c.Shutdown.EmergencyMinSamples = 3 } if c.Shutdown.DrainParallelism <= 0 { c.Shutdown.DrainParallelism = 6 } if c.Shutdown.ScaleParallelism <= 0 { c.Shutdown.ScaleParallelism = 8 } if c.Shutdown.SSHParallelism <= 0 { c.Shutdown.SSHParallelism = 8 } if c.Shutdown.PoweroffDelaySeconds <= 0 { c.Shutdown.PoweroffDelaySeconds = 25 } if c.UPS.PollSeconds <= 0 { c.UPS.PollSeconds = 5 } if c.UPS.RuntimeSafetyFactor <= 0 { c.UPS.RuntimeSafetyFactor = 1.25 } if c.UPS.DebounceCount <= 0 { c.UPS.DebounceCount = 3 } if c.UPS.TelemetryTimeoutSeconds <= 0 { c.UPS.TelemetryTimeoutSeconds = 90 } if c.Coordination.ForwardShutdownConfig == "" { c.Coordination.ForwardShutdownConfig = "/etc/ananke/ananke.yaml" } if c.Coordination.PeerHosts == nil { c.Coordination.PeerHosts = []string{} } if c.Coordination.CommandTimeoutSeconds <= 0 { c.Coordination.CommandTimeoutSeconds = 25 } if c.Coordination.StartupGuardMaxAgeSec <= 0 { c.Coordination.StartupGuardMaxAgeSec = 900 } if c.Coordination.Role == "" { c.Coordination.Role = "coordinator" } if c.Metrics.BindAddr == "" { c.Metrics.BindAddr = "0.0.0.0:9560" } if c.Metrics.Path == "" { c.Metrics.Path = "/metrics" } if c.State.Dir == "" { c.State.Dir = "/var/lib/ananke" } if c.State.RunHistoryPath == "" { c.State.RunHistoryPath = "/var/lib/ananke/runs.json" } if c.State.LockPath == "" { c.State.LockPath = "/var/lib/ananke/ananke.lock" } if c.State.IntentPath == "" { c.State.IntentPath = "/var/lib/ananke/intent.json" } }