850 lines
32 KiB
Go
850 lines
32 KiB
Go
package config
|
|
|
|
import (
|
|
"fmt"
|
|
neturl "net/url"
|
|
"os"
|
|
"strings"
|
|
|
|
"gopkg.in/yaml.v3"
|
|
)
|
|
|
|
type Config struct {
|
|
Kubeconfig string `yaml:"kubeconfig"`
|
|
SSHUser string `yaml:"ssh_user"`
|
|
SSHPort int `yaml:"ssh_port"`
|
|
SSHConfigFile string `yaml:"ssh_config_file"`
|
|
SSHIdentityFile string `yaml:"ssh_identity_file"`
|
|
SSHNodeHosts map[string]string `yaml:"ssh_node_hosts"`
|
|
SSHNodeUsers map[string]string `yaml:"ssh_node_users"`
|
|
SSHManagedNodes []string `yaml:"ssh_managed_nodes"`
|
|
SSHJumpHost string `yaml:"ssh_jump_host"`
|
|
SSHJumpUser string `yaml:"ssh_jump_user"`
|
|
IACRepoPath string `yaml:"iac_repo_path"`
|
|
ExpectedFluxBranch string `yaml:"expected_flux_branch"`
|
|
ExpectedFluxSource string `yaml:"expected_flux_source_url"`
|
|
ControlPlanes []string `yaml:"control_planes"`
|
|
Workers []string `yaml:"workers"`
|
|
LocalBootstrapPaths []string `yaml:"local_bootstrap_paths"`
|
|
ExcludedNamespaces []string `yaml:"excluded_namespaces"`
|
|
Startup Startup `yaml:"startup"`
|
|
Shutdown Shutdown `yaml:"shutdown"`
|
|
UPS UPS `yaml:"ups"`
|
|
Coordination Coordination `yaml:"coordination"`
|
|
Metrics Metrics `yaml:"metrics"`
|
|
State State `yaml:"state"`
|
|
}
|
|
|
|
type Startup struct {
|
|
APIWaitSeconds int `yaml:"api_wait_seconds"`
|
|
APIPollSeconds int `yaml:"api_poll_seconds"`
|
|
ShutdownCooldownSeconds int `yaml:"shutdown_cooldown_seconds"`
|
|
MinimumBatteryPercent float64 `yaml:"minimum_battery_percent"`
|
|
RequiredNodeLabels map[string]map[string]string `yaml:"required_node_labels"`
|
|
RequireTimeSync bool `yaml:"require_time_sync"`
|
|
TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"`
|
|
TimeSyncPollSeconds int `yaml:"time_sync_poll_seconds"`
|
|
TimeSyncMode string `yaml:"time_sync_mode"`
|
|
TimeSyncQuorum int `yaml:"time_sync_quorum"`
|
|
ReconcileAccessOnBoot bool `yaml:"reconcile_access_on_boot"`
|
|
AutoEtcdRestoreOnAPIFailure bool `yaml:"auto_etcd_restore_on_api_failure"`
|
|
EtcdRestoreControlPlane string `yaml:"etcd_restore_control_plane"`
|
|
RequireStorageReady bool `yaml:"require_storage_ready"`
|
|
StorageReadyWaitSeconds int `yaml:"storage_ready_wait_seconds"`
|
|
StorageReadyPollSeconds int `yaml:"storage_ready_poll_seconds"`
|
|
StorageMinReadyNodes int `yaml:"storage_min_ready_nodes"`
|
|
StorageCriticalPVCs []string `yaml:"storage_critical_pvcs"`
|
|
RequirePostStartProbes bool `yaml:"require_post_start_probes"`
|
|
PostStartProbeWaitSeconds int `yaml:"post_start_probe_wait_seconds"`
|
|
PostStartProbePollSeconds int `yaml:"post_start_probe_poll_seconds"`
|
|
PostStartProbes []string `yaml:"post_start_probes"`
|
|
RequireServiceChecklist bool `yaml:"require_service_checklist"`
|
|
ServiceChecklistWaitSeconds int `yaml:"service_checklist_wait_seconds"`
|
|
ServiceChecklistPollSeconds int `yaml:"service_checklist_poll_seconds"`
|
|
ServiceChecklistStabilitySec int `yaml:"service_checklist_stability_seconds"`
|
|
ServiceChecklist []ServiceChecklistCheck `yaml:"service_checklist"`
|
|
RequireIngressChecklist bool `yaml:"require_ingress_checklist"`
|
|
IngressChecklistWaitSeconds int `yaml:"ingress_checklist_wait_seconds"`
|
|
IngressChecklistPollSeconds int `yaml:"ingress_checklist_poll_seconds"`
|
|
IngressChecklistAccepted []int `yaml:"ingress_checklist_accepted_statuses"`
|
|
IngressChecklistIgnoreHosts []string `yaml:"ingress_checklist_ignore_hosts"`
|
|
IngressChecklistInsecureSkip bool `yaml:"ingress_checklist_insecure_skip_tls"`
|
|
RequireNodeSSHAuth bool `yaml:"require_node_ssh_auth"`
|
|
NodeSSHAuthWaitSeconds int `yaml:"node_ssh_auth_wait_seconds"`
|
|
NodeSSHAuthPollSeconds int `yaml:"node_ssh_auth_poll_seconds"`
|
|
RequireFluxHealth bool `yaml:"require_flux_health"`
|
|
FluxHealthWaitSeconds int `yaml:"flux_health_wait_seconds"`
|
|
FluxHealthPollSeconds int `yaml:"flux_health_poll_seconds"`
|
|
IgnoreFluxKustomizations []string `yaml:"ignore_flux_kustomizations"`
|
|
RequireWorkloadConvergence bool `yaml:"require_workload_convergence"`
|
|
WorkloadConvergenceWaitSeconds int `yaml:"workload_convergence_wait_seconds"`
|
|
WorkloadConvergencePollSeconds int `yaml:"workload_convergence_poll_seconds"`
|
|
IgnoreWorkloadNamespaces []string `yaml:"ignore_workload_namespaces"`
|
|
IgnoreWorkloads []string `yaml:"ignore_workloads"`
|
|
IgnoreUnavailableNodes []string `yaml:"ignore_unavailable_nodes"`
|
|
AutoRecycleStuckPods bool `yaml:"auto_recycle_stuck_pods"`
|
|
StuckPodGraceSeconds int `yaml:"stuck_pod_grace_seconds"`
|
|
VaultUnsealKeyFile string `yaml:"vault_unseal_key_file"`
|
|
VaultUnsealBreakglassCommand string `yaml:"vault_unseal_breakglass_command"`
|
|
VaultUnsealBreakglassTimeout int `yaml:"vault_unseal_breakglass_timeout_seconds"`
|
|
}
|
|
|
|
type ServiceChecklistCheck struct {
|
|
Name string `yaml:"name"`
|
|
URL string `yaml:"url"`
|
|
AcceptedStatuses []int `yaml:"accepted_statuses"`
|
|
BodyContains string `yaml:"body_contains"`
|
|
BodyNotContains string `yaml:"body_not_contains"`
|
|
TimeoutSeconds int `yaml:"timeout_seconds"`
|
|
InsecureSkipTLS bool `yaml:"insecure_skip_tls"`
|
|
}
|
|
|
|
type Shutdown struct {
|
|
DefaultBudgetSeconds int `yaml:"default_budget_seconds"`
|
|
HistoryMinSamples int `yaml:"history_min_samples"`
|
|
EmergencyBudgetSec int `yaml:"emergency_budget_seconds"`
|
|
EmergencyMinSamples int `yaml:"emergency_history_min_samples"`
|
|
EmergencySkipEtcd bool `yaml:"emergency_skip_etcd_snapshot"`
|
|
EmergencySkipDrain bool `yaml:"emergency_skip_drain"`
|
|
SkipEtcdSnapshot bool `yaml:"skip_etcd_snapshot"`
|
|
SkipDrain bool `yaml:"skip_drain"`
|
|
DrainParallelism int `yaml:"drain_parallelism"`
|
|
ScaleParallelism int `yaml:"scale_parallelism"`
|
|
SSHParallelism int `yaml:"ssh_parallelism"`
|
|
PoweroffEnabled bool `yaml:"poweroff_enabled"`
|
|
PoweroffDelaySeconds int `yaml:"poweroff_delay_seconds"`
|
|
PoweroffLocalHost bool `yaml:"poweroff_local_host"`
|
|
ExtraPoweroffHosts []string `yaml:"extra_poweroff_hosts"`
|
|
}
|
|
|
|
type UPS struct {
|
|
Enabled bool `yaml:"enabled"`
|
|
Provider string `yaml:"provider"`
|
|
Target string `yaml:"target"`
|
|
Targets []UPSTarget `yaml:"targets"`
|
|
PollSeconds int `yaml:"poll_seconds"`
|
|
RuntimeSafetyFactor float64 `yaml:"runtime_safety_factor"`
|
|
DebounceCount int `yaml:"debounce_count"`
|
|
TelemetryTimeoutSeconds int `yaml:"telemetry_timeout_seconds"`
|
|
}
|
|
|
|
type UPSTarget struct {
|
|
Name string `yaml:"name"`
|
|
Target string `yaml:"target"`
|
|
}
|
|
|
|
type Coordination struct {
|
|
ForwardShutdownHost string `yaml:"forward_shutdown_host"`
|
|
ForwardShutdownUser string `yaml:"forward_shutdown_user"`
|
|
ForwardShutdownConfig string `yaml:"forward_shutdown_config"`
|
|
PeerHosts []string `yaml:"peer_hosts"`
|
|
FallbackLocalShutdown bool `yaml:"fallback_local_shutdown"`
|
|
CommandTimeoutSeconds int `yaml:"command_timeout_seconds"`
|
|
StartupGuardMaxAgeSec int `yaml:"startup_guard_max_age_seconds"`
|
|
Role string `yaml:"role"`
|
|
AllowStartupOnBattery bool `yaml:"allow_startup_on_battery"`
|
|
}
|
|
|
|
type Metrics struct {
|
|
Enabled bool `yaml:"enabled"`
|
|
BindAddr string `yaml:"bind_addr"`
|
|
Path string `yaml:"path"`
|
|
}
|
|
|
|
type State struct {
|
|
Dir string `yaml:"dir"`
|
|
RunHistoryPath string `yaml:"run_history_path"`
|
|
LockPath string `yaml:"lock_path"`
|
|
IntentPath string `yaml:"intent_path"`
|
|
}
|
|
|
|
func Load(path string) (Config, error) {
|
|
cfg := defaults()
|
|
|
|
b, err := os.ReadFile(path)
|
|
if err != nil {
|
|
return Config{}, fmt.Errorf("read config %s: %w", path, err)
|
|
}
|
|
if err := yaml.Unmarshal(b, &cfg); err != nil {
|
|
return Config{}, fmt.Errorf("decode config %s: %w", path, err)
|
|
}
|
|
|
|
cfg.applyDefaults()
|
|
if err := cfg.Validate(); err != nil {
|
|
return Config{}, err
|
|
}
|
|
return cfg, nil
|
|
}
|
|
|
|
func (c Config) Validate() error {
|
|
if len(c.ControlPlanes) == 0 {
|
|
return fmt.Errorf("config.control_planes must not be empty")
|
|
}
|
|
if c.ExpectedFluxBranch == "" {
|
|
return fmt.Errorf("config.expected_flux_branch must not be empty")
|
|
}
|
|
if c.ExpectedFluxSource == "" {
|
|
return fmt.Errorf("config.expected_flux_source_url must not be empty")
|
|
}
|
|
if c.IACRepoPath == "" {
|
|
return fmt.Errorf("config.iac_repo_path must not be empty")
|
|
}
|
|
if c.Shutdown.DefaultBudgetSeconds <= 0 {
|
|
return fmt.Errorf("config.shutdown.default_budget_seconds must be > 0")
|
|
}
|
|
if c.Shutdown.HistoryMinSamples <= 0 {
|
|
return fmt.Errorf("config.shutdown.history_min_samples must be > 0")
|
|
}
|
|
if c.Shutdown.EmergencyBudgetSec <= 0 {
|
|
return fmt.Errorf("config.shutdown.emergency_budget_seconds must be > 0")
|
|
}
|
|
if c.Shutdown.EmergencyMinSamples <= 0 {
|
|
return fmt.Errorf("config.shutdown.emergency_history_min_samples must be > 0")
|
|
}
|
|
if c.Shutdown.DrainParallelism <= 0 {
|
|
return fmt.Errorf("config.shutdown.drain_parallelism must be > 0")
|
|
}
|
|
if c.Shutdown.ScaleParallelism <= 0 {
|
|
return fmt.Errorf("config.shutdown.scale_parallelism must be > 0")
|
|
}
|
|
if c.Shutdown.SSHParallelism <= 0 {
|
|
return fmt.Errorf("config.shutdown.ssh_parallelism must be > 0")
|
|
}
|
|
if c.Startup.APIWaitSeconds <= 0 {
|
|
return fmt.Errorf("config.startup.api_wait_seconds must be > 0")
|
|
}
|
|
if c.Startup.APIPollSeconds <= 0 {
|
|
return fmt.Errorf("config.startup.api_poll_seconds must be > 0")
|
|
}
|
|
if c.Startup.ShutdownCooldownSeconds <= 0 {
|
|
return fmt.Errorf("config.startup.shutdown_cooldown_seconds must be > 0")
|
|
}
|
|
if c.Startup.MinimumBatteryPercent < 0 || c.Startup.MinimumBatteryPercent > 100 {
|
|
return fmt.Errorf("config.startup.minimum_battery_percent must be between 0 and 100")
|
|
}
|
|
for node, labels := range c.Startup.RequiredNodeLabels {
|
|
if strings.TrimSpace(node) == "" {
|
|
return fmt.Errorf("config.startup.required_node_labels keys must not be empty")
|
|
}
|
|
if len(labels) == 0 {
|
|
return fmt.Errorf("config.startup.required_node_labels[%q] must include at least one label", node)
|
|
}
|
|
for key, value := range labels {
|
|
if strings.TrimSpace(key) == "" {
|
|
return fmt.Errorf("config.startup.required_node_labels[%q] contains empty label key", node)
|
|
}
|
|
if strings.TrimSpace(value) == "" {
|
|
return fmt.Errorf("config.startup.required_node_labels[%q][%q] must not be empty", node, key)
|
|
}
|
|
}
|
|
}
|
|
if c.Startup.TimeSyncWaitSeconds <= 0 {
|
|
return fmt.Errorf("config.startup.time_sync_wait_seconds must be > 0")
|
|
}
|
|
if c.Startup.TimeSyncPollSeconds <= 0 {
|
|
return fmt.Errorf("config.startup.time_sync_poll_seconds must be > 0")
|
|
}
|
|
if c.Startup.TimeSyncMode != "strict" && c.Startup.TimeSyncMode != "quorum" {
|
|
return fmt.Errorf("config.startup.time_sync_mode must be strict or quorum")
|
|
}
|
|
if c.Startup.TimeSyncMode == "quorum" && c.Startup.TimeSyncQuorum <= 0 {
|
|
return fmt.Errorf("config.startup.time_sync_quorum must be > 0 when time_sync_mode=quorum")
|
|
}
|
|
if c.Startup.EtcdRestoreControlPlane != "" {
|
|
found := false
|
|
for _, cp := range c.ControlPlanes {
|
|
if cp == c.Startup.EtcdRestoreControlPlane {
|
|
found = true
|
|
break
|
|
}
|
|
}
|
|
if !found {
|
|
return fmt.Errorf("config.startup.etcd_restore_control_plane must be one of config.control_planes when set")
|
|
}
|
|
}
|
|
if c.Startup.StorageReadyWaitSeconds <= 0 {
|
|
return fmt.Errorf("config.startup.storage_ready_wait_seconds must be > 0")
|
|
}
|
|
if c.Startup.StorageReadyPollSeconds <= 0 {
|
|
return fmt.Errorf("config.startup.storage_ready_poll_seconds must be > 0")
|
|
}
|
|
if c.Startup.StorageMinReadyNodes <= 0 {
|
|
return fmt.Errorf("config.startup.storage_min_ready_nodes must be > 0")
|
|
}
|
|
for _, pvc := range c.Startup.StorageCriticalPVCs {
|
|
if strings.Count(strings.TrimSpace(pvc), "/") != 1 {
|
|
return fmt.Errorf("config.startup.storage_critical_pvcs entries must be namespace/name, got %q", pvc)
|
|
}
|
|
}
|
|
if c.Startup.PostStartProbeWaitSeconds <= 0 {
|
|
return fmt.Errorf("config.startup.post_start_probe_wait_seconds must be > 0")
|
|
}
|
|
if c.Startup.PostStartProbePollSeconds <= 0 {
|
|
return fmt.Errorf("config.startup.post_start_probe_poll_seconds must be > 0")
|
|
}
|
|
if c.Startup.RequirePostStartProbes && len(c.Startup.PostStartProbes) == 0 {
|
|
return fmt.Errorf("config.startup.post_start_probes must not be empty when require_post_start_probes is true")
|
|
}
|
|
if c.Startup.ServiceChecklistWaitSeconds <= 0 {
|
|
return fmt.Errorf("config.startup.service_checklist_wait_seconds must be > 0")
|
|
}
|
|
if c.Startup.ServiceChecklistPollSeconds <= 0 {
|
|
return fmt.Errorf("config.startup.service_checklist_poll_seconds must be > 0")
|
|
}
|
|
if c.Startup.ServiceChecklistStabilitySec < 0 {
|
|
return fmt.Errorf("config.startup.service_checklist_stability_seconds must be >= 0")
|
|
}
|
|
if c.Startup.RequireServiceChecklist && len(c.Startup.ServiceChecklist) == 0 {
|
|
return fmt.Errorf("config.startup.service_checklist must not be empty when require_service_checklist is true")
|
|
}
|
|
for i, check := range c.Startup.ServiceChecklist {
|
|
if strings.TrimSpace(check.Name) == "" {
|
|
return fmt.Errorf("config.startup.service_checklist[%d].name must not be empty", i)
|
|
}
|
|
rawURL := strings.TrimSpace(check.URL)
|
|
if rawURL == "" {
|
|
return fmt.Errorf("config.startup.service_checklist[%d].url must not be empty", i)
|
|
}
|
|
parsed, err := neturl.Parse(rawURL)
|
|
if err != nil || parsed.Scheme == "" || parsed.Host == "" {
|
|
return fmt.Errorf("config.startup.service_checklist[%d].url is invalid: %q", i, rawURL)
|
|
}
|
|
if check.TimeoutSeconds <= 0 {
|
|
return fmt.Errorf("config.startup.service_checklist[%d].timeout_seconds must be > 0", i)
|
|
}
|
|
for _, code := range check.AcceptedStatuses {
|
|
if code < 100 || code > 599 {
|
|
return fmt.Errorf("config.startup.service_checklist[%d].accepted_statuses contains invalid HTTP code %d", i, code)
|
|
}
|
|
}
|
|
}
|
|
if c.Startup.IngressChecklistWaitSeconds <= 0 {
|
|
return fmt.Errorf("config.startup.ingress_checklist_wait_seconds must be > 0")
|
|
}
|
|
if c.Startup.IngressChecklistPollSeconds <= 0 {
|
|
return fmt.Errorf("config.startup.ingress_checklist_poll_seconds must be > 0")
|
|
}
|
|
for _, code := range c.Startup.IngressChecklistAccepted {
|
|
if code < 100 || code > 599 {
|
|
return fmt.Errorf("config.startup.ingress_checklist_accepted_statuses contains invalid HTTP code %d", code)
|
|
}
|
|
}
|
|
for _, host := range c.Startup.IngressChecklistIgnoreHosts {
|
|
if strings.TrimSpace(host) == "" {
|
|
return fmt.Errorf("config.startup.ingress_checklist_ignore_hosts entries must not be empty")
|
|
}
|
|
}
|
|
if c.Startup.NodeSSHAuthWaitSeconds <= 0 {
|
|
return fmt.Errorf("config.startup.node_ssh_auth_wait_seconds must be > 0")
|
|
}
|
|
if c.Startup.NodeSSHAuthPollSeconds <= 0 {
|
|
return fmt.Errorf("config.startup.node_ssh_auth_poll_seconds must be > 0")
|
|
}
|
|
if c.Startup.FluxHealthWaitSeconds <= 0 {
|
|
return fmt.Errorf("config.startup.flux_health_wait_seconds must be > 0")
|
|
}
|
|
if c.Startup.FluxHealthPollSeconds <= 0 {
|
|
return fmt.Errorf("config.startup.flux_health_poll_seconds must be > 0")
|
|
}
|
|
if c.Startup.WorkloadConvergenceWaitSeconds <= 0 {
|
|
return fmt.Errorf("config.startup.workload_convergence_wait_seconds must be > 0")
|
|
}
|
|
if c.Startup.WorkloadConvergencePollSeconds <= 0 {
|
|
return fmt.Errorf("config.startup.workload_convergence_poll_seconds must be > 0")
|
|
}
|
|
if c.Startup.StuckPodGraceSeconds <= 0 {
|
|
return fmt.Errorf("config.startup.stuck_pod_grace_seconds must be > 0")
|
|
}
|
|
for _, probe := range c.Startup.PostStartProbes {
|
|
if strings.TrimSpace(probe) == "" {
|
|
return fmt.Errorf("config.startup.post_start_probes entries must not be empty")
|
|
}
|
|
}
|
|
for _, item := range c.Startup.IgnoreFluxKustomizations {
|
|
item = strings.TrimSpace(item)
|
|
if item == "" {
|
|
return fmt.Errorf("config.startup.ignore_flux_kustomizations entries must not be empty")
|
|
}
|
|
if strings.Count(item, "/") != 1 {
|
|
return fmt.Errorf("config.startup.ignore_flux_kustomizations entries must be namespace/name, got %q", item)
|
|
}
|
|
}
|
|
for _, item := range c.Startup.IgnoreWorkloads {
|
|
item = strings.TrimSpace(item)
|
|
if item == "" {
|
|
return fmt.Errorf("config.startup.ignore_workloads entries must not be empty")
|
|
}
|
|
parts := strings.Split(item, "/")
|
|
if len(parts) != 2 && len(parts) != 3 {
|
|
return fmt.Errorf("config.startup.ignore_workloads entries must be namespace/name or namespace/kind/name, got %q", item)
|
|
}
|
|
}
|
|
for _, ns := range c.Startup.IgnoreWorkloadNamespaces {
|
|
if strings.TrimSpace(ns) == "" {
|
|
return fmt.Errorf("config.startup.ignore_workload_namespaces entries must not be empty")
|
|
}
|
|
}
|
|
for _, node := range c.Startup.IgnoreUnavailableNodes {
|
|
if strings.TrimSpace(node) == "" {
|
|
return fmt.Errorf("config.startup.ignore_unavailable_nodes entries must not be empty")
|
|
}
|
|
}
|
|
if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" {
|
|
return fmt.Errorf("config.startup.vault_unseal_key_file must not be empty")
|
|
}
|
|
if c.SSHPort <= 0 || c.SSHPort > 65535 {
|
|
return fmt.Errorf("config.ssh_port must be in range 1-65535")
|
|
}
|
|
if c.UPS.Enabled {
|
|
if c.UPS.Provider == "" {
|
|
return fmt.Errorf("config.ups.provider must not be empty when ups is enabled")
|
|
}
|
|
if c.UPS.Target == "" && len(c.UPS.Targets) == 0 {
|
|
return fmt.Errorf("config.ups.target or config.ups.targets must be set when ups is enabled")
|
|
}
|
|
for _, t := range c.UPS.Targets {
|
|
if t.Target == "" {
|
|
return fmt.Errorf("config.ups.targets[].target must not be empty")
|
|
}
|
|
}
|
|
}
|
|
if c.Coordination.ForwardShutdownHost != "" {
|
|
if c.Coordination.ForwardShutdownConfig == "" {
|
|
return fmt.Errorf("config.coordination.forward_shutdown_config must not be empty when forward_shutdown_host is set")
|
|
}
|
|
}
|
|
for _, peer := range c.Coordination.PeerHosts {
|
|
if strings.TrimSpace(peer) == "" {
|
|
return fmt.Errorf("config.coordination.peer_hosts entries must not be empty")
|
|
}
|
|
}
|
|
if c.Coordination.StartupGuardMaxAgeSec <= 0 {
|
|
return fmt.Errorf("config.coordination.startup_guard_max_age_seconds must be > 0")
|
|
}
|
|
if c.Coordination.Role != "coordinator" && c.Coordination.Role != "peer" {
|
|
return fmt.Errorf("config.coordination.role must be coordinator or peer")
|
|
}
|
|
if c.State.RunHistoryPath == "" || c.State.LockPath == "" {
|
|
return fmt.Errorf("config.state.run_history_path and config.state.lock_path must not be empty")
|
|
}
|
|
if c.State.IntentPath == "" {
|
|
return fmt.Errorf("config.state.intent_path must not be empty")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func defaults() Config {
|
|
c := Config{
|
|
IACRepoPath: "/opt/titan-iac",
|
|
ExpectedFluxBranch: "main",
|
|
ExpectedFluxSource: "ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git",
|
|
SSHPort: 2277,
|
|
ControlPlanes: []string{"titan-0a", "titan-0b", "titan-0c"},
|
|
LocalBootstrapPaths: []string{
|
|
"infrastructure/core",
|
|
"clusters/atlas/flux-system",
|
|
"infrastructure/sources/helm",
|
|
"infrastructure/metallb",
|
|
"infrastructure/traefik",
|
|
"infrastructure/cert-manager",
|
|
"infrastructure/vault-csi",
|
|
"infrastructure/vault-injector",
|
|
"services/vault",
|
|
"infrastructure/postgres",
|
|
"services/gitea",
|
|
"services/keycloak",
|
|
"services/oauth2-proxy",
|
|
},
|
|
ExcludedNamespaces: []string{
|
|
"kube-system",
|
|
"kube-public",
|
|
"kube-node-lease",
|
|
"flux-system",
|
|
"traefik",
|
|
"metallb-system",
|
|
"cert-manager",
|
|
"longhorn-system",
|
|
"vault",
|
|
"postgres",
|
|
"maintenance",
|
|
},
|
|
Startup: Startup{
|
|
APIWaitSeconds: 1200,
|
|
APIPollSeconds: 2,
|
|
ShutdownCooldownSeconds: 45,
|
|
RequireTimeSync: true,
|
|
TimeSyncWaitSeconds: 240,
|
|
TimeSyncPollSeconds: 5,
|
|
TimeSyncMode: "quorum",
|
|
TimeSyncQuorum: 2,
|
|
ReconcileAccessOnBoot: true,
|
|
AutoEtcdRestoreOnAPIFailure: true,
|
|
EtcdRestoreControlPlane: "titan-0a",
|
|
RequireStorageReady: true,
|
|
StorageReadyWaitSeconds: 420,
|
|
StorageReadyPollSeconds: 5,
|
|
StorageMinReadyNodes: 2,
|
|
StorageCriticalPVCs: []string{
|
|
"vault/data-vault-0",
|
|
"postgres/postgres-data-postgres-0",
|
|
"gitea/gitea-data",
|
|
"sso/keycloak-data",
|
|
},
|
|
MinimumBatteryPercent: 20,
|
|
RequiredNodeLabels: map[string]map[string]string{
|
|
"titan-09": {
|
|
"ananke.bstein.dev/harbor-bootstrap": "true",
|
|
},
|
|
},
|
|
RequirePostStartProbes: true,
|
|
PostStartProbeWaitSeconds: 240,
|
|
PostStartProbePollSeconds: 5,
|
|
PostStartProbes: []string{
|
|
"https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration",
|
|
"https://scm.bstein.dev/api/healthz",
|
|
"https://metrics.bstein.dev/api/health",
|
|
},
|
|
RequireServiceChecklist: true,
|
|
ServiceChecklistWaitSeconds: 420,
|
|
ServiceChecklistPollSeconds: 5,
|
|
ServiceChecklistStabilitySec: 120,
|
|
ServiceChecklist: []ServiceChecklistCheck{
|
|
{
|
|
Name: "gitea-api",
|
|
URL: "https://scm.bstein.dev/api/healthz",
|
|
AcceptedStatuses: []int{200},
|
|
BodyContains: "pass",
|
|
TimeoutSeconds: 12,
|
|
},
|
|
{
|
|
Name: "grafana-api",
|
|
URL: "https://metrics.bstein.dev/api/health",
|
|
AcceptedStatuses: []int{200},
|
|
BodyContains: "\"database\":\"ok\"",
|
|
TimeoutSeconds: 12,
|
|
},
|
|
{
|
|
Name: "keycloak-oidc",
|
|
URL: "https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration",
|
|
AcceptedStatuses: []int{200},
|
|
BodyContains: "\"issuer\":\"https://sso.bstein.dev/realms/atlas\"",
|
|
TimeoutSeconds: 12,
|
|
},
|
|
{
|
|
Name: "harbor-registry",
|
|
URL: "https://registry.bstein.dev/v2/",
|
|
AcceptedStatuses: []int{401},
|
|
BodyContains: "unauthorized",
|
|
TimeoutSeconds: 12,
|
|
},
|
|
{
|
|
Name: "longhorn-auth",
|
|
URL: "https://longhorn.bstein.dev/",
|
|
AcceptedStatuses: []int{200, 302},
|
|
TimeoutSeconds: 12,
|
|
},
|
|
},
|
|
RequireIngressChecklist: true,
|
|
IngressChecklistWaitSeconds: 420,
|
|
IngressChecklistPollSeconds: 5,
|
|
IngressChecklistAccepted: []int{200, 301, 302, 307, 308, 401, 403, 404},
|
|
IngressChecklistIgnoreHosts: []string{},
|
|
RequireNodeSSHAuth: true,
|
|
NodeSSHAuthWaitSeconds: 240,
|
|
NodeSSHAuthPollSeconds: 5,
|
|
RequireFluxHealth: true,
|
|
FluxHealthWaitSeconds: 900,
|
|
FluxHealthPollSeconds: 5,
|
|
IgnoreFluxKustomizations: []string{},
|
|
RequireWorkloadConvergence: true,
|
|
WorkloadConvergenceWaitSeconds: 900,
|
|
WorkloadConvergencePollSeconds: 5,
|
|
IgnoreWorkloadNamespaces: []string{},
|
|
IgnoreWorkloads: []string{},
|
|
IgnoreUnavailableNodes: []string{},
|
|
AutoRecycleStuckPods: true,
|
|
StuckPodGraceSeconds: 180,
|
|
VaultUnsealKeyFile: "/var/lib/ananke/vault-unseal.key",
|
|
VaultUnsealBreakglassTimeout: 15,
|
|
},
|
|
Shutdown: Shutdown{
|
|
DefaultBudgetSeconds: 1380,
|
|
HistoryMinSamples: 3,
|
|
EmergencyBudgetSec: 420,
|
|
EmergencyMinSamples: 3,
|
|
EmergencySkipEtcd: true,
|
|
EmergencySkipDrain: true,
|
|
DrainParallelism: 6,
|
|
ScaleParallelism: 8,
|
|
SSHParallelism: 8,
|
|
PoweroffEnabled: false,
|
|
PoweroffDelaySeconds: 25,
|
|
PoweroffLocalHost: false,
|
|
},
|
|
UPS: UPS{
|
|
Enabled: true,
|
|
Provider: "nut",
|
|
PollSeconds: 5,
|
|
RuntimeSafetyFactor: 1.25,
|
|
DebounceCount: 3,
|
|
TelemetryTimeoutSeconds: 90,
|
|
},
|
|
Coordination: Coordination{
|
|
ForwardShutdownConfig: "/etc/ananke/ananke.yaml",
|
|
PeerHosts: []string{},
|
|
FallbackLocalShutdown: true,
|
|
CommandTimeoutSeconds: 25,
|
|
StartupGuardMaxAgeSec: 900,
|
|
Role: "coordinator",
|
|
AllowStartupOnBattery: false,
|
|
},
|
|
Metrics: Metrics{
|
|
Enabled: true,
|
|
BindAddr: "0.0.0.0:9560",
|
|
Path: "/metrics",
|
|
},
|
|
State: State{
|
|
Dir: "/var/lib/ananke",
|
|
RunHistoryPath: "/var/lib/ananke/runs.json",
|
|
LockPath: "/var/lib/ananke/ananke.lock",
|
|
IntentPath: "/var/lib/ananke/intent.json",
|
|
},
|
|
}
|
|
c.applyDefaults()
|
|
return c
|
|
}
|
|
|
|
func (c *Config) applyDefaults() {
|
|
if c.ExpectedFluxBranch == "" {
|
|
c.ExpectedFluxBranch = "main"
|
|
}
|
|
if c.IACRepoPath == "" {
|
|
c.IACRepoPath = "/opt/titan-iac"
|
|
}
|
|
if c.ExpectedFluxSource == "" {
|
|
c.ExpectedFluxSource = "ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git"
|
|
}
|
|
if c.Startup.APIWaitSeconds <= 0 {
|
|
c.Startup.APIWaitSeconds = 1200
|
|
}
|
|
if c.Startup.APIPollSeconds <= 0 {
|
|
c.Startup.APIPollSeconds = 2
|
|
}
|
|
if c.Startup.ShutdownCooldownSeconds <= 0 {
|
|
c.Startup.ShutdownCooldownSeconds = 45
|
|
}
|
|
if c.Startup.MinimumBatteryPercent <= 0 {
|
|
c.Startup.MinimumBatteryPercent = 20
|
|
}
|
|
if c.Startup.RequiredNodeLabels == nil {
|
|
c.Startup.RequiredNodeLabels = map[string]map[string]string{
|
|
"titan-09": {
|
|
"ananke.bstein.dev/harbor-bootstrap": "true",
|
|
},
|
|
}
|
|
}
|
|
if c.Startup.TimeSyncWaitSeconds <= 0 {
|
|
c.Startup.TimeSyncWaitSeconds = 240
|
|
}
|
|
if c.Startup.TimeSyncPollSeconds <= 0 {
|
|
c.Startup.TimeSyncPollSeconds = 5
|
|
}
|
|
if c.Startup.TimeSyncMode == "" {
|
|
c.Startup.TimeSyncMode = "quorum"
|
|
}
|
|
if c.Startup.TimeSyncQuorum <= 0 {
|
|
c.Startup.TimeSyncQuorum = 2
|
|
}
|
|
if c.Startup.TimeSyncQuorum > len(c.ControlPlanes) && len(c.ControlPlanes) > 0 {
|
|
c.Startup.TimeSyncQuorum = len(c.ControlPlanes)
|
|
}
|
|
if c.Startup.EtcdRestoreControlPlane == "" && len(c.ControlPlanes) > 0 {
|
|
c.Startup.EtcdRestoreControlPlane = c.ControlPlanes[0]
|
|
}
|
|
if c.Startup.StorageReadyWaitSeconds <= 0 {
|
|
c.Startup.StorageReadyWaitSeconds = 420
|
|
}
|
|
if c.Startup.StorageReadyPollSeconds <= 0 {
|
|
c.Startup.StorageReadyPollSeconds = 5
|
|
}
|
|
if c.Startup.StorageMinReadyNodes <= 0 {
|
|
c.Startup.StorageMinReadyNodes = 2
|
|
}
|
|
if len(c.Startup.StorageCriticalPVCs) == 0 {
|
|
c.Startup.StorageCriticalPVCs = []string{
|
|
"vault/data-vault-0",
|
|
"postgres/postgres-data-postgres-0",
|
|
"gitea/gitea-data",
|
|
"sso/keycloak-data",
|
|
}
|
|
}
|
|
if c.Startup.PostStartProbeWaitSeconds <= 0 {
|
|
c.Startup.PostStartProbeWaitSeconds = 240
|
|
}
|
|
if c.Startup.PostStartProbePollSeconds <= 0 {
|
|
c.Startup.PostStartProbePollSeconds = 5
|
|
}
|
|
if len(c.Startup.PostStartProbes) == 0 {
|
|
c.Startup.PostStartProbes = []string{
|
|
"https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration",
|
|
"https://scm.bstein.dev/api/healthz",
|
|
"https://metrics.bstein.dev/api/health",
|
|
}
|
|
}
|
|
if c.Startup.ServiceChecklistWaitSeconds <= 0 {
|
|
c.Startup.ServiceChecklistWaitSeconds = 420
|
|
}
|
|
if c.Startup.ServiceChecklistPollSeconds <= 0 {
|
|
c.Startup.ServiceChecklistPollSeconds = 5
|
|
}
|
|
if c.Startup.ServiceChecklistStabilitySec < 0 {
|
|
c.Startup.ServiceChecklistStabilitySec = 0
|
|
}
|
|
if len(c.Startup.ServiceChecklist) == 0 {
|
|
c.Startup.ServiceChecklist = []ServiceChecklistCheck{
|
|
{
|
|
Name: "gitea-api",
|
|
URL: "https://scm.bstein.dev/api/healthz",
|
|
AcceptedStatuses: []int{200},
|
|
BodyContains: "pass",
|
|
TimeoutSeconds: 12,
|
|
},
|
|
{
|
|
Name: "grafana-api",
|
|
URL: "https://metrics.bstein.dev/api/health",
|
|
AcceptedStatuses: []int{200},
|
|
BodyContains: "\"database\":\"ok\"",
|
|
TimeoutSeconds: 12,
|
|
},
|
|
}
|
|
}
|
|
for i := range c.Startup.ServiceChecklist {
|
|
if c.Startup.ServiceChecklist[i].TimeoutSeconds <= 0 {
|
|
c.Startup.ServiceChecklist[i].TimeoutSeconds = 12
|
|
}
|
|
}
|
|
if c.Startup.IngressChecklistWaitSeconds <= 0 {
|
|
c.Startup.IngressChecklistWaitSeconds = 420
|
|
}
|
|
if c.Startup.IngressChecklistPollSeconds <= 0 {
|
|
c.Startup.IngressChecklistPollSeconds = 5
|
|
}
|
|
if len(c.Startup.IngressChecklistAccepted) == 0 {
|
|
c.Startup.IngressChecklistAccepted = []int{200, 301, 302, 307, 308, 401, 403, 404}
|
|
}
|
|
if c.Startup.IngressChecklistIgnoreHosts == nil {
|
|
c.Startup.IngressChecklistIgnoreHosts = []string{}
|
|
}
|
|
if c.Startup.NodeSSHAuthWaitSeconds <= 0 {
|
|
c.Startup.NodeSSHAuthWaitSeconds = 240
|
|
}
|
|
if c.Startup.NodeSSHAuthPollSeconds <= 0 {
|
|
c.Startup.NodeSSHAuthPollSeconds = 5
|
|
}
|
|
if c.Startup.FluxHealthWaitSeconds <= 0 {
|
|
c.Startup.FluxHealthWaitSeconds = 900
|
|
}
|
|
if c.Startup.FluxHealthPollSeconds <= 0 {
|
|
c.Startup.FluxHealthPollSeconds = 5
|
|
}
|
|
if c.Startup.IgnoreFluxKustomizations == nil {
|
|
c.Startup.IgnoreFluxKustomizations = []string{}
|
|
}
|
|
if c.Startup.WorkloadConvergenceWaitSeconds <= 0 {
|
|
c.Startup.WorkloadConvergenceWaitSeconds = 900
|
|
}
|
|
if c.Startup.WorkloadConvergencePollSeconds <= 0 {
|
|
c.Startup.WorkloadConvergencePollSeconds = 5
|
|
}
|
|
if c.Startup.IgnoreWorkloadNamespaces == nil {
|
|
c.Startup.IgnoreWorkloadNamespaces = []string{}
|
|
}
|
|
if c.Startup.IgnoreWorkloads == nil {
|
|
c.Startup.IgnoreWorkloads = []string{}
|
|
}
|
|
if c.Startup.IgnoreUnavailableNodes == nil {
|
|
c.Startup.IgnoreUnavailableNodes = []string{}
|
|
}
|
|
if c.Startup.StuckPodGraceSeconds <= 0 {
|
|
c.Startup.StuckPodGraceSeconds = 180
|
|
}
|
|
if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" {
|
|
c.Startup.VaultUnsealKeyFile = "/var/lib/ananke/vault-unseal.key"
|
|
}
|
|
if c.Startup.VaultUnsealBreakglassTimeout <= 0 {
|
|
c.Startup.VaultUnsealBreakglassTimeout = 15
|
|
}
|
|
if c.SSHPort <= 0 {
|
|
c.SSHPort = 2277
|
|
}
|
|
if c.Shutdown.DefaultBudgetSeconds <= 0 {
|
|
c.Shutdown.DefaultBudgetSeconds = 1380
|
|
}
|
|
if c.Shutdown.HistoryMinSamples <= 0 {
|
|
c.Shutdown.HistoryMinSamples = 3
|
|
}
|
|
if c.Shutdown.EmergencyBudgetSec <= 0 {
|
|
c.Shutdown.EmergencyBudgetSec = 420
|
|
}
|
|
if c.Shutdown.EmergencyMinSamples <= 0 {
|
|
c.Shutdown.EmergencyMinSamples = 3
|
|
}
|
|
if c.Shutdown.DrainParallelism <= 0 {
|
|
c.Shutdown.DrainParallelism = 6
|
|
}
|
|
if c.Shutdown.ScaleParallelism <= 0 {
|
|
c.Shutdown.ScaleParallelism = 8
|
|
}
|
|
if c.Shutdown.SSHParallelism <= 0 {
|
|
c.Shutdown.SSHParallelism = 8
|
|
}
|
|
if c.Shutdown.PoweroffDelaySeconds <= 0 {
|
|
c.Shutdown.PoweroffDelaySeconds = 25
|
|
}
|
|
if c.UPS.PollSeconds <= 0 {
|
|
c.UPS.PollSeconds = 5
|
|
}
|
|
if c.UPS.RuntimeSafetyFactor <= 0 {
|
|
c.UPS.RuntimeSafetyFactor = 1.25
|
|
}
|
|
if c.UPS.DebounceCount <= 0 {
|
|
c.UPS.DebounceCount = 3
|
|
}
|
|
if c.UPS.TelemetryTimeoutSeconds <= 0 {
|
|
c.UPS.TelemetryTimeoutSeconds = 90
|
|
}
|
|
if c.Coordination.ForwardShutdownConfig == "" {
|
|
c.Coordination.ForwardShutdownConfig = "/etc/ananke/ananke.yaml"
|
|
}
|
|
if c.Coordination.PeerHosts == nil {
|
|
c.Coordination.PeerHosts = []string{}
|
|
}
|
|
if c.Coordination.CommandTimeoutSeconds <= 0 {
|
|
c.Coordination.CommandTimeoutSeconds = 25
|
|
}
|
|
if c.Coordination.StartupGuardMaxAgeSec <= 0 {
|
|
c.Coordination.StartupGuardMaxAgeSec = 900
|
|
}
|
|
if c.Coordination.Role == "" {
|
|
c.Coordination.Role = "coordinator"
|
|
}
|
|
if c.Metrics.BindAddr == "" {
|
|
c.Metrics.BindAddr = "0.0.0.0:9560"
|
|
}
|
|
if c.Metrics.Path == "" {
|
|
c.Metrics.Path = "/metrics"
|
|
}
|
|
if c.State.Dir == "" {
|
|
c.State.Dir = "/var/lib/ananke"
|
|
}
|
|
if c.State.RunHistoryPath == "" {
|
|
c.State.RunHistoryPath = "/var/lib/ananke/runs.json"
|
|
}
|
|
if c.State.LockPath == "" {
|
|
c.State.LockPath = "/var/lib/ananke/ananke.lock"
|
|
}
|
|
if c.State.IntentPath == "" {
|
|
c.State.IntentPath = "/var/lib/ananke/intent.json"
|
|
}
|
|
}
|