ananke/internal/config/config.go

514 lines
17 KiB
Go

package config
import (
"fmt"
"os"
"strings"
"gopkg.in/yaml.v3"
)
type Config struct {
Kubeconfig string `yaml:"kubeconfig"`
SSHUser string `yaml:"ssh_user"`
SSHPort int `yaml:"ssh_port"`
SSHConfigFile string `yaml:"ssh_config_file"`
SSHIdentityFile string `yaml:"ssh_identity_file"`
SSHNodeHosts map[string]string `yaml:"ssh_node_hosts"`
SSHNodeUsers map[string]string `yaml:"ssh_node_users"`
SSHManagedNodes []string `yaml:"ssh_managed_nodes"`
SSHJumpHost string `yaml:"ssh_jump_host"`
SSHJumpUser string `yaml:"ssh_jump_user"`
IACRepoPath string `yaml:"iac_repo_path"`
ExpectedFluxBranch string `yaml:"expected_flux_branch"`
ControlPlanes []string `yaml:"control_planes"`
Workers []string `yaml:"workers"`
LocalBootstrapPaths []string `yaml:"local_bootstrap_paths"`
ExcludedNamespaces []string `yaml:"excluded_namespaces"`
Startup Startup `yaml:"startup"`
Shutdown Shutdown `yaml:"shutdown"`
UPS UPS `yaml:"ups"`
Coordination Coordination `yaml:"coordination"`
Metrics Metrics `yaml:"metrics"`
State State `yaml:"state"`
}
type Startup struct {
APIWaitSeconds int `yaml:"api_wait_seconds"`
APIPollSeconds int `yaml:"api_poll_seconds"`
RequireTimeSync bool `yaml:"require_time_sync"`
TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"`
TimeSyncPollSeconds int `yaml:"time_sync_poll_seconds"`
TimeSyncMode string `yaml:"time_sync_mode"`
TimeSyncQuorum int `yaml:"time_sync_quorum"`
ReconcileAccessOnBoot bool `yaml:"reconcile_access_on_boot"`
AutoEtcdRestoreOnAPIFailure bool `yaml:"auto_etcd_restore_on_api_failure"`
EtcdRestoreControlPlane string `yaml:"etcd_restore_control_plane"`
RequireStorageReady bool `yaml:"require_storage_ready"`
StorageReadyWaitSeconds int `yaml:"storage_ready_wait_seconds"`
StorageReadyPollSeconds int `yaml:"storage_ready_poll_seconds"`
StorageMinReadyNodes int `yaml:"storage_min_ready_nodes"`
StorageCriticalPVCs []string `yaml:"storage_critical_pvcs"`
RequirePostStartProbes bool `yaml:"require_post_start_probes"`
PostStartProbeWaitSeconds int `yaml:"post_start_probe_wait_seconds"`
PostStartProbePollSeconds int `yaml:"post_start_probe_poll_seconds"`
PostStartProbes []string `yaml:"post_start_probes"`
VaultUnsealKeyFile string `yaml:"vault_unseal_key_file"`
VaultUnsealBreakglassCommand string `yaml:"vault_unseal_breakglass_command"`
VaultUnsealBreakglassTimeout int `yaml:"vault_unseal_breakglass_timeout_seconds"`
}
type Shutdown struct {
DefaultBudgetSeconds int `yaml:"default_budget_seconds"`
HistoryMinSamples int `yaml:"history_min_samples"`
EmergencyBudgetSec int `yaml:"emergency_budget_seconds"`
EmergencyMinSamples int `yaml:"emergency_history_min_samples"`
EmergencySkipEtcd bool `yaml:"emergency_skip_etcd_snapshot"`
EmergencySkipDrain bool `yaml:"emergency_skip_drain"`
SkipEtcdSnapshot bool `yaml:"skip_etcd_snapshot"`
SkipDrain bool `yaml:"skip_drain"`
DrainParallelism int `yaml:"drain_parallelism"`
ScaleParallelism int `yaml:"scale_parallelism"`
SSHParallelism int `yaml:"ssh_parallelism"`
PoweroffEnabled bool `yaml:"poweroff_enabled"`
PoweroffDelaySeconds int `yaml:"poweroff_delay_seconds"`
PoweroffLocalHost bool `yaml:"poweroff_local_host"`
ExtraPoweroffHosts []string `yaml:"extra_poweroff_hosts"`
}
type UPS struct {
Enabled bool `yaml:"enabled"`
Provider string `yaml:"provider"`
Target string `yaml:"target"`
Targets []UPSTarget `yaml:"targets"`
PollSeconds int `yaml:"poll_seconds"`
RuntimeSafetyFactor float64 `yaml:"runtime_safety_factor"`
DebounceCount int `yaml:"debounce_count"`
TelemetryTimeoutSeconds int `yaml:"telemetry_timeout_seconds"`
}
type UPSTarget struct {
Name string `yaml:"name"`
Target string `yaml:"target"`
}
type Coordination struct {
ForwardShutdownHost string `yaml:"forward_shutdown_host"`
ForwardShutdownUser string `yaml:"forward_shutdown_user"`
ForwardShutdownConfig string `yaml:"forward_shutdown_config"`
FallbackLocalShutdown bool `yaml:"fallback_local_shutdown"`
CommandTimeoutSeconds int `yaml:"command_timeout_seconds"`
StartupGuardMaxAgeSec int `yaml:"startup_guard_max_age_seconds"`
Role string `yaml:"role"`
AllowStartupOnBattery bool `yaml:"allow_startup_on_battery"`
}
type Metrics struct {
Enabled bool `yaml:"enabled"`
BindAddr string `yaml:"bind_addr"`
Path string `yaml:"path"`
}
type State struct {
Dir string `yaml:"dir"`
RunHistoryPath string `yaml:"run_history_path"`
LockPath string `yaml:"lock_path"`
IntentPath string `yaml:"intent_path"`
}
func Load(path string) (Config, error) {
cfg := defaults()
b, err := os.ReadFile(path)
if err != nil {
return Config{}, fmt.Errorf("read config %s: %w", path, err)
}
if err := yaml.Unmarshal(b, &cfg); err != nil {
return Config{}, fmt.Errorf("decode config %s: %w", path, err)
}
cfg.applyDefaults()
if err := cfg.Validate(); err != nil {
return Config{}, err
}
return cfg, nil
}
func (c Config) Validate() error {
if len(c.ControlPlanes) == 0 {
return fmt.Errorf("config.control_planes must not be empty")
}
if c.ExpectedFluxBranch == "" {
return fmt.Errorf("config.expected_flux_branch must not be empty")
}
if c.IACRepoPath == "" {
return fmt.Errorf("config.iac_repo_path must not be empty")
}
if c.Shutdown.DefaultBudgetSeconds <= 0 {
return fmt.Errorf("config.shutdown.default_budget_seconds must be > 0")
}
if c.Shutdown.HistoryMinSamples <= 0 {
return fmt.Errorf("config.shutdown.history_min_samples must be > 0")
}
if c.Shutdown.EmergencyBudgetSec <= 0 {
return fmt.Errorf("config.shutdown.emergency_budget_seconds must be > 0")
}
if c.Shutdown.EmergencyMinSamples <= 0 {
return fmt.Errorf("config.shutdown.emergency_history_min_samples must be > 0")
}
if c.Shutdown.DrainParallelism <= 0 {
return fmt.Errorf("config.shutdown.drain_parallelism must be > 0")
}
if c.Shutdown.ScaleParallelism <= 0 {
return fmt.Errorf("config.shutdown.scale_parallelism must be > 0")
}
if c.Shutdown.SSHParallelism <= 0 {
return fmt.Errorf("config.shutdown.ssh_parallelism must be > 0")
}
if c.Startup.APIWaitSeconds <= 0 {
return fmt.Errorf("config.startup.api_wait_seconds must be > 0")
}
if c.Startup.APIPollSeconds <= 0 {
return fmt.Errorf("config.startup.api_poll_seconds must be > 0")
}
if c.Startup.TimeSyncWaitSeconds <= 0 {
return fmt.Errorf("config.startup.time_sync_wait_seconds must be > 0")
}
if c.Startup.TimeSyncPollSeconds <= 0 {
return fmt.Errorf("config.startup.time_sync_poll_seconds must be > 0")
}
if c.Startup.TimeSyncMode != "strict" && c.Startup.TimeSyncMode != "quorum" {
return fmt.Errorf("config.startup.time_sync_mode must be strict or quorum")
}
if c.Startup.TimeSyncMode == "quorum" && c.Startup.TimeSyncQuorum <= 0 {
return fmt.Errorf("config.startup.time_sync_quorum must be > 0 when time_sync_mode=quorum")
}
if c.Startup.EtcdRestoreControlPlane != "" {
found := false
for _, cp := range c.ControlPlanes {
if cp == c.Startup.EtcdRestoreControlPlane {
found = true
break
}
}
if !found {
return fmt.Errorf("config.startup.etcd_restore_control_plane must be one of config.control_planes when set")
}
}
if c.Startup.StorageReadyWaitSeconds <= 0 {
return fmt.Errorf("config.startup.storage_ready_wait_seconds must be > 0")
}
if c.Startup.StorageReadyPollSeconds <= 0 {
return fmt.Errorf("config.startup.storage_ready_poll_seconds must be > 0")
}
if c.Startup.StorageMinReadyNodes <= 0 {
return fmt.Errorf("config.startup.storage_min_ready_nodes must be > 0")
}
for _, pvc := range c.Startup.StorageCriticalPVCs {
if strings.Count(strings.TrimSpace(pvc), "/") != 1 {
return fmt.Errorf("config.startup.storage_critical_pvcs entries must be namespace/name, got %q", pvc)
}
}
if c.Startup.PostStartProbeWaitSeconds <= 0 {
return fmt.Errorf("config.startup.post_start_probe_wait_seconds must be > 0")
}
if c.Startup.PostStartProbePollSeconds <= 0 {
return fmt.Errorf("config.startup.post_start_probe_poll_seconds must be > 0")
}
if c.Startup.RequirePostStartProbes && len(c.Startup.PostStartProbes) == 0 {
return fmt.Errorf("config.startup.post_start_probes must not be empty when require_post_start_probes is true")
}
for _, probe := range c.Startup.PostStartProbes {
if strings.TrimSpace(probe) == "" {
return fmt.Errorf("config.startup.post_start_probes entries must not be empty")
}
}
if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" {
return fmt.Errorf("config.startup.vault_unseal_key_file must not be empty")
}
if c.SSHPort <= 0 || c.SSHPort > 65535 {
return fmt.Errorf("config.ssh_port must be in range 1-65535")
}
if c.UPS.Enabled {
if c.UPS.Provider == "" {
return fmt.Errorf("config.ups.provider must not be empty when ups is enabled")
}
if c.UPS.Target == "" && len(c.UPS.Targets) == 0 {
return fmt.Errorf("config.ups.target or config.ups.targets must be set when ups is enabled")
}
for _, t := range c.UPS.Targets {
if t.Target == "" {
return fmt.Errorf("config.ups.targets[].target must not be empty")
}
}
}
if c.Coordination.ForwardShutdownHost != "" {
if c.Coordination.ForwardShutdownConfig == "" {
return fmt.Errorf("config.coordination.forward_shutdown_config must not be empty when forward_shutdown_host is set")
}
}
if c.Coordination.StartupGuardMaxAgeSec <= 0 {
return fmt.Errorf("config.coordination.startup_guard_max_age_seconds must be > 0")
}
if c.Coordination.Role != "coordinator" && c.Coordination.Role != "peer" {
return fmt.Errorf("config.coordination.role must be coordinator or peer")
}
if c.State.RunHistoryPath == "" || c.State.LockPath == "" {
return fmt.Errorf("config.state.run_history_path and config.state.lock_path must not be empty")
}
if c.State.IntentPath == "" {
return fmt.Errorf("config.state.intent_path must not be empty")
}
return nil
}
func defaults() Config {
c := Config{
IACRepoPath: "/opt/titan-iac",
ExpectedFluxBranch: "main",
SSHPort: 2277,
ControlPlanes: []string{"titan-0a", "titan-0b", "titan-0c"},
LocalBootstrapPaths: []string{
"infrastructure/core",
"clusters/atlas/flux-system",
"infrastructure/sources/helm",
"infrastructure/metallb",
"infrastructure/traefik",
"infrastructure/cert-manager",
"infrastructure/vault-csi",
"infrastructure/vault-injector",
"services/vault",
"infrastructure/postgres",
"services/gitea",
"services/keycloak",
"services/oauth2-proxy",
},
ExcludedNamespaces: []string{
"kube-system",
"kube-public",
"kube-node-lease",
"flux-system",
"traefik",
"metallb-system",
"cert-manager",
"longhorn-system",
"vault",
"postgres",
"maintenance",
},
Startup: Startup{
APIWaitSeconds: 1200,
APIPollSeconds: 2,
RequireTimeSync: true,
TimeSyncWaitSeconds: 240,
TimeSyncPollSeconds: 5,
TimeSyncMode: "quorum",
TimeSyncQuorum: 2,
ReconcileAccessOnBoot: true,
AutoEtcdRestoreOnAPIFailure: true,
EtcdRestoreControlPlane: "titan-0a",
RequireStorageReady: true,
StorageReadyWaitSeconds: 420,
StorageReadyPollSeconds: 5,
StorageMinReadyNodes: 2,
StorageCriticalPVCs: []string{
"vault/data-vault-0",
"postgres/postgres-data-postgres-0",
"gitea/gitea-data",
"sso/keycloak-data",
},
RequirePostStartProbes: true,
PostStartProbeWaitSeconds: 240,
PostStartProbePollSeconds: 5,
PostStartProbes: []string{
"https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration",
"https://scm.bstein.dev/user/login",
"https://metrics.bstein.dev/login",
},
VaultUnsealKeyFile: "/var/lib/hecate/vault-unseal.key",
VaultUnsealBreakglassTimeout: 15,
},
Shutdown: Shutdown{
DefaultBudgetSeconds: 1380,
HistoryMinSamples: 3,
EmergencyBudgetSec: 420,
EmergencyMinSamples: 3,
EmergencySkipEtcd: true,
EmergencySkipDrain: true,
DrainParallelism: 6,
ScaleParallelism: 8,
SSHParallelism: 8,
PoweroffEnabled: true,
PoweroffDelaySeconds: 25,
PoweroffLocalHost: true,
},
UPS: UPS{
Enabled: true,
Provider: "nut",
PollSeconds: 5,
RuntimeSafetyFactor: 1.25,
DebounceCount: 3,
TelemetryTimeoutSeconds: 90,
},
Coordination: Coordination{
ForwardShutdownConfig: "/etc/hecate/hecate.yaml",
FallbackLocalShutdown: true,
CommandTimeoutSeconds: 25,
StartupGuardMaxAgeSec: 900,
Role: "coordinator",
AllowStartupOnBattery: false,
},
Metrics: Metrics{
Enabled: true,
BindAddr: "0.0.0.0:9560",
Path: "/metrics",
},
State: State{
Dir: "/var/lib/hecate",
RunHistoryPath: "/var/lib/hecate/runs.json",
LockPath: "/var/lib/hecate/hecate.lock",
IntentPath: "/var/lib/hecate/intent.json",
},
}
c.applyDefaults()
return c
}
func (c *Config) applyDefaults() {
if c.ExpectedFluxBranch == "" {
c.ExpectedFluxBranch = "main"
}
if c.IACRepoPath == "" {
c.IACRepoPath = "/opt/titan-iac"
}
if c.Startup.APIWaitSeconds <= 0 {
c.Startup.APIWaitSeconds = 1200
}
if c.Startup.APIPollSeconds <= 0 {
c.Startup.APIPollSeconds = 2
}
if c.Startup.TimeSyncWaitSeconds <= 0 {
c.Startup.TimeSyncWaitSeconds = 240
}
if c.Startup.TimeSyncPollSeconds <= 0 {
c.Startup.TimeSyncPollSeconds = 5
}
if c.Startup.TimeSyncMode == "" {
c.Startup.TimeSyncMode = "quorum"
}
if c.Startup.TimeSyncQuorum <= 0 {
c.Startup.TimeSyncQuorum = 2
}
if c.Startup.TimeSyncQuorum > len(c.ControlPlanes) && len(c.ControlPlanes) > 0 {
c.Startup.TimeSyncQuorum = len(c.ControlPlanes)
}
if c.Startup.EtcdRestoreControlPlane == "" && len(c.ControlPlanes) > 0 {
c.Startup.EtcdRestoreControlPlane = c.ControlPlanes[0]
}
if c.Startup.StorageReadyWaitSeconds <= 0 {
c.Startup.StorageReadyWaitSeconds = 420
}
if c.Startup.StorageReadyPollSeconds <= 0 {
c.Startup.StorageReadyPollSeconds = 5
}
if c.Startup.StorageMinReadyNodes <= 0 {
c.Startup.StorageMinReadyNodes = 2
}
if len(c.Startup.StorageCriticalPVCs) == 0 {
c.Startup.StorageCriticalPVCs = []string{
"vault/data-vault-0",
"postgres/postgres-data-postgres-0",
"gitea/gitea-data",
"sso/keycloak-data",
}
}
if c.Startup.PostStartProbeWaitSeconds <= 0 {
c.Startup.PostStartProbeWaitSeconds = 240
}
if c.Startup.PostStartProbePollSeconds <= 0 {
c.Startup.PostStartProbePollSeconds = 5
}
if len(c.Startup.PostStartProbes) == 0 {
c.Startup.PostStartProbes = []string{
"https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration",
"https://scm.bstein.dev/user/login",
"https://metrics.bstein.dev/login",
}
}
if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" {
c.Startup.VaultUnsealKeyFile = "/var/lib/hecate/vault-unseal.key"
}
if c.Startup.VaultUnsealBreakglassTimeout <= 0 {
c.Startup.VaultUnsealBreakglassTimeout = 15
}
if c.SSHPort <= 0 {
c.SSHPort = 2277
}
if c.Shutdown.DefaultBudgetSeconds <= 0 {
c.Shutdown.DefaultBudgetSeconds = 1380
}
if c.Shutdown.HistoryMinSamples <= 0 {
c.Shutdown.HistoryMinSamples = 3
}
if c.Shutdown.EmergencyBudgetSec <= 0 {
c.Shutdown.EmergencyBudgetSec = 420
}
if c.Shutdown.EmergencyMinSamples <= 0 {
c.Shutdown.EmergencyMinSamples = 3
}
if c.Shutdown.DrainParallelism <= 0 {
c.Shutdown.DrainParallelism = 6
}
if c.Shutdown.ScaleParallelism <= 0 {
c.Shutdown.ScaleParallelism = 8
}
if c.Shutdown.SSHParallelism <= 0 {
c.Shutdown.SSHParallelism = 8
}
if c.Shutdown.PoweroffDelaySeconds <= 0 {
c.Shutdown.PoweroffDelaySeconds = 25
}
if c.UPS.PollSeconds <= 0 {
c.UPS.PollSeconds = 5
}
if c.UPS.RuntimeSafetyFactor <= 0 {
c.UPS.RuntimeSafetyFactor = 1.25
}
if c.UPS.DebounceCount <= 0 {
c.UPS.DebounceCount = 3
}
if c.UPS.TelemetryTimeoutSeconds <= 0 {
c.UPS.TelemetryTimeoutSeconds = 90
}
if c.Coordination.ForwardShutdownConfig == "" {
c.Coordination.ForwardShutdownConfig = "/etc/hecate/hecate.yaml"
}
if c.Coordination.CommandTimeoutSeconds <= 0 {
c.Coordination.CommandTimeoutSeconds = 25
}
if c.Coordination.StartupGuardMaxAgeSec <= 0 {
c.Coordination.StartupGuardMaxAgeSec = 900
}
if c.Coordination.Role == "" {
c.Coordination.Role = "coordinator"
}
if c.Metrics.BindAddr == "" {
c.Metrics.BindAddr = "0.0.0.0:9560"
}
if c.Metrics.Path == "" {
c.Metrics.Path = "/metrics"
}
if c.State.Dir == "" {
c.State.Dir = "/var/lib/hecate"
}
if c.State.RunHistoryPath == "" {
c.State.RunHistoryPath = "/var/lib/hecate/runs.json"
}
if c.State.LockPath == "" {
c.State.LockPath = "/var/lib/hecate/hecate.lock"
}
if c.State.IntentPath == "" {
c.State.IntentPath = "/var/lib/hecate/intent.json"
}
}