diff --git a/Makefile b/Makefile index 4fb47f9..5fc1890 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ .PHONY: build test fmt tidy install drill-list drill-run build: - go build -o dist/hecate ./cmd/hecate + go build -o dist/ananke ./cmd/ananke test: go test ./... @@ -16,7 +16,7 @@ install: sudo ./scripts/install.sh drill-list: - ./scripts/hecate-drills.sh list + ./scripts/ananke-drills.sh list drill-run: - ./scripts/hecate-drills.sh run $(DRILL) --execute + ./scripts/ananke-drills.sh run $(DRILL) --execute diff --git a/cmd/hecate/main.go b/cmd/ananke/main.go similarity index 88% rename from cmd/hecate/main.go rename to cmd/ananke/main.go index 61933cf..fc079a3 100644 --- a/cmd/hecate/main.go +++ b/cmd/ananke/main.go @@ -6,6 +6,7 @@ import ( "flag" "fmt" "log" + "math" "os" "os/exec" "os/signal" @@ -14,17 +15,17 @@ import ( "syscall" "time" - "scm.bstein.dev/bstein/hecate/internal/cluster" - "scm.bstein.dev/bstein/hecate/internal/config" - "scm.bstein.dev/bstein/hecate/internal/execx" - "scm.bstein.dev/bstein/hecate/internal/service" - "scm.bstein.dev/bstein/hecate/internal/sshutil" - "scm.bstein.dev/bstein/hecate/internal/state" - "scm.bstein.dev/bstein/hecate/internal/ups" + "scm.bstein.dev/bstein/ananke/internal/cluster" + "scm.bstein.dev/bstein/ananke/internal/config" + "scm.bstein.dev/bstein/ananke/internal/execx" + "scm.bstein.dev/bstein/ananke/internal/service" + "scm.bstein.dev/bstein/ananke/internal/sshutil" + "scm.bstein.dev/bstein/ananke/internal/state" + "scm.bstein.dev/bstein/ananke/internal/ups" ) func main() { - logger := log.New(os.Stdout, "[hecate] ", log.LstdFlags) + logger := log.New(os.Stdout, "[ananke] ", log.LstdFlags) if len(os.Args) < 2 { usage() os.Exit(2) @@ -73,7 +74,7 @@ func main() { func runStartup(logger *log.Logger, args []string) error { fs := flag.NewFlagSet("startup", flag.ExitOnError) - configPath := fs.String("config", "/etc/hecate/hecate.yaml", "Path to config file") + configPath := fs.String("config", "/etc/ananke/ananke.yaml", "Path to config file") execute := fs.Bool("execute", false, "Actually execute changes (default dry-run)") forceBranch := fs.String("force-flux-branch", "", "Patch Flux source branch before resume") skipLocalBootstrap := fs.Bool("skip-local-bootstrap", false, "Skip local fallback bootstrap applies") @@ -124,7 +125,7 @@ func runStartup(logger *log.Logger, args []string) error { } checkCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() - if err := ensureStartupPowerSafe(checkCtx, targets); err != nil { + if err := ensureStartupPowerSafe(checkCtx, targets, cfg.Startup.MinimumBatteryPercent); err != nil { return err } } @@ -141,10 +142,11 @@ func runStartup(logger *log.Logger, args []string) error { func runShutdown(logger *log.Logger, args []string) error { fs := flag.NewFlagSet("shutdown", flag.ExitOnError) - configPath := fs.String("config", "/etc/hecate/hecate.yaml", "Path to config file") + configPath := fs.String("config", "/etc/ananke/ananke.yaml", "Path to config file") execute := fs.Bool("execute", false, "Actually execute changes (default dry-run)") skipEtcd := fs.Bool("skip-etcd-snapshot", false, "Skip etcd snapshot") skipDrain := fs.Bool("skip-drain", false, "Skip worker drain") + mode := fs.String("mode", "config", "Shutdown mode: config|cluster-only|poweroff") reason := fs.String("reason", "manual-shutdown", "Shutdown reason for run history") _ = fs.Parse(args) @@ -158,13 +160,14 @@ func runShutdown(logger *log.Logger, args []string) error { return orch.Shutdown(ctx, cluster.ShutdownOptions{ SkipEtcdSnapshot: *skipEtcd, SkipDrain: *skipDrain, + Mode: *mode, Reason: *reason, }) } func runDaemon(logger *log.Logger, args []string) error { fs := flag.NewFlagSet("daemon", flag.ExitOnError) - configPath := fs.String("config", "/etc/hecate/hecate.yaml", "Path to config file") + configPath := fs.String("config", "/etc/ananke/ananke.yaml", "Path to config file") dryRunActions := fs.Bool("dry-run-actions", false, "Log planned actions without executing") _ = fs.Parse(args) @@ -191,7 +194,7 @@ func runDaemon(logger *log.Logger, args []string) error { func runEtcdRestore(logger *log.Logger, args []string) error { fs := flag.NewFlagSet("etcd-restore", flag.ExitOnError) - configPath := fs.String("config", "/etc/hecate/hecate.yaml", "Path to config file") + configPath := fs.String("config", "/etc/ananke/ananke.yaml", "Path to config file") execute := fs.Bool("execute", false, "Actually execute restore (default dry-run)") controlPlane := fs.String("control-plane", "", "Control plane to run restore on (defaults to startup.etcd_restore_control_plane)") snapshotPath := fs.String("snapshot", "", "Explicit snapshot path (defaults to latest on selected control plane)") @@ -211,7 +214,7 @@ func runEtcdRestore(logger *log.Logger, args []string) error { func runStatus(logger *log.Logger, args []string) error { fs := flag.NewFlagSet("status", flag.ExitOnError) - configPath := fs.String("config", "/etc/hecate/hecate.yaml", "Path to config file") + configPath := fs.String("config", "/etc/ananke/ananke.yaml", "Path to config file") _ = fs.Parse(args) cfg, orch, err := buildOrchestrator(logger, *configPath, true) @@ -246,7 +249,7 @@ func runStatus(logger *log.Logger, args []string) error { func runIntent(logger *log.Logger, args []string) error { fs := flag.NewFlagSet("intent", flag.ExitOnError) - configPath := fs.String("config", "/etc/hecate/hecate.yaml", "Path to config file") + configPath := fs.String("config", "/etc/ananke/ananke.yaml", "Path to config file") setState := fs.String("set", "", "Set intent state (normal|startup_in_progress|shutting_down|shutdown_complete)") reason := fs.String("reason", "manual-intent", "Intent reason (used with --set)") source := fs.String("source", "manual", "Intent source (used with --set)") @@ -314,7 +317,7 @@ func buildUPSTargets(cfg config.Config) ([]service.Target, error) { return targets, nil } -func ensureStartupPowerSafe(ctx context.Context, targets []service.Target) error { +func ensureStartupPowerSafe(ctx context.Context, targets []service.Target, minimumBatteryPercent float64) error { type targetState struct { seenGood bool lastErr error @@ -327,6 +330,7 @@ func ensureStartupPowerSafe(ctx context.Context, targets []service.Target) error const pollInterval = 3 * time.Second for { onBatteryTargets := []string{} + lowChargeTargets := []string{} allSeen := true for _, t := range targets { key := t.Name + "|" + t.Target @@ -344,10 +348,25 @@ func ensureStartupPowerSafe(ctx context.Context, targets []service.Target) error if sample.OnBattery { onBatteryTargets = append(onBatteryTargets, fmt.Sprintf("%s(status=%s runtime_s=%d)", t.Name, sample.RawStatus, sample.RuntimeSeconds)) } + if minimumBatteryPercent > 0 && sample.BatteryCharge > 0 && sample.BatteryCharge < minimumBatteryPercent { + lowChargeTargets = append( + lowChargeTargets, + fmt.Sprintf( + "%s(charge=%.1f%%<%.1f%% status=%s)", + t.Name, + sample.BatteryCharge, + minimumBatteryPercent, + sample.RawStatus, + ), + ) + } } if len(onBatteryTargets) > 0 { return fmt.Errorf("startup blocked: UPS is on battery for %s", strings.Join(onBatteryTargets, ", ")) } + if len(lowChargeTargets) > 0 { + return fmt.Errorf("startup blocked: UPS battery charge below minimum for %s", strings.Join(lowChargeTargets, ", ")) + } if allSeen { return nil } @@ -366,7 +385,8 @@ func ensureStartupPowerSafe(ctx context.Context, targets []service.Target) error unverified = append(unverified, fmt.Sprintf("%s(%s): no telemetry sample yet", t.Name, t.Target)) } } - return fmt.Errorf("startup blocked: unable to verify UPS telemetry before timeout: %s", strings.Join(unverified, " | ")) + roundedMin := math.Round(minimumBatteryPercent*10) / 10 + return fmt.Errorf("startup blocked: unable to verify UPS telemetry before timeout (minimum_battery_percent=%.1f): %s", roundedMin, strings.Join(unverified, " | ")) case <-time.After(pollInterval): } } @@ -391,26 +411,26 @@ func buildOrchestrator(logger *log.Logger, cfgPath string, dryRun bool) (config. } func usage() { - fmt.Print(`hecate: staged startup/shutdown + UPS-triggered protection + fmt.Print(`ananke: staged startup/shutdown + UPS-triggered protection Usage: - hecate [flags] + ananke [flags] Commands: startup Perform staged cluster startup shutdown Perform graceful cluster shutdown etcd-restore Restore etcd from snapshot on a control plane daemon Monitor UPS and auto-trigger shutdown - status Print current hecate status and estimates + status Print current ananke status and estimates intent Read or manually set intent state Examples: - hecate startup --config /etc/hecate/hecate.yaml --execute --force-flux-branch main - hecate shutdown --config /etc/hecate/hecate.yaml --execute --reason "manual-maintenance" - hecate etcd-restore --config /etc/hecate/hecate.yaml --execute - hecate daemon --config /etc/hecate/hecate.yaml - hecate status --config /etc/hecate/hecate.yaml - hecate intent --config /etc/hecate/hecate.yaml --set normal --reason "manual-clear" --execute + ananke startup --config /etc/ananke/ananke.yaml --execute --force-flux-branch main + ananke shutdown --config /etc/ananke/ananke.yaml --execute --reason "manual-maintenance" + ananke etcd-restore --config /etc/ananke/ananke.yaml --execute + ananke daemon --config /etc/ananke/ananke.yaml + ananke status --config /etc/ananke/ananke.yaml + ananke intent --config /etc/ananke/ananke.yaml --set normal --reason "manual-clear" --execute `) } @@ -439,7 +459,7 @@ func tryPeerBootstrapHandoff(ctx context.Context, cfg config.Config, logger *log args := buildSSHBaseArgs(cfg) - remote := "sudo -n systemctl start hecate-bootstrap.service" + remote := "sudo -n systemctl start ananke-bootstrap.service" attempt := 1 for { cmdArgs := append(append([]string{}, args...), target, remote) @@ -480,7 +500,7 @@ func coordinatorAllowsPeerFallbackStartup(ctx context.Context, cfg config.Config if user != "" { target = user + "@" + host } - remoteCmd := "if sudo -n /usr/bin/systemctl is-active --quiet hecate-bootstrap.service; then echo __HECATE_BOOTSTRAP_ACTIVE__; else echo __HECATE_BOOTSTRAP_IDLE__; fi; sudo -n /usr/local/bin/hecate intent --config /etc/hecate/hecate.yaml" + remoteCmd := "if sudo -n /usr/bin/systemctl is-active --quiet ananke-bootstrap.service; then echo __ANANKE_BOOTSTRAP_ACTIVE__; else echo __ANANKE_BOOTSTRAP_IDLE__; fi; sudo -n /usr/local/bin/ananke intent --config /etc/ananke/ananke.yaml" args := append(buildSSHBaseArgs(cfg), target, remoteCmd) out, err := runSSHWithRecovery(ctx, logger, cfg, args, []string{coordinator, host, cfg.SSHJumpHost}) if err != nil { @@ -488,7 +508,7 @@ func coordinatorAllowsPeerFallbackStartup(ctx context.Context, cfg config.Config return true, "coordinator unreachable", nil } trimmed := strings.TrimSpace(out) - if strings.Contains(trimmed, "__HECATE_BOOTSTRAP_ACTIVE__") { + if strings.Contains(trimmed, "__ANANKE_BOOTSTRAP_ACTIVE__") { return false, "coordinator bootstrap service is active", nil } remoteIntent, parseErr := state.ParseIntentOutput(trimmed) diff --git a/configs/hecate.example.yaml b/configs/ananke.example.yaml similarity index 64% rename from configs/hecate.example.yaml rename to configs/ananke.example.yaml index 54920d0..9d0f4ae 100644 --- a/configs/hecate.example.yaml +++ b/configs/ananke.example.yaml @@ -1,5 +1,5 @@ -# /etc/hecate/hecate.yaml -kubeconfig: /etc/hecate/kubeconfig +# /etc/ananke/ananke.yaml +kubeconfig: /etc/ananke/kubeconfig ssh_user: atlas ssh_port: 2277 ssh_config_file: "" @@ -11,6 +11,7 @@ ssh_jump_host: "" ssh_jump_user: "" iac_repo_path: /opt/titan-iac expected_flux_branch: main +expected_flux_source_url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git control_planes: - titan-0a - titan-0b @@ -46,6 +47,10 @@ startup: api_wait_seconds: 1200 api_poll_seconds: 2 shutdown_cooldown_seconds: 45 + minimum_battery_percent: 20 + required_node_labels: + titan-09: + ananke.bstein.dev/harbor-bootstrap: "true" require_time_sync: true time_sync_wait_seconds: 240 time_sync_poll_seconds: 5 @@ -67,9 +72,36 @@ startup: post_start_probe_wait_seconds: 240 post_start_probe_poll_seconds: 5 post_start_probes: - - https://scm.bstein.dev/user/login - - https://metrics.bstein.dev/login - vault_unseal_key_file: /var/lib/hecate/vault-unseal.key + - https://scm.bstein.dev/api/healthz + - https://metrics.bstein.dev/api/health + require_service_checklist: true + service_checklist_wait_seconds: 420 + service_checklist_poll_seconds: 5 + service_checklist_stability_seconds: 120 + service_checklist: + - name: gitea-api + url: https://scm.bstein.dev/api/healthz + accepted_statuses: [200] + body_contains: pass + timeout_seconds: 12 + - name: grafana-api + url: https://metrics.bstein.dev/api/health + accepted_statuses: [200] + body_contains: '"database":"ok"' + timeout_seconds: 12 + require_flux_health: true + flux_health_wait_seconds: 900 + flux_health_poll_seconds: 5 + ignore_flux_kustomizations: [] + require_workload_convergence: true + workload_convergence_wait_seconds: 900 + workload_convergence_poll_seconds: 5 + ignore_workload_namespaces: [] + ignore_workloads: [] + ignore_unavailable_nodes: [] + auto_recycle_stuck_pods: true + stuck_pod_grace_seconds: 180 + vault_unseal_key_file: /var/lib/ananke/vault-unseal.key vault_unseal_breakglass_command: "" vault_unseal_breakglass_timeout_seconds: 15 shutdown: @@ -103,7 +135,7 @@ ups: coordination: forward_shutdown_host: "" forward_shutdown_user: atlas - forward_shutdown_config: /etc/hecate/hecate.yaml + forward_shutdown_config: /etc/ananke/ananke.yaml peer_hosts: [] fallback_local_shutdown: true command_timeout_seconds: 25 @@ -115,7 +147,7 @@ metrics: bind_addr: 0.0.0.0:9560 path: /metrics state: - dir: /var/lib/hecate - run_history_path: /var/lib/hecate/runs.json - lock_path: /var/lib/hecate/hecate.lock - intent_path: /var/lib/hecate/intent.json + dir: /var/lib/ananke + run_history_path: /var/lib/ananke/runs.json + lock_path: /var/lib/ananke/ananke.lock + intent_path: /var/lib/ananke/intent.json diff --git a/configs/hecate.tethys.yaml b/configs/ananke.tethys.yaml similarity index 71% rename from configs/hecate.tethys.yaml rename to configs/ananke.tethys.yaml index 0ee8952..3eb52bf 100644 --- a/configs/hecate.tethys.yaml +++ b/configs/ananke.tethys.yaml @@ -1,5 +1,5 @@ -# /etc/hecate/hecate.yaml for titan-24 (tethys forwarder) -kubeconfig: /etc/hecate/kubeconfig +# /etc/ananke/ananke.yaml for titan-24 (tethys forwarder) +kubeconfig: /etc/ananke/kubeconfig ssh_user: atlas ssh_port: 2277 ssh_config_file: /home/tethys/.ssh/config @@ -58,6 +58,7 @@ ssh_jump_host: "" ssh_jump_user: "" iac_repo_path: /opt/titan-iac expected_flux_branch: main +expected_flux_source_url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git control_planes: - titan-0a - titan-0b @@ -112,6 +113,10 @@ startup: api_wait_seconds: 1200 api_poll_seconds: 2 shutdown_cooldown_seconds: 45 + minimum_battery_percent: 20 + required_node_labels: + titan-09: + ananke.bstein.dev/harbor-bootstrap: "true" require_time_sync: true time_sync_wait_seconds: 240 time_sync_poll_seconds: 5 @@ -133,10 +138,37 @@ startup: post_start_probe_wait_seconds: 240 post_start_probe_poll_seconds: 5 post_start_probes: - - https://scm.bstein.dev/user/login - - https://metrics.bstein.dev/login - vault_unseal_key_file: /var/lib/hecate/vault-unseal.key - vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/tethys/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.hecate-breakglass/vault-unseal.key'" + - https://scm.bstein.dev/api/healthz + - https://metrics.bstein.dev/api/health + require_service_checklist: true + service_checklist_wait_seconds: 420 + service_checklist_poll_seconds: 5 + service_checklist_stability_seconds: 120 + service_checklist: + - name: gitea-api + url: https://scm.bstein.dev/api/healthz + accepted_statuses: [200] + body_contains: pass + timeout_seconds: 12 + - name: grafana-api + url: https://metrics.bstein.dev/api/health + accepted_statuses: [200] + body_contains: '"database":"ok"' + timeout_seconds: 12 + require_flux_health: true + flux_health_wait_seconds: 900 + flux_health_poll_seconds: 5 + ignore_flux_kustomizations: [] + require_workload_convergence: true + workload_convergence_wait_seconds: 900 + workload_convergence_poll_seconds: 5 + ignore_workload_namespaces: [] + ignore_workloads: [] + ignore_unavailable_nodes: [] + auto_recycle_stuck_pods: true + stuck_pod_grace_seconds: 180 + vault_unseal_key_file: /var/lib/ananke/vault-unseal.key + vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/tethys/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.ananke-breakglass/vault-unseal.key'" vault_unseal_breakglass_timeout_seconds: 15 shutdown: default_budget_seconds: 1380 @@ -167,7 +199,7 @@ ups: coordination: forward_shutdown_host: titan-db forward_shutdown_user: atlas - forward_shutdown_config: /etc/hecate/hecate.yaml + forward_shutdown_config: /etc/ananke/ananke.yaml peer_hosts: - titan-db fallback_local_shutdown: false @@ -180,7 +212,7 @@ metrics: bind_addr: 0.0.0.0:9560 path: /metrics state: - dir: /var/lib/hecate - run_history_path: /var/lib/hecate/runs.json - lock_path: /var/lib/hecate/hecate.lock - intent_path: /var/lib/hecate/intent.json + dir: /var/lib/ananke + run_history_path: /var/lib/ananke/runs.json + lock_path: /var/lib/ananke/ananke.lock + intent_path: /var/lib/ananke/intent.json diff --git a/configs/hecate.titan-db.yaml b/configs/ananke.titan-db.yaml similarity index 71% rename from configs/hecate.titan-db.yaml rename to configs/ananke.titan-db.yaml index 2124347..c3d447c 100644 --- a/configs/hecate.titan-db.yaml +++ b/configs/ananke.titan-db.yaml @@ -1,5 +1,5 @@ -# /etc/hecate/hecate.yaml for titan-db (coordinator) -kubeconfig: /etc/hecate/kubeconfig +# /etc/ananke/ananke.yaml for titan-db (coordinator) +kubeconfig: /etc/ananke/kubeconfig ssh_user: atlas ssh_port: 2277 ssh_config_file: /home/atlas/.ssh/config @@ -58,6 +58,7 @@ ssh_jump_host: "" ssh_jump_user: "" iac_repo_path: /opt/titan-iac expected_flux_branch: main +expected_flux_source_url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git control_planes: - titan-0a - titan-0b @@ -112,6 +113,10 @@ startup: api_wait_seconds: 1200 api_poll_seconds: 2 shutdown_cooldown_seconds: 45 + minimum_battery_percent: 20 + required_node_labels: + titan-09: + ananke.bstein.dev/harbor-bootstrap: "true" require_time_sync: true time_sync_wait_seconds: 240 time_sync_poll_seconds: 5 @@ -133,10 +138,37 @@ startup: post_start_probe_wait_seconds: 240 post_start_probe_poll_seconds: 5 post_start_probes: - - https://scm.bstein.dev/user/login - - https://metrics.bstein.dev/login - vault_unseal_key_file: /var/lib/hecate/vault-unseal.key - vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/atlas/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.hecate-breakglass/vault-unseal.key'" + - https://scm.bstein.dev/api/healthz + - https://metrics.bstein.dev/api/health + require_service_checklist: true + service_checklist_wait_seconds: 420 + service_checklist_poll_seconds: 5 + service_checklist_stability_seconds: 120 + service_checklist: + - name: gitea-api + url: https://scm.bstein.dev/api/healthz + accepted_statuses: [200] + body_contains: pass + timeout_seconds: 12 + - name: grafana-api + url: https://metrics.bstein.dev/api/health + accepted_statuses: [200] + body_contains: '"database":"ok"' + timeout_seconds: 12 + require_flux_health: true + flux_health_wait_seconds: 900 + flux_health_poll_seconds: 5 + ignore_flux_kustomizations: [] + require_workload_convergence: true + workload_convergence_wait_seconds: 900 + workload_convergence_poll_seconds: 5 + ignore_workload_namespaces: [] + ignore_workloads: [] + ignore_unavailable_nodes: [] + auto_recycle_stuck_pods: true + stuck_pod_grace_seconds: 180 + vault_unseal_key_file: /var/lib/ananke/vault-unseal.key + vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/atlas/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.ananke-breakglass/vault-unseal.key'" vault_unseal_breakglass_timeout_seconds: 15 shutdown: default_budget_seconds: 1380 @@ -168,7 +200,7 @@ ups: coordination: forward_shutdown_host: "" forward_shutdown_user: atlas - forward_shutdown_config: /etc/hecate/hecate.yaml + forward_shutdown_config: /etc/ananke/ananke.yaml peer_hosts: - titan-24 fallback_local_shutdown: true @@ -181,7 +213,7 @@ metrics: bind_addr: 0.0.0.0:9560 path: /metrics state: - dir: /var/lib/hecate - run_history_path: /var/lib/hecate/runs.json - lock_path: /var/lib/hecate/hecate.lock - intent_path: /var/lib/hecate/intent.json + dir: /var/lib/ananke + run_history_path: /var/lib/ananke/runs.json + lock_path: /var/lib/ananke/ananke.lock + intent_path: /var/lib/ananke/intent.json diff --git a/deploy/systemd/hecate-bootstrap.service b/deploy/systemd/ananke-bootstrap.service similarity index 53% rename from deploy/systemd/hecate-bootstrap.service rename to deploy/systemd/ananke-bootstrap.service index 4aeb962..9dff702 100644 --- a/deploy/systemd/hecate-bootstrap.service +++ b/deploy/systemd/ananke-bootstrap.service @@ -1,15 +1,15 @@ [Unit] -Description=Hecate Staged Cluster Bootstrap +Description=Ananke Staged Cluster Bootstrap Wants=network-online.target After=network-online.target -ConditionPathExists=/etc/hecate/hecate.yaml +ConditionPathExists=/etc/ananke/ananke.yaml StartLimitIntervalSec=0 [Service] Type=oneshot User=root Group=root -ExecStart=/usr/local/bin/hecate startup --config /etc/hecate/hecate.yaml --execute --force-flux-branch main --auto-peer-failover --peer-wait-seconds 180 +ExecStart=/usr/local/bin/ananke startup --config /etc/ananke/ananke.yaml --execute --force-flux-branch main --auto-peer-failover --peer-wait-seconds 180 Restart=on-failure RestartSec=30 TimeoutStartSec=1800 diff --git a/deploy/systemd/hecate-update.service b/deploy/systemd/ananke-update.service similarity index 54% rename from deploy/systemd/hecate-update.service rename to deploy/systemd/ananke-update.service index 2d42896..f088b20 100644 --- a/deploy/systemd/hecate-update.service +++ b/deploy/systemd/ananke-update.service @@ -1,5 +1,5 @@ [Unit] -Description=Hecate Self-Update and Reinstall +Description=Ananke Self-Update and Reinstall Wants=network-online.target After=network-online.target @@ -7,6 +7,7 @@ After=network-online.target Type=oneshot User=root Group=root -ExecStart=/usr/local/lib/hecate/hecate-self-update.sh +ExecStart=/usr/local/lib/ananke/ananke-self-update.sh TimeoutStartSec=1800 +[Install] diff --git a/deploy/systemd/hecate-update.timer b/deploy/systemd/ananke-update.timer similarity index 56% rename from deploy/systemd/hecate-update.timer rename to deploy/systemd/ananke-update.timer index ce7d193..8960ee2 100644 --- a/deploy/systemd/hecate-update.timer +++ b/deploy/systemd/ananke-update.timer @@ -1,12 +1,11 @@ [Unit] -Description=Periodic Hecate Self-Update Timer +Description=Periodic Ananke Self-Update Timer [Timer] OnBootSec=2m OnUnitActiveSec=6h -Unit=hecate-update.service +Unit=ananke-update.service Persistent=true [Install] WantedBy=timers.target - diff --git a/deploy/systemd/hecate.service b/deploy/systemd/ananke.service similarity index 52% rename from deploy/systemd/hecate.service rename to deploy/systemd/ananke.service index c19e55d..388222d 100644 --- a/deploy/systemd/hecate.service +++ b/deploy/systemd/ananke.service @@ -1,14 +1,14 @@ [Unit] -Description=Hecate UPS Monitor and Auto Shutdown Orchestrator +Description=Ananke UPS Monitor and Auto Shutdown Orchestrator Wants=network-online.target After=network-online.target -ConditionPathExists=/etc/hecate/hecate.yaml +ConditionPathExists=/etc/ananke/ananke.yaml [Service] Type=simple User=root Group=root -ExecStart=/usr/local/bin/hecate daemon --config /etc/hecate/hecate.yaml +ExecStart=/usr/local/bin/ananke daemon --config /etc/ananke/ananke.yaml Restart=on-failure RestartSec=5 NoNewPrivileges=true diff --git a/go.mod b/go.mod index ceb106a..487c73c 100644 --- a/go.mod +++ b/go.mod @@ -1,4 +1,4 @@ -module scm.bstein.dev/bstein/hecate +module scm.bstein.dev/bstein/ananke go 1.25 diff --git a/internal/cluster/orchestrator.go b/internal/cluster/orchestrator.go index cf438a0..ff5ba72 100644 --- a/internal/cluster/orchestrator.go +++ b/internal/cluster/orchestrator.go @@ -2,12 +2,15 @@ package cluster import ( "context" + "crypto/tls" "encoding/base64" "encoding/json" "errors" "fmt" + "io" "log" "net" + "net/http" neturl "net/url" "os" "os/exec" @@ -19,10 +22,10 @@ import ( "sync" "time" - "scm.bstein.dev/bstein/hecate/internal/config" - "scm.bstein.dev/bstein/hecate/internal/execx" - "scm.bstein.dev/bstein/hecate/internal/sshutil" - "scm.bstein.dev/bstein/hecate/internal/state" + "scm.bstein.dev/bstein/ananke/internal/config" + "scm.bstein.dev/bstein/ananke/internal/execx" + "scm.bstein.dev/bstein/ananke/internal/sshutil" + "scm.bstein.dev/bstein/ananke/internal/state" ) type Orchestrator struct { @@ -41,6 +44,7 @@ type StartupOptions struct { type ShutdownOptions struct { SkipEtcdSnapshot bool SkipDrain bool + Mode string Reason string } @@ -62,6 +66,11 @@ type workloadScaleEntry struct { Replicas int `json:"replicas"` } +type remotePeerStatus struct { + Intent state.Intent + BootstrapActive bool +} + type workloadScaleSnapshot struct { GeneratedAt time.Time `json:"generated_at"` Entries []workloadScaleEntry `json:"entries"` @@ -222,12 +231,19 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er return fmt.Errorf("kubernetes API did not become reachable after automatic etcd restore: %w", err) } } + if err := o.ensureRequiredNodeLabels(ctx); err != nil { + return err + } desiredFluxBranch := strings.TrimSpace(opts.ForceFluxBranch) if desiredFluxBranch == "" { desiredFluxBranch = strings.TrimSpace(o.cfg.ExpectedFluxBranch) } - if err := o.ensureFluxBranch(ctx, desiredFluxBranch); err != nil { + allowFluxBranchPatch := strings.TrimSpace(opts.ForceFluxBranch) != "" + if err := o.guardFluxSourceDrift(ctx, desiredFluxBranch, allowFluxBranchPatch); err != nil { + return err + } + if err := o.ensureFluxBranch(ctx, desiredFluxBranch, allowFluxBranchPatch); err != nil { return err } @@ -310,6 +326,9 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er return err } } + if err := o.waitForStartupConvergence(ctx); err != nil { + return err + } o.log.Printf("startup flow complete") return nil } @@ -446,9 +465,27 @@ func (o *Orchestrator) Shutdown(ctx context.Context, opts ShutdownOptions) (err o.bestEffort("drain workers", func() error { return o.drainWorkers(ctx, workers) }) } + shutdownMode := strings.TrimSpace(opts.Mode) + poweroffEnabled := o.cfg.Shutdown.PoweroffEnabled + switch shutdownMode { + case "", "config": + // honor configured behavior + case "cluster-only": + poweroffEnabled = false + case "poweroff": + poweroffEnabled = true + default: + return fmt.Errorf("unsupported shutdown mode %q (expected config|cluster-only|poweroff)", shutdownMode) + } + modeLabel := "cluster-only" + if poweroffEnabled { + modeLabel = "cluster-and-host-poweroff" + } + o.log.Printf("shutdown execution mode=%s (requested=%q config_poweroff_enabled=%t)", modeLabel, shutdownMode, o.cfg.Shutdown.PoweroffEnabled) + o.stopWorkers(ctx, workers) o.stopControlPlanes(ctx, o.cfg.ControlPlanes) - if o.cfg.Shutdown.PoweroffEnabled { + if poweroffEnabled { o.bestEffort("poweroff hosts", func() error { return o.poweroffHosts(ctx, workers) }) } o.log.Printf("shutdown flow complete") @@ -1093,12 +1130,14 @@ func (o *Orchestrator) guardPeerStartupIntents(ctx context.Context) error { return nil } guardAge := o.startupGuardAge() + localRole := strings.ToLower(strings.TrimSpace(o.cfg.Coordination.Role)) for _, peer := range peers { - intent, err := o.readRemoteIntent(ctx, peer) + peerStatus, err := o.readRemotePeerStatus(ctx, peer) if err != nil { o.log.Printf("warning: peer startup guard skipped intent check for %s: %v", peer, err) continue } + intent := peerStatus.Intent switch intent.State { case "", state.IntentNormal: continue @@ -1108,10 +1147,26 @@ func (o *Orchestrator) guardPeerStartupIntents(ctx context.Context) error { } o.log.Printf("warning: peer %s shutdown intent appears stale; allowing startup", peer) case state.IntentStartupInProgress: + if !peerStatus.BootstrapActive { + o.log.Printf("warning: peer %s reports startup_in_progress but bootstrap service is inactive (reason=%q age=%s); auto-clearing stale peer intent", + peer, intent.Reason, intentAge(intent).Round(time.Second)) + o.bestEffort(fmt.Sprintf("clear stale peer startup intent on %s", peer), func() error { + return o.clearRemotePeerIntent(ctx, peer, "auto-clear stale peer startup intent") + }) + continue + } + if localRole == "coordinator" && strings.EqualFold(strings.TrimSpace(intent.Reason), "manual-startup") { + o.log.Printf("warning: peer %s has manual startup in progress (age=%s); allowing coordinator startup to continue", + peer, intentAge(intent).Round(time.Second)) + continue + } if intentFresh(intent, guardAge) { return fmt.Errorf("startup blocked: peer %s reports startup_in_progress (reason=%q age=%s)", peer, intent.Reason, intentAge(intent).Round(time.Second)) } - o.log.Printf("warning: peer %s startup intent appears stale; allowing startup", peer) + o.log.Printf("warning: peer %s startup intent appears stale; auto-clearing and allowing startup", peer) + o.bestEffort(fmt.Sprintf("clear stale peer startup intent on %s", peer), func() error { + return o.clearRemotePeerIntent(ctx, peer, "auto-clear stale peer startup intent") + }) case state.IntentShutdownComplete: if intentFresh(intent, o.startupShutdownCooldown()) { return fmt.Errorf("startup blocked: peer %s completed shutdown too recently (age=%s)", peer, intentAge(intent).Round(time.Second)) @@ -1124,18 +1179,39 @@ func (o *Orchestrator) guardPeerStartupIntents(ctx context.Context) error { } func (o *Orchestrator) readRemoteIntent(ctx context.Context, node string) (state.Intent, error) { - if !o.sshManaged(node) { - return state.Intent{}, fmt.Errorf("%s is not in ssh_managed_nodes", node) - } - out, err := o.ssh(ctx, node, "sudo -n /usr/local/bin/hecate intent --config /etc/hecate/hecate.yaml") + peer, err := o.readRemotePeerStatus(ctx, node) if err != nil { return state.Intent{}, err } + return peer.Intent, nil +} + +func (o *Orchestrator) readRemotePeerStatus(ctx context.Context, node string) (remotePeerStatus, error) { + if !o.sshManaged(node) { + return remotePeerStatus{}, fmt.Errorf("%s is not in ssh_managed_nodes", node) + } + out, err := o.ssh(ctx, node, "if sudo -n /usr/bin/systemctl is-active --quiet ananke-bootstrap.service; then echo __ANANKE_BOOTSTRAP_ACTIVE__; else echo __ANANKE_BOOTSTRAP_IDLE__; fi; sudo -n /usr/local/bin/ananke intent --config /etc/ananke/ananke.yaml") + if err != nil { + return remotePeerStatus{}, err + } + status := remotePeerStatus{ + BootstrapActive: strings.Contains(out, "__ANANKE_BOOTSTRAP_ACTIVE__") || strings.Contains(out, "__ANANKE_BOOTSTRAP_ACTIVE__"), + } in, err := state.ParseIntentOutput(out) if err != nil { - return state.Intent{}, fmt.Errorf("parse remote intent output: %w", err) + return remotePeerStatus{}, fmt.Errorf("parse remote intent output: %w", err) } - return in, nil + status.Intent = in + return status, nil +} + +func (o *Orchestrator) clearRemotePeerIntent(ctx context.Context, node string, reason string) error { + cmd := fmt.Sprintf( + "sudo -n /usr/local/bin/ananke intent --config /etc/ananke/ananke.yaml --set normal --reason %s --source startup --execute", + shellQuote(reason), + ) + _, err := o.ssh(ctx, node, cmd) + return err } func shellQuote(v string) string { @@ -1501,7 +1577,12 @@ func (o *Orchestrator) fluxSourceReady(ctx context.Context) (bool, error) { func (o *Orchestrator) reportFluxSource(ctx context.Context, forceBranch string) { urlOut, urlErr := o.kubectl(ctx, 10*time.Second, "-n", "flux-system", "get", "gitrepository", "flux-system", "-o", "jsonpath={.spec.url}") if urlErr == nil { - o.log.Printf("flux-source-url=%s", strings.TrimSpace(urlOut)) + currentURL := strings.TrimSpace(urlOut) + o.log.Printf("flux-source-url=%s", currentURL) + expectedURL := strings.TrimSpace(o.cfg.ExpectedFluxSource) + if expectedURL != "" && normalizeGitURL(currentURL) != normalizeGitURL(expectedURL) { + o.log.Printf("warning: flux source URL is %q, expected %q", currentURL, expectedURL) + } } branchOut, branchErr := o.kubectl(ctx, 10*time.Second, "-n", "flux-system", "get", "gitrepository", "flux-system", "-o", "jsonpath={.spec.ref.branch}") if branchErr == nil { @@ -1513,7 +1594,45 @@ func (o *Orchestrator) reportFluxSource(ctx context.Context, forceBranch string) } } -func (o *Orchestrator) ensureFluxBranch(ctx context.Context, branch string) error { +func (o *Orchestrator) guardFluxSourceDrift(ctx context.Context, expectedBranch string, allowBranchPatch bool) error { + urlOut, err := o.kubectl( + ctx, + 10*time.Second, + "-n", "flux-system", + "get", "gitrepository", "flux-system", + "-o", "jsonpath={.spec.url}", + ) + if err != nil { + if isNotFoundErr(err) { + o.log.Printf("warning: flux gitrepository/flux-system not found while checking source drift") + return nil + } + return fmt.Errorf("read flux source url: %w", err) + } + currentURL := strings.TrimSpace(urlOut) + expectedURL := strings.TrimSpace(o.cfg.ExpectedFluxSource) + if expectedURL != "" && normalizeGitURL(currentURL) != normalizeGitURL(expectedURL) { + return fmt.Errorf("startup blocked: flux source url drift detected (current=%q expected=%q)", currentURL, expectedURL) + } + + branchOut, err := o.kubectl( + ctx, + 10*time.Second, + "-n", "flux-system", + "get", "gitrepository", "flux-system", + "-o", "jsonpath={.spec.ref.branch}", + ) + if err != nil { + return fmt.Errorf("read flux source branch: %w", err) + } + currentBranch := strings.TrimSpace(branchOut) + if expectedBranch == "" || currentBranch == expectedBranch || allowBranchPatch { + return nil + } + return fmt.Errorf("startup blocked: flux source branch drift detected (current=%q expected=%q)", currentBranch, expectedBranch) +} + +func (o *Orchestrator) ensureFluxBranch(ctx context.Context, branch string, allowPatch bool) error { branch = strings.TrimSpace(branch) if branch == "" { return nil @@ -1537,6 +1656,9 @@ func (o *Orchestrator) ensureFluxBranch(ctx context.Context, branch string) erro if current == branch { return nil } + if !allowPatch { + return fmt.Errorf("startup blocked: flux source branch is %q but expected %q (use --force-flux-branch to patch intentionally)", current, branch) + } patch := fmt.Sprintf(`{"spec":{"ref":{"branch":"%s"}}}`, branch) if _, err := o.kubectl( ctx, @@ -1552,6 +1674,13 @@ func (o *Orchestrator) ensureFluxBranch(ctx context.Context, branch string) erro return nil } +func normalizeGitURL(raw string) string { + raw = strings.TrimSpace(strings.ToLower(raw)) + raw = strings.TrimSuffix(raw, "/") + raw = strings.TrimSuffix(raw, ".git") + return raw +} + func (o *Orchestrator) bootstrapLocal(ctx context.Context) error { failures := 0 successes := 0 @@ -1804,6 +1933,986 @@ func (o *Orchestrator) storageReady(ctx context.Context) (bool, string, error) { return true, fmt.Sprintf("longhorn ready+sched nodes=%d critical pvcs bound=%d", readyNodes, len(o.cfg.Startup.StorageCriticalPVCs)), nil } +type fluxCondition struct { + Type string `json:"type"` + Status string `json:"status"` + Reason string `json:"reason"` + Message string `json:"message"` +} + +type fluxKustomizationList struct { + Items []fluxKustomization `json:"items"` +} + +type fluxKustomization struct { + Metadata struct { + Namespace string `json:"namespace"` + Name string `json:"name"` + } `json:"metadata"` + Spec struct { + Suspend bool `json:"suspend"` + } `json:"spec"` + Status struct { + Conditions []fluxCondition `json:"conditions"` + } `json:"status"` +} + +type workloadList struct { + Items []workloadResource `json:"items"` +} + +type jobList struct { + Items []jobResource `json:"items"` +} + +type jobResource struct { + Metadata struct { + Namespace string `json:"namespace"` + Name string `json:"name"` + Labels map[string]string `json:"labels"` + OwnerReferences []ownerReference `json:"ownerReferences"` + } `json:"metadata"` + Status struct { + Failed int32 `json:"failed"` + Succeeded int32 `json:"succeeded"` + Conditions []jobConditionRef `json:"conditions"` + } `json:"status"` +} + +type jobConditionRef struct { + Type string `json:"type"` + Status string `json:"status"` +} + +type workloadResource struct { + Kind string `json:"kind"` + Metadata struct { + Namespace string `json:"namespace"` + Name string `json:"name"` + } `json:"metadata"` + Spec struct { + Replicas *int32 `json:"replicas"` + Template struct { + Spec podSpec `json:"spec"` + } `json:"template"` + } `json:"spec"` + Status struct { + ReadyReplicas int32 `json:"readyReplicas"` + DesiredNumberScheduled int32 `json:"desiredNumberScheduled"` + NumberReady int32 `json:"numberReady"` + } `json:"status"` +} + +type podList struct { + Items []podResource `json:"items"` +} + +type podResource struct { + Metadata struct { + Namespace string `json:"namespace"` + Name string `json:"name"` + CreationTimestamp time.Time `json:"creationTimestamp"` + OwnerReferences []ownerReference `json:"ownerReferences"` + } `json:"metadata"` + Spec struct { + NodeName string `json:"nodeName"` + podSpec + } `json:"spec"` + Status struct { + Phase string `json:"phase"` + InitContainerStatuses []podContainerStatus `json:"initContainerStatuses"` + ContainerStatuses []podContainerStatus `json:"containerStatuses"` + } `json:"status"` +} + +type ownerReference struct { + Kind string `json:"kind"` +} + +type podContainerStatus struct { + State podContainerState `json:"state"` +} + +type podContainerState struct { + Waiting *podContainerWaitingState `json:"waiting"` +} + +type podContainerWaitingState struct { + Reason string `json:"reason"` +} + +type podSpec struct { + NodeSelector map[string]string `json:"nodeSelector"` + Affinity *podAffinity `json:"affinity"` +} + +type podAffinity struct { + NodeAffinity *nodeAffinity `json:"nodeAffinity"` +} + +type nodeAffinity struct { + RequiredDuringSchedulingIgnoredDuringExecution *nodeSelector `json:"requiredDuringSchedulingIgnoredDuringExecution"` +} + +type nodeSelector struct { + NodeSelectorTerms []nodeSelectorTerm `json:"nodeSelectorTerms"` +} + +type nodeSelectorTerm struct { + MatchExpressions []nodeSelectorRequirement `json:"matchExpressions"` +} + +type nodeSelectorRequirement struct { + Key string `json:"key"` + Operator string `json:"operator"` + Values []string `json:"values"` +} + +type workloadIgnoreRule struct { + Namespace string + Kind string + Name string +} + +func (o *Orchestrator) ensureRequiredNodeLabels(ctx context.Context) error { + if o.runner.DryRun || len(o.cfg.Startup.RequiredNodeLabels) == 0 { + return nil + } + nodes := make([]string, 0, len(o.cfg.Startup.RequiredNodeLabels)) + for node := range o.cfg.Startup.RequiredNodeLabels { + node = strings.TrimSpace(node) + if node != "" { + nodes = append(nodes, node) + } + } + sort.Strings(nodes) + for _, node := range nodes { + labels := o.cfg.Startup.RequiredNodeLabels[node] + if len(labels) == 0 { + continue + } + keys := make([]string, 0, len(labels)) + for key := range labels { + key = strings.TrimSpace(key) + if key != "" { + keys = append(keys, key) + } + } + sort.Strings(keys) + args := []string{"label", "node", node, "--overwrite"} + pairs := make([]string, 0, len(keys)) + for _, key := range keys { + value := strings.TrimSpace(labels[key]) + if value == "" { + continue + } + pair := fmt.Sprintf("%s=%s", key, value) + args = append(args, pair) + pairs = append(pairs, pair) + } + if len(pairs) == 0 { + continue + } + if _, err := o.kubectl(ctx, 25*time.Second, args...); err != nil { + return fmt.Errorf("ensure required node labels on %s (%s): %w", node, strings.Join(pairs, ", "), err) + } + o.log.Printf("ensured required labels on node %s: %s", node, strings.Join(pairs, ", ")) + } + return nil +} + +func (o *Orchestrator) waitForStartupConvergence(ctx context.Context) error { + if o.runner.DryRun { + return nil + } + if o.cfg.Startup.RequireServiceChecklist { + if err := o.waitForServiceChecklist(ctx); err != nil { + return err + } + } + if o.cfg.Startup.RequireFluxHealth { + if err := o.waitForFluxHealth(ctx); err != nil { + return err + } + } + if o.cfg.Startup.RequireWorkloadConvergence { + if err := o.waitForWorkloadConvergence(ctx); err != nil { + return err + } + } + if err := o.waitForStabilityWindow(ctx); err != nil { + return err + } + return nil +} + +func (o *Orchestrator) waitForServiceChecklist(ctx context.Context) error { + wait := time.Duration(o.cfg.Startup.ServiceChecklistWaitSeconds) * time.Second + if wait <= 0 { + wait = 7 * time.Minute + } + poll := time.Duration(o.cfg.Startup.ServiceChecklistPollSeconds) * time.Second + if poll <= 0 { + poll = 5 * time.Second + } + deadline := time.Now().Add(wait) + lastFailure := "unknown" + lastLogged := time.Time{} + for { + prevFailure := lastFailure + ready, detail := o.serviceChecklistReady(ctx) + lastFailure = detail + if ready { + o.log.Printf("external service checklist passed (%s)", detail) + return nil + } + if lastFailure != prevFailure || time.Since(lastLogged) >= 30*time.Second { + remaining := time.Until(deadline).Round(time.Second) + if remaining < 0 { + remaining = 0 + } + o.log.Printf("waiting for external service checklist (%s remaining): %s", remaining, lastFailure) + lastLogged = time.Now() + } + if time.Now().After(deadline) { + return fmt.Errorf("startup blocked: external service checklist not satisfied within %s (%s)", wait, lastFailure) + } + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(poll): + } + } +} + +func (o *Orchestrator) serviceChecklistReady(ctx context.Context) (bool, string) { + checks := o.cfg.Startup.ServiceChecklist + if len(checks) == 0 { + return true, "no checklist items configured" + } + for _, check := range checks { + ok, detail := o.serviceCheckReady(ctx, check) + if !ok { + name := strings.TrimSpace(check.Name) + if name == "" { + name = strings.TrimSpace(check.URL) + } + return false, fmt.Sprintf("%s: %s", name, detail) + } + } + return true, fmt.Sprintf("checks=%d", len(checks)) +} + +func (o *Orchestrator) serviceCheckReady(ctx context.Context, check config.ServiceChecklistCheck) (bool, string) { + status, body, err := o.httpChecklistProbe(ctx, check) + if err != nil { + return false, err.Error() + } + + accepted := check.AcceptedStatuses + if len(accepted) == 0 { + accepted = []int{200, 201, 202, 203, 204, 301, 302, 303, 307, 308, 401, 403} + } + statusOk := false + for _, code := range accepted { + if status == code { + statusOk = true + break + } + } + if !statusOk { + return false, fmt.Sprintf("unexpected status code=%d", status) + } + + bodyContains := strings.TrimSpace(check.BodyContains) + if bodyContains != "" && !strings.Contains(strings.ToLower(body), strings.ToLower(bodyContains)) { + return false, fmt.Sprintf("response missing expected marker %q", bodyContains) + } + + bodyNotContains := strings.TrimSpace(check.BodyNotContains) + if bodyNotContains != "" && strings.Contains(strings.ToLower(body), strings.ToLower(bodyNotContains)) { + return false, fmt.Sprintf("response contained forbidden marker %q", bodyNotContains) + } + + return true, fmt.Sprintf("status=%d", status) +} + +func (o *Orchestrator) httpChecklistProbe(ctx context.Context, check config.ServiceChecklistCheck) (int, string, error) { + timeout := time.Duration(check.TimeoutSeconds) * time.Second + if timeout <= 0 { + timeout = 12 * time.Second + } + + transport := &http.Transport{} + if check.InsecureSkipTLS { + transport.TLSClientConfig = &tls.Config{InsecureSkipVerify: true} + } + client := &http.Client{ + Timeout: timeout, + Transport: transport, + } + + req, err := http.NewRequestWithContext(ctx, http.MethodGet, strings.TrimSpace(check.URL), nil) + if err != nil { + return 0, "", fmt.Errorf("build request: %w", err) + } + req.Header.Set("User-Agent", "ananke/startup-checklist") + + resp, err := client.Do(req) + if err != nil { + return 0, "", fmt.Errorf("request failed: %w", err) + } + defer resp.Body.Close() + + body, readErr := io.ReadAll(io.LimitReader(resp.Body, 64*1024)) + if readErr != nil { + return resp.StatusCode, "", fmt.Errorf("read response body: %w", readErr) + } + + return resp.StatusCode, string(body), nil +} + +func (o *Orchestrator) waitForStabilityWindow(ctx context.Context) error { + window := time.Duration(o.cfg.Startup.ServiceChecklistStabilitySec) * time.Second + if window <= 0 { + return nil + } + poll := time.Duration(o.cfg.Startup.ServiceChecklistPollSeconds) * time.Second + if poll <= 0 { + poll = 5 * time.Second + } + deadline := time.Now().Add(window) + lastStatus := time.Time{} + + for { + if err := o.startupStabilityHealthy(ctx); err != nil { + return fmt.Errorf("startup stability window failed: %w", err) + } + if time.Now().After(deadline) { + o.log.Printf("startup stability window passed (%s)", window) + return nil + } + if time.Since(lastStatus) >= 30*time.Second { + remaining := time.Until(deadline).Round(time.Second) + if remaining < 0 { + remaining = 0 + } + o.log.Printf("startup stability soak in progress (%s remaining)", remaining) + lastStatus = time.Now() + } + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(poll): + } + } +} + +func (o *Orchestrator) startupStabilityHealthy(ctx context.Context) error { + if o.cfg.Startup.RequireFluxHealth { + ready, detail, err := o.fluxHealthReady(ctx) + if err != nil { + return fmt.Errorf("flux check error: %w", err) + } + if !ready { + return fmt.Errorf("flux not ready: %s", detail) + } + } + if o.cfg.Startup.RequireWorkloadConvergence { + ready, detail, err := o.workloadConvergenceReady(ctx) + if err != nil { + return fmt.Errorf("workload check error: %w", err) + } + if !ready { + return fmt.Errorf("workloads not converged: %s", detail) + } + } + if o.cfg.Startup.RequireServiceChecklist { + ready, detail := o.serviceChecklistReady(ctx) + if !ready { + return fmt.Errorf("external services not healthy: %s", detail) + } + } + failures, err := o.startupFailurePods(ctx) + if err != nil { + return fmt.Errorf("pod failure check error: %w", err) + } + if len(failures) > 0 { + return fmt.Errorf("pods in crash/image-pull failures: %s", joinLimited(failures, 8)) + } + return nil +} + +func (o *Orchestrator) waitForFluxHealth(ctx context.Context) error { + wait := time.Duration(o.cfg.Startup.FluxHealthWaitSeconds) * time.Second + if wait <= 0 { + wait = 15 * time.Minute + } + poll := time.Duration(o.cfg.Startup.FluxHealthPollSeconds) * time.Second + if poll <= 0 { + poll = 5 * time.Second + } + deadline := time.Now().Add(wait) + lastFailure := "unknown" + lastLogged := time.Time{} + lastImmutableHealAttempt := time.Time{} + for { + prevFailure := lastFailure + ready, detail, err := o.fluxHealthReady(ctx) + if err != nil { + lastFailure = err.Error() + } else { + lastFailure = detail + } + if ready { + o.log.Printf("flux convergence check passed (%s)", detail) + return nil + } + if !o.runner.DryRun && looksLikeImmutableJobError(lastFailure) && time.Since(lastImmutableHealAttempt) >= 30*time.Second { + lastImmutableHealAttempt = time.Now() + healed, healErr := o.healImmutableFluxJobs(ctx) + if healErr != nil { + o.log.Printf("warning: immutable-job self-heal attempt failed: %v", healErr) + } else if healed { + o.log.Printf("detected immutable-job failure and removed stale failed job(s); re-requesting reconcile") + o.bestEffort("reconcile flux after immutable-job cleanup", func() error { return o.resumeFluxAndReconcile(ctx) }) + } + } + if lastFailure != prevFailure || time.Since(lastLogged) >= 30*time.Second { + remaining := time.Until(deadline).Round(time.Second) + if remaining < 0 { + remaining = 0 + } + o.log.Printf("waiting for Flux convergence (%s remaining): %s", remaining, lastFailure) + lastLogged = time.Now() + } + if time.Now().After(deadline) { + return fmt.Errorf("startup blocked: flux convergence not satisfied within %s (%s)", wait, lastFailure) + } + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(poll): + } + } +} + +func (o *Orchestrator) fluxHealthReady(ctx context.Context) (bool, string, error) { + out, err := o.kubectl(ctx, 20*time.Second, "get", "kustomizations.kustomize.toolkit.fluxcd.io", "-A", "-o", "json") + if err != nil { + return false, "", fmt.Errorf("query flux kustomizations: %w", err) + } + var list fluxKustomizationList + if err := json.Unmarshal([]byte(out), &list); err != nil { + return false, "", fmt.Errorf("decode flux kustomizations: %w", err) + } + ignored := makeStringSet(o.cfg.Startup.IgnoreFluxKustomizations) + notReady := []string{} + for _, ks := range list.Items { + ns := strings.TrimSpace(ks.Metadata.Namespace) + name := strings.TrimSpace(ks.Metadata.Name) + if ns == "" || name == "" { + continue + } + full := ns + "/" + name + if ks.Spec.Suspend { + continue + } + if _, ok := ignored[full]; ok { + continue + } + cond := readyCondition(ks.Status.Conditions) + if cond != nil && strings.EqualFold(strings.TrimSpace(cond.Status), "True") { + continue + } + reason := "ready condition missing" + if cond != nil { + reason = strings.TrimSpace(cond.Message) + if reason == "" { + reason = strings.TrimSpace(cond.Reason) + } + if reason == "" { + reason = "ready=false" + } + } + notReady = append(notReady, fmt.Sprintf("%s(%s)", full, reason)) + } + if len(notReady) > 0 { + sort.Strings(notReady) + return false, "not ready: " + joinLimited(notReady, 6), nil + } + return true, fmt.Sprintf("all kustomizations ready=%d", len(list.Items)), nil +} + +func looksLikeImmutableJobError(detail string) bool { + d := strings.ToLower(strings.TrimSpace(detail)) + if d == "" { + return false + } + return strings.Contains(d, "field is immutable") && strings.Contains(d, "job") +} + +func (o *Orchestrator) healImmutableFluxJobs(ctx context.Context) (bool, error) { + out, err := o.kubectl(ctx, 25*time.Second, "get", "jobs", "-A", "-o", "json") + if err != nil { + return false, fmt.Errorf("query jobs: %w", err) + } + var list jobList + if err := json.Unmarshal([]byte(out), &list); err != nil { + return false, fmt.Errorf("decode jobs: %w", err) + } + deleted := []string{} + for _, job := range list.Items { + ns := strings.TrimSpace(job.Metadata.Namespace) + name := strings.TrimSpace(job.Metadata.Name) + if ns == "" || name == "" { + continue + } + if !jobLooksFluxManaged(job) { + continue + } + if !jobFailed(job) { + continue + } + o.log.Printf("warning: deleting stale failed flux-managed job %s/%s to recover immutable template drift", ns, name) + if _, err := o.kubectl(ctx, 20*time.Second, "-n", ns, "delete", "job", name, "--wait=false"); err != nil && !isNotFoundErr(err) { + o.log.Printf("warning: delete failed for stale job %s/%s: %v", ns, name, err) + continue + } + deleted = append(deleted, ns+"/"+name) + } + if len(deleted) == 0 { + return false, nil + } + sort.Strings(deleted) + o.log.Printf("immutable-job cleanup removed %d job(s): %s", len(deleted), joinLimited(deleted, 8)) + return true, nil +} + +func jobLooksFluxManaged(job jobResource) bool { + if strings.TrimSpace(job.Metadata.Labels["kustomize.toolkit.fluxcd.io/name"]) != "" { + return true + } + for _, owner := range job.Metadata.OwnerReferences { + if strings.EqualFold(strings.TrimSpace(owner.Kind), "CronJob") { + return false + } + } + return false +} + +func jobFailed(job jobResource) bool { + if job.Status.Succeeded > 0 { + return false + } + if job.Status.Failed <= 0 { + return false + } + for _, cond := range job.Status.Conditions { + if strings.EqualFold(strings.TrimSpace(cond.Type), "Failed") && + strings.EqualFold(strings.TrimSpace(cond.Status), "True") { + return true + } + } + return false +} + +func (o *Orchestrator) waitForWorkloadConvergence(ctx context.Context) error { + wait := time.Duration(o.cfg.Startup.WorkloadConvergenceWaitSeconds) * time.Second + if wait <= 0 { + wait = 15 * time.Minute + } + poll := time.Duration(o.cfg.Startup.WorkloadConvergencePollSeconds) * time.Second + if poll <= 0 { + poll = 5 * time.Second + } + deadline := time.Now().Add(wait) + lastFailure := "unknown" + lastLogged := time.Time{} + for { + prevFailure := lastFailure + if o.cfg.Startup.AutoRecycleStuckPods { + o.bestEffort("recycle stuck controller pods", func() error { return o.recycleStuckControllerPods(ctx) }) + } + ready, detail, err := o.workloadConvergenceReady(ctx) + if err != nil { + lastFailure = err.Error() + } else { + lastFailure = detail + } + if ready { + o.log.Printf("workload convergence check passed (%s)", detail) + return nil + } + if lastFailure != prevFailure || time.Since(lastLogged) >= 30*time.Second { + remaining := time.Until(deadline).Round(time.Second) + if remaining < 0 { + remaining = 0 + } + o.log.Printf("waiting for workload convergence (%s remaining): %s", remaining, lastFailure) + lastLogged = time.Now() + } + if time.Now().After(deadline) { + return fmt.Errorf("startup blocked: workload convergence not satisfied within %s (%s)", wait, lastFailure) + } + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(poll): + } + } +} + +func (o *Orchestrator) workloadConvergenceReady(ctx context.Context) (bool, string, error) { + out, err := o.kubectl(ctx, 30*time.Second, "get", "deploy,statefulset,daemonset", "-A", "-o", "json") + if err != nil { + return false, "", fmt.Errorf("query controllers: %w", err) + } + var list workloadList + if err := json.Unmarshal([]byte(out), &list); err != nil { + return false, "", fmt.Errorf("decode controllers: %w", err) + } + ignoredNamespaces := makeStringSet(o.cfg.Startup.IgnoreWorkloadNamespaces) + ignoredNodes := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes) + ignoreRules := parseWorkloadIgnoreRules(o.cfg.Startup.IgnoreWorkloads) + ignoredByFlux := namespaceCandidatesFromIgnoreKustomizations(o.cfg.Startup.IgnoreFluxKustomizations) + pending := []string{} + checked := 0 + for _, item := range list.Items { + kind := strings.ToLower(strings.TrimSpace(item.Kind)) + ns := strings.TrimSpace(item.Metadata.Namespace) + name := strings.TrimSpace(item.Metadata.Name) + if kind == "" || ns == "" || name == "" { + continue + } + if _, ok := ignoredNamespaces[ns]; ok { + continue + } + if _, ok := ignoredByFlux[ns]; ok { + continue + } + if workloadIgnored(ignoreRules, ns, kind, name) { + continue + } + if workloadTargetsIgnoredNodes(item.Spec.Template.Spec, ignoredNodes) { + continue + } + desired, ready, ok := desiredReady(item) + if !ok || desired <= 0 { + continue + } + if kind == "daemonset" && desired > ready && len(ignoredNodes) > 0 { + missing := desired - ready + if missing <= int32(len(ignoredNodes)) { + ready = desired + } + } + checked++ + if ready < desired { + pending = append(pending, fmt.Sprintf("%s/%s/%s ready=%d desired=%d", ns, kind, name, ready, desired)) + } + } + if len(pending) > 0 { + sort.Strings(pending) + return false, "not ready: " + joinLimited(pending, 8), nil + } + return true, fmt.Sprintf("controllers ready=%d", checked), nil +} + +func desiredReady(item workloadResource) (int32, int32, bool) { + switch strings.ToLower(strings.TrimSpace(item.Kind)) { + case "deployment", "statefulset": + desired := int32(1) + if item.Spec.Replicas != nil { + desired = *item.Spec.Replicas + } + return desired, item.Status.ReadyReplicas, true + case "daemonset": + return item.Status.DesiredNumberScheduled, item.Status.NumberReady, true + default: + return 0, 0, false + } +} + +func (o *Orchestrator) recycleStuckControllerPods(ctx context.Context) error { + out, err := o.kubectl(ctx, 30*time.Second, "get", "pods", "-A", "-o", "json") + if err != nil { + return fmt.Errorf("query pods: %w", err) + } + var list podList + if err := json.Unmarshal([]byte(out), &list); err != nil { + return fmt.Errorf("decode pods: %w", err) + } + ignoredNamespaces := makeStringSet(o.cfg.Startup.IgnoreWorkloadNamespaces) + ignoredNodes := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes) + ignoreRules := parseWorkloadIgnoreRules(o.cfg.Startup.IgnoreWorkloads) + grace := time.Duration(o.cfg.Startup.StuckPodGraceSeconds) * time.Second + if grace <= 0 { + grace = 180 * time.Second + } + stuckReasons := map[string]struct{}{ + "ImagePullBackOff": {}, + "ErrImagePull": {}, + "CrashLoopBackOff": {}, + "CreateContainerConfigError": {}, + "CreateContainerError": {}, + } + recycled := []string{} + for _, pod := range list.Items { + ns := strings.TrimSpace(pod.Metadata.Namespace) + name := strings.TrimSpace(pod.Metadata.Name) + if ns == "" || name == "" { + continue + } + if _, ok := ignoredNamespaces[ns]; ok { + continue + } + if workloadIgnored(ignoreRules, ns, "", name) { + continue + } + if podTargetsIgnoredNode(pod, ignoredNodes) { + continue + } + if !podControllerOwned(pod) { + continue + } + age := time.Since(pod.Metadata.CreationTimestamp) + if !pod.Metadata.CreationTimestamp.IsZero() && age < grace { + continue + } + reason := stuckContainerReason(pod, stuckReasons) + if reason == "" { + continue + } + o.log.Printf("warning: recycling stuck pod %s/%s reason=%s age=%s", ns, name, reason, age.Round(time.Second)) + if _, err := o.kubectl(ctx, 30*time.Second, "-n", ns, "delete", "pod", name, "--wait=false"); err != nil && !isNotFoundErr(err) { + o.log.Printf("warning: recycle pod failed for %s/%s: %v", ns, name, err) + continue + } + recycled = append(recycled, ns+"/"+name) + } + if len(recycled) > 0 { + sort.Strings(recycled) + o.log.Printf("recycled stuck controller pods (%d): %s", len(recycled), joinLimited(recycled, 10)) + } + return nil +} + +func podControllerOwned(p podResource) bool { + for _, owner := range p.Metadata.OwnerReferences { + switch strings.TrimSpace(owner.Kind) { + case "ReplicaSet", "StatefulSet", "DaemonSet": + return true + } + } + return false +} + +func stuckContainerReason(p podResource, reasons map[string]struct{}) string { + check := func(statuses []podContainerStatus) string { + for _, st := range statuses { + if st.State.Waiting == nil { + continue + } + reason := strings.TrimSpace(st.State.Waiting.Reason) + if reason == "" { + continue + } + if _, ok := reasons[reason]; ok { + return reason + } + } + return "" + } + if reason := check(p.Status.InitContainerStatuses); reason != "" { + return reason + } + return check(p.Status.ContainerStatuses) +} + +func (o *Orchestrator) startupFailurePods(ctx context.Context) ([]string, error) { + out, err := o.kubectl(ctx, 30*time.Second, "get", "pods", "-A", "-o", "json") + if err != nil { + return nil, fmt.Errorf("query pods: %w", err) + } + var list podList + if err := json.Unmarshal([]byte(out), &list); err != nil { + return nil, fmt.Errorf("decode pods: %w", err) + } + + ignoredNamespaces := makeStringSet(o.cfg.Startup.IgnoreWorkloadNamespaces) + ignoredNodes := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes) + stuckReasons := map[string]struct{}{ + "ImagePullBackOff": {}, + "ErrImagePull": {}, + "CrashLoopBackOff": {}, + "CreateContainerConfigError": {}, + "CreateContainerError": {}, + "RunContainerError": {}, + } + + failures := []string{} + for _, pod := range list.Items { + ns := strings.TrimSpace(pod.Metadata.Namespace) + name := strings.TrimSpace(pod.Metadata.Name) + if ns == "" || name == "" { + continue + } + if _, ok := ignoredNamespaces[ns]; ok { + continue + } + if podTargetsIgnoredNode(pod, ignoredNodes) { + continue + } + reason := stuckContainerReason(pod, stuckReasons) + if reason == "" { + continue + } + failures = append(failures, fmt.Sprintf("%s/%s(%s)", ns, name, reason)) + } + sort.Strings(failures) + return failures, nil +} + +func podTargetsIgnoredNode(p podResource, ignored map[string]struct{}) bool { + if len(ignored) == 0 { + return false + } + node := strings.TrimSpace(p.Spec.NodeName) + if node != "" { + _, ok := ignored[node] + return ok + } + return workloadTargetsIgnoredNodes(p.Spec.podSpec, ignored) +} + +func workloadTargetsIgnoredNodes(spec podSpec, ignored map[string]struct{}) bool { + if len(ignored) == 0 { + return false + } + if hostname, ok := spec.NodeSelector["kubernetes.io/hostname"]; ok { + _, ignoredHost := ignored[strings.TrimSpace(hostname)] + if ignoredHost { + return true + } + } + if spec.Affinity == nil || spec.Affinity.NodeAffinity == nil || spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution == nil { + return false + } + terms := spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms + if len(terms) != 1 { + return false + } + for _, expr := range terms[0].MatchExpressions { + if strings.TrimSpace(expr.Key) != "kubernetes.io/hostname" { + continue + } + if !strings.EqualFold(strings.TrimSpace(expr.Operator), "In") { + return false + } + if len(expr.Values) == 0 { + return false + } + for _, value := range expr.Values { + if _, ok := ignored[strings.TrimSpace(value)]; !ok { + return false + } + } + return true + } + return false +} + +func parseWorkloadIgnoreRules(entries []string) []workloadIgnoreRule { + out := []workloadIgnoreRule{} + for _, entry := range entries { + entry = strings.TrimSpace(entry) + if entry == "" { + continue + } + parts := strings.Split(entry, "/") + switch len(parts) { + case 2: + out = append(out, workloadIgnoreRule{ + Namespace: strings.TrimSpace(parts[0]), + Name: strings.TrimSpace(parts[1]), + }) + case 3: + out = append(out, workloadIgnoreRule{ + Namespace: strings.TrimSpace(parts[0]), + Kind: strings.ToLower(strings.TrimSpace(parts[1])), + Name: strings.TrimSpace(parts[2]), + }) + } + } + return out +} + +func workloadIgnored(rules []workloadIgnoreRule, namespace, kind, name string) bool { + ns := strings.TrimSpace(namespace) + k := strings.ToLower(strings.TrimSpace(kind)) + n := strings.TrimSpace(name) + for _, rule := range rules { + if rule.Namespace != ns { + continue + } + if rule.Kind != "" && rule.Kind != k { + continue + } + if rule.Name == n { + return true + } + } + return false +} + +func makeStringSet(entries []string) map[string]struct{} { + out := make(map[string]struct{}, len(entries)) + for _, entry := range entries { + entry = strings.TrimSpace(entry) + if entry != "" { + out[entry] = struct{}{} + } + } + return out +} + +func readyCondition(conditions []fluxCondition) *fluxCondition { + for i := range conditions { + cond := &conditions[i] + if strings.EqualFold(strings.TrimSpace(cond.Type), "Ready") { + return cond + } + } + return nil +} + +func joinLimited(items []string, limit int) string { + if len(items) <= limit || limit <= 0 { + return strings.Join(items, "; ") + } + return strings.Join(items[:limit], "; ") + fmt.Sprintf("; ... (+%d more)", len(items)-limit) +} + +func namespaceCandidatesFromIgnoreKustomizations(entries []string) map[string]struct{} { + out := map[string]struct{}{} + for _, entry := range entries { + entry = strings.TrimSpace(entry) + if entry == "" { + continue + } + parts := strings.SplitN(entry, "/", 2) + if len(parts) != 2 { + continue + } + name := strings.TrimSpace(parts[1]) + if name != "" { + out[name] = struct{}{} + } + } + return out +} + func (o *Orchestrator) waitForPostStartProbes(ctx context.Context) error { if o.runner.DryRun { return nil @@ -1818,12 +2927,21 @@ func (o *Orchestrator) waitForPostStartProbes(ctx context.Context) error { } deadline := time.Now().Add(wait) lastFailure := "unknown" + lastLogged := time.Time{} for { ok, failure := o.postStartProbesReady(ctx) if ok { o.log.Printf("post-start probes passed") return nil } + if failure != lastFailure || time.Since(lastLogged) >= 30*time.Second { + remaining := time.Until(deadline).Round(time.Second) + if remaining < 0 { + remaining = 0 + } + o.log.Printf("waiting for post-start probes (%s remaining): %s", remaining, failure) + lastLogged = time.Now() + } lastFailure = failure if time.Now().After(deadline) { return fmt.Errorf("startup blocked: post-start probes did not pass within %s (%s)", wait, lastFailure) @@ -1853,13 +2971,24 @@ func (o *Orchestrator) postStartProbesReady(ctx context.Context) (bool, string) if err != nil { return false, fmt.Sprintf("%s: %v", probe, err) } - if code < 200 || code >= 400 { + if !probeStatusAccepted(probe, code) { return false, fmt.Sprintf("%s: unexpected status code=%d", probe, code) } } return true, "all probes successful" } +func probeStatusAccepted(_ string, code int) bool { + if code >= 200 && code < 400 { + return true + } + // Auth fronts often return unauthorized/forbidden while still proving the service is up. + if code == 401 || code == 403 { + return true + } + return false +} + func (o *Orchestrator) httpProbe(ctx context.Context, probeURL string) (int, error) { out, err := o.run( ctx, @@ -2133,6 +3262,13 @@ func (o *Orchestrator) ensureCriticalStartupWorkloads(ctx context.Context) error } return fmt.Errorf("scale %s/%s/%s: %w", w.Namespace, w.Kind, w.Name, err) } + if err := o.cleanupStaleCriticalWorkloadPods(ctx, w); err != nil { + if isNotFoundErr(err) { + o.log.Printf("warning: startup workload missing during stale-pod cleanup: %s/%s/%s", w.Namespace, w.Kind, w.Name) + continue + } + return fmt.Errorf("cleanup stale pods %s/%s/%s: %w", w.Namespace, w.Kind, w.Name, err) + } if err := o.waitWorkloadReady(ctx, w); err != nil { if isNotFoundErr(err) { o.log.Printf("warning: startup workload missing during readiness wait: %s/%s/%s", w.Namespace, w.Kind, w.Name) @@ -2144,6 +3280,68 @@ func (o *Orchestrator) ensureCriticalStartupWorkloads(ctx context.Context) error return nil } +func (o *Orchestrator) cleanupStaleCriticalWorkloadPods(ctx context.Context, w startupWorkload) error { + if o.runner.DryRun { + return nil + } + if w.Kind != "statefulset" { + return nil + } + + out, err := o.kubectl( + ctx, + 20*time.Second, + "-n", + w.Namespace, + "get", + "pods", + "-o", + "custom-columns=NAME:.metadata.name,PHASE:.status.phase,OWNER_KIND:.metadata.ownerReferences[0].kind,OWNER_NAME:.metadata.ownerReferences[0].name", + "--no-headers", + ) + if err != nil { + return err + } + + prefix := w.Name + "-" + for _, line := range lines(out) { + fields := strings.Fields(line) + if len(fields) < 4 { + continue + } + podName := fields[0] + phase := strings.ToLower(strings.TrimSpace(fields[1])) + ownerKind := strings.TrimSpace(fields[2]) + ownerName := strings.TrimSpace(fields[3]) + if !strings.EqualFold(ownerKind, "StatefulSet") || ownerName != w.Name { + continue + } + if !strings.HasPrefix(podName, prefix) { + continue + } + if phase != "unknown" && phase != "failed" { + continue + } + + o.log.Printf("warning: deleting stale critical pod %s/%s phase=%s before readiness wait", w.Namespace, podName, phase) + if _, delErr := o.kubectl( + ctx, + 40*time.Second, + "-n", + w.Namespace, + "delete", + "pod", + podName, + "--grace-period=0", + "--force", + "--wait=false", + ); delErr != nil { + return fmt.Errorf("delete stale pod %s/%s: %w", w.Namespace, podName, delErr) + } + } + return nil +} + func (o *Orchestrator) ensureWorkloadReplicas(ctx context.Context, w startupWorkload, replicas int) error { _, err := o.kubectl( ctx, diff --git a/internal/cluster/orchestrator_test.go b/internal/cluster/orchestrator_test.go index fcf321b..8f35eaf 100644 --- a/internal/cluster/orchestrator_test.go +++ b/internal/cluster/orchestrator_test.go @@ -1,14 +1,17 @@ package cluster import ( + "context" "log" + "net/http" + "net/http/httptest" "os" "reflect" "testing" "time" - "scm.bstein.dev/bstein/hecate/internal/config" - "scm.bstein.dev/bstein/hecate/internal/state" + "scm.bstein.dev/bstein/ananke/internal/config" + "scm.bstein.dev/bstein/ananke/internal/state" ) func TestParseVaultSealed(t *testing.T) { @@ -117,3 +120,75 @@ func TestCoordinationPeersDedupesAndIncludesForwardHost(t *testing.T) { t.Fatalf("coordination peers mismatch: got=%v want=%v", got, want) } } + +func TestWorkloadTargetsIgnoredNodesByNodeSelector(t *testing.T) { + spec := podSpec{ + NodeSelector: map[string]string{ + "kubernetes.io/hostname": "titan-22", + }, + } + ignored := map[string]struct{}{"titan-22": {}} + if !workloadTargetsIgnoredNodes(spec, ignored) { + t.Fatalf("expected workload to target ignored node via nodeSelector") + } +} + +func TestParseWorkloadIgnoreRules(t *testing.T) { + rules := parseWorkloadIgnoreRules([]string{ + "maintenance/metis", + "crypto/statefulset/monerod", + }) + if len(rules) != 2 { + t.Fatalf("expected 2 ignore rules, got %d", len(rules)) + } + if !workloadIgnored(rules, "maintenance", "deployment", "metis") { + t.Fatalf("expected namespace/name rule to match") + } + if !workloadIgnored(rules, "crypto", "statefulset", "monerod") { + t.Fatalf("expected namespace/kind/name rule to match") + } + if workloadIgnored(rules, "crypto", "deployment", "monerod") { + t.Fatalf("did not expect mismatched kind to match") + } +} + +func TestNamespaceCandidatesFromIgnoreKustomizations(t *testing.T) { + got := namespaceCandidatesFromIgnoreKustomizations([]string{ + "flux-system/jellyfin", + "flux-system/outline", + }) + if _, ok := got["jellyfin"]; !ok { + t.Fatalf("expected jellyfin namespace candidate") + } + if _, ok := got["outline"]; !ok { + t.Fatalf("expected outline namespace candidate") + } +} + +func TestProbeStatusAcceptedRejects404(t *testing.T) { + if probeStatusAccepted("https://metrics.bstein.dev/login", 404) { + t.Fatalf("expected 404 probe status to be rejected") + } +} + +func TestServiceCheckReadyRequiresBodyContains(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte(`{"database":"ok"}`)) + })) + defer srv.Close() + + orch := &Orchestrator{ + log: log.New(os.Stdout, "", 0), + } + ok, detail := orch.serviceCheckReady(context.Background(), config.ServiceChecklistCheck{ + Name: "grafana-api", + URL: srv.URL, + AcceptedStatuses: []int{200}, + BodyContains: `"database":"ok"`, + TimeoutSeconds: 5, + }) + if !ok { + t.Fatalf("expected service check to pass, detail=%s", detail) + } +} diff --git a/internal/config/config.go b/internal/config/config.go index 024be5c..d452fe1 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -2,6 +2,7 @@ package config import ( "fmt" + neturl "net/url" "os" "strings" @@ -21,6 +22,7 @@ type Config struct { SSHJumpUser string `yaml:"ssh_jump_user"` IACRepoPath string `yaml:"iac_repo_path"` ExpectedFluxBranch string `yaml:"expected_flux_branch"` + ExpectedFluxSource string `yaml:"expected_flux_source_url"` ControlPlanes []string `yaml:"control_planes"` Workers []string `yaml:"workers"` LocalBootstrapPaths []string `yaml:"local_bootstrap_paths"` @@ -34,29 +36,58 @@ type Config struct { } type Startup struct { - APIWaitSeconds int `yaml:"api_wait_seconds"` - APIPollSeconds int `yaml:"api_poll_seconds"` - ShutdownCooldownSeconds int `yaml:"shutdown_cooldown_seconds"` - RequireTimeSync bool `yaml:"require_time_sync"` - TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"` - TimeSyncPollSeconds int `yaml:"time_sync_poll_seconds"` - TimeSyncMode string `yaml:"time_sync_mode"` - TimeSyncQuorum int `yaml:"time_sync_quorum"` - ReconcileAccessOnBoot bool `yaml:"reconcile_access_on_boot"` - AutoEtcdRestoreOnAPIFailure bool `yaml:"auto_etcd_restore_on_api_failure"` - EtcdRestoreControlPlane string `yaml:"etcd_restore_control_plane"` - RequireStorageReady bool `yaml:"require_storage_ready"` - StorageReadyWaitSeconds int `yaml:"storage_ready_wait_seconds"` - StorageReadyPollSeconds int `yaml:"storage_ready_poll_seconds"` - StorageMinReadyNodes int `yaml:"storage_min_ready_nodes"` - StorageCriticalPVCs []string `yaml:"storage_critical_pvcs"` - RequirePostStartProbes bool `yaml:"require_post_start_probes"` - PostStartProbeWaitSeconds int `yaml:"post_start_probe_wait_seconds"` - PostStartProbePollSeconds int `yaml:"post_start_probe_poll_seconds"` - PostStartProbes []string `yaml:"post_start_probes"` - VaultUnsealKeyFile string `yaml:"vault_unseal_key_file"` - VaultUnsealBreakglassCommand string `yaml:"vault_unseal_breakglass_command"` - VaultUnsealBreakglassTimeout int `yaml:"vault_unseal_breakglass_timeout_seconds"` + APIWaitSeconds int `yaml:"api_wait_seconds"` + APIPollSeconds int `yaml:"api_poll_seconds"` + ShutdownCooldownSeconds int `yaml:"shutdown_cooldown_seconds"` + MinimumBatteryPercent float64 `yaml:"minimum_battery_percent"` + RequiredNodeLabels map[string]map[string]string `yaml:"required_node_labels"` + RequireTimeSync bool `yaml:"require_time_sync"` + TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"` + TimeSyncPollSeconds int `yaml:"time_sync_poll_seconds"` + TimeSyncMode string `yaml:"time_sync_mode"` + TimeSyncQuorum int `yaml:"time_sync_quorum"` + ReconcileAccessOnBoot bool `yaml:"reconcile_access_on_boot"` + AutoEtcdRestoreOnAPIFailure bool `yaml:"auto_etcd_restore_on_api_failure"` + EtcdRestoreControlPlane string `yaml:"etcd_restore_control_plane"` + RequireStorageReady bool `yaml:"require_storage_ready"` + StorageReadyWaitSeconds int `yaml:"storage_ready_wait_seconds"` + StorageReadyPollSeconds int `yaml:"storage_ready_poll_seconds"` + StorageMinReadyNodes int `yaml:"storage_min_ready_nodes"` + StorageCriticalPVCs []string `yaml:"storage_critical_pvcs"` + RequirePostStartProbes bool `yaml:"require_post_start_probes"` + PostStartProbeWaitSeconds int `yaml:"post_start_probe_wait_seconds"` + PostStartProbePollSeconds int `yaml:"post_start_probe_poll_seconds"` + PostStartProbes []string `yaml:"post_start_probes"` + RequireServiceChecklist bool `yaml:"require_service_checklist"` + ServiceChecklistWaitSeconds int `yaml:"service_checklist_wait_seconds"` + ServiceChecklistPollSeconds int `yaml:"service_checklist_poll_seconds"` + ServiceChecklistStabilitySec int `yaml:"service_checklist_stability_seconds"` + ServiceChecklist []ServiceChecklistCheck `yaml:"service_checklist"` + RequireFluxHealth bool `yaml:"require_flux_health"` + FluxHealthWaitSeconds int `yaml:"flux_health_wait_seconds"` + FluxHealthPollSeconds int `yaml:"flux_health_poll_seconds"` + IgnoreFluxKustomizations []string `yaml:"ignore_flux_kustomizations"` + RequireWorkloadConvergence bool `yaml:"require_workload_convergence"` + WorkloadConvergenceWaitSeconds int `yaml:"workload_convergence_wait_seconds"` + WorkloadConvergencePollSeconds int `yaml:"workload_convergence_poll_seconds"` + IgnoreWorkloadNamespaces []string `yaml:"ignore_workload_namespaces"` + IgnoreWorkloads []string `yaml:"ignore_workloads"` + IgnoreUnavailableNodes []string `yaml:"ignore_unavailable_nodes"` + AutoRecycleStuckPods bool `yaml:"auto_recycle_stuck_pods"` + StuckPodGraceSeconds int `yaml:"stuck_pod_grace_seconds"` + VaultUnsealKeyFile string `yaml:"vault_unseal_key_file"` + VaultUnsealBreakglassCommand string `yaml:"vault_unseal_breakglass_command"` + VaultUnsealBreakglassTimeout int `yaml:"vault_unseal_breakglass_timeout_seconds"` +} + +type ServiceChecklistCheck struct { + Name string `yaml:"name"` + URL string `yaml:"url"` + AcceptedStatuses []int `yaml:"accepted_statuses"` + BodyContains string `yaml:"body_contains"` + BodyNotContains string `yaml:"body_not_contains"` + TimeoutSeconds int `yaml:"timeout_seconds"` + InsecureSkipTLS bool `yaml:"insecure_skip_tls"` } type Shutdown struct { @@ -143,6 +174,9 @@ func (c Config) Validate() error { if c.ExpectedFluxBranch == "" { return fmt.Errorf("config.expected_flux_branch must not be empty") } + if c.ExpectedFluxSource == "" { + return fmt.Errorf("config.expected_flux_source_url must not be empty") + } if c.IACRepoPath == "" { return fmt.Errorf("config.iac_repo_path must not be empty") } @@ -176,6 +210,25 @@ func (c Config) Validate() error { if c.Startup.ShutdownCooldownSeconds <= 0 { return fmt.Errorf("config.startup.shutdown_cooldown_seconds must be > 0") } + if c.Startup.MinimumBatteryPercent < 0 || c.Startup.MinimumBatteryPercent > 100 { + return fmt.Errorf("config.startup.minimum_battery_percent must be between 0 and 100") + } + for node, labels := range c.Startup.RequiredNodeLabels { + if strings.TrimSpace(node) == "" { + return fmt.Errorf("config.startup.required_node_labels keys must not be empty") + } + if len(labels) == 0 { + return fmt.Errorf("config.startup.required_node_labels[%q] must include at least one label", node) + } + for key, value := range labels { + if strings.TrimSpace(key) == "" { + return fmt.Errorf("config.startup.required_node_labels[%q] contains empty label key", node) + } + if strings.TrimSpace(value) == "" { + return fmt.Errorf("config.startup.required_node_labels[%q][%q] must not be empty", node, key) + } + } + } if c.Startup.TimeSyncWaitSeconds <= 0 { return fmt.Errorf("config.startup.time_sync_wait_seconds must be > 0") } @@ -223,11 +276,88 @@ func (c Config) Validate() error { if c.Startup.RequirePostStartProbes && len(c.Startup.PostStartProbes) == 0 { return fmt.Errorf("config.startup.post_start_probes must not be empty when require_post_start_probes is true") } + if c.Startup.ServiceChecklistWaitSeconds <= 0 { + return fmt.Errorf("config.startup.service_checklist_wait_seconds must be > 0") + } + if c.Startup.ServiceChecklistPollSeconds <= 0 { + return fmt.Errorf("config.startup.service_checklist_poll_seconds must be > 0") + } + if c.Startup.ServiceChecklistStabilitySec < 0 { + return fmt.Errorf("config.startup.service_checklist_stability_seconds must be >= 0") + } + if c.Startup.RequireServiceChecklist && len(c.Startup.ServiceChecklist) == 0 { + return fmt.Errorf("config.startup.service_checklist must not be empty when require_service_checklist is true") + } + for i, check := range c.Startup.ServiceChecklist { + if strings.TrimSpace(check.Name) == "" { + return fmt.Errorf("config.startup.service_checklist[%d].name must not be empty", i) + } + rawURL := strings.TrimSpace(check.URL) + if rawURL == "" { + return fmt.Errorf("config.startup.service_checklist[%d].url must not be empty", i) + } + parsed, err := neturl.Parse(rawURL) + if err != nil || parsed.Scheme == "" || parsed.Host == "" { + return fmt.Errorf("config.startup.service_checklist[%d].url is invalid: %q", i, rawURL) + } + if check.TimeoutSeconds <= 0 { + return fmt.Errorf("config.startup.service_checklist[%d].timeout_seconds must be > 0", i) + } + for _, code := range check.AcceptedStatuses { + if code < 100 || code > 599 { + return fmt.Errorf("config.startup.service_checklist[%d].accepted_statuses contains invalid HTTP code %d", i, code) + } + } + } + if c.Startup.FluxHealthWaitSeconds <= 0 { + return fmt.Errorf("config.startup.flux_health_wait_seconds must be > 0") + } + if c.Startup.FluxHealthPollSeconds <= 0 { + return fmt.Errorf("config.startup.flux_health_poll_seconds must be > 0") + } + if c.Startup.WorkloadConvergenceWaitSeconds <= 0 { + return fmt.Errorf("config.startup.workload_convergence_wait_seconds must be > 0") + } + if c.Startup.WorkloadConvergencePollSeconds <= 0 { + return fmt.Errorf("config.startup.workload_convergence_poll_seconds must be > 0") + } + if c.Startup.StuckPodGraceSeconds <= 0 { + return fmt.Errorf("config.startup.stuck_pod_grace_seconds must be > 0") + } for _, probe := range c.Startup.PostStartProbes { if strings.TrimSpace(probe) == "" { return fmt.Errorf("config.startup.post_start_probes entries must not be empty") } } + for _, item := range c.Startup.IgnoreFluxKustomizations { + item = strings.TrimSpace(item) + if item == "" { + return fmt.Errorf("config.startup.ignore_flux_kustomizations entries must not be empty") + } + if strings.Count(item, "/") != 1 { + return fmt.Errorf("config.startup.ignore_flux_kustomizations entries must be namespace/name, got %q", item) + } + } + for _, item := range c.Startup.IgnoreWorkloads { + item = strings.TrimSpace(item) + if item == "" { + return fmt.Errorf("config.startup.ignore_workloads entries must not be empty") + } + parts := strings.Split(item, "/") + if len(parts) != 2 && len(parts) != 3 { + return fmt.Errorf("config.startup.ignore_workloads entries must be namespace/name or namespace/kind/name, got %q", item) + } + } + for _, ns := range c.Startup.IgnoreWorkloadNamespaces { + if strings.TrimSpace(ns) == "" { + return fmt.Errorf("config.startup.ignore_workload_namespaces entries must not be empty") + } + } + for _, node := range c.Startup.IgnoreUnavailableNodes { + if strings.TrimSpace(node) == "" { + return fmt.Errorf("config.startup.ignore_unavailable_nodes entries must not be empty") + } + } if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" { return fmt.Errorf("config.startup.vault_unseal_key_file must not be empty") } @@ -276,6 +406,7 @@ func defaults() Config { c := Config{ IACRepoPath: "/opt/titan-iac", ExpectedFluxBranch: "main", + ExpectedFluxSource: "ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git", SSHPort: 2277, ControlPlanes: []string{"titan-0a", "titan-0b", "titan-0c"}, LocalBootstrapPaths: []string{ @@ -328,16 +459,54 @@ func defaults() Config { "gitea/gitea-data", "sso/keycloak-data", }, + MinimumBatteryPercent: 20, + RequiredNodeLabels: map[string]map[string]string{ + "titan-09": { + "ananke.bstein.dev/harbor-bootstrap": "true", + }, + }, RequirePostStartProbes: true, PostStartProbeWaitSeconds: 240, PostStartProbePollSeconds: 5, PostStartProbes: []string{ "https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration", - "https://scm.bstein.dev/user/login", - "https://metrics.bstein.dev/login", + "https://scm.bstein.dev/api/healthz", + "https://metrics.bstein.dev/api/health", }, - VaultUnsealKeyFile: "/var/lib/hecate/vault-unseal.key", - VaultUnsealBreakglassTimeout: 15, + RequireServiceChecklist: true, + ServiceChecklistWaitSeconds: 420, + ServiceChecklistPollSeconds: 5, + ServiceChecklistStabilitySec: 120, + ServiceChecklist: []ServiceChecklistCheck{ + { + Name: "gitea-api", + URL: "https://scm.bstein.dev/api/healthz", + AcceptedStatuses: []int{200}, + BodyContains: "pass", + TimeoutSeconds: 12, + }, + { + Name: "grafana-api", + URL: "https://metrics.bstein.dev/api/health", + AcceptedStatuses: []int{200}, + BodyContains: "\"database\":\"ok\"", + TimeoutSeconds: 12, + }, + }, + RequireFluxHealth: true, + FluxHealthWaitSeconds: 900, + FluxHealthPollSeconds: 5, + IgnoreFluxKustomizations: []string{}, + RequireWorkloadConvergence: true, + WorkloadConvergenceWaitSeconds: 900, + WorkloadConvergencePollSeconds: 5, + IgnoreWorkloadNamespaces: []string{}, + IgnoreWorkloads: []string{}, + IgnoreUnavailableNodes: []string{}, + AutoRecycleStuckPods: true, + StuckPodGraceSeconds: 180, + VaultUnsealKeyFile: "/var/lib/ananke/vault-unseal.key", + VaultUnsealBreakglassTimeout: 15, }, Shutdown: Shutdown{ DefaultBudgetSeconds: 1380, @@ -362,7 +531,7 @@ func defaults() Config { TelemetryTimeoutSeconds: 90, }, Coordination: Coordination{ - ForwardShutdownConfig: "/etc/hecate/hecate.yaml", + ForwardShutdownConfig: "/etc/ananke/ananke.yaml", PeerHosts: []string{}, FallbackLocalShutdown: true, CommandTimeoutSeconds: 25, @@ -376,10 +545,10 @@ func defaults() Config { Path: "/metrics", }, State: State{ - Dir: "/var/lib/hecate", - RunHistoryPath: "/var/lib/hecate/runs.json", - LockPath: "/var/lib/hecate/hecate.lock", - IntentPath: "/var/lib/hecate/intent.json", + Dir: "/var/lib/ananke", + RunHistoryPath: "/var/lib/ananke/runs.json", + LockPath: "/var/lib/ananke/ananke.lock", + IntentPath: "/var/lib/ananke/intent.json", }, } c.applyDefaults() @@ -393,6 +562,9 @@ func (c *Config) applyDefaults() { if c.IACRepoPath == "" { c.IACRepoPath = "/opt/titan-iac" } + if c.ExpectedFluxSource == "" { + c.ExpectedFluxSource = "ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git" + } if c.Startup.APIWaitSeconds <= 0 { c.Startup.APIWaitSeconds = 1200 } @@ -402,6 +574,16 @@ func (c *Config) applyDefaults() { if c.Startup.ShutdownCooldownSeconds <= 0 { c.Startup.ShutdownCooldownSeconds = 45 } + if c.Startup.MinimumBatteryPercent <= 0 { + c.Startup.MinimumBatteryPercent = 20 + } + if c.Startup.RequiredNodeLabels == nil { + c.Startup.RequiredNodeLabels = map[string]map[string]string{ + "titan-09": { + "ananke.bstein.dev/harbor-bootstrap": "true", + }, + } + } if c.Startup.TimeSyncWaitSeconds <= 0 { c.Startup.TimeSyncWaitSeconds = 240 } @@ -446,12 +628,71 @@ func (c *Config) applyDefaults() { if len(c.Startup.PostStartProbes) == 0 { c.Startup.PostStartProbes = []string{ "https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration", - "https://scm.bstein.dev/user/login", - "https://metrics.bstein.dev/login", + "https://scm.bstein.dev/api/healthz", + "https://metrics.bstein.dev/api/health", } } + if c.Startup.ServiceChecklistWaitSeconds <= 0 { + c.Startup.ServiceChecklistWaitSeconds = 420 + } + if c.Startup.ServiceChecklistPollSeconds <= 0 { + c.Startup.ServiceChecklistPollSeconds = 5 + } + if c.Startup.ServiceChecklistStabilitySec < 0 { + c.Startup.ServiceChecklistStabilitySec = 0 + } + if len(c.Startup.ServiceChecklist) == 0 { + c.Startup.ServiceChecklist = []ServiceChecklistCheck{ + { + Name: "gitea-api", + URL: "https://scm.bstein.dev/api/healthz", + AcceptedStatuses: []int{200}, + BodyContains: "pass", + TimeoutSeconds: 12, + }, + { + Name: "grafana-api", + URL: "https://metrics.bstein.dev/api/health", + AcceptedStatuses: []int{200}, + BodyContains: "\"database\":\"ok\"", + TimeoutSeconds: 12, + }, + } + } + for i := range c.Startup.ServiceChecklist { + if c.Startup.ServiceChecklist[i].TimeoutSeconds <= 0 { + c.Startup.ServiceChecklist[i].TimeoutSeconds = 12 + } + } + if c.Startup.FluxHealthWaitSeconds <= 0 { + c.Startup.FluxHealthWaitSeconds = 900 + } + if c.Startup.FluxHealthPollSeconds <= 0 { + c.Startup.FluxHealthPollSeconds = 5 + } + if c.Startup.IgnoreFluxKustomizations == nil { + c.Startup.IgnoreFluxKustomizations = []string{} + } + if c.Startup.WorkloadConvergenceWaitSeconds <= 0 { + c.Startup.WorkloadConvergenceWaitSeconds = 900 + } + if c.Startup.WorkloadConvergencePollSeconds <= 0 { + c.Startup.WorkloadConvergencePollSeconds = 5 + } + if c.Startup.IgnoreWorkloadNamespaces == nil { + c.Startup.IgnoreWorkloadNamespaces = []string{} + } + if c.Startup.IgnoreWorkloads == nil { + c.Startup.IgnoreWorkloads = []string{} + } + if c.Startup.IgnoreUnavailableNodes == nil { + c.Startup.IgnoreUnavailableNodes = []string{} + } + if c.Startup.StuckPodGraceSeconds <= 0 { + c.Startup.StuckPodGraceSeconds = 180 + } if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" { - c.Startup.VaultUnsealKeyFile = "/var/lib/hecate/vault-unseal.key" + c.Startup.VaultUnsealKeyFile = "/var/lib/ananke/vault-unseal.key" } if c.Startup.VaultUnsealBreakglassTimeout <= 0 { c.Startup.VaultUnsealBreakglassTimeout = 15 @@ -496,7 +737,7 @@ func (c *Config) applyDefaults() { c.UPS.TelemetryTimeoutSeconds = 90 } if c.Coordination.ForwardShutdownConfig == "" { - c.Coordination.ForwardShutdownConfig = "/etc/hecate/hecate.yaml" + c.Coordination.ForwardShutdownConfig = "/etc/ananke/ananke.yaml" } if c.Coordination.PeerHosts == nil { c.Coordination.PeerHosts = []string{} @@ -517,15 +758,15 @@ func (c *Config) applyDefaults() { c.Metrics.Path = "/metrics" } if c.State.Dir == "" { - c.State.Dir = "/var/lib/hecate" + c.State.Dir = "/var/lib/ananke" } if c.State.RunHistoryPath == "" { - c.State.RunHistoryPath = "/var/lib/hecate/runs.json" + c.State.RunHistoryPath = "/var/lib/ananke/runs.json" } if c.State.LockPath == "" { - c.State.LockPath = "/var/lib/hecate/hecate.lock" + c.State.LockPath = "/var/lib/ananke/ananke.lock" } if c.State.IntentPath == "" { - c.State.IntentPath = "/var/lib/hecate/intent.json" + c.State.IntentPath = "/var/lib/ananke/intent.json" } } diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 3c21f76..d4af8fa 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -9,7 +9,7 @@ import ( func TestLoadAcceptsUPSTargets(t *testing.T) { tmp := t.TempDir() - cfgPath := filepath.Join(tmp, "hecate.yaml") + cfgPath := filepath.Join(tmp, "ananke.yaml") raw := ` control_planes: [titan-0a, titan-0b, titan-0c] expected_flux_branch: main @@ -24,7 +24,7 @@ shutdown: default_budget_seconds: 300 state: run_history_path: /tmp/runs.json - lock_path: /tmp/hecate.lock + lock_path: /tmp/ananke.lock ` if err := os.WriteFile(cfgPath, []byte(strings.TrimSpace(raw)), 0o644); err != nil { t.Fatalf("write config: %v", err) @@ -74,7 +74,7 @@ func TestValidateRejectsUnknownEtcdRestoreControlPlane(t *testing.T) { func TestLoadSetsCoordinationGuardDefaults(t *testing.T) { tmp := t.TempDir() - cfgPath := filepath.Join(tmp, "hecate.yaml") + cfgPath := filepath.Join(tmp, "ananke.yaml") raw := ` control_planes: [titan-0a, titan-0b, titan-0c] expected_flux_branch: main @@ -85,7 +85,7 @@ ups: enabled: false state: run_history_path: /tmp/runs.json - lock_path: /tmp/hecate.lock + lock_path: /tmp/ananke.lock ` if err := os.WriteFile(cfgPath, []byte(strings.TrimSpace(raw)), 0o644); err != nil { t.Fatalf("write config: %v", err) @@ -146,3 +146,55 @@ func TestValidateRejectsMissingPostStartProbesWhenRequired(t *testing.T) { t.Fatalf("expected validation error when post start probes are required but empty") } } + +func TestValidateRejectsMissingServiceChecklistWhenRequired(t *testing.T) { + cfg := defaults() + cfg.Startup.RequireServiceChecklist = true + cfg.Startup.ServiceChecklist = nil + if err := cfg.Validate(); err == nil { + t.Fatalf("expected validation error when service checklist is required but empty") + } +} + +func TestValidateRejectsBadServiceChecklistURL(t *testing.T) { + cfg := defaults() + cfg.Startup.ServiceChecklist = []ServiceChecklistCheck{ + { + Name: "grafana", + URL: "not-a-url", + AcceptedStatuses: []int{200}, + TimeoutSeconds: 12, + }, + } + if err := cfg.Validate(); err == nil { + t.Fatalf("expected validation error for invalid service checklist url") + } +} + +func TestValidateRejectsBadIgnoreFluxKustomizationFormat(t *testing.T) { + cfg := defaults() + cfg.Startup.IgnoreFluxKustomizations = []string{"jellyfin"} + if err := cfg.Validate(); err == nil { + t.Fatalf("expected validation error for invalid ignore_flux_kustomizations entry") + } +} + +func TestValidateRejectsBadIgnoreWorkloadFormat(t *testing.T) { + cfg := defaults() + cfg.Startup.IgnoreWorkloads = []string{"maintenance/metis/extra/value"} + if err := cfg.Validate(); err == nil { + t.Fatalf("expected validation error for invalid ignore_workloads entry") + } +} + +func TestValidateRejectsInvalidRequiredNodeLabel(t *testing.T) { + cfg := defaults() + cfg.Startup.RequiredNodeLabels = map[string]map[string]string{ + "titan-09": { + "": "true", + }, + } + if err := cfg.Validate(); err == nil { + t.Fatalf("expected validation error for invalid required_node_labels entry") + } +} diff --git a/internal/metrics/exporter.go b/internal/metrics/exporter.go index 8fc951f..4b26241 100644 --- a/internal/metrics/exporter.go +++ b/internal/metrics/exporter.go @@ -84,41 +84,41 @@ func (e *Exporter) serveMetrics(w http.ResponseWriter, _ *http.Request) { w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8") var b strings.Builder - b.WriteString("# HELP hecate_shutdown_budget_seconds Estimated graceful shutdown budget in seconds (p95).\n") - b.WriteString("# TYPE hecate_shutdown_budget_seconds gauge\n") - b.WriteString(fmt.Sprintf("hecate_shutdown_budget_seconds %d\n", e.shutdownBudgetSec)) - b.WriteString("# HELP hecate_shutdown_triggers_total Total number of shutdown triggers issued by this instance.\n") - b.WriteString("# TYPE hecate_shutdown_triggers_total counter\n") - b.WriteString(fmt.Sprintf("hecate_shutdown_triggers_total %d\n", e.shutdownTriggers)) - b.WriteString("# HELP hecate_shutdown_last_trigger_timestamp_seconds Unix timestamp of last shutdown trigger.\n") - b.WriteString("# TYPE hecate_shutdown_last_trigger_timestamp_seconds gauge\n") + b.WriteString("# HELP ananke_shutdown_budget_seconds Estimated graceful shutdown budget in seconds (p95).\n") + b.WriteString("# TYPE ananke_shutdown_budget_seconds gauge\n") + b.WriteString(fmt.Sprintf("ananke_shutdown_budget_seconds %d\n", e.shutdownBudgetSec)) + b.WriteString("# HELP ananke_shutdown_triggers_total Total number of shutdown triggers issued by this instance.\n") + b.WriteString("# TYPE ananke_shutdown_triggers_total counter\n") + b.WriteString(fmt.Sprintf("ananke_shutdown_triggers_total %d\n", e.shutdownTriggers)) + b.WriteString("# HELP ananke_shutdown_last_trigger_timestamp_seconds Unix timestamp of last shutdown trigger.\n") + b.WriteString("# TYPE ananke_shutdown_last_trigger_timestamp_seconds gauge\n") if e.lastShutdownAt.IsZero() { - b.WriteString("hecate_shutdown_last_trigger_timestamp_seconds 0\n") + b.WriteString("ananke_shutdown_last_trigger_timestamp_seconds 0\n") } else { - b.WriteString(fmt.Sprintf("hecate_shutdown_last_trigger_timestamp_seconds %d\n", e.lastShutdownAt.Unix())) + b.WriteString(fmt.Sprintf("ananke_shutdown_last_trigger_timestamp_seconds %d\n", e.lastShutdownAt.Unix())) } - b.WriteString("# HELP hecate_ups_on_battery Whether a UPS source is currently on battery.\n") - b.WriteString("# TYPE hecate_ups_on_battery gauge\n") - b.WriteString("# HELP hecate_ups_low_battery Whether a UPS source currently reports low battery.\n") - b.WriteString("# TYPE hecate_ups_low_battery gauge\n") - b.WriteString("# HELP hecate_ups_runtime_seconds Battery runtime remaining reported by UPS.\n") - b.WriteString("# TYPE hecate_ups_runtime_seconds gauge\n") - b.WriteString("# HELP hecate_ups_battery_charge_percent Battery charge percentage reported by UPS.\n") - b.WriteString("# TYPE hecate_ups_battery_charge_percent gauge\n") - b.WriteString("# HELP hecate_ups_load_percent UPS output load percentage.\n") - b.WriteString("# TYPE hecate_ups_load_percent gauge\n") - b.WriteString("# HELP hecate_ups_power_nominal_watts UPS nominal power rating in watts.\n") - b.WriteString("# TYPE hecate_ups_power_nominal_watts gauge\n") - b.WriteString("# HELP hecate_ups_threshold_seconds Red-line threshold for runtime-based shutdown.\n") - b.WriteString("# TYPE hecate_ups_threshold_seconds gauge\n") - b.WriteString("# HELP hecate_ups_trigger_active Whether this UPS source currently breaches shutdown trigger conditions.\n") - b.WriteString("# TYPE hecate_ups_trigger_active gauge\n") - b.WriteString("# HELP hecate_ups_breach_count Current debounce breach count for this UPS source.\n") - b.WriteString("# TYPE hecate_ups_breach_count gauge\n") - b.WriteString("# HELP hecate_ups_last_sample_timestamp_seconds Unix timestamp of most recent sample.\n") - b.WriteString("# TYPE hecate_ups_last_sample_timestamp_seconds gauge\n") - b.WriteString("# HELP hecate_ups_error Whether the last sample had an error.\n") - b.WriteString("# TYPE hecate_ups_error gauge\n") + b.WriteString("# HELP ananke_ups_on_battery Whether a UPS source is currently on battery.\n") + b.WriteString("# TYPE ananke_ups_on_battery gauge\n") + b.WriteString("# HELP ananke_ups_low_battery Whether a UPS source currently reports low battery.\n") + b.WriteString("# TYPE ananke_ups_low_battery gauge\n") + b.WriteString("# HELP ananke_ups_runtime_seconds Battery runtime remaining reported by UPS.\n") + b.WriteString("# TYPE ananke_ups_runtime_seconds gauge\n") + b.WriteString("# HELP ananke_ups_battery_charge_percent Battery charge percentage reported by UPS.\n") + b.WriteString("# TYPE ananke_ups_battery_charge_percent gauge\n") + b.WriteString("# HELP ananke_ups_load_percent UPS output load percentage.\n") + b.WriteString("# TYPE ananke_ups_load_percent gauge\n") + b.WriteString("# HELP ananke_ups_power_nominal_watts UPS nominal power rating in watts.\n") + b.WriteString("# TYPE ananke_ups_power_nominal_watts gauge\n") + b.WriteString("# HELP ananke_ups_threshold_seconds Red-line threshold for runtime-based shutdown.\n") + b.WriteString("# TYPE ananke_ups_threshold_seconds gauge\n") + b.WriteString("# HELP ananke_ups_trigger_active Whether this UPS source currently breaches shutdown trigger conditions.\n") + b.WriteString("# TYPE ananke_ups_trigger_active gauge\n") + b.WriteString("# HELP ananke_ups_breach_count Current debounce breach count for this UPS source.\n") + b.WriteString("# TYPE ananke_ups_breach_count gauge\n") + b.WriteString("# HELP ananke_ups_last_sample_timestamp_seconds Unix timestamp of most recent sample.\n") + b.WriteString("# TYPE ananke_ups_last_sample_timestamp_seconds gauge\n") + b.WriteString("# HELP ananke_ups_error Whether the last sample had an error.\n") + b.WriteString("# TYPE ananke_ups_error gauge\n") names := make([]string, 0, len(e.samples)) for name := range e.samples { @@ -129,21 +129,21 @@ func (e *Exporter) serveMetrics(w http.ResponseWriter, _ *http.Request) { s := e.samples[name] labels := fmt.Sprintf("{source=%q,target=%q,status=%q,last_reason=%q}", safe(name), safe(s.Target), safe(s.Status), safe(e.lastShutdownReason)) - b.WriteString(fmt.Sprintf("hecate_ups_on_battery%s %d\n", labels, boolNum(s.OnBattery))) - b.WriteString(fmt.Sprintf("hecate_ups_low_battery%s %d\n", labels, boolNum(s.LowBattery))) - b.WriteString(fmt.Sprintf("hecate_ups_runtime_seconds%s %d\n", labels, s.RuntimeSecond)) - b.WriteString(fmt.Sprintf("hecate_ups_battery_charge_percent%s %.2f\n", labels, s.BatteryCharge)) - b.WriteString(fmt.Sprintf("hecate_ups_load_percent%s %.2f\n", labels, s.LoadPercent)) - b.WriteString(fmt.Sprintf("hecate_ups_power_nominal_watts%s %.2f\n", labels, s.PowerNominalW)) - b.WriteString(fmt.Sprintf("hecate_ups_threshold_seconds%s %d\n", labels, s.ThresholdSec)) - b.WriteString(fmt.Sprintf("hecate_ups_trigger_active%s %d\n", labels, boolNum(s.Trigger))) - b.WriteString(fmt.Sprintf("hecate_ups_breach_count%s %d\n", labels, s.BreachCount)) + b.WriteString(fmt.Sprintf("ananke_ups_on_battery%s %d\n", labels, boolNum(s.OnBattery))) + b.WriteString(fmt.Sprintf("ananke_ups_low_battery%s %d\n", labels, boolNum(s.LowBattery))) + b.WriteString(fmt.Sprintf("ananke_ups_runtime_seconds%s %d\n", labels, s.RuntimeSecond)) + b.WriteString(fmt.Sprintf("ananke_ups_battery_charge_percent%s %.2f\n", labels, s.BatteryCharge)) + b.WriteString(fmt.Sprintf("ananke_ups_load_percent%s %.2f\n", labels, s.LoadPercent)) + b.WriteString(fmt.Sprintf("ananke_ups_power_nominal_watts%s %.2f\n", labels, s.PowerNominalW)) + b.WriteString(fmt.Sprintf("ananke_ups_threshold_seconds%s %d\n", labels, s.ThresholdSec)) + b.WriteString(fmt.Sprintf("ananke_ups_trigger_active%s %d\n", labels, boolNum(s.Trigger))) + b.WriteString(fmt.Sprintf("ananke_ups_breach_count%s %d\n", labels, s.BreachCount)) if s.UpdatedAt.IsZero() { - b.WriteString(fmt.Sprintf("hecate_ups_last_sample_timestamp_seconds%s 0\n", labels)) + b.WriteString(fmt.Sprintf("ananke_ups_last_sample_timestamp_seconds%s 0\n", labels)) } else { - b.WriteString(fmt.Sprintf("hecate_ups_last_sample_timestamp_seconds%s %d\n", labels, s.UpdatedAt.Unix())) + b.WriteString(fmt.Sprintf("ananke_ups_last_sample_timestamp_seconds%s %d\n", labels, s.UpdatedAt.Unix())) } - b.WriteString(fmt.Sprintf("hecate_ups_error%s %d\n", labels, boolNum(s.LastError != ""))) + b.WriteString(fmt.Sprintf("ananke_ups_error%s %d\n", labels, boolNum(s.LastError != ""))) } _, _ = w.Write([]byte(b.String())) diff --git a/internal/metrics/exporter_test.go b/internal/metrics/exporter_test.go index 9c30c7f..48835cc 100644 --- a/internal/metrics/exporter_test.go +++ b/internal/metrics/exporter_test.go @@ -33,14 +33,14 @@ func TestExporterEmitsCoreMetrics(t *testing.T) { body := rr.Body.String() mustContain := []string{ - "hecate_shutdown_budget_seconds 321", - "hecate_shutdown_triggers_total 1", - "hecate_ups_on_battery{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"", - "hecate_ups_runtime_seconds{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"", - "hecate_ups_battery_charge_percent{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"", - "hecate_ups_load_percent{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"", - "hecate_ups_power_nominal_watts{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"", - "hecate_ups_threshold_seconds{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"", + "ananke_shutdown_budget_seconds 321", + "ananke_shutdown_triggers_total 1", + "ananke_ups_on_battery{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"", + "ananke_ups_runtime_seconds{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"", + "ananke_ups_battery_charge_percent{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"", + "ananke_ups_load_percent{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"", + "ananke_ups_power_nominal_watts{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"", + "ananke_ups_threshold_seconds{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"", } for _, m := range mustContain { if !strings.Contains(body, m) { diff --git a/internal/service/daemon.go b/internal/service/daemon.go index 77fe119..1362599 100644 --- a/internal/service/daemon.go +++ b/internal/service/daemon.go @@ -12,12 +12,12 @@ import ( "strings" "time" - "scm.bstein.dev/bstein/hecate/internal/cluster" - "scm.bstein.dev/bstein/hecate/internal/config" - "scm.bstein.dev/bstein/hecate/internal/metrics" - "scm.bstein.dev/bstein/hecate/internal/sshutil" - "scm.bstein.dev/bstein/hecate/internal/state" - "scm.bstein.dev/bstein/hecate/internal/ups" + "scm.bstein.dev/bstein/ananke/internal/cluster" + "scm.bstein.dev/bstein/ananke/internal/config" + "scm.bstein.dev/bstein/ananke/internal/metrics" + "scm.bstein.dev/bstein/ananke/internal/sshutil" + "scm.bstein.dev/bstein/ananke/internal/state" + "scm.bstein.dev/bstein/ananke/internal/ups" ) type Target struct { @@ -81,7 +81,7 @@ func (d *Daemon) Run(ctx context.Context) error { lastGood[t.Name] = time.Now() } - d.log.Printf("hecate daemon started: poll=%s debounce=%d telemetry_timeout=%s targets=%s", + d.log.Printf("ananke daemon started: poll=%s debounce=%d telemetry_timeout=%s targets=%s", poll, debounce, telemetryTimeout, d.targetList()) for { @@ -198,7 +198,7 @@ func (d *Daemon) forwardShutdown(ctx context.Context, reason string) error { runCtx, cancel := context.WithTimeout(ctx, timeout) defer cancel() - remoteCmd := fmt.Sprintf("sudo /usr/local/bin/hecate shutdown --config %q --execute --reason %q", d.cfg.Coordination.ForwardShutdownConfig, reason) + remoteCmd := fmt.Sprintf("sudo /usr/local/bin/ananke shutdown --config %q --execute --reason %q", d.cfg.Coordination.ForwardShutdownConfig, reason) if d.cfg.Shutdown.EmergencySkipEtcd { remoteCmd += " --skip-etcd-snapshot" } diff --git a/internal/state/intent_parse.go b/internal/state/intent_parse.go index 7001d06..9a28852 100644 --- a/internal/state/intent_parse.go +++ b/internal/state/intent_parse.go @@ -6,7 +6,7 @@ import ( "time" ) -// ParseIntentOutput parses `hecate intent` CLI output from local/remote commands. +// ParseIntentOutput parses `ananke intent` CLI output from local/remote commands. func ParseIntentOutput(raw string) (Intent, error) { for _, line := range strings.Split(raw, "\n") { line = strings.TrimSpace(line) diff --git a/internal/state/intent_test.go b/internal/state/intent_test.go index 50d0c7c..4f66419 100644 --- a/internal/state/intent_test.go +++ b/internal/state/intent_test.go @@ -61,7 +61,7 @@ func TestReadIntentAutoHealsCorruptJSON(t *testing.T) { } func TestParseIntentOutputParsesStructuredLine(t *testing.T) { - raw := `[hecate] 2026/04/05 11:24:49 intent=normal reason="guard-test-clear-2" source=drill updated_at=2026-04-05T16:24:33Z` + raw := `[ananke] 2026/04/05 11:24:49 intent=normal reason="guard-test-clear-2" source=drill updated_at=2026-04-05T16:24:33Z` in, err := ParseIntentOutput(raw) if err != nil { t.Fatalf("parse intent output: %v", err) @@ -81,7 +81,7 @@ func TestParseIntentOutputParsesStructuredLine(t *testing.T) { } func TestParseIntentOutputHandlesNone(t *testing.T) { - in, err := ParseIntentOutput(`[hecate] 2026/04/05 11:24:49 intent=none`) + in, err := ParseIntentOutput(`[ananke] 2026/04/05 11:24:49 intent=none`) if err != nil { t.Fatalf("parse none intent output: %v", err) } diff --git a/internal/state/store_test.go b/internal/state/store_test.go index fdc552b..f88cb5f 100644 --- a/internal/state/store_test.go +++ b/internal/state/store_test.go @@ -11,7 +11,7 @@ import ( ) func TestAcquireLockLifecycle(t *testing.T) { - lockPath := filepath.Join(t.TempDir(), "hecate.lock") + lockPath := filepath.Join(t.TempDir(), "ananke.lock") unlock, err := AcquireLock(lockPath) if err != nil { t.Fatalf("acquire lock: %v", err) @@ -26,7 +26,7 @@ func TestAcquireLockLifecycle(t *testing.T) { } func TestAcquireLockReclaimsStaleLock(t *testing.T) { - lockPath := filepath.Join(t.TempDir(), "hecate.lock") + lockPath := filepath.Join(t.TempDir(), "ananke.lock") if err := os.WriteFile(lockPath, []byte("pid=999999\n"), 0o600); err != nil { t.Fatalf("write stale lock: %v", err) } @@ -47,7 +47,7 @@ func TestAcquireLockReclaimsStaleLock(t *testing.T) { } func TestAcquireLockRejectsActiveLock(t *testing.T) { - lockPath := filepath.Join(t.TempDir(), "hecate.lock") + lockPath := filepath.Join(t.TempDir(), "ananke.lock") active := "pid=" + strconv.Itoa(os.Getpid()) + "\n" if err := os.WriteFile(lockPath, []byte(active), 0o600); err != nil { t.Fatalf("write active lock: %v", err) diff --git a/internal/ups/nut_test.go b/internal/ups/nut_test.go index 3f83c4f..7b613bc 100644 --- a/internal/ups/nut_test.go +++ b/internal/ups/nut_test.go @@ -3,7 +3,7 @@ package ups import "testing" func TestParseNUT(t *testing.T) { -raw := `battery.runtime: 384 + raw := `battery.runtime: 384 battery.charge: 72 ups.load: 19 ups.realpower.nominal: 510 diff --git a/scripts/hecate-drills.sh b/scripts/ananke-drills.sh similarity index 83% rename from scripts/hecate-drills.sh rename to scripts/ananke-drills.sh index 51e20cc..583415c 100755 --- a/scripts/hecate-drills.sh +++ b/scripts/ananke-drills.sh @@ -2,23 +2,23 @@ set -Eeuo pipefail KUBECTL="${KUBECTL:-kubectl}" -HECATE_COORDINATOR_HOST="${HECATE_COORDINATOR_HOST:-titan-db}" -HECATE_BIN="${HECATE_BIN:-/usr/local/bin/hecate}" -HECATE_CONFIG="${HECATE_CONFIG:-/etc/hecate/hecate.yaml}" -HECATE_COORDINATOR_RELAY="${HECATE_COORDINATOR_RELAY:-}" -LOG_DIR="${HECATE_DRILL_LOG_DIR:-/tmp/hecate-drills}" -STARTUP_TIMEOUT_SECONDS="${HECATE_DRILL_STARTUP_TIMEOUT_SECONDS:-1800}" -SHUTDOWN_TIMEOUT_SECONDS="${HECATE_DRILL_SHUTDOWN_TIMEOUT_SECONDS:-1800}" -SHUTDOWN_CONFIG="${HECATE_DRILL_SHUTDOWN_CONFIG:-/tmp/hecate-drill-no-poweroff.yaml}" -STARTUP_RETRY_DELAY_SECONDS="${HECATE_DRILL_STARTUP_RETRY_DELAY_SECONDS:-10}" -STARTUP_RETRY_MAX="${HECATE_DRILL_STARTUP_RETRY_MAX:-12}" +ANANKE_COORDINATOR_HOST="${ANANKE_COORDINATOR_HOST:-titan-db}" +ANANKE_BIN="${ANANKE_BIN:-/usr/local/bin/ananke}" +ANANKE_CONFIG="${ANANKE_CONFIG:-/etc/ananke/ananke.yaml}" +ANANKE_COORDINATOR_RELAY="${ANANKE_COORDINATOR_RELAY:-}" +LOG_DIR="${ANANKE_DRILL_LOG_DIR:-/tmp/ananke-drills}" +STARTUP_TIMEOUT_SECONDS="${ANANKE_DRILL_STARTUP_TIMEOUT_SECONDS:-1800}" +SHUTDOWN_TIMEOUT_SECONDS="${ANANKE_DRILL_SHUTDOWN_TIMEOUT_SECONDS:-1800}" +SHUTDOWN_CONFIG="${ANANKE_DRILL_SHUTDOWN_CONFIG:-/tmp/ananke-drill-no-poweroff.yaml}" +STARTUP_RETRY_DELAY_SECONDS="${ANANKE_DRILL_STARTUP_RETRY_DELAY_SECONDS:-10}" +STARTUP_RETRY_MAX="${ANANKE_DRILL_STARTUP_RETRY_MAX:-12}" EXECUTE=0 usage() { cat <<'EOF' Usage: - scripts/hecate-drills.sh list - scripts/hecate-drills.sh run [--execute] + scripts/ananke-drills.sh list + scripts/ananke-drills.sh run [--execute] Drills: flux-gitea-deadlock Simulate flux-controller + gitea outage and require startup recovery. @@ -30,7 +30,7 @@ Drills: Notes: - Drills are intentionally disruptive and are not part of regular `make test`. - Use --execute to run live changes. Without it, this script prints planned actions only. - - Optional relay: set HECATE_COORDINATOR_RELAY="ssh titan-db" to run coordinator commands via a jump host. + - Optional relay: set ANANKE_COORDINATOR_RELAY="ssh titan-db" to run coordinator commands via a jump host. EOF } @@ -98,47 +98,47 @@ wait_ready_keycloak() { die "keycloak workload not found in sso namespace (expected deployment/keycloak or statefulset/keycloak)" } -run_hecate_startup() { +run_ananke_startup() { local reason="$1" - local cmd=(sudo "${HECATE_BIN}" startup --config "${HECATE_CONFIG}" --execute --force-flux-branch main --reason "${reason}") + local cmd=(sudo "${ANANKE_BIN}" startup --config "${ANANKE_CONFIG}" --execute --force-flux-branch main --reason "${reason}") if [[ "${EXECUTE}" -eq 0 ]]; then - if [[ -n "${HECATE_COORDINATOR_RELAY}" ]]; then - log "plan: ssh ${HECATE_COORDINATOR_HOST} ${HECATE_COORDINATOR_RELAY} '${cmd[*]}'" + if [[ -n "${ANANKE_COORDINATOR_RELAY}" ]]; then + log "plan: ssh ${ANANKE_COORDINATOR_HOST} ${ANANKE_COORDINATOR_RELAY} '${cmd[*]}'" else - log "plan: ssh ${HECATE_COORDINATOR_HOST} '${cmd[*]}'" + log "plan: ssh ${ANANKE_COORDINATOR_HOST} '${cmd[*]}'" fi return 0 fi - if [[ -n "${HECATE_COORDINATOR_RELAY}" ]]; then + if [[ -n "${ANANKE_COORDINATOR_RELAY}" ]]; then # shellcheck disable=SC2086 - timeout "${STARTUP_TIMEOUT_SECONDS}" ssh "${HECATE_COORDINATOR_HOST}" ${HECATE_COORDINATOR_RELAY} "${cmd[@]}" + timeout "${STARTUP_TIMEOUT_SECONDS}" ssh "${ANANKE_COORDINATOR_HOST}" ${ANANKE_COORDINATOR_RELAY} "${cmd[@]}" else - timeout "${STARTUP_TIMEOUT_SECONDS}" ssh "${HECATE_COORDINATOR_HOST}" "${cmd[@]}" + timeout "${STARTUP_TIMEOUT_SECONDS}" ssh "${ANANKE_COORDINATOR_HOST}" "${cmd[@]}" fi } -run_hecate_shutdown() { +run_ananke_shutdown() { local reason="$1" - local cmd=(sudo "${HECATE_BIN}" shutdown --config "${SHUTDOWN_CONFIG}" --execute --reason "${reason}") + local cmd=(sudo "${ANANKE_BIN}" shutdown --config "${SHUTDOWN_CONFIG}" --execute --reason "${reason}") if [[ "${EXECUTE}" -eq 0 ]]; then - if [[ -n "${HECATE_COORDINATOR_RELAY}" ]]; then - log "plan: ssh ${HECATE_COORDINATOR_HOST} ${HECATE_COORDINATOR_RELAY} '${cmd[*]}'" + if [[ -n "${ANANKE_COORDINATOR_RELAY}" ]]; then + log "plan: ssh ${ANANKE_COORDINATOR_HOST} ${ANANKE_COORDINATOR_RELAY} '${cmd[*]}'" else - log "plan: ssh ${HECATE_COORDINATOR_HOST} '${cmd[*]}'" + log "plan: ssh ${ANANKE_COORDINATOR_HOST} '${cmd[*]}'" fi return 0 fi - if [[ -n "${HECATE_COORDINATOR_RELAY}" ]]; then + if [[ -n "${ANANKE_COORDINATOR_RELAY}" ]]; then # shellcheck disable=SC2086 - timeout "${SHUTDOWN_TIMEOUT_SECONDS}" ssh "${HECATE_COORDINATOR_HOST}" ${HECATE_COORDINATOR_RELAY} "${cmd[@]}" + timeout "${SHUTDOWN_TIMEOUT_SECONDS}" ssh "${ANANKE_COORDINATOR_HOST}" ${ANANKE_COORDINATOR_RELAY} "${cmd[@]}" else - timeout "${SHUTDOWN_TIMEOUT_SECONDS}" ssh "${HECATE_COORDINATOR_HOST}" "${cmd[@]}" + timeout "${SHUTDOWN_TIMEOUT_SECONDS}" ssh "${ANANKE_COORDINATOR_HOST}" "${cmd[@]}" fi } -run_hecate_startup_with_retry() { +run_ananke_startup_with_retry() { local reason="$1" - local startup_cmd="sudo ${HECATE_BIN} startup --config ${HECATE_CONFIG} --execute --force-flux-branch main --reason ${reason}" + local startup_cmd="sudo ${ANANKE_BIN} startup --config ${ANANKE_CONFIG} --execute --force-flux-branch main --reason ${reason}" if [[ "${EXECUTE}" -eq 0 ]]; then log "plan: startup retry loop with max=${STARTUP_RETRY_MAX} delay=${STARTUP_RETRY_DELAY_SECONDS}s" @@ -161,11 +161,11 @@ run_hecate_startup_with_retry() { run_coordinator_bash() { local script="$1" - if [[ -n "${HECATE_COORDINATOR_RELAY}" ]]; then + if [[ -n "${ANANKE_COORDINATOR_RELAY}" ]]; then # shellcheck disable=SC2086 - printf '%s\n' "${script}" | ssh "${HECATE_COORDINATOR_HOST}" ${HECATE_COORDINATOR_RELAY} "bash -se" + printf '%s\n' "${script}" | ssh "${ANANKE_COORDINATOR_HOST}" ${ANANKE_COORDINATOR_RELAY} "bash -se" else - printf '%s\n' "${script}" | ssh "${HECATE_COORDINATOR_HOST}" "bash -se" + printf '%s\n' "${script}" | ssh "${ANANKE_COORDINATOR_HOST}" "bash -se" fi } @@ -283,7 +283,7 @@ write_log_header() { mkdir -p "${LOG_DIR}" local f="${LOG_DIR}/${drill}-$(now_ts).log" exec > >(tee -a "${f}") 2>&1 - log "drill=${drill} execute=${EXECUTE} coordinator=${HECATE_COORDINATOR_HOST}" + log "drill=${drill} execute=${EXECUTE} coordinator=${ANANKE_COORDINATOR_HOST}" } run_drill_flux_gitea_deadlock() { @@ -303,7 +303,7 @@ run_drill_flux_gitea_deadlock() { scale_to "$ns" "$kind" "$name" 0 done - run_hecate_startup "drill-flux-gitea-deadlock" + run_ananke_startup "drill-flux-gitea-deadlock" log "verifying recovery" wait_ready flux-system deployment source-controller 240s @@ -330,7 +330,7 @@ run_drill_foundation_recovery() { scale_to "$ns" "$kind" "$name" 0 done - run_hecate_startup "drill-foundation-recovery" + run_ananke_startup "drill-foundation-recovery" log "verifying layered recovery" wait_ready vault statefulset vault 420s @@ -350,7 +350,7 @@ run_drill_reconciliation_resume() { set_flux_suspend_all true scale_to flux-system deployment source-controller 0 - run_hecate_startup "drill-reconciliation-resume" + run_ananke_startup "drill-reconciliation-resume" log "verifying reconciliation resumed" wait_ready flux-system deployment source-controller 240s @@ -361,8 +361,8 @@ run_drill_reconciliation_resume() { } run_drill_startup_intent_guard() { - local intent_path="/var/lib/hecate/intent.json" - local backup_path="/tmp/hecate-intent-pre-drill.json" + local intent_path="/var/lib/ananke/intent.json" + local backup_path="/tmp/ananke-intent-pre-drill.json" local inject_cmd=" if [ -f '${intent_path}' ]; then sudo cp '${intent_path}' '${backup_path}'; else sudo rm -f '${backup_path}'; fi cat <<'JSON' | sudo tee '${intent_path}' >/dev/null @@ -376,12 +376,12 @@ else sudo rm -f '${intent_path}' fi " - local startup_cmd="sudo ${HECATE_BIN} startup --config ${HECATE_CONFIG} --execute --force-flux-branch main --reason drill-startup-intent-guard" + local startup_cmd="sudo ${ANANKE_BIN} startup --config ${ANANKE_CONFIG} --execute --force-flux-branch main --reason drill-startup-intent-guard" if [[ "${EXECUTE}" -eq 0 ]]; then - log "plan: ssh ${HECATE_COORDINATOR_HOST} ''" - log "plan: ssh ${HECATE_COORDINATOR_HOST} '${startup_cmd}' (expect failure)" - log "plan: ssh ${HECATE_COORDINATOR_HOST} ''" + log "plan: ssh ${ANANKE_COORDINATOR_HOST} ''" + log "plan: ssh ${ANANKE_COORDINATOR_HOST} '${startup_cmd}' (expect failure)" + log "plan: ssh ${ANANKE_COORDINATOR_HOST} ''" log "pass: startup-intent-guard (plan mode)" return 0 fi @@ -406,10 +406,10 @@ run_drill_controlled_cycle() { fi log "running controlled shutdown cycle (poweroff disabled config)" - run_hecate_shutdown "drill-controlled-cycle-shutdown" + run_ananke_shutdown "drill-controlled-cycle-shutdown" log "running startup recovery cycle" - run_hecate_startup_with_retry "drill-controlled-cycle-startup" + run_ananke_startup_with_retry "drill-controlled-cycle-startup" log "verifying critical stack readiness after cycle" wait_ready flux-system deployment source-controller 240s diff --git a/scripts/hecate-self-update.sh b/scripts/ananke-self-update.sh similarity index 69% rename from scripts/hecate-self-update.sh rename to scripts/ananke-self-update.sh index ff66f80..4ae76b7 100644 --- a/scripts/hecate-self-update.sh +++ b/scripts/ananke-self-update.sh @@ -2,13 +2,13 @@ set -euo pipefail if [[ "${EUID}" -ne 0 ]]; then - echo "hecate-self-update.sh must run as root" >&2 + echo "ananke-self-update.sh must run as root" >&2 exit 1 fi -REPO_URL="${HECATE_REPO_URL:-https://scm.bstein.dev/bstein/hecate.git}" -BRANCH="${HECATE_REPO_BRANCH:-main}" -REPO_DIR="${HECATE_REPO_DIR:-/opt/hecate}" +REPO_URL="${ANANKE_REPO_URL:-ssh://git@scm.bstein.dev:2242/bstein/ananke.git}" +BRANCH="${ANANKE_REPO_BRANCH:-main}" +REPO_DIR="${ANANKE_REPO_DIR:-/opt/ananke}" mkdir -p "$(dirname "${REPO_DIR}")" if [[ ! -d "${REPO_DIR}/.git" ]]; then