rename runtime surfaces from hecate to ananke

This commit is contained in:
Brad Stein 2026-04-07 13:13:58 -03:00
parent 169324ef4a
commit c605a083ee
23 changed files with 1938 additions and 256 deletions

View File

@ -1,7 +1,7 @@
.PHONY: build test fmt tidy install drill-list drill-run .PHONY: build test fmt tidy install drill-list drill-run
build: build:
go build -o dist/hecate ./cmd/hecate go build -o dist/ananke ./cmd/ananke
test: test:
go test ./... go test ./...
@ -16,7 +16,7 @@ install:
sudo ./scripts/install.sh sudo ./scripts/install.sh
drill-list: drill-list:
./scripts/hecate-drills.sh list ./scripts/ananke-drills.sh list
drill-run: drill-run:
./scripts/hecate-drills.sh run $(DRILL) --execute ./scripts/ananke-drills.sh run $(DRILL) --execute

View File

@ -6,6 +6,7 @@ import (
"flag" "flag"
"fmt" "fmt"
"log" "log"
"math"
"os" "os"
"os/exec" "os/exec"
"os/signal" "os/signal"
@ -14,17 +15,17 @@ import (
"syscall" "syscall"
"time" "time"
"scm.bstein.dev/bstein/hecate/internal/cluster" "scm.bstein.dev/bstein/ananke/internal/cluster"
"scm.bstein.dev/bstein/hecate/internal/config" "scm.bstein.dev/bstein/ananke/internal/config"
"scm.bstein.dev/bstein/hecate/internal/execx" "scm.bstein.dev/bstein/ananke/internal/execx"
"scm.bstein.dev/bstein/hecate/internal/service" "scm.bstein.dev/bstein/ananke/internal/service"
"scm.bstein.dev/bstein/hecate/internal/sshutil" "scm.bstein.dev/bstein/ananke/internal/sshutil"
"scm.bstein.dev/bstein/hecate/internal/state" "scm.bstein.dev/bstein/ananke/internal/state"
"scm.bstein.dev/bstein/hecate/internal/ups" "scm.bstein.dev/bstein/ananke/internal/ups"
) )
func main() { func main() {
logger := log.New(os.Stdout, "[hecate] ", log.LstdFlags) logger := log.New(os.Stdout, "[ananke] ", log.LstdFlags)
if len(os.Args) < 2 { if len(os.Args) < 2 {
usage() usage()
os.Exit(2) os.Exit(2)
@ -73,7 +74,7 @@ func main() {
func runStartup(logger *log.Logger, args []string) error { func runStartup(logger *log.Logger, args []string) error {
fs := flag.NewFlagSet("startup", flag.ExitOnError) fs := flag.NewFlagSet("startup", flag.ExitOnError)
configPath := fs.String("config", "/etc/hecate/hecate.yaml", "Path to config file") configPath := fs.String("config", "/etc/ananke/ananke.yaml", "Path to config file")
execute := fs.Bool("execute", false, "Actually execute changes (default dry-run)") execute := fs.Bool("execute", false, "Actually execute changes (default dry-run)")
forceBranch := fs.String("force-flux-branch", "", "Patch Flux source branch before resume") forceBranch := fs.String("force-flux-branch", "", "Patch Flux source branch before resume")
skipLocalBootstrap := fs.Bool("skip-local-bootstrap", false, "Skip local fallback bootstrap applies") skipLocalBootstrap := fs.Bool("skip-local-bootstrap", false, "Skip local fallback bootstrap applies")
@ -124,7 +125,7 @@ func runStartup(logger *log.Logger, args []string) error {
} }
checkCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second) checkCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel() defer cancel()
if err := ensureStartupPowerSafe(checkCtx, targets); err != nil { if err := ensureStartupPowerSafe(checkCtx, targets, cfg.Startup.MinimumBatteryPercent); err != nil {
return err return err
} }
} }
@ -141,10 +142,11 @@ func runStartup(logger *log.Logger, args []string) error {
func runShutdown(logger *log.Logger, args []string) error { func runShutdown(logger *log.Logger, args []string) error {
fs := flag.NewFlagSet("shutdown", flag.ExitOnError) fs := flag.NewFlagSet("shutdown", flag.ExitOnError)
configPath := fs.String("config", "/etc/hecate/hecate.yaml", "Path to config file") configPath := fs.String("config", "/etc/ananke/ananke.yaml", "Path to config file")
execute := fs.Bool("execute", false, "Actually execute changes (default dry-run)") execute := fs.Bool("execute", false, "Actually execute changes (default dry-run)")
skipEtcd := fs.Bool("skip-etcd-snapshot", false, "Skip etcd snapshot") skipEtcd := fs.Bool("skip-etcd-snapshot", false, "Skip etcd snapshot")
skipDrain := fs.Bool("skip-drain", false, "Skip worker drain") skipDrain := fs.Bool("skip-drain", false, "Skip worker drain")
mode := fs.String("mode", "config", "Shutdown mode: config|cluster-only|poweroff")
reason := fs.String("reason", "manual-shutdown", "Shutdown reason for run history") reason := fs.String("reason", "manual-shutdown", "Shutdown reason for run history")
_ = fs.Parse(args) _ = fs.Parse(args)
@ -158,13 +160,14 @@ func runShutdown(logger *log.Logger, args []string) error {
return orch.Shutdown(ctx, cluster.ShutdownOptions{ return orch.Shutdown(ctx, cluster.ShutdownOptions{
SkipEtcdSnapshot: *skipEtcd, SkipEtcdSnapshot: *skipEtcd,
SkipDrain: *skipDrain, SkipDrain: *skipDrain,
Mode: *mode,
Reason: *reason, Reason: *reason,
}) })
} }
func runDaemon(logger *log.Logger, args []string) error { func runDaemon(logger *log.Logger, args []string) error {
fs := flag.NewFlagSet("daemon", flag.ExitOnError) fs := flag.NewFlagSet("daemon", flag.ExitOnError)
configPath := fs.String("config", "/etc/hecate/hecate.yaml", "Path to config file") configPath := fs.String("config", "/etc/ananke/ananke.yaml", "Path to config file")
dryRunActions := fs.Bool("dry-run-actions", false, "Log planned actions without executing") dryRunActions := fs.Bool("dry-run-actions", false, "Log planned actions without executing")
_ = fs.Parse(args) _ = fs.Parse(args)
@ -191,7 +194,7 @@ func runDaemon(logger *log.Logger, args []string) error {
func runEtcdRestore(logger *log.Logger, args []string) error { func runEtcdRestore(logger *log.Logger, args []string) error {
fs := flag.NewFlagSet("etcd-restore", flag.ExitOnError) fs := flag.NewFlagSet("etcd-restore", flag.ExitOnError)
configPath := fs.String("config", "/etc/hecate/hecate.yaml", "Path to config file") configPath := fs.String("config", "/etc/ananke/ananke.yaml", "Path to config file")
execute := fs.Bool("execute", false, "Actually execute restore (default dry-run)") execute := fs.Bool("execute", false, "Actually execute restore (default dry-run)")
controlPlane := fs.String("control-plane", "", "Control plane to run restore on (defaults to startup.etcd_restore_control_plane)") controlPlane := fs.String("control-plane", "", "Control plane to run restore on (defaults to startup.etcd_restore_control_plane)")
snapshotPath := fs.String("snapshot", "", "Explicit snapshot path (defaults to latest on selected control plane)") snapshotPath := fs.String("snapshot", "", "Explicit snapshot path (defaults to latest on selected control plane)")
@ -211,7 +214,7 @@ func runEtcdRestore(logger *log.Logger, args []string) error {
func runStatus(logger *log.Logger, args []string) error { func runStatus(logger *log.Logger, args []string) error {
fs := flag.NewFlagSet("status", flag.ExitOnError) fs := flag.NewFlagSet("status", flag.ExitOnError)
configPath := fs.String("config", "/etc/hecate/hecate.yaml", "Path to config file") configPath := fs.String("config", "/etc/ananke/ananke.yaml", "Path to config file")
_ = fs.Parse(args) _ = fs.Parse(args)
cfg, orch, err := buildOrchestrator(logger, *configPath, true) cfg, orch, err := buildOrchestrator(logger, *configPath, true)
@ -246,7 +249,7 @@ func runStatus(logger *log.Logger, args []string) error {
func runIntent(logger *log.Logger, args []string) error { func runIntent(logger *log.Logger, args []string) error {
fs := flag.NewFlagSet("intent", flag.ExitOnError) fs := flag.NewFlagSet("intent", flag.ExitOnError)
configPath := fs.String("config", "/etc/hecate/hecate.yaml", "Path to config file") configPath := fs.String("config", "/etc/ananke/ananke.yaml", "Path to config file")
setState := fs.String("set", "", "Set intent state (normal|startup_in_progress|shutting_down|shutdown_complete)") setState := fs.String("set", "", "Set intent state (normal|startup_in_progress|shutting_down|shutdown_complete)")
reason := fs.String("reason", "manual-intent", "Intent reason (used with --set)") reason := fs.String("reason", "manual-intent", "Intent reason (used with --set)")
source := fs.String("source", "manual", "Intent source (used with --set)") source := fs.String("source", "manual", "Intent source (used with --set)")
@ -314,7 +317,7 @@ func buildUPSTargets(cfg config.Config) ([]service.Target, error) {
return targets, nil return targets, nil
} }
func ensureStartupPowerSafe(ctx context.Context, targets []service.Target) error { func ensureStartupPowerSafe(ctx context.Context, targets []service.Target, minimumBatteryPercent float64) error {
type targetState struct { type targetState struct {
seenGood bool seenGood bool
lastErr error lastErr error
@ -327,6 +330,7 @@ func ensureStartupPowerSafe(ctx context.Context, targets []service.Target) error
const pollInterval = 3 * time.Second const pollInterval = 3 * time.Second
for { for {
onBatteryTargets := []string{} onBatteryTargets := []string{}
lowChargeTargets := []string{}
allSeen := true allSeen := true
for _, t := range targets { for _, t := range targets {
key := t.Name + "|" + t.Target key := t.Name + "|" + t.Target
@ -344,10 +348,25 @@ func ensureStartupPowerSafe(ctx context.Context, targets []service.Target) error
if sample.OnBattery { if sample.OnBattery {
onBatteryTargets = append(onBatteryTargets, fmt.Sprintf("%s(status=%s runtime_s=%d)", t.Name, sample.RawStatus, sample.RuntimeSeconds)) onBatteryTargets = append(onBatteryTargets, fmt.Sprintf("%s(status=%s runtime_s=%d)", t.Name, sample.RawStatus, sample.RuntimeSeconds))
} }
if minimumBatteryPercent > 0 && sample.BatteryCharge > 0 && sample.BatteryCharge < minimumBatteryPercent {
lowChargeTargets = append(
lowChargeTargets,
fmt.Sprintf(
"%s(charge=%.1f%%<%.1f%% status=%s)",
t.Name,
sample.BatteryCharge,
minimumBatteryPercent,
sample.RawStatus,
),
)
}
} }
if len(onBatteryTargets) > 0 { if len(onBatteryTargets) > 0 {
return fmt.Errorf("startup blocked: UPS is on battery for %s", strings.Join(onBatteryTargets, ", ")) return fmt.Errorf("startup blocked: UPS is on battery for %s", strings.Join(onBatteryTargets, ", "))
} }
if len(lowChargeTargets) > 0 {
return fmt.Errorf("startup blocked: UPS battery charge below minimum for %s", strings.Join(lowChargeTargets, ", "))
}
if allSeen { if allSeen {
return nil return nil
} }
@ -366,7 +385,8 @@ func ensureStartupPowerSafe(ctx context.Context, targets []service.Target) error
unverified = append(unverified, fmt.Sprintf("%s(%s): no telemetry sample yet", t.Name, t.Target)) unverified = append(unverified, fmt.Sprintf("%s(%s): no telemetry sample yet", t.Name, t.Target))
} }
} }
return fmt.Errorf("startup blocked: unable to verify UPS telemetry before timeout: %s", strings.Join(unverified, " | ")) roundedMin := math.Round(minimumBatteryPercent*10) / 10
return fmt.Errorf("startup blocked: unable to verify UPS telemetry before timeout (minimum_battery_percent=%.1f): %s", roundedMin, strings.Join(unverified, " | "))
case <-time.After(pollInterval): case <-time.After(pollInterval):
} }
} }
@ -391,26 +411,26 @@ func buildOrchestrator(logger *log.Logger, cfgPath string, dryRun bool) (config.
} }
func usage() { func usage() {
fmt.Print(`hecate: staged startup/shutdown + UPS-triggered protection fmt.Print(`ananke: staged startup/shutdown + UPS-triggered protection
Usage: Usage:
hecate <command> [flags] ananke <command> [flags]
Commands: Commands:
startup Perform staged cluster startup startup Perform staged cluster startup
shutdown Perform graceful cluster shutdown shutdown Perform graceful cluster shutdown
etcd-restore Restore etcd from snapshot on a control plane etcd-restore Restore etcd from snapshot on a control plane
daemon Monitor UPS and auto-trigger shutdown daemon Monitor UPS and auto-trigger shutdown
status Print current hecate status and estimates status Print current ananke status and estimates
intent Read or manually set intent state intent Read or manually set intent state
Examples: Examples:
hecate startup --config /etc/hecate/hecate.yaml --execute --force-flux-branch main ananke startup --config /etc/ananke/ananke.yaml --execute --force-flux-branch main
hecate shutdown --config /etc/hecate/hecate.yaml --execute --reason "manual-maintenance" ananke shutdown --config /etc/ananke/ananke.yaml --execute --reason "manual-maintenance"
hecate etcd-restore --config /etc/hecate/hecate.yaml --execute ananke etcd-restore --config /etc/ananke/ananke.yaml --execute
hecate daemon --config /etc/hecate/hecate.yaml ananke daemon --config /etc/ananke/ananke.yaml
hecate status --config /etc/hecate/hecate.yaml ananke status --config /etc/ananke/ananke.yaml
hecate intent --config /etc/hecate/hecate.yaml --set normal --reason "manual-clear" --execute ananke intent --config /etc/ananke/ananke.yaml --set normal --reason "manual-clear" --execute
`) `)
} }
@ -439,7 +459,7 @@ func tryPeerBootstrapHandoff(ctx context.Context, cfg config.Config, logger *log
args := buildSSHBaseArgs(cfg) args := buildSSHBaseArgs(cfg)
remote := "sudo -n systemctl start hecate-bootstrap.service" remote := "sudo -n systemctl start ananke-bootstrap.service"
attempt := 1 attempt := 1
for { for {
cmdArgs := append(append([]string{}, args...), target, remote) cmdArgs := append(append([]string{}, args...), target, remote)
@ -480,7 +500,7 @@ func coordinatorAllowsPeerFallbackStartup(ctx context.Context, cfg config.Config
if user != "" { if user != "" {
target = user + "@" + host target = user + "@" + host
} }
remoteCmd := "if sudo -n /usr/bin/systemctl is-active --quiet hecate-bootstrap.service; then echo __HECATE_BOOTSTRAP_ACTIVE__; else echo __HECATE_BOOTSTRAP_IDLE__; fi; sudo -n /usr/local/bin/hecate intent --config /etc/hecate/hecate.yaml" remoteCmd := "if sudo -n /usr/bin/systemctl is-active --quiet ananke-bootstrap.service; then echo __ANANKE_BOOTSTRAP_ACTIVE__; else echo __ANANKE_BOOTSTRAP_IDLE__; fi; sudo -n /usr/local/bin/ananke intent --config /etc/ananke/ananke.yaml"
args := append(buildSSHBaseArgs(cfg), target, remoteCmd) args := append(buildSSHBaseArgs(cfg), target, remoteCmd)
out, err := runSSHWithRecovery(ctx, logger, cfg, args, []string{coordinator, host, cfg.SSHJumpHost}) out, err := runSSHWithRecovery(ctx, logger, cfg, args, []string{coordinator, host, cfg.SSHJumpHost})
if err != nil { if err != nil {
@ -488,7 +508,7 @@ func coordinatorAllowsPeerFallbackStartup(ctx context.Context, cfg config.Config
return true, "coordinator unreachable", nil return true, "coordinator unreachable", nil
} }
trimmed := strings.TrimSpace(out) trimmed := strings.TrimSpace(out)
if strings.Contains(trimmed, "__HECATE_BOOTSTRAP_ACTIVE__") { if strings.Contains(trimmed, "__ANANKE_BOOTSTRAP_ACTIVE__") {
return false, "coordinator bootstrap service is active", nil return false, "coordinator bootstrap service is active", nil
} }
remoteIntent, parseErr := state.ParseIntentOutput(trimmed) remoteIntent, parseErr := state.ParseIntentOutput(trimmed)

View File

@ -1,5 +1,5 @@
# /etc/hecate/hecate.yaml # /etc/ananke/ananke.yaml
kubeconfig: /etc/hecate/kubeconfig kubeconfig: /etc/ananke/kubeconfig
ssh_user: atlas ssh_user: atlas
ssh_port: 2277 ssh_port: 2277
ssh_config_file: "" ssh_config_file: ""
@ -11,6 +11,7 @@ ssh_jump_host: ""
ssh_jump_user: "" ssh_jump_user: ""
iac_repo_path: /opt/titan-iac iac_repo_path: /opt/titan-iac
expected_flux_branch: main expected_flux_branch: main
expected_flux_source_url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git
control_planes: control_planes:
- titan-0a - titan-0a
- titan-0b - titan-0b
@ -46,6 +47,10 @@ startup:
api_wait_seconds: 1200 api_wait_seconds: 1200
api_poll_seconds: 2 api_poll_seconds: 2
shutdown_cooldown_seconds: 45 shutdown_cooldown_seconds: 45
minimum_battery_percent: 20
required_node_labels:
titan-09:
ananke.bstein.dev/harbor-bootstrap: "true"
require_time_sync: true require_time_sync: true
time_sync_wait_seconds: 240 time_sync_wait_seconds: 240
time_sync_poll_seconds: 5 time_sync_poll_seconds: 5
@ -67,9 +72,36 @@ startup:
post_start_probe_wait_seconds: 240 post_start_probe_wait_seconds: 240
post_start_probe_poll_seconds: 5 post_start_probe_poll_seconds: 5
post_start_probes: post_start_probes:
- https://scm.bstein.dev/user/login - https://scm.bstein.dev/api/healthz
- https://metrics.bstein.dev/login - https://metrics.bstein.dev/api/health
vault_unseal_key_file: /var/lib/hecate/vault-unseal.key require_service_checklist: true
service_checklist_wait_seconds: 420
service_checklist_poll_seconds: 5
service_checklist_stability_seconds: 120
service_checklist:
- name: gitea-api
url: https://scm.bstein.dev/api/healthz
accepted_statuses: [200]
body_contains: pass
timeout_seconds: 12
- name: grafana-api
url: https://metrics.bstein.dev/api/health
accepted_statuses: [200]
body_contains: '"database":"ok"'
timeout_seconds: 12
require_flux_health: true
flux_health_wait_seconds: 900
flux_health_poll_seconds: 5
ignore_flux_kustomizations: []
require_workload_convergence: true
workload_convergence_wait_seconds: 900
workload_convergence_poll_seconds: 5
ignore_workload_namespaces: []
ignore_workloads: []
ignore_unavailable_nodes: []
auto_recycle_stuck_pods: true
stuck_pod_grace_seconds: 180
vault_unseal_key_file: /var/lib/ananke/vault-unseal.key
vault_unseal_breakglass_command: "" vault_unseal_breakglass_command: ""
vault_unseal_breakglass_timeout_seconds: 15 vault_unseal_breakglass_timeout_seconds: 15
shutdown: shutdown:
@ -103,7 +135,7 @@ ups:
coordination: coordination:
forward_shutdown_host: "" forward_shutdown_host: ""
forward_shutdown_user: atlas forward_shutdown_user: atlas
forward_shutdown_config: /etc/hecate/hecate.yaml forward_shutdown_config: /etc/ananke/ananke.yaml
peer_hosts: [] peer_hosts: []
fallback_local_shutdown: true fallback_local_shutdown: true
command_timeout_seconds: 25 command_timeout_seconds: 25
@ -115,7 +147,7 @@ metrics:
bind_addr: 0.0.0.0:9560 bind_addr: 0.0.0.0:9560
path: /metrics path: /metrics
state: state:
dir: /var/lib/hecate dir: /var/lib/ananke
run_history_path: /var/lib/hecate/runs.json run_history_path: /var/lib/ananke/runs.json
lock_path: /var/lib/hecate/hecate.lock lock_path: /var/lib/ananke/ananke.lock
intent_path: /var/lib/hecate/intent.json intent_path: /var/lib/ananke/intent.json

View File

@ -1,5 +1,5 @@
# /etc/hecate/hecate.yaml for titan-24 (tethys forwarder) # /etc/ananke/ananke.yaml for titan-24 (tethys forwarder)
kubeconfig: /etc/hecate/kubeconfig kubeconfig: /etc/ananke/kubeconfig
ssh_user: atlas ssh_user: atlas
ssh_port: 2277 ssh_port: 2277
ssh_config_file: /home/tethys/.ssh/config ssh_config_file: /home/tethys/.ssh/config
@ -58,6 +58,7 @@ ssh_jump_host: ""
ssh_jump_user: "" ssh_jump_user: ""
iac_repo_path: /opt/titan-iac iac_repo_path: /opt/titan-iac
expected_flux_branch: main expected_flux_branch: main
expected_flux_source_url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git
control_planes: control_planes:
- titan-0a - titan-0a
- titan-0b - titan-0b
@ -112,6 +113,10 @@ startup:
api_wait_seconds: 1200 api_wait_seconds: 1200
api_poll_seconds: 2 api_poll_seconds: 2
shutdown_cooldown_seconds: 45 shutdown_cooldown_seconds: 45
minimum_battery_percent: 20
required_node_labels:
titan-09:
ananke.bstein.dev/harbor-bootstrap: "true"
require_time_sync: true require_time_sync: true
time_sync_wait_seconds: 240 time_sync_wait_seconds: 240
time_sync_poll_seconds: 5 time_sync_poll_seconds: 5
@ -133,10 +138,37 @@ startup:
post_start_probe_wait_seconds: 240 post_start_probe_wait_seconds: 240
post_start_probe_poll_seconds: 5 post_start_probe_poll_seconds: 5
post_start_probes: post_start_probes:
- https://scm.bstein.dev/user/login - https://scm.bstein.dev/api/healthz
- https://metrics.bstein.dev/login - https://metrics.bstein.dev/api/health
vault_unseal_key_file: /var/lib/hecate/vault-unseal.key require_service_checklist: true
vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/tethys/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.hecate-breakglass/vault-unseal.key'" service_checklist_wait_seconds: 420
service_checklist_poll_seconds: 5
service_checklist_stability_seconds: 120
service_checklist:
- name: gitea-api
url: https://scm.bstein.dev/api/healthz
accepted_statuses: [200]
body_contains: pass
timeout_seconds: 12
- name: grafana-api
url: https://metrics.bstein.dev/api/health
accepted_statuses: [200]
body_contains: '"database":"ok"'
timeout_seconds: 12
require_flux_health: true
flux_health_wait_seconds: 900
flux_health_poll_seconds: 5
ignore_flux_kustomizations: []
require_workload_convergence: true
workload_convergence_wait_seconds: 900
workload_convergence_poll_seconds: 5
ignore_workload_namespaces: []
ignore_workloads: []
ignore_unavailable_nodes: []
auto_recycle_stuck_pods: true
stuck_pod_grace_seconds: 180
vault_unseal_key_file: /var/lib/ananke/vault-unseal.key
vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/tethys/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.ananke-breakglass/vault-unseal.key'"
vault_unseal_breakglass_timeout_seconds: 15 vault_unseal_breakglass_timeout_seconds: 15
shutdown: shutdown:
default_budget_seconds: 1380 default_budget_seconds: 1380
@ -167,7 +199,7 @@ ups:
coordination: coordination:
forward_shutdown_host: titan-db forward_shutdown_host: titan-db
forward_shutdown_user: atlas forward_shutdown_user: atlas
forward_shutdown_config: /etc/hecate/hecate.yaml forward_shutdown_config: /etc/ananke/ananke.yaml
peer_hosts: peer_hosts:
- titan-db - titan-db
fallback_local_shutdown: false fallback_local_shutdown: false
@ -180,7 +212,7 @@ metrics:
bind_addr: 0.0.0.0:9560 bind_addr: 0.0.0.0:9560
path: /metrics path: /metrics
state: state:
dir: /var/lib/hecate dir: /var/lib/ananke
run_history_path: /var/lib/hecate/runs.json run_history_path: /var/lib/ananke/runs.json
lock_path: /var/lib/hecate/hecate.lock lock_path: /var/lib/ananke/ananke.lock
intent_path: /var/lib/hecate/intent.json intent_path: /var/lib/ananke/intent.json

View File

@ -1,5 +1,5 @@
# /etc/hecate/hecate.yaml for titan-db (coordinator) # /etc/ananke/ananke.yaml for titan-db (coordinator)
kubeconfig: /etc/hecate/kubeconfig kubeconfig: /etc/ananke/kubeconfig
ssh_user: atlas ssh_user: atlas
ssh_port: 2277 ssh_port: 2277
ssh_config_file: /home/atlas/.ssh/config ssh_config_file: /home/atlas/.ssh/config
@ -58,6 +58,7 @@ ssh_jump_host: ""
ssh_jump_user: "" ssh_jump_user: ""
iac_repo_path: /opt/titan-iac iac_repo_path: /opt/titan-iac
expected_flux_branch: main expected_flux_branch: main
expected_flux_source_url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git
control_planes: control_planes:
- titan-0a - titan-0a
- titan-0b - titan-0b
@ -112,6 +113,10 @@ startup:
api_wait_seconds: 1200 api_wait_seconds: 1200
api_poll_seconds: 2 api_poll_seconds: 2
shutdown_cooldown_seconds: 45 shutdown_cooldown_seconds: 45
minimum_battery_percent: 20
required_node_labels:
titan-09:
ananke.bstein.dev/harbor-bootstrap: "true"
require_time_sync: true require_time_sync: true
time_sync_wait_seconds: 240 time_sync_wait_seconds: 240
time_sync_poll_seconds: 5 time_sync_poll_seconds: 5
@ -133,10 +138,37 @@ startup:
post_start_probe_wait_seconds: 240 post_start_probe_wait_seconds: 240
post_start_probe_poll_seconds: 5 post_start_probe_poll_seconds: 5
post_start_probes: post_start_probes:
- https://scm.bstein.dev/user/login - https://scm.bstein.dev/api/healthz
- https://metrics.bstein.dev/login - https://metrics.bstein.dev/api/health
vault_unseal_key_file: /var/lib/hecate/vault-unseal.key require_service_checklist: true
vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/atlas/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.hecate-breakglass/vault-unseal.key'" service_checklist_wait_seconds: 420
service_checklist_poll_seconds: 5
service_checklist_stability_seconds: 120
service_checklist:
- name: gitea-api
url: https://scm.bstein.dev/api/healthz
accepted_statuses: [200]
body_contains: pass
timeout_seconds: 12
- name: grafana-api
url: https://metrics.bstein.dev/api/health
accepted_statuses: [200]
body_contains: '"database":"ok"'
timeout_seconds: 12
require_flux_health: true
flux_health_wait_seconds: 900
flux_health_poll_seconds: 5
ignore_flux_kustomizations: []
require_workload_convergence: true
workload_convergence_wait_seconds: 900
workload_convergence_poll_seconds: 5
ignore_workload_namespaces: []
ignore_workloads: []
ignore_unavailable_nodes: []
auto_recycle_stuck_pods: true
stuck_pod_grace_seconds: 180
vault_unseal_key_file: /var/lib/ananke/vault-unseal.key
vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/atlas/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.ananke-breakglass/vault-unseal.key'"
vault_unseal_breakglass_timeout_seconds: 15 vault_unseal_breakglass_timeout_seconds: 15
shutdown: shutdown:
default_budget_seconds: 1380 default_budget_seconds: 1380
@ -168,7 +200,7 @@ ups:
coordination: coordination:
forward_shutdown_host: "" forward_shutdown_host: ""
forward_shutdown_user: atlas forward_shutdown_user: atlas
forward_shutdown_config: /etc/hecate/hecate.yaml forward_shutdown_config: /etc/ananke/ananke.yaml
peer_hosts: peer_hosts:
- titan-24 - titan-24
fallback_local_shutdown: true fallback_local_shutdown: true
@ -181,7 +213,7 @@ metrics:
bind_addr: 0.0.0.0:9560 bind_addr: 0.0.0.0:9560
path: /metrics path: /metrics
state: state:
dir: /var/lib/hecate dir: /var/lib/ananke
run_history_path: /var/lib/hecate/runs.json run_history_path: /var/lib/ananke/runs.json
lock_path: /var/lib/hecate/hecate.lock lock_path: /var/lib/ananke/ananke.lock
intent_path: /var/lib/hecate/intent.json intent_path: /var/lib/ananke/intent.json

View File

@ -1,15 +1,15 @@
[Unit] [Unit]
Description=Hecate Staged Cluster Bootstrap Description=Ananke Staged Cluster Bootstrap
Wants=network-online.target Wants=network-online.target
After=network-online.target After=network-online.target
ConditionPathExists=/etc/hecate/hecate.yaml ConditionPathExists=/etc/ananke/ananke.yaml
StartLimitIntervalSec=0 StartLimitIntervalSec=0
[Service] [Service]
Type=oneshot Type=oneshot
User=root User=root
Group=root Group=root
ExecStart=/usr/local/bin/hecate startup --config /etc/hecate/hecate.yaml --execute --force-flux-branch main --auto-peer-failover --peer-wait-seconds 180 ExecStart=/usr/local/bin/ananke startup --config /etc/ananke/ananke.yaml --execute --force-flux-branch main --auto-peer-failover --peer-wait-seconds 180
Restart=on-failure Restart=on-failure
RestartSec=30 RestartSec=30
TimeoutStartSec=1800 TimeoutStartSec=1800

View File

@ -1,5 +1,5 @@
[Unit] [Unit]
Description=Hecate Self-Update and Reinstall Description=Ananke Self-Update and Reinstall
Wants=network-online.target Wants=network-online.target
After=network-online.target After=network-online.target
@ -7,6 +7,7 @@ After=network-online.target
Type=oneshot Type=oneshot
User=root User=root
Group=root Group=root
ExecStart=/usr/local/lib/hecate/hecate-self-update.sh ExecStart=/usr/local/lib/ananke/ananke-self-update.sh
TimeoutStartSec=1800 TimeoutStartSec=1800
[Install]

View File

@ -1,12 +1,11 @@
[Unit] [Unit]
Description=Periodic Hecate Self-Update Timer Description=Periodic Ananke Self-Update Timer
[Timer] [Timer]
OnBootSec=2m OnBootSec=2m
OnUnitActiveSec=6h OnUnitActiveSec=6h
Unit=hecate-update.service Unit=ananke-update.service
Persistent=true Persistent=true
[Install] [Install]
WantedBy=timers.target WantedBy=timers.target

View File

@ -1,14 +1,14 @@
[Unit] [Unit]
Description=Hecate UPS Monitor and Auto Shutdown Orchestrator Description=Ananke UPS Monitor and Auto Shutdown Orchestrator
Wants=network-online.target Wants=network-online.target
After=network-online.target After=network-online.target
ConditionPathExists=/etc/hecate/hecate.yaml ConditionPathExists=/etc/ananke/ananke.yaml
[Service] [Service]
Type=simple Type=simple
User=root User=root
Group=root Group=root
ExecStart=/usr/local/bin/hecate daemon --config /etc/hecate/hecate.yaml ExecStart=/usr/local/bin/ananke daemon --config /etc/ananke/ananke.yaml
Restart=on-failure Restart=on-failure
RestartSec=5 RestartSec=5
NoNewPrivileges=true NoNewPrivileges=true

2
go.mod
View File

@ -1,4 +1,4 @@
module scm.bstein.dev/bstein/hecate module scm.bstein.dev/bstein/ananke
go 1.25 go 1.25

File diff suppressed because it is too large Load Diff

View File

@ -1,14 +1,17 @@
package cluster package cluster
import ( import (
"context"
"log" "log"
"net/http"
"net/http/httptest"
"os" "os"
"reflect" "reflect"
"testing" "testing"
"time" "time"
"scm.bstein.dev/bstein/hecate/internal/config" "scm.bstein.dev/bstein/ananke/internal/config"
"scm.bstein.dev/bstein/hecate/internal/state" "scm.bstein.dev/bstein/ananke/internal/state"
) )
func TestParseVaultSealed(t *testing.T) { func TestParseVaultSealed(t *testing.T) {
@ -117,3 +120,75 @@ func TestCoordinationPeersDedupesAndIncludesForwardHost(t *testing.T) {
t.Fatalf("coordination peers mismatch: got=%v want=%v", got, want) t.Fatalf("coordination peers mismatch: got=%v want=%v", got, want)
} }
} }
func TestWorkloadTargetsIgnoredNodesByNodeSelector(t *testing.T) {
spec := podSpec{
NodeSelector: map[string]string{
"kubernetes.io/hostname": "titan-22",
},
}
ignored := map[string]struct{}{"titan-22": {}}
if !workloadTargetsIgnoredNodes(spec, ignored) {
t.Fatalf("expected workload to target ignored node via nodeSelector")
}
}
func TestParseWorkloadIgnoreRules(t *testing.T) {
rules := parseWorkloadIgnoreRules([]string{
"maintenance/metis",
"crypto/statefulset/monerod",
})
if len(rules) != 2 {
t.Fatalf("expected 2 ignore rules, got %d", len(rules))
}
if !workloadIgnored(rules, "maintenance", "deployment", "metis") {
t.Fatalf("expected namespace/name rule to match")
}
if !workloadIgnored(rules, "crypto", "statefulset", "monerod") {
t.Fatalf("expected namespace/kind/name rule to match")
}
if workloadIgnored(rules, "crypto", "deployment", "monerod") {
t.Fatalf("did not expect mismatched kind to match")
}
}
func TestNamespaceCandidatesFromIgnoreKustomizations(t *testing.T) {
got := namespaceCandidatesFromIgnoreKustomizations([]string{
"flux-system/jellyfin",
"flux-system/outline",
})
if _, ok := got["jellyfin"]; !ok {
t.Fatalf("expected jellyfin namespace candidate")
}
if _, ok := got["outline"]; !ok {
t.Fatalf("expected outline namespace candidate")
}
}
func TestProbeStatusAcceptedRejects404(t *testing.T) {
if probeStatusAccepted("https://metrics.bstein.dev/login", 404) {
t.Fatalf("expected 404 probe status to be rejected")
}
}
func TestServiceCheckReadyRequiresBodyContains(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte(`{"database":"ok"}`))
}))
defer srv.Close()
orch := &Orchestrator{
log: log.New(os.Stdout, "", 0),
}
ok, detail := orch.serviceCheckReady(context.Background(), config.ServiceChecklistCheck{
Name: "grafana-api",
URL: srv.URL,
AcceptedStatuses: []int{200},
BodyContains: `"database":"ok"`,
TimeoutSeconds: 5,
})
if !ok {
t.Fatalf("expected service check to pass, detail=%s", detail)
}
}

View File

@ -2,6 +2,7 @@ package config
import ( import (
"fmt" "fmt"
neturl "net/url"
"os" "os"
"strings" "strings"
@ -21,6 +22,7 @@ type Config struct {
SSHJumpUser string `yaml:"ssh_jump_user"` SSHJumpUser string `yaml:"ssh_jump_user"`
IACRepoPath string `yaml:"iac_repo_path"` IACRepoPath string `yaml:"iac_repo_path"`
ExpectedFluxBranch string `yaml:"expected_flux_branch"` ExpectedFluxBranch string `yaml:"expected_flux_branch"`
ExpectedFluxSource string `yaml:"expected_flux_source_url"`
ControlPlanes []string `yaml:"control_planes"` ControlPlanes []string `yaml:"control_planes"`
Workers []string `yaml:"workers"` Workers []string `yaml:"workers"`
LocalBootstrapPaths []string `yaml:"local_bootstrap_paths"` LocalBootstrapPaths []string `yaml:"local_bootstrap_paths"`
@ -34,29 +36,58 @@ type Config struct {
} }
type Startup struct { type Startup struct {
APIWaitSeconds int `yaml:"api_wait_seconds"` APIWaitSeconds int `yaml:"api_wait_seconds"`
APIPollSeconds int `yaml:"api_poll_seconds"` APIPollSeconds int `yaml:"api_poll_seconds"`
ShutdownCooldownSeconds int `yaml:"shutdown_cooldown_seconds"` ShutdownCooldownSeconds int `yaml:"shutdown_cooldown_seconds"`
RequireTimeSync bool `yaml:"require_time_sync"` MinimumBatteryPercent float64 `yaml:"minimum_battery_percent"`
TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"` RequiredNodeLabels map[string]map[string]string `yaml:"required_node_labels"`
TimeSyncPollSeconds int `yaml:"time_sync_poll_seconds"` RequireTimeSync bool `yaml:"require_time_sync"`
TimeSyncMode string `yaml:"time_sync_mode"` TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"`
TimeSyncQuorum int `yaml:"time_sync_quorum"` TimeSyncPollSeconds int `yaml:"time_sync_poll_seconds"`
ReconcileAccessOnBoot bool `yaml:"reconcile_access_on_boot"` TimeSyncMode string `yaml:"time_sync_mode"`
AutoEtcdRestoreOnAPIFailure bool `yaml:"auto_etcd_restore_on_api_failure"` TimeSyncQuorum int `yaml:"time_sync_quorum"`
EtcdRestoreControlPlane string `yaml:"etcd_restore_control_plane"` ReconcileAccessOnBoot bool `yaml:"reconcile_access_on_boot"`
RequireStorageReady bool `yaml:"require_storage_ready"` AutoEtcdRestoreOnAPIFailure bool `yaml:"auto_etcd_restore_on_api_failure"`
StorageReadyWaitSeconds int `yaml:"storage_ready_wait_seconds"` EtcdRestoreControlPlane string `yaml:"etcd_restore_control_plane"`
StorageReadyPollSeconds int `yaml:"storage_ready_poll_seconds"` RequireStorageReady bool `yaml:"require_storage_ready"`
StorageMinReadyNodes int `yaml:"storage_min_ready_nodes"` StorageReadyWaitSeconds int `yaml:"storage_ready_wait_seconds"`
StorageCriticalPVCs []string `yaml:"storage_critical_pvcs"` StorageReadyPollSeconds int `yaml:"storage_ready_poll_seconds"`
RequirePostStartProbes bool `yaml:"require_post_start_probes"` StorageMinReadyNodes int `yaml:"storage_min_ready_nodes"`
PostStartProbeWaitSeconds int `yaml:"post_start_probe_wait_seconds"` StorageCriticalPVCs []string `yaml:"storage_critical_pvcs"`
PostStartProbePollSeconds int `yaml:"post_start_probe_poll_seconds"` RequirePostStartProbes bool `yaml:"require_post_start_probes"`
PostStartProbes []string `yaml:"post_start_probes"` PostStartProbeWaitSeconds int `yaml:"post_start_probe_wait_seconds"`
VaultUnsealKeyFile string `yaml:"vault_unseal_key_file"` PostStartProbePollSeconds int `yaml:"post_start_probe_poll_seconds"`
VaultUnsealBreakglassCommand string `yaml:"vault_unseal_breakglass_command"` PostStartProbes []string `yaml:"post_start_probes"`
VaultUnsealBreakglassTimeout int `yaml:"vault_unseal_breakglass_timeout_seconds"` RequireServiceChecklist bool `yaml:"require_service_checklist"`
ServiceChecklistWaitSeconds int `yaml:"service_checklist_wait_seconds"`
ServiceChecklistPollSeconds int `yaml:"service_checklist_poll_seconds"`
ServiceChecklistStabilitySec int `yaml:"service_checklist_stability_seconds"`
ServiceChecklist []ServiceChecklistCheck `yaml:"service_checklist"`
RequireFluxHealth bool `yaml:"require_flux_health"`
FluxHealthWaitSeconds int `yaml:"flux_health_wait_seconds"`
FluxHealthPollSeconds int `yaml:"flux_health_poll_seconds"`
IgnoreFluxKustomizations []string `yaml:"ignore_flux_kustomizations"`
RequireWorkloadConvergence bool `yaml:"require_workload_convergence"`
WorkloadConvergenceWaitSeconds int `yaml:"workload_convergence_wait_seconds"`
WorkloadConvergencePollSeconds int `yaml:"workload_convergence_poll_seconds"`
IgnoreWorkloadNamespaces []string `yaml:"ignore_workload_namespaces"`
IgnoreWorkloads []string `yaml:"ignore_workloads"`
IgnoreUnavailableNodes []string `yaml:"ignore_unavailable_nodes"`
AutoRecycleStuckPods bool `yaml:"auto_recycle_stuck_pods"`
StuckPodGraceSeconds int `yaml:"stuck_pod_grace_seconds"`
VaultUnsealKeyFile string `yaml:"vault_unseal_key_file"`
VaultUnsealBreakglassCommand string `yaml:"vault_unseal_breakglass_command"`
VaultUnsealBreakglassTimeout int `yaml:"vault_unseal_breakglass_timeout_seconds"`
}
type ServiceChecklistCheck struct {
Name string `yaml:"name"`
URL string `yaml:"url"`
AcceptedStatuses []int `yaml:"accepted_statuses"`
BodyContains string `yaml:"body_contains"`
BodyNotContains string `yaml:"body_not_contains"`
TimeoutSeconds int `yaml:"timeout_seconds"`
InsecureSkipTLS bool `yaml:"insecure_skip_tls"`
} }
type Shutdown struct { type Shutdown struct {
@ -143,6 +174,9 @@ func (c Config) Validate() error {
if c.ExpectedFluxBranch == "" { if c.ExpectedFluxBranch == "" {
return fmt.Errorf("config.expected_flux_branch must not be empty") return fmt.Errorf("config.expected_flux_branch must not be empty")
} }
if c.ExpectedFluxSource == "" {
return fmt.Errorf("config.expected_flux_source_url must not be empty")
}
if c.IACRepoPath == "" { if c.IACRepoPath == "" {
return fmt.Errorf("config.iac_repo_path must not be empty") return fmt.Errorf("config.iac_repo_path must not be empty")
} }
@ -176,6 +210,25 @@ func (c Config) Validate() error {
if c.Startup.ShutdownCooldownSeconds <= 0 { if c.Startup.ShutdownCooldownSeconds <= 0 {
return fmt.Errorf("config.startup.shutdown_cooldown_seconds must be > 0") return fmt.Errorf("config.startup.shutdown_cooldown_seconds must be > 0")
} }
if c.Startup.MinimumBatteryPercent < 0 || c.Startup.MinimumBatteryPercent > 100 {
return fmt.Errorf("config.startup.minimum_battery_percent must be between 0 and 100")
}
for node, labels := range c.Startup.RequiredNodeLabels {
if strings.TrimSpace(node) == "" {
return fmt.Errorf("config.startup.required_node_labels keys must not be empty")
}
if len(labels) == 0 {
return fmt.Errorf("config.startup.required_node_labels[%q] must include at least one label", node)
}
for key, value := range labels {
if strings.TrimSpace(key) == "" {
return fmt.Errorf("config.startup.required_node_labels[%q] contains empty label key", node)
}
if strings.TrimSpace(value) == "" {
return fmt.Errorf("config.startup.required_node_labels[%q][%q] must not be empty", node, key)
}
}
}
if c.Startup.TimeSyncWaitSeconds <= 0 { if c.Startup.TimeSyncWaitSeconds <= 0 {
return fmt.Errorf("config.startup.time_sync_wait_seconds must be > 0") return fmt.Errorf("config.startup.time_sync_wait_seconds must be > 0")
} }
@ -223,11 +276,88 @@ func (c Config) Validate() error {
if c.Startup.RequirePostStartProbes && len(c.Startup.PostStartProbes) == 0 { if c.Startup.RequirePostStartProbes && len(c.Startup.PostStartProbes) == 0 {
return fmt.Errorf("config.startup.post_start_probes must not be empty when require_post_start_probes is true") return fmt.Errorf("config.startup.post_start_probes must not be empty when require_post_start_probes is true")
} }
if c.Startup.ServiceChecklistWaitSeconds <= 0 {
return fmt.Errorf("config.startup.service_checklist_wait_seconds must be > 0")
}
if c.Startup.ServiceChecklistPollSeconds <= 0 {
return fmt.Errorf("config.startup.service_checklist_poll_seconds must be > 0")
}
if c.Startup.ServiceChecklistStabilitySec < 0 {
return fmt.Errorf("config.startup.service_checklist_stability_seconds must be >= 0")
}
if c.Startup.RequireServiceChecklist && len(c.Startup.ServiceChecklist) == 0 {
return fmt.Errorf("config.startup.service_checklist must not be empty when require_service_checklist is true")
}
for i, check := range c.Startup.ServiceChecklist {
if strings.TrimSpace(check.Name) == "" {
return fmt.Errorf("config.startup.service_checklist[%d].name must not be empty", i)
}
rawURL := strings.TrimSpace(check.URL)
if rawURL == "" {
return fmt.Errorf("config.startup.service_checklist[%d].url must not be empty", i)
}
parsed, err := neturl.Parse(rawURL)
if err != nil || parsed.Scheme == "" || parsed.Host == "" {
return fmt.Errorf("config.startup.service_checklist[%d].url is invalid: %q", i, rawURL)
}
if check.TimeoutSeconds <= 0 {
return fmt.Errorf("config.startup.service_checklist[%d].timeout_seconds must be > 0", i)
}
for _, code := range check.AcceptedStatuses {
if code < 100 || code > 599 {
return fmt.Errorf("config.startup.service_checklist[%d].accepted_statuses contains invalid HTTP code %d", i, code)
}
}
}
if c.Startup.FluxHealthWaitSeconds <= 0 {
return fmt.Errorf("config.startup.flux_health_wait_seconds must be > 0")
}
if c.Startup.FluxHealthPollSeconds <= 0 {
return fmt.Errorf("config.startup.flux_health_poll_seconds must be > 0")
}
if c.Startup.WorkloadConvergenceWaitSeconds <= 0 {
return fmt.Errorf("config.startup.workload_convergence_wait_seconds must be > 0")
}
if c.Startup.WorkloadConvergencePollSeconds <= 0 {
return fmt.Errorf("config.startup.workload_convergence_poll_seconds must be > 0")
}
if c.Startup.StuckPodGraceSeconds <= 0 {
return fmt.Errorf("config.startup.stuck_pod_grace_seconds must be > 0")
}
for _, probe := range c.Startup.PostStartProbes { for _, probe := range c.Startup.PostStartProbes {
if strings.TrimSpace(probe) == "" { if strings.TrimSpace(probe) == "" {
return fmt.Errorf("config.startup.post_start_probes entries must not be empty") return fmt.Errorf("config.startup.post_start_probes entries must not be empty")
} }
} }
for _, item := range c.Startup.IgnoreFluxKustomizations {
item = strings.TrimSpace(item)
if item == "" {
return fmt.Errorf("config.startup.ignore_flux_kustomizations entries must not be empty")
}
if strings.Count(item, "/") != 1 {
return fmt.Errorf("config.startup.ignore_flux_kustomizations entries must be namespace/name, got %q", item)
}
}
for _, item := range c.Startup.IgnoreWorkloads {
item = strings.TrimSpace(item)
if item == "" {
return fmt.Errorf("config.startup.ignore_workloads entries must not be empty")
}
parts := strings.Split(item, "/")
if len(parts) != 2 && len(parts) != 3 {
return fmt.Errorf("config.startup.ignore_workloads entries must be namespace/name or namespace/kind/name, got %q", item)
}
}
for _, ns := range c.Startup.IgnoreWorkloadNamespaces {
if strings.TrimSpace(ns) == "" {
return fmt.Errorf("config.startup.ignore_workload_namespaces entries must not be empty")
}
}
for _, node := range c.Startup.IgnoreUnavailableNodes {
if strings.TrimSpace(node) == "" {
return fmt.Errorf("config.startup.ignore_unavailable_nodes entries must not be empty")
}
}
if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" { if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" {
return fmt.Errorf("config.startup.vault_unseal_key_file must not be empty") return fmt.Errorf("config.startup.vault_unseal_key_file must not be empty")
} }
@ -276,6 +406,7 @@ func defaults() Config {
c := Config{ c := Config{
IACRepoPath: "/opt/titan-iac", IACRepoPath: "/opt/titan-iac",
ExpectedFluxBranch: "main", ExpectedFluxBranch: "main",
ExpectedFluxSource: "ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git",
SSHPort: 2277, SSHPort: 2277,
ControlPlanes: []string{"titan-0a", "titan-0b", "titan-0c"}, ControlPlanes: []string{"titan-0a", "titan-0b", "titan-0c"},
LocalBootstrapPaths: []string{ LocalBootstrapPaths: []string{
@ -328,16 +459,54 @@ func defaults() Config {
"gitea/gitea-data", "gitea/gitea-data",
"sso/keycloak-data", "sso/keycloak-data",
}, },
MinimumBatteryPercent: 20,
RequiredNodeLabels: map[string]map[string]string{
"titan-09": {
"ananke.bstein.dev/harbor-bootstrap": "true",
},
},
RequirePostStartProbes: true, RequirePostStartProbes: true,
PostStartProbeWaitSeconds: 240, PostStartProbeWaitSeconds: 240,
PostStartProbePollSeconds: 5, PostStartProbePollSeconds: 5,
PostStartProbes: []string{ PostStartProbes: []string{
"https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration", "https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration",
"https://scm.bstein.dev/user/login", "https://scm.bstein.dev/api/healthz",
"https://metrics.bstein.dev/login", "https://metrics.bstein.dev/api/health",
}, },
VaultUnsealKeyFile: "/var/lib/hecate/vault-unseal.key", RequireServiceChecklist: true,
VaultUnsealBreakglassTimeout: 15, ServiceChecklistWaitSeconds: 420,
ServiceChecklistPollSeconds: 5,
ServiceChecklistStabilitySec: 120,
ServiceChecklist: []ServiceChecklistCheck{
{
Name: "gitea-api",
URL: "https://scm.bstein.dev/api/healthz",
AcceptedStatuses: []int{200},
BodyContains: "pass",
TimeoutSeconds: 12,
},
{
Name: "grafana-api",
URL: "https://metrics.bstein.dev/api/health",
AcceptedStatuses: []int{200},
BodyContains: "\"database\":\"ok\"",
TimeoutSeconds: 12,
},
},
RequireFluxHealth: true,
FluxHealthWaitSeconds: 900,
FluxHealthPollSeconds: 5,
IgnoreFluxKustomizations: []string{},
RequireWorkloadConvergence: true,
WorkloadConvergenceWaitSeconds: 900,
WorkloadConvergencePollSeconds: 5,
IgnoreWorkloadNamespaces: []string{},
IgnoreWorkloads: []string{},
IgnoreUnavailableNodes: []string{},
AutoRecycleStuckPods: true,
StuckPodGraceSeconds: 180,
VaultUnsealKeyFile: "/var/lib/ananke/vault-unseal.key",
VaultUnsealBreakglassTimeout: 15,
}, },
Shutdown: Shutdown{ Shutdown: Shutdown{
DefaultBudgetSeconds: 1380, DefaultBudgetSeconds: 1380,
@ -362,7 +531,7 @@ func defaults() Config {
TelemetryTimeoutSeconds: 90, TelemetryTimeoutSeconds: 90,
}, },
Coordination: Coordination{ Coordination: Coordination{
ForwardShutdownConfig: "/etc/hecate/hecate.yaml", ForwardShutdownConfig: "/etc/ananke/ananke.yaml",
PeerHosts: []string{}, PeerHosts: []string{},
FallbackLocalShutdown: true, FallbackLocalShutdown: true,
CommandTimeoutSeconds: 25, CommandTimeoutSeconds: 25,
@ -376,10 +545,10 @@ func defaults() Config {
Path: "/metrics", Path: "/metrics",
}, },
State: State{ State: State{
Dir: "/var/lib/hecate", Dir: "/var/lib/ananke",
RunHistoryPath: "/var/lib/hecate/runs.json", RunHistoryPath: "/var/lib/ananke/runs.json",
LockPath: "/var/lib/hecate/hecate.lock", LockPath: "/var/lib/ananke/ananke.lock",
IntentPath: "/var/lib/hecate/intent.json", IntentPath: "/var/lib/ananke/intent.json",
}, },
} }
c.applyDefaults() c.applyDefaults()
@ -393,6 +562,9 @@ func (c *Config) applyDefaults() {
if c.IACRepoPath == "" { if c.IACRepoPath == "" {
c.IACRepoPath = "/opt/titan-iac" c.IACRepoPath = "/opt/titan-iac"
} }
if c.ExpectedFluxSource == "" {
c.ExpectedFluxSource = "ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git"
}
if c.Startup.APIWaitSeconds <= 0 { if c.Startup.APIWaitSeconds <= 0 {
c.Startup.APIWaitSeconds = 1200 c.Startup.APIWaitSeconds = 1200
} }
@ -402,6 +574,16 @@ func (c *Config) applyDefaults() {
if c.Startup.ShutdownCooldownSeconds <= 0 { if c.Startup.ShutdownCooldownSeconds <= 0 {
c.Startup.ShutdownCooldownSeconds = 45 c.Startup.ShutdownCooldownSeconds = 45
} }
if c.Startup.MinimumBatteryPercent <= 0 {
c.Startup.MinimumBatteryPercent = 20
}
if c.Startup.RequiredNodeLabels == nil {
c.Startup.RequiredNodeLabels = map[string]map[string]string{
"titan-09": {
"ananke.bstein.dev/harbor-bootstrap": "true",
},
}
}
if c.Startup.TimeSyncWaitSeconds <= 0 { if c.Startup.TimeSyncWaitSeconds <= 0 {
c.Startup.TimeSyncWaitSeconds = 240 c.Startup.TimeSyncWaitSeconds = 240
} }
@ -446,12 +628,71 @@ func (c *Config) applyDefaults() {
if len(c.Startup.PostStartProbes) == 0 { if len(c.Startup.PostStartProbes) == 0 {
c.Startup.PostStartProbes = []string{ c.Startup.PostStartProbes = []string{
"https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration", "https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration",
"https://scm.bstein.dev/user/login", "https://scm.bstein.dev/api/healthz",
"https://metrics.bstein.dev/login", "https://metrics.bstein.dev/api/health",
} }
} }
if c.Startup.ServiceChecklistWaitSeconds <= 0 {
c.Startup.ServiceChecklistWaitSeconds = 420
}
if c.Startup.ServiceChecklistPollSeconds <= 0 {
c.Startup.ServiceChecklistPollSeconds = 5
}
if c.Startup.ServiceChecklistStabilitySec < 0 {
c.Startup.ServiceChecklistStabilitySec = 0
}
if len(c.Startup.ServiceChecklist) == 0 {
c.Startup.ServiceChecklist = []ServiceChecklistCheck{
{
Name: "gitea-api",
URL: "https://scm.bstein.dev/api/healthz",
AcceptedStatuses: []int{200},
BodyContains: "pass",
TimeoutSeconds: 12,
},
{
Name: "grafana-api",
URL: "https://metrics.bstein.dev/api/health",
AcceptedStatuses: []int{200},
BodyContains: "\"database\":\"ok\"",
TimeoutSeconds: 12,
},
}
}
for i := range c.Startup.ServiceChecklist {
if c.Startup.ServiceChecklist[i].TimeoutSeconds <= 0 {
c.Startup.ServiceChecklist[i].TimeoutSeconds = 12
}
}
if c.Startup.FluxHealthWaitSeconds <= 0 {
c.Startup.FluxHealthWaitSeconds = 900
}
if c.Startup.FluxHealthPollSeconds <= 0 {
c.Startup.FluxHealthPollSeconds = 5
}
if c.Startup.IgnoreFluxKustomizations == nil {
c.Startup.IgnoreFluxKustomizations = []string{}
}
if c.Startup.WorkloadConvergenceWaitSeconds <= 0 {
c.Startup.WorkloadConvergenceWaitSeconds = 900
}
if c.Startup.WorkloadConvergencePollSeconds <= 0 {
c.Startup.WorkloadConvergencePollSeconds = 5
}
if c.Startup.IgnoreWorkloadNamespaces == nil {
c.Startup.IgnoreWorkloadNamespaces = []string{}
}
if c.Startup.IgnoreWorkloads == nil {
c.Startup.IgnoreWorkloads = []string{}
}
if c.Startup.IgnoreUnavailableNodes == nil {
c.Startup.IgnoreUnavailableNodes = []string{}
}
if c.Startup.StuckPodGraceSeconds <= 0 {
c.Startup.StuckPodGraceSeconds = 180
}
if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" { if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" {
c.Startup.VaultUnsealKeyFile = "/var/lib/hecate/vault-unseal.key" c.Startup.VaultUnsealKeyFile = "/var/lib/ananke/vault-unseal.key"
} }
if c.Startup.VaultUnsealBreakglassTimeout <= 0 { if c.Startup.VaultUnsealBreakglassTimeout <= 0 {
c.Startup.VaultUnsealBreakglassTimeout = 15 c.Startup.VaultUnsealBreakglassTimeout = 15
@ -496,7 +737,7 @@ func (c *Config) applyDefaults() {
c.UPS.TelemetryTimeoutSeconds = 90 c.UPS.TelemetryTimeoutSeconds = 90
} }
if c.Coordination.ForwardShutdownConfig == "" { if c.Coordination.ForwardShutdownConfig == "" {
c.Coordination.ForwardShutdownConfig = "/etc/hecate/hecate.yaml" c.Coordination.ForwardShutdownConfig = "/etc/ananke/ananke.yaml"
} }
if c.Coordination.PeerHosts == nil { if c.Coordination.PeerHosts == nil {
c.Coordination.PeerHosts = []string{} c.Coordination.PeerHosts = []string{}
@ -517,15 +758,15 @@ func (c *Config) applyDefaults() {
c.Metrics.Path = "/metrics" c.Metrics.Path = "/metrics"
} }
if c.State.Dir == "" { if c.State.Dir == "" {
c.State.Dir = "/var/lib/hecate" c.State.Dir = "/var/lib/ananke"
} }
if c.State.RunHistoryPath == "" { if c.State.RunHistoryPath == "" {
c.State.RunHistoryPath = "/var/lib/hecate/runs.json" c.State.RunHistoryPath = "/var/lib/ananke/runs.json"
} }
if c.State.LockPath == "" { if c.State.LockPath == "" {
c.State.LockPath = "/var/lib/hecate/hecate.lock" c.State.LockPath = "/var/lib/ananke/ananke.lock"
} }
if c.State.IntentPath == "" { if c.State.IntentPath == "" {
c.State.IntentPath = "/var/lib/hecate/intent.json" c.State.IntentPath = "/var/lib/ananke/intent.json"
} }
} }

View File

@ -9,7 +9,7 @@ import (
func TestLoadAcceptsUPSTargets(t *testing.T) { func TestLoadAcceptsUPSTargets(t *testing.T) {
tmp := t.TempDir() tmp := t.TempDir()
cfgPath := filepath.Join(tmp, "hecate.yaml") cfgPath := filepath.Join(tmp, "ananke.yaml")
raw := ` raw := `
control_planes: [titan-0a, titan-0b, titan-0c] control_planes: [titan-0a, titan-0b, titan-0c]
expected_flux_branch: main expected_flux_branch: main
@ -24,7 +24,7 @@ shutdown:
default_budget_seconds: 300 default_budget_seconds: 300
state: state:
run_history_path: /tmp/runs.json run_history_path: /tmp/runs.json
lock_path: /tmp/hecate.lock lock_path: /tmp/ananke.lock
` `
if err := os.WriteFile(cfgPath, []byte(strings.TrimSpace(raw)), 0o644); err != nil { if err := os.WriteFile(cfgPath, []byte(strings.TrimSpace(raw)), 0o644); err != nil {
t.Fatalf("write config: %v", err) t.Fatalf("write config: %v", err)
@ -74,7 +74,7 @@ func TestValidateRejectsUnknownEtcdRestoreControlPlane(t *testing.T) {
func TestLoadSetsCoordinationGuardDefaults(t *testing.T) { func TestLoadSetsCoordinationGuardDefaults(t *testing.T) {
tmp := t.TempDir() tmp := t.TempDir()
cfgPath := filepath.Join(tmp, "hecate.yaml") cfgPath := filepath.Join(tmp, "ananke.yaml")
raw := ` raw := `
control_planes: [titan-0a, titan-0b, titan-0c] control_planes: [titan-0a, titan-0b, titan-0c]
expected_flux_branch: main expected_flux_branch: main
@ -85,7 +85,7 @@ ups:
enabled: false enabled: false
state: state:
run_history_path: /tmp/runs.json run_history_path: /tmp/runs.json
lock_path: /tmp/hecate.lock lock_path: /tmp/ananke.lock
` `
if err := os.WriteFile(cfgPath, []byte(strings.TrimSpace(raw)), 0o644); err != nil { if err := os.WriteFile(cfgPath, []byte(strings.TrimSpace(raw)), 0o644); err != nil {
t.Fatalf("write config: %v", err) t.Fatalf("write config: %v", err)
@ -146,3 +146,55 @@ func TestValidateRejectsMissingPostStartProbesWhenRequired(t *testing.T) {
t.Fatalf("expected validation error when post start probes are required but empty") t.Fatalf("expected validation error when post start probes are required but empty")
} }
} }
func TestValidateRejectsMissingServiceChecklistWhenRequired(t *testing.T) {
cfg := defaults()
cfg.Startup.RequireServiceChecklist = true
cfg.Startup.ServiceChecklist = nil
if err := cfg.Validate(); err == nil {
t.Fatalf("expected validation error when service checklist is required but empty")
}
}
func TestValidateRejectsBadServiceChecklistURL(t *testing.T) {
cfg := defaults()
cfg.Startup.ServiceChecklist = []ServiceChecklistCheck{
{
Name: "grafana",
URL: "not-a-url",
AcceptedStatuses: []int{200},
TimeoutSeconds: 12,
},
}
if err := cfg.Validate(); err == nil {
t.Fatalf("expected validation error for invalid service checklist url")
}
}
func TestValidateRejectsBadIgnoreFluxKustomizationFormat(t *testing.T) {
cfg := defaults()
cfg.Startup.IgnoreFluxKustomizations = []string{"jellyfin"}
if err := cfg.Validate(); err == nil {
t.Fatalf("expected validation error for invalid ignore_flux_kustomizations entry")
}
}
func TestValidateRejectsBadIgnoreWorkloadFormat(t *testing.T) {
cfg := defaults()
cfg.Startup.IgnoreWorkloads = []string{"maintenance/metis/extra/value"}
if err := cfg.Validate(); err == nil {
t.Fatalf("expected validation error for invalid ignore_workloads entry")
}
}
func TestValidateRejectsInvalidRequiredNodeLabel(t *testing.T) {
cfg := defaults()
cfg.Startup.RequiredNodeLabels = map[string]map[string]string{
"titan-09": {
"": "true",
},
}
if err := cfg.Validate(); err == nil {
t.Fatalf("expected validation error for invalid required_node_labels entry")
}
}

View File

@ -84,41 +84,41 @@ func (e *Exporter) serveMetrics(w http.ResponseWriter, _ *http.Request) {
w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8") w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8")
var b strings.Builder var b strings.Builder
b.WriteString("# HELP hecate_shutdown_budget_seconds Estimated graceful shutdown budget in seconds (p95).\n") b.WriteString("# HELP ananke_shutdown_budget_seconds Estimated graceful shutdown budget in seconds (p95).\n")
b.WriteString("# TYPE hecate_shutdown_budget_seconds gauge\n") b.WriteString("# TYPE ananke_shutdown_budget_seconds gauge\n")
b.WriteString(fmt.Sprintf("hecate_shutdown_budget_seconds %d\n", e.shutdownBudgetSec)) b.WriteString(fmt.Sprintf("ananke_shutdown_budget_seconds %d\n", e.shutdownBudgetSec))
b.WriteString("# HELP hecate_shutdown_triggers_total Total number of shutdown triggers issued by this instance.\n") b.WriteString("# HELP ananke_shutdown_triggers_total Total number of shutdown triggers issued by this instance.\n")
b.WriteString("# TYPE hecate_shutdown_triggers_total counter\n") b.WriteString("# TYPE ananke_shutdown_triggers_total counter\n")
b.WriteString(fmt.Sprintf("hecate_shutdown_triggers_total %d\n", e.shutdownTriggers)) b.WriteString(fmt.Sprintf("ananke_shutdown_triggers_total %d\n", e.shutdownTriggers))
b.WriteString("# HELP hecate_shutdown_last_trigger_timestamp_seconds Unix timestamp of last shutdown trigger.\n") b.WriteString("# HELP ananke_shutdown_last_trigger_timestamp_seconds Unix timestamp of last shutdown trigger.\n")
b.WriteString("# TYPE hecate_shutdown_last_trigger_timestamp_seconds gauge\n") b.WriteString("# TYPE ananke_shutdown_last_trigger_timestamp_seconds gauge\n")
if e.lastShutdownAt.IsZero() { if e.lastShutdownAt.IsZero() {
b.WriteString("hecate_shutdown_last_trigger_timestamp_seconds 0\n") b.WriteString("ananke_shutdown_last_trigger_timestamp_seconds 0\n")
} else { } else {
b.WriteString(fmt.Sprintf("hecate_shutdown_last_trigger_timestamp_seconds %d\n", e.lastShutdownAt.Unix())) b.WriteString(fmt.Sprintf("ananke_shutdown_last_trigger_timestamp_seconds %d\n", e.lastShutdownAt.Unix()))
} }
b.WriteString("# HELP hecate_ups_on_battery Whether a UPS source is currently on battery.\n") b.WriteString("# HELP ananke_ups_on_battery Whether a UPS source is currently on battery.\n")
b.WriteString("# TYPE hecate_ups_on_battery gauge\n") b.WriteString("# TYPE ananke_ups_on_battery gauge\n")
b.WriteString("# HELP hecate_ups_low_battery Whether a UPS source currently reports low battery.\n") b.WriteString("# HELP ananke_ups_low_battery Whether a UPS source currently reports low battery.\n")
b.WriteString("# TYPE hecate_ups_low_battery gauge\n") b.WriteString("# TYPE ananke_ups_low_battery gauge\n")
b.WriteString("# HELP hecate_ups_runtime_seconds Battery runtime remaining reported by UPS.\n") b.WriteString("# HELP ananke_ups_runtime_seconds Battery runtime remaining reported by UPS.\n")
b.WriteString("# TYPE hecate_ups_runtime_seconds gauge\n") b.WriteString("# TYPE ananke_ups_runtime_seconds gauge\n")
b.WriteString("# HELP hecate_ups_battery_charge_percent Battery charge percentage reported by UPS.\n") b.WriteString("# HELP ananke_ups_battery_charge_percent Battery charge percentage reported by UPS.\n")
b.WriteString("# TYPE hecate_ups_battery_charge_percent gauge\n") b.WriteString("# TYPE ananke_ups_battery_charge_percent gauge\n")
b.WriteString("# HELP hecate_ups_load_percent UPS output load percentage.\n") b.WriteString("# HELP ananke_ups_load_percent UPS output load percentage.\n")
b.WriteString("# TYPE hecate_ups_load_percent gauge\n") b.WriteString("# TYPE ananke_ups_load_percent gauge\n")
b.WriteString("# HELP hecate_ups_power_nominal_watts UPS nominal power rating in watts.\n") b.WriteString("# HELP ananke_ups_power_nominal_watts UPS nominal power rating in watts.\n")
b.WriteString("# TYPE hecate_ups_power_nominal_watts gauge\n") b.WriteString("# TYPE ananke_ups_power_nominal_watts gauge\n")
b.WriteString("# HELP hecate_ups_threshold_seconds Red-line threshold for runtime-based shutdown.\n") b.WriteString("# HELP ananke_ups_threshold_seconds Red-line threshold for runtime-based shutdown.\n")
b.WriteString("# TYPE hecate_ups_threshold_seconds gauge\n") b.WriteString("# TYPE ananke_ups_threshold_seconds gauge\n")
b.WriteString("# HELP hecate_ups_trigger_active Whether this UPS source currently breaches shutdown trigger conditions.\n") b.WriteString("# HELP ananke_ups_trigger_active Whether this UPS source currently breaches shutdown trigger conditions.\n")
b.WriteString("# TYPE hecate_ups_trigger_active gauge\n") b.WriteString("# TYPE ananke_ups_trigger_active gauge\n")
b.WriteString("# HELP hecate_ups_breach_count Current debounce breach count for this UPS source.\n") b.WriteString("# HELP ananke_ups_breach_count Current debounce breach count for this UPS source.\n")
b.WriteString("# TYPE hecate_ups_breach_count gauge\n") b.WriteString("# TYPE ananke_ups_breach_count gauge\n")
b.WriteString("# HELP hecate_ups_last_sample_timestamp_seconds Unix timestamp of most recent sample.\n") b.WriteString("# HELP ananke_ups_last_sample_timestamp_seconds Unix timestamp of most recent sample.\n")
b.WriteString("# TYPE hecate_ups_last_sample_timestamp_seconds gauge\n") b.WriteString("# TYPE ananke_ups_last_sample_timestamp_seconds gauge\n")
b.WriteString("# HELP hecate_ups_error Whether the last sample had an error.\n") b.WriteString("# HELP ananke_ups_error Whether the last sample had an error.\n")
b.WriteString("# TYPE hecate_ups_error gauge\n") b.WriteString("# TYPE ananke_ups_error gauge\n")
names := make([]string, 0, len(e.samples)) names := make([]string, 0, len(e.samples))
for name := range e.samples { for name := range e.samples {
@ -129,21 +129,21 @@ func (e *Exporter) serveMetrics(w http.ResponseWriter, _ *http.Request) {
s := e.samples[name] s := e.samples[name]
labels := fmt.Sprintf("{source=%q,target=%q,status=%q,last_reason=%q}", labels := fmt.Sprintf("{source=%q,target=%q,status=%q,last_reason=%q}",
safe(name), safe(s.Target), safe(s.Status), safe(e.lastShutdownReason)) safe(name), safe(s.Target), safe(s.Status), safe(e.lastShutdownReason))
b.WriteString(fmt.Sprintf("hecate_ups_on_battery%s %d\n", labels, boolNum(s.OnBattery))) b.WriteString(fmt.Sprintf("ananke_ups_on_battery%s %d\n", labels, boolNum(s.OnBattery)))
b.WriteString(fmt.Sprintf("hecate_ups_low_battery%s %d\n", labels, boolNum(s.LowBattery))) b.WriteString(fmt.Sprintf("ananke_ups_low_battery%s %d\n", labels, boolNum(s.LowBattery)))
b.WriteString(fmt.Sprintf("hecate_ups_runtime_seconds%s %d\n", labels, s.RuntimeSecond)) b.WriteString(fmt.Sprintf("ananke_ups_runtime_seconds%s %d\n", labels, s.RuntimeSecond))
b.WriteString(fmt.Sprintf("hecate_ups_battery_charge_percent%s %.2f\n", labels, s.BatteryCharge)) b.WriteString(fmt.Sprintf("ananke_ups_battery_charge_percent%s %.2f\n", labels, s.BatteryCharge))
b.WriteString(fmt.Sprintf("hecate_ups_load_percent%s %.2f\n", labels, s.LoadPercent)) b.WriteString(fmt.Sprintf("ananke_ups_load_percent%s %.2f\n", labels, s.LoadPercent))
b.WriteString(fmt.Sprintf("hecate_ups_power_nominal_watts%s %.2f\n", labels, s.PowerNominalW)) b.WriteString(fmt.Sprintf("ananke_ups_power_nominal_watts%s %.2f\n", labels, s.PowerNominalW))
b.WriteString(fmt.Sprintf("hecate_ups_threshold_seconds%s %d\n", labels, s.ThresholdSec)) b.WriteString(fmt.Sprintf("ananke_ups_threshold_seconds%s %d\n", labels, s.ThresholdSec))
b.WriteString(fmt.Sprintf("hecate_ups_trigger_active%s %d\n", labels, boolNum(s.Trigger))) b.WriteString(fmt.Sprintf("ananke_ups_trigger_active%s %d\n", labels, boolNum(s.Trigger)))
b.WriteString(fmt.Sprintf("hecate_ups_breach_count%s %d\n", labels, s.BreachCount)) b.WriteString(fmt.Sprintf("ananke_ups_breach_count%s %d\n", labels, s.BreachCount))
if s.UpdatedAt.IsZero() { if s.UpdatedAt.IsZero() {
b.WriteString(fmt.Sprintf("hecate_ups_last_sample_timestamp_seconds%s 0\n", labels)) b.WriteString(fmt.Sprintf("ananke_ups_last_sample_timestamp_seconds%s 0\n", labels))
} else { } else {
b.WriteString(fmt.Sprintf("hecate_ups_last_sample_timestamp_seconds%s %d\n", labels, s.UpdatedAt.Unix())) b.WriteString(fmt.Sprintf("ananke_ups_last_sample_timestamp_seconds%s %d\n", labels, s.UpdatedAt.Unix()))
} }
b.WriteString(fmt.Sprintf("hecate_ups_error%s %d\n", labels, boolNum(s.LastError != ""))) b.WriteString(fmt.Sprintf("ananke_ups_error%s %d\n", labels, boolNum(s.LastError != "")))
} }
_, _ = w.Write([]byte(b.String())) _, _ = w.Write([]byte(b.String()))

View File

@ -33,14 +33,14 @@ func TestExporterEmitsCoreMetrics(t *testing.T) {
body := rr.Body.String() body := rr.Body.String()
mustContain := []string{ mustContain := []string{
"hecate_shutdown_budget_seconds 321", "ananke_shutdown_budget_seconds 321",
"hecate_shutdown_triggers_total 1", "ananke_shutdown_triggers_total 1",
"hecate_ups_on_battery{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"", "ananke_ups_on_battery{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
"hecate_ups_runtime_seconds{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"", "ananke_ups_runtime_seconds{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
"hecate_ups_battery_charge_percent{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"", "ananke_ups_battery_charge_percent{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
"hecate_ups_load_percent{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"", "ananke_ups_load_percent{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
"hecate_ups_power_nominal_watts{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"", "ananke_ups_power_nominal_watts{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
"hecate_ups_threshold_seconds{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"", "ananke_ups_threshold_seconds{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
} }
for _, m := range mustContain { for _, m := range mustContain {
if !strings.Contains(body, m) { if !strings.Contains(body, m) {

View File

@ -12,12 +12,12 @@ import (
"strings" "strings"
"time" "time"
"scm.bstein.dev/bstein/hecate/internal/cluster" "scm.bstein.dev/bstein/ananke/internal/cluster"
"scm.bstein.dev/bstein/hecate/internal/config" "scm.bstein.dev/bstein/ananke/internal/config"
"scm.bstein.dev/bstein/hecate/internal/metrics" "scm.bstein.dev/bstein/ananke/internal/metrics"
"scm.bstein.dev/bstein/hecate/internal/sshutil" "scm.bstein.dev/bstein/ananke/internal/sshutil"
"scm.bstein.dev/bstein/hecate/internal/state" "scm.bstein.dev/bstein/ananke/internal/state"
"scm.bstein.dev/bstein/hecate/internal/ups" "scm.bstein.dev/bstein/ananke/internal/ups"
) )
type Target struct { type Target struct {
@ -81,7 +81,7 @@ func (d *Daemon) Run(ctx context.Context) error {
lastGood[t.Name] = time.Now() lastGood[t.Name] = time.Now()
} }
d.log.Printf("hecate daemon started: poll=%s debounce=%d telemetry_timeout=%s targets=%s", d.log.Printf("ananke daemon started: poll=%s debounce=%d telemetry_timeout=%s targets=%s",
poll, debounce, telemetryTimeout, d.targetList()) poll, debounce, telemetryTimeout, d.targetList())
for { for {
@ -198,7 +198,7 @@ func (d *Daemon) forwardShutdown(ctx context.Context, reason string) error {
runCtx, cancel := context.WithTimeout(ctx, timeout) runCtx, cancel := context.WithTimeout(ctx, timeout)
defer cancel() defer cancel()
remoteCmd := fmt.Sprintf("sudo /usr/local/bin/hecate shutdown --config %q --execute --reason %q", d.cfg.Coordination.ForwardShutdownConfig, reason) remoteCmd := fmt.Sprintf("sudo /usr/local/bin/ananke shutdown --config %q --execute --reason %q", d.cfg.Coordination.ForwardShutdownConfig, reason)
if d.cfg.Shutdown.EmergencySkipEtcd { if d.cfg.Shutdown.EmergencySkipEtcd {
remoteCmd += " --skip-etcd-snapshot" remoteCmd += " --skip-etcd-snapshot"
} }

View File

@ -6,7 +6,7 @@ import (
"time" "time"
) )
// ParseIntentOutput parses `hecate intent` CLI output from local/remote commands. // ParseIntentOutput parses `ananke intent` CLI output from local/remote commands.
func ParseIntentOutput(raw string) (Intent, error) { func ParseIntentOutput(raw string) (Intent, error) {
for _, line := range strings.Split(raw, "\n") { for _, line := range strings.Split(raw, "\n") {
line = strings.TrimSpace(line) line = strings.TrimSpace(line)

View File

@ -61,7 +61,7 @@ func TestReadIntentAutoHealsCorruptJSON(t *testing.T) {
} }
func TestParseIntentOutputParsesStructuredLine(t *testing.T) { func TestParseIntentOutputParsesStructuredLine(t *testing.T) {
raw := `[hecate] 2026/04/05 11:24:49 intent=normal reason="guard-test-clear-2" source=drill updated_at=2026-04-05T16:24:33Z` raw := `[ananke] 2026/04/05 11:24:49 intent=normal reason="guard-test-clear-2" source=drill updated_at=2026-04-05T16:24:33Z`
in, err := ParseIntentOutput(raw) in, err := ParseIntentOutput(raw)
if err != nil { if err != nil {
t.Fatalf("parse intent output: %v", err) t.Fatalf("parse intent output: %v", err)
@ -81,7 +81,7 @@ func TestParseIntentOutputParsesStructuredLine(t *testing.T) {
} }
func TestParseIntentOutputHandlesNone(t *testing.T) { func TestParseIntentOutputHandlesNone(t *testing.T) {
in, err := ParseIntentOutput(`[hecate] 2026/04/05 11:24:49 intent=none`) in, err := ParseIntentOutput(`[ananke] 2026/04/05 11:24:49 intent=none`)
if err != nil { if err != nil {
t.Fatalf("parse none intent output: %v", err) t.Fatalf("parse none intent output: %v", err)
} }

View File

@ -11,7 +11,7 @@ import (
) )
func TestAcquireLockLifecycle(t *testing.T) { func TestAcquireLockLifecycle(t *testing.T) {
lockPath := filepath.Join(t.TempDir(), "hecate.lock") lockPath := filepath.Join(t.TempDir(), "ananke.lock")
unlock, err := AcquireLock(lockPath) unlock, err := AcquireLock(lockPath)
if err != nil { if err != nil {
t.Fatalf("acquire lock: %v", err) t.Fatalf("acquire lock: %v", err)
@ -26,7 +26,7 @@ func TestAcquireLockLifecycle(t *testing.T) {
} }
func TestAcquireLockReclaimsStaleLock(t *testing.T) { func TestAcquireLockReclaimsStaleLock(t *testing.T) {
lockPath := filepath.Join(t.TempDir(), "hecate.lock") lockPath := filepath.Join(t.TempDir(), "ananke.lock")
if err := os.WriteFile(lockPath, []byte("pid=999999\n"), 0o600); err != nil { if err := os.WriteFile(lockPath, []byte("pid=999999\n"), 0o600); err != nil {
t.Fatalf("write stale lock: %v", err) t.Fatalf("write stale lock: %v", err)
} }
@ -47,7 +47,7 @@ func TestAcquireLockReclaimsStaleLock(t *testing.T) {
} }
func TestAcquireLockRejectsActiveLock(t *testing.T) { func TestAcquireLockRejectsActiveLock(t *testing.T) {
lockPath := filepath.Join(t.TempDir(), "hecate.lock") lockPath := filepath.Join(t.TempDir(), "ananke.lock")
active := "pid=" + strconv.Itoa(os.Getpid()) + "\n" active := "pid=" + strconv.Itoa(os.Getpid()) + "\n"
if err := os.WriteFile(lockPath, []byte(active), 0o600); err != nil { if err := os.WriteFile(lockPath, []byte(active), 0o600); err != nil {
t.Fatalf("write active lock: %v", err) t.Fatalf("write active lock: %v", err)

View File

@ -3,7 +3,7 @@ package ups
import "testing" import "testing"
func TestParseNUT(t *testing.T) { func TestParseNUT(t *testing.T) {
raw := `battery.runtime: 384 raw := `battery.runtime: 384
battery.charge: 72 battery.charge: 72
ups.load: 19 ups.load: 19
ups.realpower.nominal: 510 ups.realpower.nominal: 510

View File

@ -2,23 +2,23 @@
set -Eeuo pipefail set -Eeuo pipefail
KUBECTL="${KUBECTL:-kubectl}" KUBECTL="${KUBECTL:-kubectl}"
HECATE_COORDINATOR_HOST="${HECATE_COORDINATOR_HOST:-titan-db}" ANANKE_COORDINATOR_HOST="${ANANKE_COORDINATOR_HOST:-titan-db}"
HECATE_BIN="${HECATE_BIN:-/usr/local/bin/hecate}" ANANKE_BIN="${ANANKE_BIN:-/usr/local/bin/ananke}"
HECATE_CONFIG="${HECATE_CONFIG:-/etc/hecate/hecate.yaml}" ANANKE_CONFIG="${ANANKE_CONFIG:-/etc/ananke/ananke.yaml}"
HECATE_COORDINATOR_RELAY="${HECATE_COORDINATOR_RELAY:-}" ANANKE_COORDINATOR_RELAY="${ANANKE_COORDINATOR_RELAY:-}"
LOG_DIR="${HECATE_DRILL_LOG_DIR:-/tmp/hecate-drills}" LOG_DIR="${ANANKE_DRILL_LOG_DIR:-/tmp/ananke-drills}"
STARTUP_TIMEOUT_SECONDS="${HECATE_DRILL_STARTUP_TIMEOUT_SECONDS:-1800}" STARTUP_TIMEOUT_SECONDS="${ANANKE_DRILL_STARTUP_TIMEOUT_SECONDS:-1800}"
SHUTDOWN_TIMEOUT_SECONDS="${HECATE_DRILL_SHUTDOWN_TIMEOUT_SECONDS:-1800}" SHUTDOWN_TIMEOUT_SECONDS="${ANANKE_DRILL_SHUTDOWN_TIMEOUT_SECONDS:-1800}"
SHUTDOWN_CONFIG="${HECATE_DRILL_SHUTDOWN_CONFIG:-/tmp/hecate-drill-no-poweroff.yaml}" SHUTDOWN_CONFIG="${ANANKE_DRILL_SHUTDOWN_CONFIG:-/tmp/ananke-drill-no-poweroff.yaml}"
STARTUP_RETRY_DELAY_SECONDS="${HECATE_DRILL_STARTUP_RETRY_DELAY_SECONDS:-10}" STARTUP_RETRY_DELAY_SECONDS="${ANANKE_DRILL_STARTUP_RETRY_DELAY_SECONDS:-10}"
STARTUP_RETRY_MAX="${HECATE_DRILL_STARTUP_RETRY_MAX:-12}" STARTUP_RETRY_MAX="${ANANKE_DRILL_STARTUP_RETRY_MAX:-12}"
EXECUTE=0 EXECUTE=0
usage() { usage() {
cat <<'EOF' cat <<'EOF'
Usage: Usage:
scripts/hecate-drills.sh list scripts/ananke-drills.sh list
scripts/hecate-drills.sh run <drill-name> [--execute] scripts/ananke-drills.sh run <drill-name> [--execute]
Drills: Drills:
flux-gitea-deadlock Simulate flux-controller + gitea outage and require startup recovery. flux-gitea-deadlock Simulate flux-controller + gitea outage and require startup recovery.
@ -30,7 +30,7 @@ Drills:
Notes: Notes:
- Drills are intentionally disruptive and are not part of regular `make test`. - Drills are intentionally disruptive and are not part of regular `make test`.
- Use --execute to run live changes. Without it, this script prints planned actions only. - Use --execute to run live changes. Without it, this script prints planned actions only.
- Optional relay: set HECATE_COORDINATOR_RELAY="ssh titan-db" to run coordinator commands via a jump host. - Optional relay: set ANANKE_COORDINATOR_RELAY="ssh titan-db" to run coordinator commands via a jump host.
EOF EOF
} }
@ -98,47 +98,47 @@ wait_ready_keycloak() {
die "keycloak workload not found in sso namespace (expected deployment/keycloak or statefulset/keycloak)" die "keycloak workload not found in sso namespace (expected deployment/keycloak or statefulset/keycloak)"
} }
run_hecate_startup() { run_ananke_startup() {
local reason="$1" local reason="$1"
local cmd=(sudo "${HECATE_BIN}" startup --config "${HECATE_CONFIG}" --execute --force-flux-branch main --reason "${reason}") local cmd=(sudo "${ANANKE_BIN}" startup --config "${ANANKE_CONFIG}" --execute --force-flux-branch main --reason "${reason}")
if [[ "${EXECUTE}" -eq 0 ]]; then if [[ "${EXECUTE}" -eq 0 ]]; then
if [[ -n "${HECATE_COORDINATOR_RELAY}" ]]; then if [[ -n "${ANANKE_COORDINATOR_RELAY}" ]]; then
log "plan: ssh ${HECATE_COORDINATOR_HOST} ${HECATE_COORDINATOR_RELAY} '${cmd[*]}'" log "plan: ssh ${ANANKE_COORDINATOR_HOST} ${ANANKE_COORDINATOR_RELAY} '${cmd[*]}'"
else else
log "plan: ssh ${HECATE_COORDINATOR_HOST} '${cmd[*]}'" log "plan: ssh ${ANANKE_COORDINATOR_HOST} '${cmd[*]}'"
fi fi
return 0 return 0
fi fi
if [[ -n "${HECATE_COORDINATOR_RELAY}" ]]; then if [[ -n "${ANANKE_COORDINATOR_RELAY}" ]]; then
# shellcheck disable=SC2086 # shellcheck disable=SC2086
timeout "${STARTUP_TIMEOUT_SECONDS}" ssh "${HECATE_COORDINATOR_HOST}" ${HECATE_COORDINATOR_RELAY} "${cmd[@]}" timeout "${STARTUP_TIMEOUT_SECONDS}" ssh "${ANANKE_COORDINATOR_HOST}" ${ANANKE_COORDINATOR_RELAY} "${cmd[@]}"
else else
timeout "${STARTUP_TIMEOUT_SECONDS}" ssh "${HECATE_COORDINATOR_HOST}" "${cmd[@]}" timeout "${STARTUP_TIMEOUT_SECONDS}" ssh "${ANANKE_COORDINATOR_HOST}" "${cmd[@]}"
fi fi
} }
run_hecate_shutdown() { run_ananke_shutdown() {
local reason="$1" local reason="$1"
local cmd=(sudo "${HECATE_BIN}" shutdown --config "${SHUTDOWN_CONFIG}" --execute --reason "${reason}") local cmd=(sudo "${ANANKE_BIN}" shutdown --config "${SHUTDOWN_CONFIG}" --execute --reason "${reason}")
if [[ "${EXECUTE}" -eq 0 ]]; then if [[ "${EXECUTE}" -eq 0 ]]; then
if [[ -n "${HECATE_COORDINATOR_RELAY}" ]]; then if [[ -n "${ANANKE_COORDINATOR_RELAY}" ]]; then
log "plan: ssh ${HECATE_COORDINATOR_HOST} ${HECATE_COORDINATOR_RELAY} '${cmd[*]}'" log "plan: ssh ${ANANKE_COORDINATOR_HOST} ${ANANKE_COORDINATOR_RELAY} '${cmd[*]}'"
else else
log "plan: ssh ${HECATE_COORDINATOR_HOST} '${cmd[*]}'" log "plan: ssh ${ANANKE_COORDINATOR_HOST} '${cmd[*]}'"
fi fi
return 0 return 0
fi fi
if [[ -n "${HECATE_COORDINATOR_RELAY}" ]]; then if [[ -n "${ANANKE_COORDINATOR_RELAY}" ]]; then
# shellcheck disable=SC2086 # shellcheck disable=SC2086
timeout "${SHUTDOWN_TIMEOUT_SECONDS}" ssh "${HECATE_COORDINATOR_HOST}" ${HECATE_COORDINATOR_RELAY} "${cmd[@]}" timeout "${SHUTDOWN_TIMEOUT_SECONDS}" ssh "${ANANKE_COORDINATOR_HOST}" ${ANANKE_COORDINATOR_RELAY} "${cmd[@]}"
else else
timeout "${SHUTDOWN_TIMEOUT_SECONDS}" ssh "${HECATE_COORDINATOR_HOST}" "${cmd[@]}" timeout "${SHUTDOWN_TIMEOUT_SECONDS}" ssh "${ANANKE_COORDINATOR_HOST}" "${cmd[@]}"
fi fi
} }
run_hecate_startup_with_retry() { run_ananke_startup_with_retry() {
local reason="$1" local reason="$1"
local startup_cmd="sudo ${HECATE_BIN} startup --config ${HECATE_CONFIG} --execute --force-flux-branch main --reason ${reason}" local startup_cmd="sudo ${ANANKE_BIN} startup --config ${ANANKE_CONFIG} --execute --force-flux-branch main --reason ${reason}"
if [[ "${EXECUTE}" -eq 0 ]]; then if [[ "${EXECUTE}" -eq 0 ]]; then
log "plan: startup retry loop with max=${STARTUP_RETRY_MAX} delay=${STARTUP_RETRY_DELAY_SECONDS}s" log "plan: startup retry loop with max=${STARTUP_RETRY_MAX} delay=${STARTUP_RETRY_DELAY_SECONDS}s"
@ -161,11 +161,11 @@ run_hecate_startup_with_retry() {
run_coordinator_bash() { run_coordinator_bash() {
local script="$1" local script="$1"
if [[ -n "${HECATE_COORDINATOR_RELAY}" ]]; then if [[ -n "${ANANKE_COORDINATOR_RELAY}" ]]; then
# shellcheck disable=SC2086 # shellcheck disable=SC2086
printf '%s\n' "${script}" | ssh "${HECATE_COORDINATOR_HOST}" ${HECATE_COORDINATOR_RELAY} "bash -se" printf '%s\n' "${script}" | ssh "${ANANKE_COORDINATOR_HOST}" ${ANANKE_COORDINATOR_RELAY} "bash -se"
else else
printf '%s\n' "${script}" | ssh "${HECATE_COORDINATOR_HOST}" "bash -se" printf '%s\n' "${script}" | ssh "${ANANKE_COORDINATOR_HOST}" "bash -se"
fi fi
} }
@ -283,7 +283,7 @@ write_log_header() {
mkdir -p "${LOG_DIR}" mkdir -p "${LOG_DIR}"
local f="${LOG_DIR}/${drill}-$(now_ts).log" local f="${LOG_DIR}/${drill}-$(now_ts).log"
exec > >(tee -a "${f}") 2>&1 exec > >(tee -a "${f}") 2>&1
log "drill=${drill} execute=${EXECUTE} coordinator=${HECATE_COORDINATOR_HOST}" log "drill=${drill} execute=${EXECUTE} coordinator=${ANANKE_COORDINATOR_HOST}"
} }
run_drill_flux_gitea_deadlock() { run_drill_flux_gitea_deadlock() {
@ -303,7 +303,7 @@ run_drill_flux_gitea_deadlock() {
scale_to "$ns" "$kind" "$name" 0 scale_to "$ns" "$kind" "$name" 0
done done
run_hecate_startup "drill-flux-gitea-deadlock" run_ananke_startup "drill-flux-gitea-deadlock"
log "verifying recovery" log "verifying recovery"
wait_ready flux-system deployment source-controller 240s wait_ready flux-system deployment source-controller 240s
@ -330,7 +330,7 @@ run_drill_foundation_recovery() {
scale_to "$ns" "$kind" "$name" 0 scale_to "$ns" "$kind" "$name" 0
done done
run_hecate_startup "drill-foundation-recovery" run_ananke_startup "drill-foundation-recovery"
log "verifying layered recovery" log "verifying layered recovery"
wait_ready vault statefulset vault 420s wait_ready vault statefulset vault 420s
@ -350,7 +350,7 @@ run_drill_reconciliation_resume() {
set_flux_suspend_all true set_flux_suspend_all true
scale_to flux-system deployment source-controller 0 scale_to flux-system deployment source-controller 0
run_hecate_startup "drill-reconciliation-resume" run_ananke_startup "drill-reconciliation-resume"
log "verifying reconciliation resumed" log "verifying reconciliation resumed"
wait_ready flux-system deployment source-controller 240s wait_ready flux-system deployment source-controller 240s
@ -361,8 +361,8 @@ run_drill_reconciliation_resume() {
} }
run_drill_startup_intent_guard() { run_drill_startup_intent_guard() {
local intent_path="/var/lib/hecate/intent.json" local intent_path="/var/lib/ananke/intent.json"
local backup_path="/tmp/hecate-intent-pre-drill.json" local backup_path="/tmp/ananke-intent-pre-drill.json"
local inject_cmd=" local inject_cmd="
if [ -f '${intent_path}' ]; then sudo cp '${intent_path}' '${backup_path}'; else sudo rm -f '${backup_path}'; fi if [ -f '${intent_path}' ]; then sudo cp '${intent_path}' '${backup_path}'; else sudo rm -f '${backup_path}'; fi
cat <<'JSON' | sudo tee '${intent_path}' >/dev/null cat <<'JSON' | sudo tee '${intent_path}' >/dev/null
@ -376,12 +376,12 @@ else
sudo rm -f '${intent_path}' sudo rm -f '${intent_path}'
fi fi
" "
local startup_cmd="sudo ${HECATE_BIN} startup --config ${HECATE_CONFIG} --execute --force-flux-branch main --reason drill-startup-intent-guard" local startup_cmd="sudo ${ANANKE_BIN} startup --config ${ANANKE_CONFIG} --execute --force-flux-branch main --reason drill-startup-intent-guard"
if [[ "${EXECUTE}" -eq 0 ]]; then if [[ "${EXECUTE}" -eq 0 ]]; then
log "plan: ssh ${HECATE_COORDINATOR_HOST} '<inject shutdown intent>'" log "plan: ssh ${ANANKE_COORDINATOR_HOST} '<inject shutdown intent>'"
log "plan: ssh ${HECATE_COORDINATOR_HOST} '${startup_cmd}' (expect failure)" log "plan: ssh ${ANANKE_COORDINATOR_HOST} '${startup_cmd}' (expect failure)"
log "plan: ssh ${HECATE_COORDINATOR_HOST} '<restore prior intent>'" log "plan: ssh ${ANANKE_COORDINATOR_HOST} '<restore prior intent>'"
log "pass: startup-intent-guard (plan mode)" log "pass: startup-intent-guard (plan mode)"
return 0 return 0
fi fi
@ -406,10 +406,10 @@ run_drill_controlled_cycle() {
fi fi
log "running controlled shutdown cycle (poweroff disabled config)" log "running controlled shutdown cycle (poweroff disabled config)"
run_hecate_shutdown "drill-controlled-cycle-shutdown" run_ananke_shutdown "drill-controlled-cycle-shutdown"
log "running startup recovery cycle" log "running startup recovery cycle"
run_hecate_startup_with_retry "drill-controlled-cycle-startup" run_ananke_startup_with_retry "drill-controlled-cycle-startup"
log "verifying critical stack readiness after cycle" log "verifying critical stack readiness after cycle"
wait_ready flux-system deployment source-controller 240s wait_ready flux-system deployment source-controller 240s

View File

@ -2,13 +2,13 @@
set -euo pipefail set -euo pipefail
if [[ "${EUID}" -ne 0 ]]; then if [[ "${EUID}" -ne 0 ]]; then
echo "hecate-self-update.sh must run as root" >&2 echo "ananke-self-update.sh must run as root" >&2
exit 1 exit 1
fi fi
REPO_URL="${HECATE_REPO_URL:-https://scm.bstein.dev/bstein/hecate.git}" REPO_URL="${ANANKE_REPO_URL:-ssh://git@scm.bstein.dev:2242/bstein/ananke.git}"
BRANCH="${HECATE_REPO_BRANCH:-main}" BRANCH="${ANANKE_REPO_BRANCH:-main}"
REPO_DIR="${HECATE_REPO_DIR:-/opt/hecate}" REPO_DIR="${ANANKE_REPO_DIR:-/opt/ananke}"
mkdir -p "$(dirname "${REPO_DIR}")" mkdir -p "$(dirname "${REPO_DIR}")"
if [[ ! -d "${REPO_DIR}/.git" ]]; then if [[ ! -d "${REPO_DIR}/.git" ]]; then