rename runtime surfaces from hecate to ananke

This commit is contained in:
Brad Stein 2026-04-07 13:13:58 -03:00
parent 169324ef4a
commit c605a083ee
23 changed files with 1938 additions and 256 deletions

View File

@ -1,7 +1,7 @@
.PHONY: build test fmt tidy install drill-list drill-run
build:
go build -o dist/hecate ./cmd/hecate
go build -o dist/ananke ./cmd/ananke
test:
go test ./...
@ -16,7 +16,7 @@ install:
sudo ./scripts/install.sh
drill-list:
./scripts/hecate-drills.sh list
./scripts/ananke-drills.sh list
drill-run:
./scripts/hecate-drills.sh run $(DRILL) --execute
./scripts/ananke-drills.sh run $(DRILL) --execute

View File

@ -6,6 +6,7 @@ import (
"flag"
"fmt"
"log"
"math"
"os"
"os/exec"
"os/signal"
@ -14,17 +15,17 @@ import (
"syscall"
"time"
"scm.bstein.dev/bstein/hecate/internal/cluster"
"scm.bstein.dev/bstein/hecate/internal/config"
"scm.bstein.dev/bstein/hecate/internal/execx"
"scm.bstein.dev/bstein/hecate/internal/service"
"scm.bstein.dev/bstein/hecate/internal/sshutil"
"scm.bstein.dev/bstein/hecate/internal/state"
"scm.bstein.dev/bstein/hecate/internal/ups"
"scm.bstein.dev/bstein/ananke/internal/cluster"
"scm.bstein.dev/bstein/ananke/internal/config"
"scm.bstein.dev/bstein/ananke/internal/execx"
"scm.bstein.dev/bstein/ananke/internal/service"
"scm.bstein.dev/bstein/ananke/internal/sshutil"
"scm.bstein.dev/bstein/ananke/internal/state"
"scm.bstein.dev/bstein/ananke/internal/ups"
)
func main() {
logger := log.New(os.Stdout, "[hecate] ", log.LstdFlags)
logger := log.New(os.Stdout, "[ananke] ", log.LstdFlags)
if len(os.Args) < 2 {
usage()
os.Exit(2)
@ -73,7 +74,7 @@ func main() {
func runStartup(logger *log.Logger, args []string) error {
fs := flag.NewFlagSet("startup", flag.ExitOnError)
configPath := fs.String("config", "/etc/hecate/hecate.yaml", "Path to config file")
configPath := fs.String("config", "/etc/ananke/ananke.yaml", "Path to config file")
execute := fs.Bool("execute", false, "Actually execute changes (default dry-run)")
forceBranch := fs.String("force-flux-branch", "", "Patch Flux source branch before resume")
skipLocalBootstrap := fs.Bool("skip-local-bootstrap", false, "Skip local fallback bootstrap applies")
@ -124,7 +125,7 @@ func runStartup(logger *log.Logger, args []string) error {
}
checkCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
if err := ensureStartupPowerSafe(checkCtx, targets); err != nil {
if err := ensureStartupPowerSafe(checkCtx, targets, cfg.Startup.MinimumBatteryPercent); err != nil {
return err
}
}
@ -141,10 +142,11 @@ func runStartup(logger *log.Logger, args []string) error {
func runShutdown(logger *log.Logger, args []string) error {
fs := flag.NewFlagSet("shutdown", flag.ExitOnError)
configPath := fs.String("config", "/etc/hecate/hecate.yaml", "Path to config file")
configPath := fs.String("config", "/etc/ananke/ananke.yaml", "Path to config file")
execute := fs.Bool("execute", false, "Actually execute changes (default dry-run)")
skipEtcd := fs.Bool("skip-etcd-snapshot", false, "Skip etcd snapshot")
skipDrain := fs.Bool("skip-drain", false, "Skip worker drain")
mode := fs.String("mode", "config", "Shutdown mode: config|cluster-only|poweroff")
reason := fs.String("reason", "manual-shutdown", "Shutdown reason for run history")
_ = fs.Parse(args)
@ -158,13 +160,14 @@ func runShutdown(logger *log.Logger, args []string) error {
return orch.Shutdown(ctx, cluster.ShutdownOptions{
SkipEtcdSnapshot: *skipEtcd,
SkipDrain: *skipDrain,
Mode: *mode,
Reason: *reason,
})
}
func runDaemon(logger *log.Logger, args []string) error {
fs := flag.NewFlagSet("daemon", flag.ExitOnError)
configPath := fs.String("config", "/etc/hecate/hecate.yaml", "Path to config file")
configPath := fs.String("config", "/etc/ananke/ananke.yaml", "Path to config file")
dryRunActions := fs.Bool("dry-run-actions", false, "Log planned actions without executing")
_ = fs.Parse(args)
@ -191,7 +194,7 @@ func runDaemon(logger *log.Logger, args []string) error {
func runEtcdRestore(logger *log.Logger, args []string) error {
fs := flag.NewFlagSet("etcd-restore", flag.ExitOnError)
configPath := fs.String("config", "/etc/hecate/hecate.yaml", "Path to config file")
configPath := fs.String("config", "/etc/ananke/ananke.yaml", "Path to config file")
execute := fs.Bool("execute", false, "Actually execute restore (default dry-run)")
controlPlane := fs.String("control-plane", "", "Control plane to run restore on (defaults to startup.etcd_restore_control_plane)")
snapshotPath := fs.String("snapshot", "", "Explicit snapshot path (defaults to latest on selected control plane)")
@ -211,7 +214,7 @@ func runEtcdRestore(logger *log.Logger, args []string) error {
func runStatus(logger *log.Logger, args []string) error {
fs := flag.NewFlagSet("status", flag.ExitOnError)
configPath := fs.String("config", "/etc/hecate/hecate.yaml", "Path to config file")
configPath := fs.String("config", "/etc/ananke/ananke.yaml", "Path to config file")
_ = fs.Parse(args)
cfg, orch, err := buildOrchestrator(logger, *configPath, true)
@ -246,7 +249,7 @@ func runStatus(logger *log.Logger, args []string) error {
func runIntent(logger *log.Logger, args []string) error {
fs := flag.NewFlagSet("intent", flag.ExitOnError)
configPath := fs.String("config", "/etc/hecate/hecate.yaml", "Path to config file")
configPath := fs.String("config", "/etc/ananke/ananke.yaml", "Path to config file")
setState := fs.String("set", "", "Set intent state (normal|startup_in_progress|shutting_down|shutdown_complete)")
reason := fs.String("reason", "manual-intent", "Intent reason (used with --set)")
source := fs.String("source", "manual", "Intent source (used with --set)")
@ -314,7 +317,7 @@ func buildUPSTargets(cfg config.Config) ([]service.Target, error) {
return targets, nil
}
func ensureStartupPowerSafe(ctx context.Context, targets []service.Target) error {
func ensureStartupPowerSafe(ctx context.Context, targets []service.Target, minimumBatteryPercent float64) error {
type targetState struct {
seenGood bool
lastErr error
@ -327,6 +330,7 @@ func ensureStartupPowerSafe(ctx context.Context, targets []service.Target) error
const pollInterval = 3 * time.Second
for {
onBatteryTargets := []string{}
lowChargeTargets := []string{}
allSeen := true
for _, t := range targets {
key := t.Name + "|" + t.Target
@ -344,10 +348,25 @@ func ensureStartupPowerSafe(ctx context.Context, targets []service.Target) error
if sample.OnBattery {
onBatteryTargets = append(onBatteryTargets, fmt.Sprintf("%s(status=%s runtime_s=%d)", t.Name, sample.RawStatus, sample.RuntimeSeconds))
}
if minimumBatteryPercent > 0 && sample.BatteryCharge > 0 && sample.BatteryCharge < minimumBatteryPercent {
lowChargeTargets = append(
lowChargeTargets,
fmt.Sprintf(
"%s(charge=%.1f%%<%.1f%% status=%s)",
t.Name,
sample.BatteryCharge,
minimumBatteryPercent,
sample.RawStatus,
),
)
}
}
if len(onBatteryTargets) > 0 {
return fmt.Errorf("startup blocked: UPS is on battery for %s", strings.Join(onBatteryTargets, ", "))
}
if len(lowChargeTargets) > 0 {
return fmt.Errorf("startup blocked: UPS battery charge below minimum for %s", strings.Join(lowChargeTargets, ", "))
}
if allSeen {
return nil
}
@ -366,7 +385,8 @@ func ensureStartupPowerSafe(ctx context.Context, targets []service.Target) error
unverified = append(unverified, fmt.Sprintf("%s(%s): no telemetry sample yet", t.Name, t.Target))
}
}
return fmt.Errorf("startup blocked: unable to verify UPS telemetry before timeout: %s", strings.Join(unverified, " | "))
roundedMin := math.Round(minimumBatteryPercent*10) / 10
return fmt.Errorf("startup blocked: unable to verify UPS telemetry before timeout (minimum_battery_percent=%.1f): %s", roundedMin, strings.Join(unverified, " | "))
case <-time.After(pollInterval):
}
}
@ -391,26 +411,26 @@ func buildOrchestrator(logger *log.Logger, cfgPath string, dryRun bool) (config.
}
func usage() {
fmt.Print(`hecate: staged startup/shutdown + UPS-triggered protection
fmt.Print(`ananke: staged startup/shutdown + UPS-triggered protection
Usage:
hecate <command> [flags]
ananke <command> [flags]
Commands:
startup Perform staged cluster startup
shutdown Perform graceful cluster shutdown
etcd-restore Restore etcd from snapshot on a control plane
daemon Monitor UPS and auto-trigger shutdown
status Print current hecate status and estimates
status Print current ananke status and estimates
intent Read or manually set intent state
Examples:
hecate startup --config /etc/hecate/hecate.yaml --execute --force-flux-branch main
hecate shutdown --config /etc/hecate/hecate.yaml --execute --reason "manual-maintenance"
hecate etcd-restore --config /etc/hecate/hecate.yaml --execute
hecate daemon --config /etc/hecate/hecate.yaml
hecate status --config /etc/hecate/hecate.yaml
hecate intent --config /etc/hecate/hecate.yaml --set normal --reason "manual-clear" --execute
ananke startup --config /etc/ananke/ananke.yaml --execute --force-flux-branch main
ananke shutdown --config /etc/ananke/ananke.yaml --execute --reason "manual-maintenance"
ananke etcd-restore --config /etc/ananke/ananke.yaml --execute
ananke daemon --config /etc/ananke/ananke.yaml
ananke status --config /etc/ananke/ananke.yaml
ananke intent --config /etc/ananke/ananke.yaml --set normal --reason "manual-clear" --execute
`)
}
@ -439,7 +459,7 @@ func tryPeerBootstrapHandoff(ctx context.Context, cfg config.Config, logger *log
args := buildSSHBaseArgs(cfg)
remote := "sudo -n systemctl start hecate-bootstrap.service"
remote := "sudo -n systemctl start ananke-bootstrap.service"
attempt := 1
for {
cmdArgs := append(append([]string{}, args...), target, remote)
@ -480,7 +500,7 @@ func coordinatorAllowsPeerFallbackStartup(ctx context.Context, cfg config.Config
if user != "" {
target = user + "@" + host
}
remoteCmd := "if sudo -n /usr/bin/systemctl is-active --quiet hecate-bootstrap.service; then echo __HECATE_BOOTSTRAP_ACTIVE__; else echo __HECATE_BOOTSTRAP_IDLE__; fi; sudo -n /usr/local/bin/hecate intent --config /etc/hecate/hecate.yaml"
remoteCmd := "if sudo -n /usr/bin/systemctl is-active --quiet ananke-bootstrap.service; then echo __ANANKE_BOOTSTRAP_ACTIVE__; else echo __ANANKE_BOOTSTRAP_IDLE__; fi; sudo -n /usr/local/bin/ananke intent --config /etc/ananke/ananke.yaml"
args := append(buildSSHBaseArgs(cfg), target, remoteCmd)
out, err := runSSHWithRecovery(ctx, logger, cfg, args, []string{coordinator, host, cfg.SSHJumpHost})
if err != nil {
@ -488,7 +508,7 @@ func coordinatorAllowsPeerFallbackStartup(ctx context.Context, cfg config.Config
return true, "coordinator unreachable", nil
}
trimmed := strings.TrimSpace(out)
if strings.Contains(trimmed, "__HECATE_BOOTSTRAP_ACTIVE__") {
if strings.Contains(trimmed, "__ANANKE_BOOTSTRAP_ACTIVE__") {
return false, "coordinator bootstrap service is active", nil
}
remoteIntent, parseErr := state.ParseIntentOutput(trimmed)

View File

@ -1,5 +1,5 @@
# /etc/hecate/hecate.yaml
kubeconfig: /etc/hecate/kubeconfig
# /etc/ananke/ananke.yaml
kubeconfig: /etc/ananke/kubeconfig
ssh_user: atlas
ssh_port: 2277
ssh_config_file: ""
@ -11,6 +11,7 @@ ssh_jump_host: ""
ssh_jump_user: ""
iac_repo_path: /opt/titan-iac
expected_flux_branch: main
expected_flux_source_url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git
control_planes:
- titan-0a
- titan-0b
@ -46,6 +47,10 @@ startup:
api_wait_seconds: 1200
api_poll_seconds: 2
shutdown_cooldown_seconds: 45
minimum_battery_percent: 20
required_node_labels:
titan-09:
ananke.bstein.dev/harbor-bootstrap: "true"
require_time_sync: true
time_sync_wait_seconds: 240
time_sync_poll_seconds: 5
@ -67,9 +72,36 @@ startup:
post_start_probe_wait_seconds: 240
post_start_probe_poll_seconds: 5
post_start_probes:
- https://scm.bstein.dev/user/login
- https://metrics.bstein.dev/login
vault_unseal_key_file: /var/lib/hecate/vault-unseal.key
- https://scm.bstein.dev/api/healthz
- https://metrics.bstein.dev/api/health
require_service_checklist: true
service_checklist_wait_seconds: 420
service_checklist_poll_seconds: 5
service_checklist_stability_seconds: 120
service_checklist:
- name: gitea-api
url: https://scm.bstein.dev/api/healthz
accepted_statuses: [200]
body_contains: pass
timeout_seconds: 12
- name: grafana-api
url: https://metrics.bstein.dev/api/health
accepted_statuses: [200]
body_contains: '"database":"ok"'
timeout_seconds: 12
require_flux_health: true
flux_health_wait_seconds: 900
flux_health_poll_seconds: 5
ignore_flux_kustomizations: []
require_workload_convergence: true
workload_convergence_wait_seconds: 900
workload_convergence_poll_seconds: 5
ignore_workload_namespaces: []
ignore_workloads: []
ignore_unavailable_nodes: []
auto_recycle_stuck_pods: true
stuck_pod_grace_seconds: 180
vault_unseal_key_file: /var/lib/ananke/vault-unseal.key
vault_unseal_breakglass_command: ""
vault_unseal_breakglass_timeout_seconds: 15
shutdown:
@ -103,7 +135,7 @@ ups:
coordination:
forward_shutdown_host: ""
forward_shutdown_user: atlas
forward_shutdown_config: /etc/hecate/hecate.yaml
forward_shutdown_config: /etc/ananke/ananke.yaml
peer_hosts: []
fallback_local_shutdown: true
command_timeout_seconds: 25
@ -115,7 +147,7 @@ metrics:
bind_addr: 0.0.0.0:9560
path: /metrics
state:
dir: /var/lib/hecate
run_history_path: /var/lib/hecate/runs.json
lock_path: /var/lib/hecate/hecate.lock
intent_path: /var/lib/hecate/intent.json
dir: /var/lib/ananke
run_history_path: /var/lib/ananke/runs.json
lock_path: /var/lib/ananke/ananke.lock
intent_path: /var/lib/ananke/intent.json

View File

@ -1,5 +1,5 @@
# /etc/hecate/hecate.yaml for titan-24 (tethys forwarder)
kubeconfig: /etc/hecate/kubeconfig
# /etc/ananke/ananke.yaml for titan-24 (tethys forwarder)
kubeconfig: /etc/ananke/kubeconfig
ssh_user: atlas
ssh_port: 2277
ssh_config_file: /home/tethys/.ssh/config
@ -58,6 +58,7 @@ ssh_jump_host: ""
ssh_jump_user: ""
iac_repo_path: /opt/titan-iac
expected_flux_branch: main
expected_flux_source_url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git
control_planes:
- titan-0a
- titan-0b
@ -112,6 +113,10 @@ startup:
api_wait_seconds: 1200
api_poll_seconds: 2
shutdown_cooldown_seconds: 45
minimum_battery_percent: 20
required_node_labels:
titan-09:
ananke.bstein.dev/harbor-bootstrap: "true"
require_time_sync: true
time_sync_wait_seconds: 240
time_sync_poll_seconds: 5
@ -133,10 +138,37 @@ startup:
post_start_probe_wait_seconds: 240
post_start_probe_poll_seconds: 5
post_start_probes:
- https://scm.bstein.dev/user/login
- https://metrics.bstein.dev/login
vault_unseal_key_file: /var/lib/hecate/vault-unseal.key
vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/tethys/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.hecate-breakglass/vault-unseal.key'"
- https://scm.bstein.dev/api/healthz
- https://metrics.bstein.dev/api/health
require_service_checklist: true
service_checklist_wait_seconds: 420
service_checklist_poll_seconds: 5
service_checklist_stability_seconds: 120
service_checklist:
- name: gitea-api
url: https://scm.bstein.dev/api/healthz
accepted_statuses: [200]
body_contains: pass
timeout_seconds: 12
- name: grafana-api
url: https://metrics.bstein.dev/api/health
accepted_statuses: [200]
body_contains: '"database":"ok"'
timeout_seconds: 12
require_flux_health: true
flux_health_wait_seconds: 900
flux_health_poll_seconds: 5
ignore_flux_kustomizations: []
require_workload_convergence: true
workload_convergence_wait_seconds: 900
workload_convergence_poll_seconds: 5
ignore_workload_namespaces: []
ignore_workloads: []
ignore_unavailable_nodes: []
auto_recycle_stuck_pods: true
stuck_pod_grace_seconds: 180
vault_unseal_key_file: /var/lib/ananke/vault-unseal.key
vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/tethys/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.ananke-breakglass/vault-unseal.key'"
vault_unseal_breakglass_timeout_seconds: 15
shutdown:
default_budget_seconds: 1380
@ -167,7 +199,7 @@ ups:
coordination:
forward_shutdown_host: titan-db
forward_shutdown_user: atlas
forward_shutdown_config: /etc/hecate/hecate.yaml
forward_shutdown_config: /etc/ananke/ananke.yaml
peer_hosts:
- titan-db
fallback_local_shutdown: false
@ -180,7 +212,7 @@ metrics:
bind_addr: 0.0.0.0:9560
path: /metrics
state:
dir: /var/lib/hecate
run_history_path: /var/lib/hecate/runs.json
lock_path: /var/lib/hecate/hecate.lock
intent_path: /var/lib/hecate/intent.json
dir: /var/lib/ananke
run_history_path: /var/lib/ananke/runs.json
lock_path: /var/lib/ananke/ananke.lock
intent_path: /var/lib/ananke/intent.json

View File

@ -1,5 +1,5 @@
# /etc/hecate/hecate.yaml for titan-db (coordinator)
kubeconfig: /etc/hecate/kubeconfig
# /etc/ananke/ananke.yaml for titan-db (coordinator)
kubeconfig: /etc/ananke/kubeconfig
ssh_user: atlas
ssh_port: 2277
ssh_config_file: /home/atlas/.ssh/config
@ -58,6 +58,7 @@ ssh_jump_host: ""
ssh_jump_user: ""
iac_repo_path: /opt/titan-iac
expected_flux_branch: main
expected_flux_source_url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git
control_planes:
- titan-0a
- titan-0b
@ -112,6 +113,10 @@ startup:
api_wait_seconds: 1200
api_poll_seconds: 2
shutdown_cooldown_seconds: 45
minimum_battery_percent: 20
required_node_labels:
titan-09:
ananke.bstein.dev/harbor-bootstrap: "true"
require_time_sync: true
time_sync_wait_seconds: 240
time_sync_poll_seconds: 5
@ -133,10 +138,37 @@ startup:
post_start_probe_wait_seconds: 240
post_start_probe_poll_seconds: 5
post_start_probes:
- https://scm.bstein.dev/user/login
- https://metrics.bstein.dev/login
vault_unseal_key_file: /var/lib/hecate/vault-unseal.key
vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/atlas/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.hecate-breakglass/vault-unseal.key'"
- https://scm.bstein.dev/api/healthz
- https://metrics.bstein.dev/api/health
require_service_checklist: true
service_checklist_wait_seconds: 420
service_checklist_poll_seconds: 5
service_checklist_stability_seconds: 120
service_checklist:
- name: gitea-api
url: https://scm.bstein.dev/api/healthz
accepted_statuses: [200]
body_contains: pass
timeout_seconds: 12
- name: grafana-api
url: https://metrics.bstein.dev/api/health
accepted_statuses: [200]
body_contains: '"database":"ok"'
timeout_seconds: 12
require_flux_health: true
flux_health_wait_seconds: 900
flux_health_poll_seconds: 5
ignore_flux_kustomizations: []
require_workload_convergence: true
workload_convergence_wait_seconds: 900
workload_convergence_poll_seconds: 5
ignore_workload_namespaces: []
ignore_workloads: []
ignore_unavailable_nodes: []
auto_recycle_stuck_pods: true
stuck_pod_grace_seconds: 180
vault_unseal_key_file: /var/lib/ananke/vault-unseal.key
vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/atlas/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.ananke-breakglass/vault-unseal.key'"
vault_unseal_breakglass_timeout_seconds: 15
shutdown:
default_budget_seconds: 1380
@ -168,7 +200,7 @@ ups:
coordination:
forward_shutdown_host: ""
forward_shutdown_user: atlas
forward_shutdown_config: /etc/hecate/hecate.yaml
forward_shutdown_config: /etc/ananke/ananke.yaml
peer_hosts:
- titan-24
fallback_local_shutdown: true
@ -181,7 +213,7 @@ metrics:
bind_addr: 0.0.0.0:9560
path: /metrics
state:
dir: /var/lib/hecate
run_history_path: /var/lib/hecate/runs.json
lock_path: /var/lib/hecate/hecate.lock
intent_path: /var/lib/hecate/intent.json
dir: /var/lib/ananke
run_history_path: /var/lib/ananke/runs.json
lock_path: /var/lib/ananke/ananke.lock
intent_path: /var/lib/ananke/intent.json

View File

@ -1,15 +1,15 @@
[Unit]
Description=Hecate Staged Cluster Bootstrap
Description=Ananke Staged Cluster Bootstrap
Wants=network-online.target
After=network-online.target
ConditionPathExists=/etc/hecate/hecate.yaml
ConditionPathExists=/etc/ananke/ananke.yaml
StartLimitIntervalSec=0
[Service]
Type=oneshot
User=root
Group=root
ExecStart=/usr/local/bin/hecate startup --config /etc/hecate/hecate.yaml --execute --force-flux-branch main --auto-peer-failover --peer-wait-seconds 180
ExecStart=/usr/local/bin/ananke startup --config /etc/ananke/ananke.yaml --execute --force-flux-branch main --auto-peer-failover --peer-wait-seconds 180
Restart=on-failure
RestartSec=30
TimeoutStartSec=1800

View File

@ -1,5 +1,5 @@
[Unit]
Description=Hecate Self-Update and Reinstall
Description=Ananke Self-Update and Reinstall
Wants=network-online.target
After=network-online.target
@ -7,6 +7,7 @@ After=network-online.target
Type=oneshot
User=root
Group=root
ExecStart=/usr/local/lib/hecate/hecate-self-update.sh
ExecStart=/usr/local/lib/ananke/ananke-self-update.sh
TimeoutStartSec=1800
[Install]

View File

@ -1,12 +1,11 @@
[Unit]
Description=Periodic Hecate Self-Update Timer
Description=Periodic Ananke Self-Update Timer
[Timer]
OnBootSec=2m
OnUnitActiveSec=6h
Unit=hecate-update.service
Unit=ananke-update.service
Persistent=true
[Install]
WantedBy=timers.target

View File

@ -1,14 +1,14 @@
[Unit]
Description=Hecate UPS Monitor and Auto Shutdown Orchestrator
Description=Ananke UPS Monitor and Auto Shutdown Orchestrator
Wants=network-online.target
After=network-online.target
ConditionPathExists=/etc/hecate/hecate.yaml
ConditionPathExists=/etc/ananke/ananke.yaml
[Service]
Type=simple
User=root
Group=root
ExecStart=/usr/local/bin/hecate daemon --config /etc/hecate/hecate.yaml
ExecStart=/usr/local/bin/ananke daemon --config /etc/ananke/ananke.yaml
Restart=on-failure
RestartSec=5
NoNewPrivileges=true

2
go.mod
View File

@ -1,4 +1,4 @@
module scm.bstein.dev/bstein/hecate
module scm.bstein.dev/bstein/ananke
go 1.25

File diff suppressed because it is too large Load Diff

View File

@ -1,14 +1,17 @@
package cluster
import (
"context"
"log"
"net/http"
"net/http/httptest"
"os"
"reflect"
"testing"
"time"
"scm.bstein.dev/bstein/hecate/internal/config"
"scm.bstein.dev/bstein/hecate/internal/state"
"scm.bstein.dev/bstein/ananke/internal/config"
"scm.bstein.dev/bstein/ananke/internal/state"
)
func TestParseVaultSealed(t *testing.T) {
@ -117,3 +120,75 @@ func TestCoordinationPeersDedupesAndIncludesForwardHost(t *testing.T) {
t.Fatalf("coordination peers mismatch: got=%v want=%v", got, want)
}
}
func TestWorkloadTargetsIgnoredNodesByNodeSelector(t *testing.T) {
spec := podSpec{
NodeSelector: map[string]string{
"kubernetes.io/hostname": "titan-22",
},
}
ignored := map[string]struct{}{"titan-22": {}}
if !workloadTargetsIgnoredNodes(spec, ignored) {
t.Fatalf("expected workload to target ignored node via nodeSelector")
}
}
func TestParseWorkloadIgnoreRules(t *testing.T) {
rules := parseWorkloadIgnoreRules([]string{
"maintenance/metis",
"crypto/statefulset/monerod",
})
if len(rules) != 2 {
t.Fatalf("expected 2 ignore rules, got %d", len(rules))
}
if !workloadIgnored(rules, "maintenance", "deployment", "metis") {
t.Fatalf("expected namespace/name rule to match")
}
if !workloadIgnored(rules, "crypto", "statefulset", "monerod") {
t.Fatalf("expected namespace/kind/name rule to match")
}
if workloadIgnored(rules, "crypto", "deployment", "monerod") {
t.Fatalf("did not expect mismatched kind to match")
}
}
func TestNamespaceCandidatesFromIgnoreKustomizations(t *testing.T) {
got := namespaceCandidatesFromIgnoreKustomizations([]string{
"flux-system/jellyfin",
"flux-system/outline",
})
if _, ok := got["jellyfin"]; !ok {
t.Fatalf("expected jellyfin namespace candidate")
}
if _, ok := got["outline"]; !ok {
t.Fatalf("expected outline namespace candidate")
}
}
func TestProbeStatusAcceptedRejects404(t *testing.T) {
if probeStatusAccepted("https://metrics.bstein.dev/login", 404) {
t.Fatalf("expected 404 probe status to be rejected")
}
}
func TestServiceCheckReadyRequiresBodyContains(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte(`{"database":"ok"}`))
}))
defer srv.Close()
orch := &Orchestrator{
log: log.New(os.Stdout, "", 0),
}
ok, detail := orch.serviceCheckReady(context.Background(), config.ServiceChecklistCheck{
Name: "grafana-api",
URL: srv.URL,
AcceptedStatuses: []int{200},
BodyContains: `"database":"ok"`,
TimeoutSeconds: 5,
})
if !ok {
t.Fatalf("expected service check to pass, detail=%s", detail)
}
}

View File

@ -2,6 +2,7 @@ package config
import (
"fmt"
neturl "net/url"
"os"
"strings"
@ -21,6 +22,7 @@ type Config struct {
SSHJumpUser string `yaml:"ssh_jump_user"`
IACRepoPath string `yaml:"iac_repo_path"`
ExpectedFluxBranch string `yaml:"expected_flux_branch"`
ExpectedFluxSource string `yaml:"expected_flux_source_url"`
ControlPlanes []string `yaml:"control_planes"`
Workers []string `yaml:"workers"`
LocalBootstrapPaths []string `yaml:"local_bootstrap_paths"`
@ -37,6 +39,8 @@ type Startup struct {
APIWaitSeconds int `yaml:"api_wait_seconds"`
APIPollSeconds int `yaml:"api_poll_seconds"`
ShutdownCooldownSeconds int `yaml:"shutdown_cooldown_seconds"`
MinimumBatteryPercent float64 `yaml:"minimum_battery_percent"`
RequiredNodeLabels map[string]map[string]string `yaml:"required_node_labels"`
RequireTimeSync bool `yaml:"require_time_sync"`
TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"`
TimeSyncPollSeconds int `yaml:"time_sync_poll_seconds"`
@ -54,11 +58,38 @@ type Startup struct {
PostStartProbeWaitSeconds int `yaml:"post_start_probe_wait_seconds"`
PostStartProbePollSeconds int `yaml:"post_start_probe_poll_seconds"`
PostStartProbes []string `yaml:"post_start_probes"`
RequireServiceChecklist bool `yaml:"require_service_checklist"`
ServiceChecklistWaitSeconds int `yaml:"service_checklist_wait_seconds"`
ServiceChecklistPollSeconds int `yaml:"service_checklist_poll_seconds"`
ServiceChecklistStabilitySec int `yaml:"service_checklist_stability_seconds"`
ServiceChecklist []ServiceChecklistCheck `yaml:"service_checklist"`
RequireFluxHealth bool `yaml:"require_flux_health"`
FluxHealthWaitSeconds int `yaml:"flux_health_wait_seconds"`
FluxHealthPollSeconds int `yaml:"flux_health_poll_seconds"`
IgnoreFluxKustomizations []string `yaml:"ignore_flux_kustomizations"`
RequireWorkloadConvergence bool `yaml:"require_workload_convergence"`
WorkloadConvergenceWaitSeconds int `yaml:"workload_convergence_wait_seconds"`
WorkloadConvergencePollSeconds int `yaml:"workload_convergence_poll_seconds"`
IgnoreWorkloadNamespaces []string `yaml:"ignore_workload_namespaces"`
IgnoreWorkloads []string `yaml:"ignore_workloads"`
IgnoreUnavailableNodes []string `yaml:"ignore_unavailable_nodes"`
AutoRecycleStuckPods bool `yaml:"auto_recycle_stuck_pods"`
StuckPodGraceSeconds int `yaml:"stuck_pod_grace_seconds"`
VaultUnsealKeyFile string `yaml:"vault_unseal_key_file"`
VaultUnsealBreakglassCommand string `yaml:"vault_unseal_breakglass_command"`
VaultUnsealBreakglassTimeout int `yaml:"vault_unseal_breakglass_timeout_seconds"`
}
type ServiceChecklistCheck struct {
Name string `yaml:"name"`
URL string `yaml:"url"`
AcceptedStatuses []int `yaml:"accepted_statuses"`
BodyContains string `yaml:"body_contains"`
BodyNotContains string `yaml:"body_not_contains"`
TimeoutSeconds int `yaml:"timeout_seconds"`
InsecureSkipTLS bool `yaml:"insecure_skip_tls"`
}
type Shutdown struct {
DefaultBudgetSeconds int `yaml:"default_budget_seconds"`
HistoryMinSamples int `yaml:"history_min_samples"`
@ -143,6 +174,9 @@ func (c Config) Validate() error {
if c.ExpectedFluxBranch == "" {
return fmt.Errorf("config.expected_flux_branch must not be empty")
}
if c.ExpectedFluxSource == "" {
return fmt.Errorf("config.expected_flux_source_url must not be empty")
}
if c.IACRepoPath == "" {
return fmt.Errorf("config.iac_repo_path must not be empty")
}
@ -176,6 +210,25 @@ func (c Config) Validate() error {
if c.Startup.ShutdownCooldownSeconds <= 0 {
return fmt.Errorf("config.startup.shutdown_cooldown_seconds must be > 0")
}
if c.Startup.MinimumBatteryPercent < 0 || c.Startup.MinimumBatteryPercent > 100 {
return fmt.Errorf("config.startup.minimum_battery_percent must be between 0 and 100")
}
for node, labels := range c.Startup.RequiredNodeLabels {
if strings.TrimSpace(node) == "" {
return fmt.Errorf("config.startup.required_node_labels keys must not be empty")
}
if len(labels) == 0 {
return fmt.Errorf("config.startup.required_node_labels[%q] must include at least one label", node)
}
for key, value := range labels {
if strings.TrimSpace(key) == "" {
return fmt.Errorf("config.startup.required_node_labels[%q] contains empty label key", node)
}
if strings.TrimSpace(value) == "" {
return fmt.Errorf("config.startup.required_node_labels[%q][%q] must not be empty", node, key)
}
}
}
if c.Startup.TimeSyncWaitSeconds <= 0 {
return fmt.Errorf("config.startup.time_sync_wait_seconds must be > 0")
}
@ -223,11 +276,88 @@ func (c Config) Validate() error {
if c.Startup.RequirePostStartProbes && len(c.Startup.PostStartProbes) == 0 {
return fmt.Errorf("config.startup.post_start_probes must not be empty when require_post_start_probes is true")
}
if c.Startup.ServiceChecklistWaitSeconds <= 0 {
return fmt.Errorf("config.startup.service_checklist_wait_seconds must be > 0")
}
if c.Startup.ServiceChecklistPollSeconds <= 0 {
return fmt.Errorf("config.startup.service_checklist_poll_seconds must be > 0")
}
if c.Startup.ServiceChecklistStabilitySec < 0 {
return fmt.Errorf("config.startup.service_checklist_stability_seconds must be >= 0")
}
if c.Startup.RequireServiceChecklist && len(c.Startup.ServiceChecklist) == 0 {
return fmt.Errorf("config.startup.service_checklist must not be empty when require_service_checklist is true")
}
for i, check := range c.Startup.ServiceChecklist {
if strings.TrimSpace(check.Name) == "" {
return fmt.Errorf("config.startup.service_checklist[%d].name must not be empty", i)
}
rawURL := strings.TrimSpace(check.URL)
if rawURL == "" {
return fmt.Errorf("config.startup.service_checklist[%d].url must not be empty", i)
}
parsed, err := neturl.Parse(rawURL)
if err != nil || parsed.Scheme == "" || parsed.Host == "" {
return fmt.Errorf("config.startup.service_checklist[%d].url is invalid: %q", i, rawURL)
}
if check.TimeoutSeconds <= 0 {
return fmt.Errorf("config.startup.service_checklist[%d].timeout_seconds must be > 0", i)
}
for _, code := range check.AcceptedStatuses {
if code < 100 || code > 599 {
return fmt.Errorf("config.startup.service_checklist[%d].accepted_statuses contains invalid HTTP code %d", i, code)
}
}
}
if c.Startup.FluxHealthWaitSeconds <= 0 {
return fmt.Errorf("config.startup.flux_health_wait_seconds must be > 0")
}
if c.Startup.FluxHealthPollSeconds <= 0 {
return fmt.Errorf("config.startup.flux_health_poll_seconds must be > 0")
}
if c.Startup.WorkloadConvergenceWaitSeconds <= 0 {
return fmt.Errorf("config.startup.workload_convergence_wait_seconds must be > 0")
}
if c.Startup.WorkloadConvergencePollSeconds <= 0 {
return fmt.Errorf("config.startup.workload_convergence_poll_seconds must be > 0")
}
if c.Startup.StuckPodGraceSeconds <= 0 {
return fmt.Errorf("config.startup.stuck_pod_grace_seconds must be > 0")
}
for _, probe := range c.Startup.PostStartProbes {
if strings.TrimSpace(probe) == "" {
return fmt.Errorf("config.startup.post_start_probes entries must not be empty")
}
}
for _, item := range c.Startup.IgnoreFluxKustomizations {
item = strings.TrimSpace(item)
if item == "" {
return fmt.Errorf("config.startup.ignore_flux_kustomizations entries must not be empty")
}
if strings.Count(item, "/") != 1 {
return fmt.Errorf("config.startup.ignore_flux_kustomizations entries must be namespace/name, got %q", item)
}
}
for _, item := range c.Startup.IgnoreWorkloads {
item = strings.TrimSpace(item)
if item == "" {
return fmt.Errorf("config.startup.ignore_workloads entries must not be empty")
}
parts := strings.Split(item, "/")
if len(parts) != 2 && len(parts) != 3 {
return fmt.Errorf("config.startup.ignore_workloads entries must be namespace/name or namespace/kind/name, got %q", item)
}
}
for _, ns := range c.Startup.IgnoreWorkloadNamespaces {
if strings.TrimSpace(ns) == "" {
return fmt.Errorf("config.startup.ignore_workload_namespaces entries must not be empty")
}
}
for _, node := range c.Startup.IgnoreUnavailableNodes {
if strings.TrimSpace(node) == "" {
return fmt.Errorf("config.startup.ignore_unavailable_nodes entries must not be empty")
}
}
if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" {
return fmt.Errorf("config.startup.vault_unseal_key_file must not be empty")
}
@ -276,6 +406,7 @@ func defaults() Config {
c := Config{
IACRepoPath: "/opt/titan-iac",
ExpectedFluxBranch: "main",
ExpectedFluxSource: "ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git",
SSHPort: 2277,
ControlPlanes: []string{"titan-0a", "titan-0b", "titan-0c"},
LocalBootstrapPaths: []string{
@ -328,15 +459,53 @@ func defaults() Config {
"gitea/gitea-data",
"sso/keycloak-data",
},
MinimumBatteryPercent: 20,
RequiredNodeLabels: map[string]map[string]string{
"titan-09": {
"ananke.bstein.dev/harbor-bootstrap": "true",
},
},
RequirePostStartProbes: true,
PostStartProbeWaitSeconds: 240,
PostStartProbePollSeconds: 5,
PostStartProbes: []string{
"https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration",
"https://scm.bstein.dev/user/login",
"https://metrics.bstein.dev/login",
"https://scm.bstein.dev/api/healthz",
"https://metrics.bstein.dev/api/health",
},
VaultUnsealKeyFile: "/var/lib/hecate/vault-unseal.key",
RequireServiceChecklist: true,
ServiceChecklistWaitSeconds: 420,
ServiceChecklistPollSeconds: 5,
ServiceChecklistStabilitySec: 120,
ServiceChecklist: []ServiceChecklistCheck{
{
Name: "gitea-api",
URL: "https://scm.bstein.dev/api/healthz",
AcceptedStatuses: []int{200},
BodyContains: "pass",
TimeoutSeconds: 12,
},
{
Name: "grafana-api",
URL: "https://metrics.bstein.dev/api/health",
AcceptedStatuses: []int{200},
BodyContains: "\"database\":\"ok\"",
TimeoutSeconds: 12,
},
},
RequireFluxHealth: true,
FluxHealthWaitSeconds: 900,
FluxHealthPollSeconds: 5,
IgnoreFluxKustomizations: []string{},
RequireWorkloadConvergence: true,
WorkloadConvergenceWaitSeconds: 900,
WorkloadConvergencePollSeconds: 5,
IgnoreWorkloadNamespaces: []string{},
IgnoreWorkloads: []string{},
IgnoreUnavailableNodes: []string{},
AutoRecycleStuckPods: true,
StuckPodGraceSeconds: 180,
VaultUnsealKeyFile: "/var/lib/ananke/vault-unseal.key",
VaultUnsealBreakglassTimeout: 15,
},
Shutdown: Shutdown{
@ -362,7 +531,7 @@ func defaults() Config {
TelemetryTimeoutSeconds: 90,
},
Coordination: Coordination{
ForwardShutdownConfig: "/etc/hecate/hecate.yaml",
ForwardShutdownConfig: "/etc/ananke/ananke.yaml",
PeerHosts: []string{},
FallbackLocalShutdown: true,
CommandTimeoutSeconds: 25,
@ -376,10 +545,10 @@ func defaults() Config {
Path: "/metrics",
},
State: State{
Dir: "/var/lib/hecate",
RunHistoryPath: "/var/lib/hecate/runs.json",
LockPath: "/var/lib/hecate/hecate.lock",
IntentPath: "/var/lib/hecate/intent.json",
Dir: "/var/lib/ananke",
RunHistoryPath: "/var/lib/ananke/runs.json",
LockPath: "/var/lib/ananke/ananke.lock",
IntentPath: "/var/lib/ananke/intent.json",
},
}
c.applyDefaults()
@ -393,6 +562,9 @@ func (c *Config) applyDefaults() {
if c.IACRepoPath == "" {
c.IACRepoPath = "/opt/titan-iac"
}
if c.ExpectedFluxSource == "" {
c.ExpectedFluxSource = "ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git"
}
if c.Startup.APIWaitSeconds <= 0 {
c.Startup.APIWaitSeconds = 1200
}
@ -402,6 +574,16 @@ func (c *Config) applyDefaults() {
if c.Startup.ShutdownCooldownSeconds <= 0 {
c.Startup.ShutdownCooldownSeconds = 45
}
if c.Startup.MinimumBatteryPercent <= 0 {
c.Startup.MinimumBatteryPercent = 20
}
if c.Startup.RequiredNodeLabels == nil {
c.Startup.RequiredNodeLabels = map[string]map[string]string{
"titan-09": {
"ananke.bstein.dev/harbor-bootstrap": "true",
},
}
}
if c.Startup.TimeSyncWaitSeconds <= 0 {
c.Startup.TimeSyncWaitSeconds = 240
}
@ -446,12 +628,71 @@ func (c *Config) applyDefaults() {
if len(c.Startup.PostStartProbes) == 0 {
c.Startup.PostStartProbes = []string{
"https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration",
"https://scm.bstein.dev/user/login",
"https://metrics.bstein.dev/login",
"https://scm.bstein.dev/api/healthz",
"https://metrics.bstein.dev/api/health",
}
}
if c.Startup.ServiceChecklistWaitSeconds <= 0 {
c.Startup.ServiceChecklistWaitSeconds = 420
}
if c.Startup.ServiceChecklistPollSeconds <= 0 {
c.Startup.ServiceChecklistPollSeconds = 5
}
if c.Startup.ServiceChecklistStabilitySec < 0 {
c.Startup.ServiceChecklistStabilitySec = 0
}
if len(c.Startup.ServiceChecklist) == 0 {
c.Startup.ServiceChecklist = []ServiceChecklistCheck{
{
Name: "gitea-api",
URL: "https://scm.bstein.dev/api/healthz",
AcceptedStatuses: []int{200},
BodyContains: "pass",
TimeoutSeconds: 12,
},
{
Name: "grafana-api",
URL: "https://metrics.bstein.dev/api/health",
AcceptedStatuses: []int{200},
BodyContains: "\"database\":\"ok\"",
TimeoutSeconds: 12,
},
}
}
for i := range c.Startup.ServiceChecklist {
if c.Startup.ServiceChecklist[i].TimeoutSeconds <= 0 {
c.Startup.ServiceChecklist[i].TimeoutSeconds = 12
}
}
if c.Startup.FluxHealthWaitSeconds <= 0 {
c.Startup.FluxHealthWaitSeconds = 900
}
if c.Startup.FluxHealthPollSeconds <= 0 {
c.Startup.FluxHealthPollSeconds = 5
}
if c.Startup.IgnoreFluxKustomizations == nil {
c.Startup.IgnoreFluxKustomizations = []string{}
}
if c.Startup.WorkloadConvergenceWaitSeconds <= 0 {
c.Startup.WorkloadConvergenceWaitSeconds = 900
}
if c.Startup.WorkloadConvergencePollSeconds <= 0 {
c.Startup.WorkloadConvergencePollSeconds = 5
}
if c.Startup.IgnoreWorkloadNamespaces == nil {
c.Startup.IgnoreWorkloadNamespaces = []string{}
}
if c.Startup.IgnoreWorkloads == nil {
c.Startup.IgnoreWorkloads = []string{}
}
if c.Startup.IgnoreUnavailableNodes == nil {
c.Startup.IgnoreUnavailableNodes = []string{}
}
if c.Startup.StuckPodGraceSeconds <= 0 {
c.Startup.StuckPodGraceSeconds = 180
}
if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" {
c.Startup.VaultUnsealKeyFile = "/var/lib/hecate/vault-unseal.key"
c.Startup.VaultUnsealKeyFile = "/var/lib/ananke/vault-unseal.key"
}
if c.Startup.VaultUnsealBreakglassTimeout <= 0 {
c.Startup.VaultUnsealBreakglassTimeout = 15
@ -496,7 +737,7 @@ func (c *Config) applyDefaults() {
c.UPS.TelemetryTimeoutSeconds = 90
}
if c.Coordination.ForwardShutdownConfig == "" {
c.Coordination.ForwardShutdownConfig = "/etc/hecate/hecate.yaml"
c.Coordination.ForwardShutdownConfig = "/etc/ananke/ananke.yaml"
}
if c.Coordination.PeerHosts == nil {
c.Coordination.PeerHosts = []string{}
@ -517,15 +758,15 @@ func (c *Config) applyDefaults() {
c.Metrics.Path = "/metrics"
}
if c.State.Dir == "" {
c.State.Dir = "/var/lib/hecate"
c.State.Dir = "/var/lib/ananke"
}
if c.State.RunHistoryPath == "" {
c.State.RunHistoryPath = "/var/lib/hecate/runs.json"
c.State.RunHistoryPath = "/var/lib/ananke/runs.json"
}
if c.State.LockPath == "" {
c.State.LockPath = "/var/lib/hecate/hecate.lock"
c.State.LockPath = "/var/lib/ananke/ananke.lock"
}
if c.State.IntentPath == "" {
c.State.IntentPath = "/var/lib/hecate/intent.json"
c.State.IntentPath = "/var/lib/ananke/intent.json"
}
}

View File

@ -9,7 +9,7 @@ import (
func TestLoadAcceptsUPSTargets(t *testing.T) {
tmp := t.TempDir()
cfgPath := filepath.Join(tmp, "hecate.yaml")
cfgPath := filepath.Join(tmp, "ananke.yaml")
raw := `
control_planes: [titan-0a, titan-0b, titan-0c]
expected_flux_branch: main
@ -24,7 +24,7 @@ shutdown:
default_budget_seconds: 300
state:
run_history_path: /tmp/runs.json
lock_path: /tmp/hecate.lock
lock_path: /tmp/ananke.lock
`
if err := os.WriteFile(cfgPath, []byte(strings.TrimSpace(raw)), 0o644); err != nil {
t.Fatalf("write config: %v", err)
@ -74,7 +74,7 @@ func TestValidateRejectsUnknownEtcdRestoreControlPlane(t *testing.T) {
func TestLoadSetsCoordinationGuardDefaults(t *testing.T) {
tmp := t.TempDir()
cfgPath := filepath.Join(tmp, "hecate.yaml")
cfgPath := filepath.Join(tmp, "ananke.yaml")
raw := `
control_planes: [titan-0a, titan-0b, titan-0c]
expected_flux_branch: main
@ -85,7 +85,7 @@ ups:
enabled: false
state:
run_history_path: /tmp/runs.json
lock_path: /tmp/hecate.lock
lock_path: /tmp/ananke.lock
`
if err := os.WriteFile(cfgPath, []byte(strings.TrimSpace(raw)), 0o644); err != nil {
t.Fatalf("write config: %v", err)
@ -146,3 +146,55 @@ func TestValidateRejectsMissingPostStartProbesWhenRequired(t *testing.T) {
t.Fatalf("expected validation error when post start probes are required but empty")
}
}
func TestValidateRejectsMissingServiceChecklistWhenRequired(t *testing.T) {
cfg := defaults()
cfg.Startup.RequireServiceChecklist = true
cfg.Startup.ServiceChecklist = nil
if err := cfg.Validate(); err == nil {
t.Fatalf("expected validation error when service checklist is required but empty")
}
}
func TestValidateRejectsBadServiceChecklistURL(t *testing.T) {
cfg := defaults()
cfg.Startup.ServiceChecklist = []ServiceChecklistCheck{
{
Name: "grafana",
URL: "not-a-url",
AcceptedStatuses: []int{200},
TimeoutSeconds: 12,
},
}
if err := cfg.Validate(); err == nil {
t.Fatalf("expected validation error for invalid service checklist url")
}
}
func TestValidateRejectsBadIgnoreFluxKustomizationFormat(t *testing.T) {
cfg := defaults()
cfg.Startup.IgnoreFluxKustomizations = []string{"jellyfin"}
if err := cfg.Validate(); err == nil {
t.Fatalf("expected validation error for invalid ignore_flux_kustomizations entry")
}
}
func TestValidateRejectsBadIgnoreWorkloadFormat(t *testing.T) {
cfg := defaults()
cfg.Startup.IgnoreWorkloads = []string{"maintenance/metis/extra/value"}
if err := cfg.Validate(); err == nil {
t.Fatalf("expected validation error for invalid ignore_workloads entry")
}
}
func TestValidateRejectsInvalidRequiredNodeLabel(t *testing.T) {
cfg := defaults()
cfg.Startup.RequiredNodeLabels = map[string]map[string]string{
"titan-09": {
"": "true",
},
}
if err := cfg.Validate(); err == nil {
t.Fatalf("expected validation error for invalid required_node_labels entry")
}
}

View File

@ -84,41 +84,41 @@ func (e *Exporter) serveMetrics(w http.ResponseWriter, _ *http.Request) {
w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8")
var b strings.Builder
b.WriteString("# HELP hecate_shutdown_budget_seconds Estimated graceful shutdown budget in seconds (p95).\n")
b.WriteString("# TYPE hecate_shutdown_budget_seconds gauge\n")
b.WriteString(fmt.Sprintf("hecate_shutdown_budget_seconds %d\n", e.shutdownBudgetSec))
b.WriteString("# HELP hecate_shutdown_triggers_total Total number of shutdown triggers issued by this instance.\n")
b.WriteString("# TYPE hecate_shutdown_triggers_total counter\n")
b.WriteString(fmt.Sprintf("hecate_shutdown_triggers_total %d\n", e.shutdownTriggers))
b.WriteString("# HELP hecate_shutdown_last_trigger_timestamp_seconds Unix timestamp of last shutdown trigger.\n")
b.WriteString("# TYPE hecate_shutdown_last_trigger_timestamp_seconds gauge\n")
b.WriteString("# HELP ananke_shutdown_budget_seconds Estimated graceful shutdown budget in seconds (p95).\n")
b.WriteString("# TYPE ananke_shutdown_budget_seconds gauge\n")
b.WriteString(fmt.Sprintf("ananke_shutdown_budget_seconds %d\n", e.shutdownBudgetSec))
b.WriteString("# HELP ananke_shutdown_triggers_total Total number of shutdown triggers issued by this instance.\n")
b.WriteString("# TYPE ananke_shutdown_triggers_total counter\n")
b.WriteString(fmt.Sprintf("ananke_shutdown_triggers_total %d\n", e.shutdownTriggers))
b.WriteString("# HELP ananke_shutdown_last_trigger_timestamp_seconds Unix timestamp of last shutdown trigger.\n")
b.WriteString("# TYPE ananke_shutdown_last_trigger_timestamp_seconds gauge\n")
if e.lastShutdownAt.IsZero() {
b.WriteString("hecate_shutdown_last_trigger_timestamp_seconds 0\n")
b.WriteString("ananke_shutdown_last_trigger_timestamp_seconds 0\n")
} else {
b.WriteString(fmt.Sprintf("hecate_shutdown_last_trigger_timestamp_seconds %d\n", e.lastShutdownAt.Unix()))
b.WriteString(fmt.Sprintf("ananke_shutdown_last_trigger_timestamp_seconds %d\n", e.lastShutdownAt.Unix()))
}
b.WriteString("# HELP hecate_ups_on_battery Whether a UPS source is currently on battery.\n")
b.WriteString("# TYPE hecate_ups_on_battery gauge\n")
b.WriteString("# HELP hecate_ups_low_battery Whether a UPS source currently reports low battery.\n")
b.WriteString("# TYPE hecate_ups_low_battery gauge\n")
b.WriteString("# HELP hecate_ups_runtime_seconds Battery runtime remaining reported by UPS.\n")
b.WriteString("# TYPE hecate_ups_runtime_seconds gauge\n")
b.WriteString("# HELP hecate_ups_battery_charge_percent Battery charge percentage reported by UPS.\n")
b.WriteString("# TYPE hecate_ups_battery_charge_percent gauge\n")
b.WriteString("# HELP hecate_ups_load_percent UPS output load percentage.\n")
b.WriteString("# TYPE hecate_ups_load_percent gauge\n")
b.WriteString("# HELP hecate_ups_power_nominal_watts UPS nominal power rating in watts.\n")
b.WriteString("# TYPE hecate_ups_power_nominal_watts gauge\n")
b.WriteString("# HELP hecate_ups_threshold_seconds Red-line threshold for runtime-based shutdown.\n")
b.WriteString("# TYPE hecate_ups_threshold_seconds gauge\n")
b.WriteString("# HELP hecate_ups_trigger_active Whether this UPS source currently breaches shutdown trigger conditions.\n")
b.WriteString("# TYPE hecate_ups_trigger_active gauge\n")
b.WriteString("# HELP hecate_ups_breach_count Current debounce breach count for this UPS source.\n")
b.WriteString("# TYPE hecate_ups_breach_count gauge\n")
b.WriteString("# HELP hecate_ups_last_sample_timestamp_seconds Unix timestamp of most recent sample.\n")
b.WriteString("# TYPE hecate_ups_last_sample_timestamp_seconds gauge\n")
b.WriteString("# HELP hecate_ups_error Whether the last sample had an error.\n")
b.WriteString("# TYPE hecate_ups_error gauge\n")
b.WriteString("# HELP ananke_ups_on_battery Whether a UPS source is currently on battery.\n")
b.WriteString("# TYPE ananke_ups_on_battery gauge\n")
b.WriteString("# HELP ananke_ups_low_battery Whether a UPS source currently reports low battery.\n")
b.WriteString("# TYPE ananke_ups_low_battery gauge\n")
b.WriteString("# HELP ananke_ups_runtime_seconds Battery runtime remaining reported by UPS.\n")
b.WriteString("# TYPE ananke_ups_runtime_seconds gauge\n")
b.WriteString("# HELP ananke_ups_battery_charge_percent Battery charge percentage reported by UPS.\n")
b.WriteString("# TYPE ananke_ups_battery_charge_percent gauge\n")
b.WriteString("# HELP ananke_ups_load_percent UPS output load percentage.\n")
b.WriteString("# TYPE ananke_ups_load_percent gauge\n")
b.WriteString("# HELP ananke_ups_power_nominal_watts UPS nominal power rating in watts.\n")
b.WriteString("# TYPE ananke_ups_power_nominal_watts gauge\n")
b.WriteString("# HELP ananke_ups_threshold_seconds Red-line threshold for runtime-based shutdown.\n")
b.WriteString("# TYPE ananke_ups_threshold_seconds gauge\n")
b.WriteString("# HELP ananke_ups_trigger_active Whether this UPS source currently breaches shutdown trigger conditions.\n")
b.WriteString("# TYPE ananke_ups_trigger_active gauge\n")
b.WriteString("# HELP ananke_ups_breach_count Current debounce breach count for this UPS source.\n")
b.WriteString("# TYPE ananke_ups_breach_count gauge\n")
b.WriteString("# HELP ananke_ups_last_sample_timestamp_seconds Unix timestamp of most recent sample.\n")
b.WriteString("# TYPE ananke_ups_last_sample_timestamp_seconds gauge\n")
b.WriteString("# HELP ananke_ups_error Whether the last sample had an error.\n")
b.WriteString("# TYPE ananke_ups_error gauge\n")
names := make([]string, 0, len(e.samples))
for name := range e.samples {
@ -129,21 +129,21 @@ func (e *Exporter) serveMetrics(w http.ResponseWriter, _ *http.Request) {
s := e.samples[name]
labels := fmt.Sprintf("{source=%q,target=%q,status=%q,last_reason=%q}",
safe(name), safe(s.Target), safe(s.Status), safe(e.lastShutdownReason))
b.WriteString(fmt.Sprintf("hecate_ups_on_battery%s %d\n", labels, boolNum(s.OnBattery)))
b.WriteString(fmt.Sprintf("hecate_ups_low_battery%s %d\n", labels, boolNum(s.LowBattery)))
b.WriteString(fmt.Sprintf("hecate_ups_runtime_seconds%s %d\n", labels, s.RuntimeSecond))
b.WriteString(fmt.Sprintf("hecate_ups_battery_charge_percent%s %.2f\n", labels, s.BatteryCharge))
b.WriteString(fmt.Sprintf("hecate_ups_load_percent%s %.2f\n", labels, s.LoadPercent))
b.WriteString(fmt.Sprintf("hecate_ups_power_nominal_watts%s %.2f\n", labels, s.PowerNominalW))
b.WriteString(fmt.Sprintf("hecate_ups_threshold_seconds%s %d\n", labels, s.ThresholdSec))
b.WriteString(fmt.Sprintf("hecate_ups_trigger_active%s %d\n", labels, boolNum(s.Trigger)))
b.WriteString(fmt.Sprintf("hecate_ups_breach_count%s %d\n", labels, s.BreachCount))
b.WriteString(fmt.Sprintf("ananke_ups_on_battery%s %d\n", labels, boolNum(s.OnBattery)))
b.WriteString(fmt.Sprintf("ananke_ups_low_battery%s %d\n", labels, boolNum(s.LowBattery)))
b.WriteString(fmt.Sprintf("ananke_ups_runtime_seconds%s %d\n", labels, s.RuntimeSecond))
b.WriteString(fmt.Sprintf("ananke_ups_battery_charge_percent%s %.2f\n", labels, s.BatteryCharge))
b.WriteString(fmt.Sprintf("ananke_ups_load_percent%s %.2f\n", labels, s.LoadPercent))
b.WriteString(fmt.Sprintf("ananke_ups_power_nominal_watts%s %.2f\n", labels, s.PowerNominalW))
b.WriteString(fmt.Sprintf("ananke_ups_threshold_seconds%s %d\n", labels, s.ThresholdSec))
b.WriteString(fmt.Sprintf("ananke_ups_trigger_active%s %d\n", labels, boolNum(s.Trigger)))
b.WriteString(fmt.Sprintf("ananke_ups_breach_count%s %d\n", labels, s.BreachCount))
if s.UpdatedAt.IsZero() {
b.WriteString(fmt.Sprintf("hecate_ups_last_sample_timestamp_seconds%s 0\n", labels))
b.WriteString(fmt.Sprintf("ananke_ups_last_sample_timestamp_seconds%s 0\n", labels))
} else {
b.WriteString(fmt.Sprintf("hecate_ups_last_sample_timestamp_seconds%s %d\n", labels, s.UpdatedAt.Unix()))
b.WriteString(fmt.Sprintf("ananke_ups_last_sample_timestamp_seconds%s %d\n", labels, s.UpdatedAt.Unix()))
}
b.WriteString(fmt.Sprintf("hecate_ups_error%s %d\n", labels, boolNum(s.LastError != "")))
b.WriteString(fmt.Sprintf("ananke_ups_error%s %d\n", labels, boolNum(s.LastError != "")))
}
_, _ = w.Write([]byte(b.String()))

View File

@ -33,14 +33,14 @@ func TestExporterEmitsCoreMetrics(t *testing.T) {
body := rr.Body.String()
mustContain := []string{
"hecate_shutdown_budget_seconds 321",
"hecate_shutdown_triggers_total 1",
"hecate_ups_on_battery{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
"hecate_ups_runtime_seconds{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
"hecate_ups_battery_charge_percent{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
"hecate_ups_load_percent{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
"hecate_ups_power_nominal_watts{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
"hecate_ups_threshold_seconds{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
"ananke_shutdown_budget_seconds 321",
"ananke_shutdown_triggers_total 1",
"ananke_ups_on_battery{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
"ananke_ups_runtime_seconds{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
"ananke_ups_battery_charge_percent{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
"ananke_ups_load_percent{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
"ananke_ups_power_nominal_watts{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
"ananke_ups_threshold_seconds{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
}
for _, m := range mustContain {
if !strings.Contains(body, m) {

View File

@ -12,12 +12,12 @@ import (
"strings"
"time"
"scm.bstein.dev/bstein/hecate/internal/cluster"
"scm.bstein.dev/bstein/hecate/internal/config"
"scm.bstein.dev/bstein/hecate/internal/metrics"
"scm.bstein.dev/bstein/hecate/internal/sshutil"
"scm.bstein.dev/bstein/hecate/internal/state"
"scm.bstein.dev/bstein/hecate/internal/ups"
"scm.bstein.dev/bstein/ananke/internal/cluster"
"scm.bstein.dev/bstein/ananke/internal/config"
"scm.bstein.dev/bstein/ananke/internal/metrics"
"scm.bstein.dev/bstein/ananke/internal/sshutil"
"scm.bstein.dev/bstein/ananke/internal/state"
"scm.bstein.dev/bstein/ananke/internal/ups"
)
type Target struct {
@ -81,7 +81,7 @@ func (d *Daemon) Run(ctx context.Context) error {
lastGood[t.Name] = time.Now()
}
d.log.Printf("hecate daemon started: poll=%s debounce=%d telemetry_timeout=%s targets=%s",
d.log.Printf("ananke daemon started: poll=%s debounce=%d telemetry_timeout=%s targets=%s",
poll, debounce, telemetryTimeout, d.targetList())
for {
@ -198,7 +198,7 @@ func (d *Daemon) forwardShutdown(ctx context.Context, reason string) error {
runCtx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
remoteCmd := fmt.Sprintf("sudo /usr/local/bin/hecate shutdown --config %q --execute --reason %q", d.cfg.Coordination.ForwardShutdownConfig, reason)
remoteCmd := fmt.Sprintf("sudo /usr/local/bin/ananke shutdown --config %q --execute --reason %q", d.cfg.Coordination.ForwardShutdownConfig, reason)
if d.cfg.Shutdown.EmergencySkipEtcd {
remoteCmd += " --skip-etcd-snapshot"
}

View File

@ -6,7 +6,7 @@ import (
"time"
)
// ParseIntentOutput parses `hecate intent` CLI output from local/remote commands.
// ParseIntentOutput parses `ananke intent` CLI output from local/remote commands.
func ParseIntentOutput(raw string) (Intent, error) {
for _, line := range strings.Split(raw, "\n") {
line = strings.TrimSpace(line)

View File

@ -61,7 +61,7 @@ func TestReadIntentAutoHealsCorruptJSON(t *testing.T) {
}
func TestParseIntentOutputParsesStructuredLine(t *testing.T) {
raw := `[hecate] 2026/04/05 11:24:49 intent=normal reason="guard-test-clear-2" source=drill updated_at=2026-04-05T16:24:33Z`
raw := `[ananke] 2026/04/05 11:24:49 intent=normal reason="guard-test-clear-2" source=drill updated_at=2026-04-05T16:24:33Z`
in, err := ParseIntentOutput(raw)
if err != nil {
t.Fatalf("parse intent output: %v", err)
@ -81,7 +81,7 @@ func TestParseIntentOutputParsesStructuredLine(t *testing.T) {
}
func TestParseIntentOutputHandlesNone(t *testing.T) {
in, err := ParseIntentOutput(`[hecate] 2026/04/05 11:24:49 intent=none`)
in, err := ParseIntentOutput(`[ananke] 2026/04/05 11:24:49 intent=none`)
if err != nil {
t.Fatalf("parse none intent output: %v", err)
}

View File

@ -11,7 +11,7 @@ import (
)
func TestAcquireLockLifecycle(t *testing.T) {
lockPath := filepath.Join(t.TempDir(), "hecate.lock")
lockPath := filepath.Join(t.TempDir(), "ananke.lock")
unlock, err := AcquireLock(lockPath)
if err != nil {
t.Fatalf("acquire lock: %v", err)
@ -26,7 +26,7 @@ func TestAcquireLockLifecycle(t *testing.T) {
}
func TestAcquireLockReclaimsStaleLock(t *testing.T) {
lockPath := filepath.Join(t.TempDir(), "hecate.lock")
lockPath := filepath.Join(t.TempDir(), "ananke.lock")
if err := os.WriteFile(lockPath, []byte("pid=999999\n"), 0o600); err != nil {
t.Fatalf("write stale lock: %v", err)
}
@ -47,7 +47,7 @@ func TestAcquireLockReclaimsStaleLock(t *testing.T) {
}
func TestAcquireLockRejectsActiveLock(t *testing.T) {
lockPath := filepath.Join(t.TempDir(), "hecate.lock")
lockPath := filepath.Join(t.TempDir(), "ananke.lock")
active := "pid=" + strconv.Itoa(os.Getpid()) + "\n"
if err := os.WriteFile(lockPath, []byte(active), 0o600); err != nil {
t.Fatalf("write active lock: %v", err)

View File

@ -2,23 +2,23 @@
set -Eeuo pipefail
KUBECTL="${KUBECTL:-kubectl}"
HECATE_COORDINATOR_HOST="${HECATE_COORDINATOR_HOST:-titan-db}"
HECATE_BIN="${HECATE_BIN:-/usr/local/bin/hecate}"
HECATE_CONFIG="${HECATE_CONFIG:-/etc/hecate/hecate.yaml}"
HECATE_COORDINATOR_RELAY="${HECATE_COORDINATOR_RELAY:-}"
LOG_DIR="${HECATE_DRILL_LOG_DIR:-/tmp/hecate-drills}"
STARTUP_TIMEOUT_SECONDS="${HECATE_DRILL_STARTUP_TIMEOUT_SECONDS:-1800}"
SHUTDOWN_TIMEOUT_SECONDS="${HECATE_DRILL_SHUTDOWN_TIMEOUT_SECONDS:-1800}"
SHUTDOWN_CONFIG="${HECATE_DRILL_SHUTDOWN_CONFIG:-/tmp/hecate-drill-no-poweroff.yaml}"
STARTUP_RETRY_DELAY_SECONDS="${HECATE_DRILL_STARTUP_RETRY_DELAY_SECONDS:-10}"
STARTUP_RETRY_MAX="${HECATE_DRILL_STARTUP_RETRY_MAX:-12}"
ANANKE_COORDINATOR_HOST="${ANANKE_COORDINATOR_HOST:-titan-db}"
ANANKE_BIN="${ANANKE_BIN:-/usr/local/bin/ananke}"
ANANKE_CONFIG="${ANANKE_CONFIG:-/etc/ananke/ananke.yaml}"
ANANKE_COORDINATOR_RELAY="${ANANKE_COORDINATOR_RELAY:-}"
LOG_DIR="${ANANKE_DRILL_LOG_DIR:-/tmp/ananke-drills}"
STARTUP_TIMEOUT_SECONDS="${ANANKE_DRILL_STARTUP_TIMEOUT_SECONDS:-1800}"
SHUTDOWN_TIMEOUT_SECONDS="${ANANKE_DRILL_SHUTDOWN_TIMEOUT_SECONDS:-1800}"
SHUTDOWN_CONFIG="${ANANKE_DRILL_SHUTDOWN_CONFIG:-/tmp/ananke-drill-no-poweroff.yaml}"
STARTUP_RETRY_DELAY_SECONDS="${ANANKE_DRILL_STARTUP_RETRY_DELAY_SECONDS:-10}"
STARTUP_RETRY_MAX="${ANANKE_DRILL_STARTUP_RETRY_MAX:-12}"
EXECUTE=0
usage() {
cat <<'EOF'
Usage:
scripts/hecate-drills.sh list
scripts/hecate-drills.sh run <drill-name> [--execute]
scripts/ananke-drills.sh list
scripts/ananke-drills.sh run <drill-name> [--execute]
Drills:
flux-gitea-deadlock Simulate flux-controller + gitea outage and require startup recovery.
@ -30,7 +30,7 @@ Drills:
Notes:
- Drills are intentionally disruptive and are not part of regular `make test`.
- Use --execute to run live changes. Without it, this script prints planned actions only.
- Optional relay: set HECATE_COORDINATOR_RELAY="ssh titan-db" to run coordinator commands via a jump host.
- Optional relay: set ANANKE_COORDINATOR_RELAY="ssh titan-db" to run coordinator commands via a jump host.
EOF
}
@ -98,47 +98,47 @@ wait_ready_keycloak() {
die "keycloak workload not found in sso namespace (expected deployment/keycloak or statefulset/keycloak)"
}
run_hecate_startup() {
run_ananke_startup() {
local reason="$1"
local cmd=(sudo "${HECATE_BIN}" startup --config "${HECATE_CONFIG}" --execute --force-flux-branch main --reason "${reason}")
local cmd=(sudo "${ANANKE_BIN}" startup --config "${ANANKE_CONFIG}" --execute --force-flux-branch main --reason "${reason}")
if [[ "${EXECUTE}" -eq 0 ]]; then
if [[ -n "${HECATE_COORDINATOR_RELAY}" ]]; then
log "plan: ssh ${HECATE_COORDINATOR_HOST} ${HECATE_COORDINATOR_RELAY} '${cmd[*]}'"
if [[ -n "${ANANKE_COORDINATOR_RELAY}" ]]; then
log "plan: ssh ${ANANKE_COORDINATOR_HOST} ${ANANKE_COORDINATOR_RELAY} '${cmd[*]}'"
else
log "plan: ssh ${HECATE_COORDINATOR_HOST} '${cmd[*]}'"
log "plan: ssh ${ANANKE_COORDINATOR_HOST} '${cmd[*]}'"
fi
return 0
fi
if [[ -n "${HECATE_COORDINATOR_RELAY}" ]]; then
if [[ -n "${ANANKE_COORDINATOR_RELAY}" ]]; then
# shellcheck disable=SC2086
timeout "${STARTUP_TIMEOUT_SECONDS}" ssh "${HECATE_COORDINATOR_HOST}" ${HECATE_COORDINATOR_RELAY} "${cmd[@]}"
timeout "${STARTUP_TIMEOUT_SECONDS}" ssh "${ANANKE_COORDINATOR_HOST}" ${ANANKE_COORDINATOR_RELAY} "${cmd[@]}"
else
timeout "${STARTUP_TIMEOUT_SECONDS}" ssh "${HECATE_COORDINATOR_HOST}" "${cmd[@]}"
timeout "${STARTUP_TIMEOUT_SECONDS}" ssh "${ANANKE_COORDINATOR_HOST}" "${cmd[@]}"
fi
}
run_hecate_shutdown() {
run_ananke_shutdown() {
local reason="$1"
local cmd=(sudo "${HECATE_BIN}" shutdown --config "${SHUTDOWN_CONFIG}" --execute --reason "${reason}")
local cmd=(sudo "${ANANKE_BIN}" shutdown --config "${SHUTDOWN_CONFIG}" --execute --reason "${reason}")
if [[ "${EXECUTE}" -eq 0 ]]; then
if [[ -n "${HECATE_COORDINATOR_RELAY}" ]]; then
log "plan: ssh ${HECATE_COORDINATOR_HOST} ${HECATE_COORDINATOR_RELAY} '${cmd[*]}'"
if [[ -n "${ANANKE_COORDINATOR_RELAY}" ]]; then
log "plan: ssh ${ANANKE_COORDINATOR_HOST} ${ANANKE_COORDINATOR_RELAY} '${cmd[*]}'"
else
log "plan: ssh ${HECATE_COORDINATOR_HOST} '${cmd[*]}'"
log "plan: ssh ${ANANKE_COORDINATOR_HOST} '${cmd[*]}'"
fi
return 0
fi
if [[ -n "${HECATE_COORDINATOR_RELAY}" ]]; then
if [[ -n "${ANANKE_COORDINATOR_RELAY}" ]]; then
# shellcheck disable=SC2086
timeout "${SHUTDOWN_TIMEOUT_SECONDS}" ssh "${HECATE_COORDINATOR_HOST}" ${HECATE_COORDINATOR_RELAY} "${cmd[@]}"
timeout "${SHUTDOWN_TIMEOUT_SECONDS}" ssh "${ANANKE_COORDINATOR_HOST}" ${ANANKE_COORDINATOR_RELAY} "${cmd[@]}"
else
timeout "${SHUTDOWN_TIMEOUT_SECONDS}" ssh "${HECATE_COORDINATOR_HOST}" "${cmd[@]}"
timeout "${SHUTDOWN_TIMEOUT_SECONDS}" ssh "${ANANKE_COORDINATOR_HOST}" "${cmd[@]}"
fi
}
run_hecate_startup_with_retry() {
run_ananke_startup_with_retry() {
local reason="$1"
local startup_cmd="sudo ${HECATE_BIN} startup --config ${HECATE_CONFIG} --execute --force-flux-branch main --reason ${reason}"
local startup_cmd="sudo ${ANANKE_BIN} startup --config ${ANANKE_CONFIG} --execute --force-flux-branch main --reason ${reason}"
if [[ "${EXECUTE}" -eq 0 ]]; then
log "plan: startup retry loop with max=${STARTUP_RETRY_MAX} delay=${STARTUP_RETRY_DELAY_SECONDS}s"
@ -161,11 +161,11 @@ run_hecate_startup_with_retry() {
run_coordinator_bash() {
local script="$1"
if [[ -n "${HECATE_COORDINATOR_RELAY}" ]]; then
if [[ -n "${ANANKE_COORDINATOR_RELAY}" ]]; then
# shellcheck disable=SC2086
printf '%s\n' "${script}" | ssh "${HECATE_COORDINATOR_HOST}" ${HECATE_COORDINATOR_RELAY} "bash -se"
printf '%s\n' "${script}" | ssh "${ANANKE_COORDINATOR_HOST}" ${ANANKE_COORDINATOR_RELAY} "bash -se"
else
printf '%s\n' "${script}" | ssh "${HECATE_COORDINATOR_HOST}" "bash -se"
printf '%s\n' "${script}" | ssh "${ANANKE_COORDINATOR_HOST}" "bash -se"
fi
}
@ -283,7 +283,7 @@ write_log_header() {
mkdir -p "${LOG_DIR}"
local f="${LOG_DIR}/${drill}-$(now_ts).log"
exec > >(tee -a "${f}") 2>&1
log "drill=${drill} execute=${EXECUTE} coordinator=${HECATE_COORDINATOR_HOST}"
log "drill=${drill} execute=${EXECUTE} coordinator=${ANANKE_COORDINATOR_HOST}"
}
run_drill_flux_gitea_deadlock() {
@ -303,7 +303,7 @@ run_drill_flux_gitea_deadlock() {
scale_to "$ns" "$kind" "$name" 0
done
run_hecate_startup "drill-flux-gitea-deadlock"
run_ananke_startup "drill-flux-gitea-deadlock"
log "verifying recovery"
wait_ready flux-system deployment source-controller 240s
@ -330,7 +330,7 @@ run_drill_foundation_recovery() {
scale_to "$ns" "$kind" "$name" 0
done
run_hecate_startup "drill-foundation-recovery"
run_ananke_startup "drill-foundation-recovery"
log "verifying layered recovery"
wait_ready vault statefulset vault 420s
@ -350,7 +350,7 @@ run_drill_reconciliation_resume() {
set_flux_suspend_all true
scale_to flux-system deployment source-controller 0
run_hecate_startup "drill-reconciliation-resume"
run_ananke_startup "drill-reconciliation-resume"
log "verifying reconciliation resumed"
wait_ready flux-system deployment source-controller 240s
@ -361,8 +361,8 @@ run_drill_reconciliation_resume() {
}
run_drill_startup_intent_guard() {
local intent_path="/var/lib/hecate/intent.json"
local backup_path="/tmp/hecate-intent-pre-drill.json"
local intent_path="/var/lib/ananke/intent.json"
local backup_path="/tmp/ananke-intent-pre-drill.json"
local inject_cmd="
if [ -f '${intent_path}' ]; then sudo cp '${intent_path}' '${backup_path}'; else sudo rm -f '${backup_path}'; fi
cat <<'JSON' | sudo tee '${intent_path}' >/dev/null
@ -376,12 +376,12 @@ else
sudo rm -f '${intent_path}'
fi
"
local startup_cmd="sudo ${HECATE_BIN} startup --config ${HECATE_CONFIG} --execute --force-flux-branch main --reason drill-startup-intent-guard"
local startup_cmd="sudo ${ANANKE_BIN} startup --config ${ANANKE_CONFIG} --execute --force-flux-branch main --reason drill-startup-intent-guard"
if [[ "${EXECUTE}" -eq 0 ]]; then
log "plan: ssh ${HECATE_COORDINATOR_HOST} '<inject shutdown intent>'"
log "plan: ssh ${HECATE_COORDINATOR_HOST} '${startup_cmd}' (expect failure)"
log "plan: ssh ${HECATE_COORDINATOR_HOST} '<restore prior intent>'"
log "plan: ssh ${ANANKE_COORDINATOR_HOST} '<inject shutdown intent>'"
log "plan: ssh ${ANANKE_COORDINATOR_HOST} '${startup_cmd}' (expect failure)"
log "plan: ssh ${ANANKE_COORDINATOR_HOST} '<restore prior intent>'"
log "pass: startup-intent-guard (plan mode)"
return 0
fi
@ -406,10 +406,10 @@ run_drill_controlled_cycle() {
fi
log "running controlled shutdown cycle (poweroff disabled config)"
run_hecate_shutdown "drill-controlled-cycle-shutdown"
run_ananke_shutdown "drill-controlled-cycle-shutdown"
log "running startup recovery cycle"
run_hecate_startup_with_retry "drill-controlled-cycle-startup"
run_ananke_startup_with_retry "drill-controlled-cycle-startup"
log "verifying critical stack readiness after cycle"
wait_ready flux-system deployment source-controller 240s

View File

@ -2,13 +2,13 @@
set -euo pipefail
if [[ "${EUID}" -ne 0 ]]; then
echo "hecate-self-update.sh must run as root" >&2
echo "ananke-self-update.sh must run as root" >&2
exit 1
fi
REPO_URL="${HECATE_REPO_URL:-https://scm.bstein.dev/bstein/hecate.git}"
BRANCH="${HECATE_REPO_BRANCH:-main}"
REPO_DIR="${HECATE_REPO_DIR:-/opt/hecate}"
REPO_URL="${ANANKE_REPO_URL:-ssh://git@scm.bstein.dev:2242/bstein/ananke.git}"
BRANCH="${ANANKE_REPO_BRANCH:-main}"
REPO_DIR="${ANANKE_REPO_DIR:-/opt/ananke}"
mkdir -p "$(dirname "${REPO_DIR}")"
if [[ ! -d "${REPO_DIR}/.git" ]]; then