rename runtime surfaces from hecate to ananke
This commit is contained in:
parent
169324ef4a
commit
c605a083ee
6
Makefile
6
Makefile
@ -1,7 +1,7 @@
|
||||
.PHONY: build test fmt tidy install drill-list drill-run
|
||||
|
||||
build:
|
||||
go build -o dist/hecate ./cmd/hecate
|
||||
go build -o dist/ananke ./cmd/ananke
|
||||
|
||||
test:
|
||||
go test ./...
|
||||
@ -16,7 +16,7 @@ install:
|
||||
sudo ./scripts/install.sh
|
||||
|
||||
drill-list:
|
||||
./scripts/hecate-drills.sh list
|
||||
./scripts/ananke-drills.sh list
|
||||
|
||||
drill-run:
|
||||
./scripts/hecate-drills.sh run $(DRILL) --execute
|
||||
./scripts/ananke-drills.sh run $(DRILL) --execute
|
||||
|
||||
@ -6,6 +6,7 @@ import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"log"
|
||||
"math"
|
||||
"os"
|
||||
"os/exec"
|
||||
"os/signal"
|
||||
@ -14,17 +15,17 @@ import (
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"scm.bstein.dev/bstein/hecate/internal/cluster"
|
||||
"scm.bstein.dev/bstein/hecate/internal/config"
|
||||
"scm.bstein.dev/bstein/hecate/internal/execx"
|
||||
"scm.bstein.dev/bstein/hecate/internal/service"
|
||||
"scm.bstein.dev/bstein/hecate/internal/sshutil"
|
||||
"scm.bstein.dev/bstein/hecate/internal/state"
|
||||
"scm.bstein.dev/bstein/hecate/internal/ups"
|
||||
"scm.bstein.dev/bstein/ananke/internal/cluster"
|
||||
"scm.bstein.dev/bstein/ananke/internal/config"
|
||||
"scm.bstein.dev/bstein/ananke/internal/execx"
|
||||
"scm.bstein.dev/bstein/ananke/internal/service"
|
||||
"scm.bstein.dev/bstein/ananke/internal/sshutil"
|
||||
"scm.bstein.dev/bstein/ananke/internal/state"
|
||||
"scm.bstein.dev/bstein/ananke/internal/ups"
|
||||
)
|
||||
|
||||
func main() {
|
||||
logger := log.New(os.Stdout, "[hecate] ", log.LstdFlags)
|
||||
logger := log.New(os.Stdout, "[ananke] ", log.LstdFlags)
|
||||
if len(os.Args) < 2 {
|
||||
usage()
|
||||
os.Exit(2)
|
||||
@ -73,7 +74,7 @@ func main() {
|
||||
|
||||
func runStartup(logger *log.Logger, args []string) error {
|
||||
fs := flag.NewFlagSet("startup", flag.ExitOnError)
|
||||
configPath := fs.String("config", "/etc/hecate/hecate.yaml", "Path to config file")
|
||||
configPath := fs.String("config", "/etc/ananke/ananke.yaml", "Path to config file")
|
||||
execute := fs.Bool("execute", false, "Actually execute changes (default dry-run)")
|
||||
forceBranch := fs.String("force-flux-branch", "", "Patch Flux source branch before resume")
|
||||
skipLocalBootstrap := fs.Bool("skip-local-bootstrap", false, "Skip local fallback bootstrap applies")
|
||||
@ -124,7 +125,7 @@ func runStartup(logger *log.Logger, args []string) error {
|
||||
}
|
||||
checkCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
if err := ensureStartupPowerSafe(checkCtx, targets); err != nil {
|
||||
if err := ensureStartupPowerSafe(checkCtx, targets, cfg.Startup.MinimumBatteryPercent); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
@ -141,10 +142,11 @@ func runStartup(logger *log.Logger, args []string) error {
|
||||
|
||||
func runShutdown(logger *log.Logger, args []string) error {
|
||||
fs := flag.NewFlagSet("shutdown", flag.ExitOnError)
|
||||
configPath := fs.String("config", "/etc/hecate/hecate.yaml", "Path to config file")
|
||||
configPath := fs.String("config", "/etc/ananke/ananke.yaml", "Path to config file")
|
||||
execute := fs.Bool("execute", false, "Actually execute changes (default dry-run)")
|
||||
skipEtcd := fs.Bool("skip-etcd-snapshot", false, "Skip etcd snapshot")
|
||||
skipDrain := fs.Bool("skip-drain", false, "Skip worker drain")
|
||||
mode := fs.String("mode", "config", "Shutdown mode: config|cluster-only|poweroff")
|
||||
reason := fs.String("reason", "manual-shutdown", "Shutdown reason for run history")
|
||||
_ = fs.Parse(args)
|
||||
|
||||
@ -158,13 +160,14 @@ func runShutdown(logger *log.Logger, args []string) error {
|
||||
return orch.Shutdown(ctx, cluster.ShutdownOptions{
|
||||
SkipEtcdSnapshot: *skipEtcd,
|
||||
SkipDrain: *skipDrain,
|
||||
Mode: *mode,
|
||||
Reason: *reason,
|
||||
})
|
||||
}
|
||||
|
||||
func runDaemon(logger *log.Logger, args []string) error {
|
||||
fs := flag.NewFlagSet("daemon", flag.ExitOnError)
|
||||
configPath := fs.String("config", "/etc/hecate/hecate.yaml", "Path to config file")
|
||||
configPath := fs.String("config", "/etc/ananke/ananke.yaml", "Path to config file")
|
||||
dryRunActions := fs.Bool("dry-run-actions", false, "Log planned actions without executing")
|
||||
_ = fs.Parse(args)
|
||||
|
||||
@ -191,7 +194,7 @@ func runDaemon(logger *log.Logger, args []string) error {
|
||||
|
||||
func runEtcdRestore(logger *log.Logger, args []string) error {
|
||||
fs := flag.NewFlagSet("etcd-restore", flag.ExitOnError)
|
||||
configPath := fs.String("config", "/etc/hecate/hecate.yaml", "Path to config file")
|
||||
configPath := fs.String("config", "/etc/ananke/ananke.yaml", "Path to config file")
|
||||
execute := fs.Bool("execute", false, "Actually execute restore (default dry-run)")
|
||||
controlPlane := fs.String("control-plane", "", "Control plane to run restore on (defaults to startup.etcd_restore_control_plane)")
|
||||
snapshotPath := fs.String("snapshot", "", "Explicit snapshot path (defaults to latest on selected control plane)")
|
||||
@ -211,7 +214,7 @@ func runEtcdRestore(logger *log.Logger, args []string) error {
|
||||
|
||||
func runStatus(logger *log.Logger, args []string) error {
|
||||
fs := flag.NewFlagSet("status", flag.ExitOnError)
|
||||
configPath := fs.String("config", "/etc/hecate/hecate.yaml", "Path to config file")
|
||||
configPath := fs.String("config", "/etc/ananke/ananke.yaml", "Path to config file")
|
||||
_ = fs.Parse(args)
|
||||
|
||||
cfg, orch, err := buildOrchestrator(logger, *configPath, true)
|
||||
@ -246,7 +249,7 @@ func runStatus(logger *log.Logger, args []string) error {
|
||||
|
||||
func runIntent(logger *log.Logger, args []string) error {
|
||||
fs := flag.NewFlagSet("intent", flag.ExitOnError)
|
||||
configPath := fs.String("config", "/etc/hecate/hecate.yaml", "Path to config file")
|
||||
configPath := fs.String("config", "/etc/ananke/ananke.yaml", "Path to config file")
|
||||
setState := fs.String("set", "", "Set intent state (normal|startup_in_progress|shutting_down|shutdown_complete)")
|
||||
reason := fs.String("reason", "manual-intent", "Intent reason (used with --set)")
|
||||
source := fs.String("source", "manual", "Intent source (used with --set)")
|
||||
@ -314,7 +317,7 @@ func buildUPSTargets(cfg config.Config) ([]service.Target, error) {
|
||||
return targets, nil
|
||||
}
|
||||
|
||||
func ensureStartupPowerSafe(ctx context.Context, targets []service.Target) error {
|
||||
func ensureStartupPowerSafe(ctx context.Context, targets []service.Target, minimumBatteryPercent float64) error {
|
||||
type targetState struct {
|
||||
seenGood bool
|
||||
lastErr error
|
||||
@ -327,6 +330,7 @@ func ensureStartupPowerSafe(ctx context.Context, targets []service.Target) error
|
||||
const pollInterval = 3 * time.Second
|
||||
for {
|
||||
onBatteryTargets := []string{}
|
||||
lowChargeTargets := []string{}
|
||||
allSeen := true
|
||||
for _, t := range targets {
|
||||
key := t.Name + "|" + t.Target
|
||||
@ -344,10 +348,25 @@ func ensureStartupPowerSafe(ctx context.Context, targets []service.Target) error
|
||||
if sample.OnBattery {
|
||||
onBatteryTargets = append(onBatteryTargets, fmt.Sprintf("%s(status=%s runtime_s=%d)", t.Name, sample.RawStatus, sample.RuntimeSeconds))
|
||||
}
|
||||
if minimumBatteryPercent > 0 && sample.BatteryCharge > 0 && sample.BatteryCharge < minimumBatteryPercent {
|
||||
lowChargeTargets = append(
|
||||
lowChargeTargets,
|
||||
fmt.Sprintf(
|
||||
"%s(charge=%.1f%%<%.1f%% status=%s)",
|
||||
t.Name,
|
||||
sample.BatteryCharge,
|
||||
minimumBatteryPercent,
|
||||
sample.RawStatus,
|
||||
),
|
||||
)
|
||||
}
|
||||
}
|
||||
if len(onBatteryTargets) > 0 {
|
||||
return fmt.Errorf("startup blocked: UPS is on battery for %s", strings.Join(onBatteryTargets, ", "))
|
||||
}
|
||||
if len(lowChargeTargets) > 0 {
|
||||
return fmt.Errorf("startup blocked: UPS battery charge below minimum for %s", strings.Join(lowChargeTargets, ", "))
|
||||
}
|
||||
if allSeen {
|
||||
return nil
|
||||
}
|
||||
@ -366,7 +385,8 @@ func ensureStartupPowerSafe(ctx context.Context, targets []service.Target) error
|
||||
unverified = append(unverified, fmt.Sprintf("%s(%s): no telemetry sample yet", t.Name, t.Target))
|
||||
}
|
||||
}
|
||||
return fmt.Errorf("startup blocked: unable to verify UPS telemetry before timeout: %s", strings.Join(unverified, " | "))
|
||||
roundedMin := math.Round(minimumBatteryPercent*10) / 10
|
||||
return fmt.Errorf("startup blocked: unable to verify UPS telemetry before timeout (minimum_battery_percent=%.1f): %s", roundedMin, strings.Join(unverified, " | "))
|
||||
case <-time.After(pollInterval):
|
||||
}
|
||||
}
|
||||
@ -391,26 +411,26 @@ func buildOrchestrator(logger *log.Logger, cfgPath string, dryRun bool) (config.
|
||||
}
|
||||
|
||||
func usage() {
|
||||
fmt.Print(`hecate: staged startup/shutdown + UPS-triggered protection
|
||||
fmt.Print(`ananke: staged startup/shutdown + UPS-triggered protection
|
||||
|
||||
Usage:
|
||||
hecate <command> [flags]
|
||||
ananke <command> [flags]
|
||||
|
||||
Commands:
|
||||
startup Perform staged cluster startup
|
||||
shutdown Perform graceful cluster shutdown
|
||||
etcd-restore Restore etcd from snapshot on a control plane
|
||||
daemon Monitor UPS and auto-trigger shutdown
|
||||
status Print current hecate status and estimates
|
||||
status Print current ananke status and estimates
|
||||
intent Read or manually set intent state
|
||||
|
||||
Examples:
|
||||
hecate startup --config /etc/hecate/hecate.yaml --execute --force-flux-branch main
|
||||
hecate shutdown --config /etc/hecate/hecate.yaml --execute --reason "manual-maintenance"
|
||||
hecate etcd-restore --config /etc/hecate/hecate.yaml --execute
|
||||
hecate daemon --config /etc/hecate/hecate.yaml
|
||||
hecate status --config /etc/hecate/hecate.yaml
|
||||
hecate intent --config /etc/hecate/hecate.yaml --set normal --reason "manual-clear" --execute
|
||||
ananke startup --config /etc/ananke/ananke.yaml --execute --force-flux-branch main
|
||||
ananke shutdown --config /etc/ananke/ananke.yaml --execute --reason "manual-maintenance"
|
||||
ananke etcd-restore --config /etc/ananke/ananke.yaml --execute
|
||||
ananke daemon --config /etc/ananke/ananke.yaml
|
||||
ananke status --config /etc/ananke/ananke.yaml
|
||||
ananke intent --config /etc/ananke/ananke.yaml --set normal --reason "manual-clear" --execute
|
||||
`)
|
||||
}
|
||||
|
||||
@ -439,7 +459,7 @@ func tryPeerBootstrapHandoff(ctx context.Context, cfg config.Config, logger *log
|
||||
|
||||
args := buildSSHBaseArgs(cfg)
|
||||
|
||||
remote := "sudo -n systemctl start hecate-bootstrap.service"
|
||||
remote := "sudo -n systemctl start ananke-bootstrap.service"
|
||||
attempt := 1
|
||||
for {
|
||||
cmdArgs := append(append([]string{}, args...), target, remote)
|
||||
@ -480,7 +500,7 @@ func coordinatorAllowsPeerFallbackStartup(ctx context.Context, cfg config.Config
|
||||
if user != "" {
|
||||
target = user + "@" + host
|
||||
}
|
||||
remoteCmd := "if sudo -n /usr/bin/systemctl is-active --quiet hecate-bootstrap.service; then echo __HECATE_BOOTSTRAP_ACTIVE__; else echo __HECATE_BOOTSTRAP_IDLE__; fi; sudo -n /usr/local/bin/hecate intent --config /etc/hecate/hecate.yaml"
|
||||
remoteCmd := "if sudo -n /usr/bin/systemctl is-active --quiet ananke-bootstrap.service; then echo __ANANKE_BOOTSTRAP_ACTIVE__; else echo __ANANKE_BOOTSTRAP_IDLE__; fi; sudo -n /usr/local/bin/ananke intent --config /etc/ananke/ananke.yaml"
|
||||
args := append(buildSSHBaseArgs(cfg), target, remoteCmd)
|
||||
out, err := runSSHWithRecovery(ctx, logger, cfg, args, []string{coordinator, host, cfg.SSHJumpHost})
|
||||
if err != nil {
|
||||
@ -488,7 +508,7 @@ func coordinatorAllowsPeerFallbackStartup(ctx context.Context, cfg config.Config
|
||||
return true, "coordinator unreachable", nil
|
||||
}
|
||||
trimmed := strings.TrimSpace(out)
|
||||
if strings.Contains(trimmed, "__HECATE_BOOTSTRAP_ACTIVE__") {
|
||||
if strings.Contains(trimmed, "__ANANKE_BOOTSTRAP_ACTIVE__") {
|
||||
return false, "coordinator bootstrap service is active", nil
|
||||
}
|
||||
remoteIntent, parseErr := state.ParseIntentOutput(trimmed)
|
||||
@ -1,5 +1,5 @@
|
||||
# /etc/hecate/hecate.yaml
|
||||
kubeconfig: /etc/hecate/kubeconfig
|
||||
# /etc/ananke/ananke.yaml
|
||||
kubeconfig: /etc/ananke/kubeconfig
|
||||
ssh_user: atlas
|
||||
ssh_port: 2277
|
||||
ssh_config_file: ""
|
||||
@ -11,6 +11,7 @@ ssh_jump_host: ""
|
||||
ssh_jump_user: ""
|
||||
iac_repo_path: /opt/titan-iac
|
||||
expected_flux_branch: main
|
||||
expected_flux_source_url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git
|
||||
control_planes:
|
||||
- titan-0a
|
||||
- titan-0b
|
||||
@ -46,6 +47,10 @@ startup:
|
||||
api_wait_seconds: 1200
|
||||
api_poll_seconds: 2
|
||||
shutdown_cooldown_seconds: 45
|
||||
minimum_battery_percent: 20
|
||||
required_node_labels:
|
||||
titan-09:
|
||||
ananke.bstein.dev/harbor-bootstrap: "true"
|
||||
require_time_sync: true
|
||||
time_sync_wait_seconds: 240
|
||||
time_sync_poll_seconds: 5
|
||||
@ -67,9 +72,36 @@ startup:
|
||||
post_start_probe_wait_seconds: 240
|
||||
post_start_probe_poll_seconds: 5
|
||||
post_start_probes:
|
||||
- https://scm.bstein.dev/user/login
|
||||
- https://metrics.bstein.dev/login
|
||||
vault_unseal_key_file: /var/lib/hecate/vault-unseal.key
|
||||
- https://scm.bstein.dev/api/healthz
|
||||
- https://metrics.bstein.dev/api/health
|
||||
require_service_checklist: true
|
||||
service_checklist_wait_seconds: 420
|
||||
service_checklist_poll_seconds: 5
|
||||
service_checklist_stability_seconds: 120
|
||||
service_checklist:
|
||||
- name: gitea-api
|
||||
url: https://scm.bstein.dev/api/healthz
|
||||
accepted_statuses: [200]
|
||||
body_contains: pass
|
||||
timeout_seconds: 12
|
||||
- name: grafana-api
|
||||
url: https://metrics.bstein.dev/api/health
|
||||
accepted_statuses: [200]
|
||||
body_contains: '"database":"ok"'
|
||||
timeout_seconds: 12
|
||||
require_flux_health: true
|
||||
flux_health_wait_seconds: 900
|
||||
flux_health_poll_seconds: 5
|
||||
ignore_flux_kustomizations: []
|
||||
require_workload_convergence: true
|
||||
workload_convergence_wait_seconds: 900
|
||||
workload_convergence_poll_seconds: 5
|
||||
ignore_workload_namespaces: []
|
||||
ignore_workloads: []
|
||||
ignore_unavailable_nodes: []
|
||||
auto_recycle_stuck_pods: true
|
||||
stuck_pod_grace_seconds: 180
|
||||
vault_unseal_key_file: /var/lib/ananke/vault-unseal.key
|
||||
vault_unseal_breakglass_command: ""
|
||||
vault_unseal_breakglass_timeout_seconds: 15
|
||||
shutdown:
|
||||
@ -103,7 +135,7 @@ ups:
|
||||
coordination:
|
||||
forward_shutdown_host: ""
|
||||
forward_shutdown_user: atlas
|
||||
forward_shutdown_config: /etc/hecate/hecate.yaml
|
||||
forward_shutdown_config: /etc/ananke/ananke.yaml
|
||||
peer_hosts: []
|
||||
fallback_local_shutdown: true
|
||||
command_timeout_seconds: 25
|
||||
@ -115,7 +147,7 @@ metrics:
|
||||
bind_addr: 0.0.0.0:9560
|
||||
path: /metrics
|
||||
state:
|
||||
dir: /var/lib/hecate
|
||||
run_history_path: /var/lib/hecate/runs.json
|
||||
lock_path: /var/lib/hecate/hecate.lock
|
||||
intent_path: /var/lib/hecate/intent.json
|
||||
dir: /var/lib/ananke
|
||||
run_history_path: /var/lib/ananke/runs.json
|
||||
lock_path: /var/lib/ananke/ananke.lock
|
||||
intent_path: /var/lib/ananke/intent.json
|
||||
@ -1,5 +1,5 @@
|
||||
# /etc/hecate/hecate.yaml for titan-24 (tethys forwarder)
|
||||
kubeconfig: /etc/hecate/kubeconfig
|
||||
# /etc/ananke/ananke.yaml for titan-24 (tethys forwarder)
|
||||
kubeconfig: /etc/ananke/kubeconfig
|
||||
ssh_user: atlas
|
||||
ssh_port: 2277
|
||||
ssh_config_file: /home/tethys/.ssh/config
|
||||
@ -58,6 +58,7 @@ ssh_jump_host: ""
|
||||
ssh_jump_user: ""
|
||||
iac_repo_path: /opt/titan-iac
|
||||
expected_flux_branch: main
|
||||
expected_flux_source_url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git
|
||||
control_planes:
|
||||
- titan-0a
|
||||
- titan-0b
|
||||
@ -112,6 +113,10 @@ startup:
|
||||
api_wait_seconds: 1200
|
||||
api_poll_seconds: 2
|
||||
shutdown_cooldown_seconds: 45
|
||||
minimum_battery_percent: 20
|
||||
required_node_labels:
|
||||
titan-09:
|
||||
ananke.bstein.dev/harbor-bootstrap: "true"
|
||||
require_time_sync: true
|
||||
time_sync_wait_seconds: 240
|
||||
time_sync_poll_seconds: 5
|
||||
@ -133,10 +138,37 @@ startup:
|
||||
post_start_probe_wait_seconds: 240
|
||||
post_start_probe_poll_seconds: 5
|
||||
post_start_probes:
|
||||
- https://scm.bstein.dev/user/login
|
||||
- https://metrics.bstein.dev/login
|
||||
vault_unseal_key_file: /var/lib/hecate/vault-unseal.key
|
||||
vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/tethys/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.hecate-breakglass/vault-unseal.key'"
|
||||
- https://scm.bstein.dev/api/healthz
|
||||
- https://metrics.bstein.dev/api/health
|
||||
require_service_checklist: true
|
||||
service_checklist_wait_seconds: 420
|
||||
service_checklist_poll_seconds: 5
|
||||
service_checklist_stability_seconds: 120
|
||||
service_checklist:
|
||||
- name: gitea-api
|
||||
url: https://scm.bstein.dev/api/healthz
|
||||
accepted_statuses: [200]
|
||||
body_contains: pass
|
||||
timeout_seconds: 12
|
||||
- name: grafana-api
|
||||
url: https://metrics.bstein.dev/api/health
|
||||
accepted_statuses: [200]
|
||||
body_contains: '"database":"ok"'
|
||||
timeout_seconds: 12
|
||||
require_flux_health: true
|
||||
flux_health_wait_seconds: 900
|
||||
flux_health_poll_seconds: 5
|
||||
ignore_flux_kustomizations: []
|
||||
require_workload_convergence: true
|
||||
workload_convergence_wait_seconds: 900
|
||||
workload_convergence_poll_seconds: 5
|
||||
ignore_workload_namespaces: []
|
||||
ignore_workloads: []
|
||||
ignore_unavailable_nodes: []
|
||||
auto_recycle_stuck_pods: true
|
||||
stuck_pod_grace_seconds: 180
|
||||
vault_unseal_key_file: /var/lib/ananke/vault-unseal.key
|
||||
vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/tethys/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.ananke-breakglass/vault-unseal.key'"
|
||||
vault_unseal_breakglass_timeout_seconds: 15
|
||||
shutdown:
|
||||
default_budget_seconds: 1380
|
||||
@ -167,7 +199,7 @@ ups:
|
||||
coordination:
|
||||
forward_shutdown_host: titan-db
|
||||
forward_shutdown_user: atlas
|
||||
forward_shutdown_config: /etc/hecate/hecate.yaml
|
||||
forward_shutdown_config: /etc/ananke/ananke.yaml
|
||||
peer_hosts:
|
||||
- titan-db
|
||||
fallback_local_shutdown: false
|
||||
@ -180,7 +212,7 @@ metrics:
|
||||
bind_addr: 0.0.0.0:9560
|
||||
path: /metrics
|
||||
state:
|
||||
dir: /var/lib/hecate
|
||||
run_history_path: /var/lib/hecate/runs.json
|
||||
lock_path: /var/lib/hecate/hecate.lock
|
||||
intent_path: /var/lib/hecate/intent.json
|
||||
dir: /var/lib/ananke
|
||||
run_history_path: /var/lib/ananke/runs.json
|
||||
lock_path: /var/lib/ananke/ananke.lock
|
||||
intent_path: /var/lib/ananke/intent.json
|
||||
@ -1,5 +1,5 @@
|
||||
# /etc/hecate/hecate.yaml for titan-db (coordinator)
|
||||
kubeconfig: /etc/hecate/kubeconfig
|
||||
# /etc/ananke/ananke.yaml for titan-db (coordinator)
|
||||
kubeconfig: /etc/ananke/kubeconfig
|
||||
ssh_user: atlas
|
||||
ssh_port: 2277
|
||||
ssh_config_file: /home/atlas/.ssh/config
|
||||
@ -58,6 +58,7 @@ ssh_jump_host: ""
|
||||
ssh_jump_user: ""
|
||||
iac_repo_path: /opt/titan-iac
|
||||
expected_flux_branch: main
|
||||
expected_flux_source_url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git
|
||||
control_planes:
|
||||
- titan-0a
|
||||
- titan-0b
|
||||
@ -112,6 +113,10 @@ startup:
|
||||
api_wait_seconds: 1200
|
||||
api_poll_seconds: 2
|
||||
shutdown_cooldown_seconds: 45
|
||||
minimum_battery_percent: 20
|
||||
required_node_labels:
|
||||
titan-09:
|
||||
ananke.bstein.dev/harbor-bootstrap: "true"
|
||||
require_time_sync: true
|
||||
time_sync_wait_seconds: 240
|
||||
time_sync_poll_seconds: 5
|
||||
@ -133,10 +138,37 @@ startup:
|
||||
post_start_probe_wait_seconds: 240
|
||||
post_start_probe_poll_seconds: 5
|
||||
post_start_probes:
|
||||
- https://scm.bstein.dev/user/login
|
||||
- https://metrics.bstein.dev/login
|
||||
vault_unseal_key_file: /var/lib/hecate/vault-unseal.key
|
||||
vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/atlas/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.hecate-breakglass/vault-unseal.key'"
|
||||
- https://scm.bstein.dev/api/healthz
|
||||
- https://metrics.bstein.dev/api/health
|
||||
require_service_checklist: true
|
||||
service_checklist_wait_seconds: 420
|
||||
service_checklist_poll_seconds: 5
|
||||
service_checklist_stability_seconds: 120
|
||||
service_checklist:
|
||||
- name: gitea-api
|
||||
url: https://scm.bstein.dev/api/healthz
|
||||
accepted_statuses: [200]
|
||||
body_contains: pass
|
||||
timeout_seconds: 12
|
||||
- name: grafana-api
|
||||
url: https://metrics.bstein.dev/api/health
|
||||
accepted_statuses: [200]
|
||||
body_contains: '"database":"ok"'
|
||||
timeout_seconds: 12
|
||||
require_flux_health: true
|
||||
flux_health_wait_seconds: 900
|
||||
flux_health_poll_seconds: 5
|
||||
ignore_flux_kustomizations: []
|
||||
require_workload_convergence: true
|
||||
workload_convergence_wait_seconds: 900
|
||||
workload_convergence_poll_seconds: 5
|
||||
ignore_workload_namespaces: []
|
||||
ignore_workloads: []
|
||||
ignore_unavailable_nodes: []
|
||||
auto_recycle_stuck_pods: true
|
||||
stuck_pod_grace_seconds: 180
|
||||
vault_unseal_key_file: /var/lib/ananke/vault-unseal.key
|
||||
vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/atlas/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.ananke-breakglass/vault-unseal.key'"
|
||||
vault_unseal_breakglass_timeout_seconds: 15
|
||||
shutdown:
|
||||
default_budget_seconds: 1380
|
||||
@ -168,7 +200,7 @@ ups:
|
||||
coordination:
|
||||
forward_shutdown_host: ""
|
||||
forward_shutdown_user: atlas
|
||||
forward_shutdown_config: /etc/hecate/hecate.yaml
|
||||
forward_shutdown_config: /etc/ananke/ananke.yaml
|
||||
peer_hosts:
|
||||
- titan-24
|
||||
fallback_local_shutdown: true
|
||||
@ -181,7 +213,7 @@ metrics:
|
||||
bind_addr: 0.0.0.0:9560
|
||||
path: /metrics
|
||||
state:
|
||||
dir: /var/lib/hecate
|
||||
run_history_path: /var/lib/hecate/runs.json
|
||||
lock_path: /var/lib/hecate/hecate.lock
|
||||
intent_path: /var/lib/hecate/intent.json
|
||||
dir: /var/lib/ananke
|
||||
run_history_path: /var/lib/ananke/runs.json
|
||||
lock_path: /var/lib/ananke/ananke.lock
|
||||
intent_path: /var/lib/ananke/intent.json
|
||||
@ -1,15 +1,15 @@
|
||||
[Unit]
|
||||
Description=Hecate Staged Cluster Bootstrap
|
||||
Description=Ananke Staged Cluster Bootstrap
|
||||
Wants=network-online.target
|
||||
After=network-online.target
|
||||
ConditionPathExists=/etc/hecate/hecate.yaml
|
||||
ConditionPathExists=/etc/ananke/ananke.yaml
|
||||
StartLimitIntervalSec=0
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
User=root
|
||||
Group=root
|
||||
ExecStart=/usr/local/bin/hecate startup --config /etc/hecate/hecate.yaml --execute --force-flux-branch main --auto-peer-failover --peer-wait-seconds 180
|
||||
ExecStart=/usr/local/bin/ananke startup --config /etc/ananke/ananke.yaml --execute --force-flux-branch main --auto-peer-failover --peer-wait-seconds 180
|
||||
Restart=on-failure
|
||||
RestartSec=30
|
||||
TimeoutStartSec=1800
|
||||
@ -1,5 +1,5 @@
|
||||
[Unit]
|
||||
Description=Hecate Self-Update and Reinstall
|
||||
Description=Ananke Self-Update and Reinstall
|
||||
Wants=network-online.target
|
||||
After=network-online.target
|
||||
|
||||
@ -7,6 +7,7 @@ After=network-online.target
|
||||
Type=oneshot
|
||||
User=root
|
||||
Group=root
|
||||
ExecStart=/usr/local/lib/hecate/hecate-self-update.sh
|
||||
ExecStart=/usr/local/lib/ananke/ananke-self-update.sh
|
||||
TimeoutStartSec=1800
|
||||
|
||||
[Install]
|
||||
@ -1,12 +1,11 @@
|
||||
[Unit]
|
||||
Description=Periodic Hecate Self-Update Timer
|
||||
Description=Periodic Ananke Self-Update Timer
|
||||
|
||||
[Timer]
|
||||
OnBootSec=2m
|
||||
OnUnitActiveSec=6h
|
||||
Unit=hecate-update.service
|
||||
Unit=ananke-update.service
|
||||
Persistent=true
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
|
||||
@ -1,14 +1,14 @@
|
||||
[Unit]
|
||||
Description=Hecate UPS Monitor and Auto Shutdown Orchestrator
|
||||
Description=Ananke UPS Monitor and Auto Shutdown Orchestrator
|
||||
Wants=network-online.target
|
||||
After=network-online.target
|
||||
ConditionPathExists=/etc/hecate/hecate.yaml
|
||||
ConditionPathExists=/etc/ananke/ananke.yaml
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=root
|
||||
Group=root
|
||||
ExecStart=/usr/local/bin/hecate daemon --config /etc/hecate/hecate.yaml
|
||||
ExecStart=/usr/local/bin/ananke daemon --config /etc/ananke/ananke.yaml
|
||||
Restart=on-failure
|
||||
RestartSec=5
|
||||
NoNewPrivileges=true
|
||||
2
go.mod
2
go.mod
@ -1,4 +1,4 @@
|
||||
module scm.bstein.dev/bstein/hecate
|
||||
module scm.bstein.dev/bstein/ananke
|
||||
|
||||
go 1.25
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,14 +1,17 @@
|
||||
package cluster
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"reflect"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"scm.bstein.dev/bstein/hecate/internal/config"
|
||||
"scm.bstein.dev/bstein/hecate/internal/state"
|
||||
"scm.bstein.dev/bstein/ananke/internal/config"
|
||||
"scm.bstein.dev/bstein/ananke/internal/state"
|
||||
)
|
||||
|
||||
func TestParseVaultSealed(t *testing.T) {
|
||||
@ -117,3 +120,75 @@ func TestCoordinationPeersDedupesAndIncludesForwardHost(t *testing.T) {
|
||||
t.Fatalf("coordination peers mismatch: got=%v want=%v", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestWorkloadTargetsIgnoredNodesByNodeSelector(t *testing.T) {
|
||||
spec := podSpec{
|
||||
NodeSelector: map[string]string{
|
||||
"kubernetes.io/hostname": "titan-22",
|
||||
},
|
||||
}
|
||||
ignored := map[string]struct{}{"titan-22": {}}
|
||||
if !workloadTargetsIgnoredNodes(spec, ignored) {
|
||||
t.Fatalf("expected workload to target ignored node via nodeSelector")
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseWorkloadIgnoreRules(t *testing.T) {
|
||||
rules := parseWorkloadIgnoreRules([]string{
|
||||
"maintenance/metis",
|
||||
"crypto/statefulset/monerod",
|
||||
})
|
||||
if len(rules) != 2 {
|
||||
t.Fatalf("expected 2 ignore rules, got %d", len(rules))
|
||||
}
|
||||
if !workloadIgnored(rules, "maintenance", "deployment", "metis") {
|
||||
t.Fatalf("expected namespace/name rule to match")
|
||||
}
|
||||
if !workloadIgnored(rules, "crypto", "statefulset", "monerod") {
|
||||
t.Fatalf("expected namespace/kind/name rule to match")
|
||||
}
|
||||
if workloadIgnored(rules, "crypto", "deployment", "monerod") {
|
||||
t.Fatalf("did not expect mismatched kind to match")
|
||||
}
|
||||
}
|
||||
|
||||
func TestNamespaceCandidatesFromIgnoreKustomizations(t *testing.T) {
|
||||
got := namespaceCandidatesFromIgnoreKustomizations([]string{
|
||||
"flux-system/jellyfin",
|
||||
"flux-system/outline",
|
||||
})
|
||||
if _, ok := got["jellyfin"]; !ok {
|
||||
t.Fatalf("expected jellyfin namespace candidate")
|
||||
}
|
||||
if _, ok := got["outline"]; !ok {
|
||||
t.Fatalf("expected outline namespace candidate")
|
||||
}
|
||||
}
|
||||
|
||||
func TestProbeStatusAcceptedRejects404(t *testing.T) {
|
||||
if probeStatusAccepted("https://metrics.bstein.dev/login", 404) {
|
||||
t.Fatalf("expected 404 probe status to be rejected")
|
||||
}
|
||||
}
|
||||
|
||||
func TestServiceCheckReadyRequiresBodyContains(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
|
||||
w.WriteHeader(http.StatusOK)
|
||||
_, _ = w.Write([]byte(`{"database":"ok"}`))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
orch := &Orchestrator{
|
||||
log: log.New(os.Stdout, "", 0),
|
||||
}
|
||||
ok, detail := orch.serviceCheckReady(context.Background(), config.ServiceChecklistCheck{
|
||||
Name: "grafana-api",
|
||||
URL: srv.URL,
|
||||
AcceptedStatuses: []int{200},
|
||||
BodyContains: `"database":"ok"`,
|
||||
TimeoutSeconds: 5,
|
||||
})
|
||||
if !ok {
|
||||
t.Fatalf("expected service check to pass, detail=%s", detail)
|
||||
}
|
||||
}
|
||||
|
||||
@ -2,6 +2,7 @@ package config
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
neturl "net/url"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
@ -21,6 +22,7 @@ type Config struct {
|
||||
SSHJumpUser string `yaml:"ssh_jump_user"`
|
||||
IACRepoPath string `yaml:"iac_repo_path"`
|
||||
ExpectedFluxBranch string `yaml:"expected_flux_branch"`
|
||||
ExpectedFluxSource string `yaml:"expected_flux_source_url"`
|
||||
ControlPlanes []string `yaml:"control_planes"`
|
||||
Workers []string `yaml:"workers"`
|
||||
LocalBootstrapPaths []string `yaml:"local_bootstrap_paths"`
|
||||
@ -37,6 +39,8 @@ type Startup struct {
|
||||
APIWaitSeconds int `yaml:"api_wait_seconds"`
|
||||
APIPollSeconds int `yaml:"api_poll_seconds"`
|
||||
ShutdownCooldownSeconds int `yaml:"shutdown_cooldown_seconds"`
|
||||
MinimumBatteryPercent float64 `yaml:"minimum_battery_percent"`
|
||||
RequiredNodeLabels map[string]map[string]string `yaml:"required_node_labels"`
|
||||
RequireTimeSync bool `yaml:"require_time_sync"`
|
||||
TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"`
|
||||
TimeSyncPollSeconds int `yaml:"time_sync_poll_seconds"`
|
||||
@ -54,11 +58,38 @@ type Startup struct {
|
||||
PostStartProbeWaitSeconds int `yaml:"post_start_probe_wait_seconds"`
|
||||
PostStartProbePollSeconds int `yaml:"post_start_probe_poll_seconds"`
|
||||
PostStartProbes []string `yaml:"post_start_probes"`
|
||||
RequireServiceChecklist bool `yaml:"require_service_checklist"`
|
||||
ServiceChecklistWaitSeconds int `yaml:"service_checklist_wait_seconds"`
|
||||
ServiceChecklistPollSeconds int `yaml:"service_checklist_poll_seconds"`
|
||||
ServiceChecklistStabilitySec int `yaml:"service_checklist_stability_seconds"`
|
||||
ServiceChecklist []ServiceChecklistCheck `yaml:"service_checklist"`
|
||||
RequireFluxHealth bool `yaml:"require_flux_health"`
|
||||
FluxHealthWaitSeconds int `yaml:"flux_health_wait_seconds"`
|
||||
FluxHealthPollSeconds int `yaml:"flux_health_poll_seconds"`
|
||||
IgnoreFluxKustomizations []string `yaml:"ignore_flux_kustomizations"`
|
||||
RequireWorkloadConvergence bool `yaml:"require_workload_convergence"`
|
||||
WorkloadConvergenceWaitSeconds int `yaml:"workload_convergence_wait_seconds"`
|
||||
WorkloadConvergencePollSeconds int `yaml:"workload_convergence_poll_seconds"`
|
||||
IgnoreWorkloadNamespaces []string `yaml:"ignore_workload_namespaces"`
|
||||
IgnoreWorkloads []string `yaml:"ignore_workloads"`
|
||||
IgnoreUnavailableNodes []string `yaml:"ignore_unavailable_nodes"`
|
||||
AutoRecycleStuckPods bool `yaml:"auto_recycle_stuck_pods"`
|
||||
StuckPodGraceSeconds int `yaml:"stuck_pod_grace_seconds"`
|
||||
VaultUnsealKeyFile string `yaml:"vault_unseal_key_file"`
|
||||
VaultUnsealBreakglassCommand string `yaml:"vault_unseal_breakglass_command"`
|
||||
VaultUnsealBreakglassTimeout int `yaml:"vault_unseal_breakglass_timeout_seconds"`
|
||||
}
|
||||
|
||||
type ServiceChecklistCheck struct {
|
||||
Name string `yaml:"name"`
|
||||
URL string `yaml:"url"`
|
||||
AcceptedStatuses []int `yaml:"accepted_statuses"`
|
||||
BodyContains string `yaml:"body_contains"`
|
||||
BodyNotContains string `yaml:"body_not_contains"`
|
||||
TimeoutSeconds int `yaml:"timeout_seconds"`
|
||||
InsecureSkipTLS bool `yaml:"insecure_skip_tls"`
|
||||
}
|
||||
|
||||
type Shutdown struct {
|
||||
DefaultBudgetSeconds int `yaml:"default_budget_seconds"`
|
||||
HistoryMinSamples int `yaml:"history_min_samples"`
|
||||
@ -143,6 +174,9 @@ func (c Config) Validate() error {
|
||||
if c.ExpectedFluxBranch == "" {
|
||||
return fmt.Errorf("config.expected_flux_branch must not be empty")
|
||||
}
|
||||
if c.ExpectedFluxSource == "" {
|
||||
return fmt.Errorf("config.expected_flux_source_url must not be empty")
|
||||
}
|
||||
if c.IACRepoPath == "" {
|
||||
return fmt.Errorf("config.iac_repo_path must not be empty")
|
||||
}
|
||||
@ -176,6 +210,25 @@ func (c Config) Validate() error {
|
||||
if c.Startup.ShutdownCooldownSeconds <= 0 {
|
||||
return fmt.Errorf("config.startup.shutdown_cooldown_seconds must be > 0")
|
||||
}
|
||||
if c.Startup.MinimumBatteryPercent < 0 || c.Startup.MinimumBatteryPercent > 100 {
|
||||
return fmt.Errorf("config.startup.minimum_battery_percent must be between 0 and 100")
|
||||
}
|
||||
for node, labels := range c.Startup.RequiredNodeLabels {
|
||||
if strings.TrimSpace(node) == "" {
|
||||
return fmt.Errorf("config.startup.required_node_labels keys must not be empty")
|
||||
}
|
||||
if len(labels) == 0 {
|
||||
return fmt.Errorf("config.startup.required_node_labels[%q] must include at least one label", node)
|
||||
}
|
||||
for key, value := range labels {
|
||||
if strings.TrimSpace(key) == "" {
|
||||
return fmt.Errorf("config.startup.required_node_labels[%q] contains empty label key", node)
|
||||
}
|
||||
if strings.TrimSpace(value) == "" {
|
||||
return fmt.Errorf("config.startup.required_node_labels[%q][%q] must not be empty", node, key)
|
||||
}
|
||||
}
|
||||
}
|
||||
if c.Startup.TimeSyncWaitSeconds <= 0 {
|
||||
return fmt.Errorf("config.startup.time_sync_wait_seconds must be > 0")
|
||||
}
|
||||
@ -223,11 +276,88 @@ func (c Config) Validate() error {
|
||||
if c.Startup.RequirePostStartProbes && len(c.Startup.PostStartProbes) == 0 {
|
||||
return fmt.Errorf("config.startup.post_start_probes must not be empty when require_post_start_probes is true")
|
||||
}
|
||||
if c.Startup.ServiceChecklistWaitSeconds <= 0 {
|
||||
return fmt.Errorf("config.startup.service_checklist_wait_seconds must be > 0")
|
||||
}
|
||||
if c.Startup.ServiceChecklistPollSeconds <= 0 {
|
||||
return fmt.Errorf("config.startup.service_checklist_poll_seconds must be > 0")
|
||||
}
|
||||
if c.Startup.ServiceChecklistStabilitySec < 0 {
|
||||
return fmt.Errorf("config.startup.service_checklist_stability_seconds must be >= 0")
|
||||
}
|
||||
if c.Startup.RequireServiceChecklist && len(c.Startup.ServiceChecklist) == 0 {
|
||||
return fmt.Errorf("config.startup.service_checklist must not be empty when require_service_checklist is true")
|
||||
}
|
||||
for i, check := range c.Startup.ServiceChecklist {
|
||||
if strings.TrimSpace(check.Name) == "" {
|
||||
return fmt.Errorf("config.startup.service_checklist[%d].name must not be empty", i)
|
||||
}
|
||||
rawURL := strings.TrimSpace(check.URL)
|
||||
if rawURL == "" {
|
||||
return fmt.Errorf("config.startup.service_checklist[%d].url must not be empty", i)
|
||||
}
|
||||
parsed, err := neturl.Parse(rawURL)
|
||||
if err != nil || parsed.Scheme == "" || parsed.Host == "" {
|
||||
return fmt.Errorf("config.startup.service_checklist[%d].url is invalid: %q", i, rawURL)
|
||||
}
|
||||
if check.TimeoutSeconds <= 0 {
|
||||
return fmt.Errorf("config.startup.service_checklist[%d].timeout_seconds must be > 0", i)
|
||||
}
|
||||
for _, code := range check.AcceptedStatuses {
|
||||
if code < 100 || code > 599 {
|
||||
return fmt.Errorf("config.startup.service_checklist[%d].accepted_statuses contains invalid HTTP code %d", i, code)
|
||||
}
|
||||
}
|
||||
}
|
||||
if c.Startup.FluxHealthWaitSeconds <= 0 {
|
||||
return fmt.Errorf("config.startup.flux_health_wait_seconds must be > 0")
|
||||
}
|
||||
if c.Startup.FluxHealthPollSeconds <= 0 {
|
||||
return fmt.Errorf("config.startup.flux_health_poll_seconds must be > 0")
|
||||
}
|
||||
if c.Startup.WorkloadConvergenceWaitSeconds <= 0 {
|
||||
return fmt.Errorf("config.startup.workload_convergence_wait_seconds must be > 0")
|
||||
}
|
||||
if c.Startup.WorkloadConvergencePollSeconds <= 0 {
|
||||
return fmt.Errorf("config.startup.workload_convergence_poll_seconds must be > 0")
|
||||
}
|
||||
if c.Startup.StuckPodGraceSeconds <= 0 {
|
||||
return fmt.Errorf("config.startup.stuck_pod_grace_seconds must be > 0")
|
||||
}
|
||||
for _, probe := range c.Startup.PostStartProbes {
|
||||
if strings.TrimSpace(probe) == "" {
|
||||
return fmt.Errorf("config.startup.post_start_probes entries must not be empty")
|
||||
}
|
||||
}
|
||||
for _, item := range c.Startup.IgnoreFluxKustomizations {
|
||||
item = strings.TrimSpace(item)
|
||||
if item == "" {
|
||||
return fmt.Errorf("config.startup.ignore_flux_kustomizations entries must not be empty")
|
||||
}
|
||||
if strings.Count(item, "/") != 1 {
|
||||
return fmt.Errorf("config.startup.ignore_flux_kustomizations entries must be namespace/name, got %q", item)
|
||||
}
|
||||
}
|
||||
for _, item := range c.Startup.IgnoreWorkloads {
|
||||
item = strings.TrimSpace(item)
|
||||
if item == "" {
|
||||
return fmt.Errorf("config.startup.ignore_workloads entries must not be empty")
|
||||
}
|
||||
parts := strings.Split(item, "/")
|
||||
if len(parts) != 2 && len(parts) != 3 {
|
||||
return fmt.Errorf("config.startup.ignore_workloads entries must be namespace/name or namespace/kind/name, got %q", item)
|
||||
}
|
||||
}
|
||||
for _, ns := range c.Startup.IgnoreWorkloadNamespaces {
|
||||
if strings.TrimSpace(ns) == "" {
|
||||
return fmt.Errorf("config.startup.ignore_workload_namespaces entries must not be empty")
|
||||
}
|
||||
}
|
||||
for _, node := range c.Startup.IgnoreUnavailableNodes {
|
||||
if strings.TrimSpace(node) == "" {
|
||||
return fmt.Errorf("config.startup.ignore_unavailable_nodes entries must not be empty")
|
||||
}
|
||||
}
|
||||
if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" {
|
||||
return fmt.Errorf("config.startup.vault_unseal_key_file must not be empty")
|
||||
}
|
||||
@ -276,6 +406,7 @@ func defaults() Config {
|
||||
c := Config{
|
||||
IACRepoPath: "/opt/titan-iac",
|
||||
ExpectedFluxBranch: "main",
|
||||
ExpectedFluxSource: "ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git",
|
||||
SSHPort: 2277,
|
||||
ControlPlanes: []string{"titan-0a", "titan-0b", "titan-0c"},
|
||||
LocalBootstrapPaths: []string{
|
||||
@ -328,15 +459,53 @@ func defaults() Config {
|
||||
"gitea/gitea-data",
|
||||
"sso/keycloak-data",
|
||||
},
|
||||
MinimumBatteryPercent: 20,
|
||||
RequiredNodeLabels: map[string]map[string]string{
|
||||
"titan-09": {
|
||||
"ananke.bstein.dev/harbor-bootstrap": "true",
|
||||
},
|
||||
},
|
||||
RequirePostStartProbes: true,
|
||||
PostStartProbeWaitSeconds: 240,
|
||||
PostStartProbePollSeconds: 5,
|
||||
PostStartProbes: []string{
|
||||
"https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration",
|
||||
"https://scm.bstein.dev/user/login",
|
||||
"https://metrics.bstein.dev/login",
|
||||
"https://scm.bstein.dev/api/healthz",
|
||||
"https://metrics.bstein.dev/api/health",
|
||||
},
|
||||
VaultUnsealKeyFile: "/var/lib/hecate/vault-unseal.key",
|
||||
RequireServiceChecklist: true,
|
||||
ServiceChecklistWaitSeconds: 420,
|
||||
ServiceChecklistPollSeconds: 5,
|
||||
ServiceChecklistStabilitySec: 120,
|
||||
ServiceChecklist: []ServiceChecklistCheck{
|
||||
{
|
||||
Name: "gitea-api",
|
||||
URL: "https://scm.bstein.dev/api/healthz",
|
||||
AcceptedStatuses: []int{200},
|
||||
BodyContains: "pass",
|
||||
TimeoutSeconds: 12,
|
||||
},
|
||||
{
|
||||
Name: "grafana-api",
|
||||
URL: "https://metrics.bstein.dev/api/health",
|
||||
AcceptedStatuses: []int{200},
|
||||
BodyContains: "\"database\":\"ok\"",
|
||||
TimeoutSeconds: 12,
|
||||
},
|
||||
},
|
||||
RequireFluxHealth: true,
|
||||
FluxHealthWaitSeconds: 900,
|
||||
FluxHealthPollSeconds: 5,
|
||||
IgnoreFluxKustomizations: []string{},
|
||||
RequireWorkloadConvergence: true,
|
||||
WorkloadConvergenceWaitSeconds: 900,
|
||||
WorkloadConvergencePollSeconds: 5,
|
||||
IgnoreWorkloadNamespaces: []string{},
|
||||
IgnoreWorkloads: []string{},
|
||||
IgnoreUnavailableNodes: []string{},
|
||||
AutoRecycleStuckPods: true,
|
||||
StuckPodGraceSeconds: 180,
|
||||
VaultUnsealKeyFile: "/var/lib/ananke/vault-unseal.key",
|
||||
VaultUnsealBreakglassTimeout: 15,
|
||||
},
|
||||
Shutdown: Shutdown{
|
||||
@ -362,7 +531,7 @@ func defaults() Config {
|
||||
TelemetryTimeoutSeconds: 90,
|
||||
},
|
||||
Coordination: Coordination{
|
||||
ForwardShutdownConfig: "/etc/hecate/hecate.yaml",
|
||||
ForwardShutdownConfig: "/etc/ananke/ananke.yaml",
|
||||
PeerHosts: []string{},
|
||||
FallbackLocalShutdown: true,
|
||||
CommandTimeoutSeconds: 25,
|
||||
@ -376,10 +545,10 @@ func defaults() Config {
|
||||
Path: "/metrics",
|
||||
},
|
||||
State: State{
|
||||
Dir: "/var/lib/hecate",
|
||||
RunHistoryPath: "/var/lib/hecate/runs.json",
|
||||
LockPath: "/var/lib/hecate/hecate.lock",
|
||||
IntentPath: "/var/lib/hecate/intent.json",
|
||||
Dir: "/var/lib/ananke",
|
||||
RunHistoryPath: "/var/lib/ananke/runs.json",
|
||||
LockPath: "/var/lib/ananke/ananke.lock",
|
||||
IntentPath: "/var/lib/ananke/intent.json",
|
||||
},
|
||||
}
|
||||
c.applyDefaults()
|
||||
@ -393,6 +562,9 @@ func (c *Config) applyDefaults() {
|
||||
if c.IACRepoPath == "" {
|
||||
c.IACRepoPath = "/opt/titan-iac"
|
||||
}
|
||||
if c.ExpectedFluxSource == "" {
|
||||
c.ExpectedFluxSource = "ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git"
|
||||
}
|
||||
if c.Startup.APIWaitSeconds <= 0 {
|
||||
c.Startup.APIWaitSeconds = 1200
|
||||
}
|
||||
@ -402,6 +574,16 @@ func (c *Config) applyDefaults() {
|
||||
if c.Startup.ShutdownCooldownSeconds <= 0 {
|
||||
c.Startup.ShutdownCooldownSeconds = 45
|
||||
}
|
||||
if c.Startup.MinimumBatteryPercent <= 0 {
|
||||
c.Startup.MinimumBatteryPercent = 20
|
||||
}
|
||||
if c.Startup.RequiredNodeLabels == nil {
|
||||
c.Startup.RequiredNodeLabels = map[string]map[string]string{
|
||||
"titan-09": {
|
||||
"ananke.bstein.dev/harbor-bootstrap": "true",
|
||||
},
|
||||
}
|
||||
}
|
||||
if c.Startup.TimeSyncWaitSeconds <= 0 {
|
||||
c.Startup.TimeSyncWaitSeconds = 240
|
||||
}
|
||||
@ -446,12 +628,71 @@ func (c *Config) applyDefaults() {
|
||||
if len(c.Startup.PostStartProbes) == 0 {
|
||||
c.Startup.PostStartProbes = []string{
|
||||
"https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration",
|
||||
"https://scm.bstein.dev/user/login",
|
||||
"https://metrics.bstein.dev/login",
|
||||
"https://scm.bstein.dev/api/healthz",
|
||||
"https://metrics.bstein.dev/api/health",
|
||||
}
|
||||
}
|
||||
if c.Startup.ServiceChecklistWaitSeconds <= 0 {
|
||||
c.Startup.ServiceChecklistWaitSeconds = 420
|
||||
}
|
||||
if c.Startup.ServiceChecklistPollSeconds <= 0 {
|
||||
c.Startup.ServiceChecklistPollSeconds = 5
|
||||
}
|
||||
if c.Startup.ServiceChecklistStabilitySec < 0 {
|
||||
c.Startup.ServiceChecklistStabilitySec = 0
|
||||
}
|
||||
if len(c.Startup.ServiceChecklist) == 0 {
|
||||
c.Startup.ServiceChecklist = []ServiceChecklistCheck{
|
||||
{
|
||||
Name: "gitea-api",
|
||||
URL: "https://scm.bstein.dev/api/healthz",
|
||||
AcceptedStatuses: []int{200},
|
||||
BodyContains: "pass",
|
||||
TimeoutSeconds: 12,
|
||||
},
|
||||
{
|
||||
Name: "grafana-api",
|
||||
URL: "https://metrics.bstein.dev/api/health",
|
||||
AcceptedStatuses: []int{200},
|
||||
BodyContains: "\"database\":\"ok\"",
|
||||
TimeoutSeconds: 12,
|
||||
},
|
||||
}
|
||||
}
|
||||
for i := range c.Startup.ServiceChecklist {
|
||||
if c.Startup.ServiceChecklist[i].TimeoutSeconds <= 0 {
|
||||
c.Startup.ServiceChecklist[i].TimeoutSeconds = 12
|
||||
}
|
||||
}
|
||||
if c.Startup.FluxHealthWaitSeconds <= 0 {
|
||||
c.Startup.FluxHealthWaitSeconds = 900
|
||||
}
|
||||
if c.Startup.FluxHealthPollSeconds <= 0 {
|
||||
c.Startup.FluxHealthPollSeconds = 5
|
||||
}
|
||||
if c.Startup.IgnoreFluxKustomizations == nil {
|
||||
c.Startup.IgnoreFluxKustomizations = []string{}
|
||||
}
|
||||
if c.Startup.WorkloadConvergenceWaitSeconds <= 0 {
|
||||
c.Startup.WorkloadConvergenceWaitSeconds = 900
|
||||
}
|
||||
if c.Startup.WorkloadConvergencePollSeconds <= 0 {
|
||||
c.Startup.WorkloadConvergencePollSeconds = 5
|
||||
}
|
||||
if c.Startup.IgnoreWorkloadNamespaces == nil {
|
||||
c.Startup.IgnoreWorkloadNamespaces = []string{}
|
||||
}
|
||||
if c.Startup.IgnoreWorkloads == nil {
|
||||
c.Startup.IgnoreWorkloads = []string{}
|
||||
}
|
||||
if c.Startup.IgnoreUnavailableNodes == nil {
|
||||
c.Startup.IgnoreUnavailableNodes = []string{}
|
||||
}
|
||||
if c.Startup.StuckPodGraceSeconds <= 0 {
|
||||
c.Startup.StuckPodGraceSeconds = 180
|
||||
}
|
||||
if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" {
|
||||
c.Startup.VaultUnsealKeyFile = "/var/lib/hecate/vault-unseal.key"
|
||||
c.Startup.VaultUnsealKeyFile = "/var/lib/ananke/vault-unseal.key"
|
||||
}
|
||||
if c.Startup.VaultUnsealBreakglassTimeout <= 0 {
|
||||
c.Startup.VaultUnsealBreakglassTimeout = 15
|
||||
@ -496,7 +737,7 @@ func (c *Config) applyDefaults() {
|
||||
c.UPS.TelemetryTimeoutSeconds = 90
|
||||
}
|
||||
if c.Coordination.ForwardShutdownConfig == "" {
|
||||
c.Coordination.ForwardShutdownConfig = "/etc/hecate/hecate.yaml"
|
||||
c.Coordination.ForwardShutdownConfig = "/etc/ananke/ananke.yaml"
|
||||
}
|
||||
if c.Coordination.PeerHosts == nil {
|
||||
c.Coordination.PeerHosts = []string{}
|
||||
@ -517,15 +758,15 @@ func (c *Config) applyDefaults() {
|
||||
c.Metrics.Path = "/metrics"
|
||||
}
|
||||
if c.State.Dir == "" {
|
||||
c.State.Dir = "/var/lib/hecate"
|
||||
c.State.Dir = "/var/lib/ananke"
|
||||
}
|
||||
if c.State.RunHistoryPath == "" {
|
||||
c.State.RunHistoryPath = "/var/lib/hecate/runs.json"
|
||||
c.State.RunHistoryPath = "/var/lib/ananke/runs.json"
|
||||
}
|
||||
if c.State.LockPath == "" {
|
||||
c.State.LockPath = "/var/lib/hecate/hecate.lock"
|
||||
c.State.LockPath = "/var/lib/ananke/ananke.lock"
|
||||
}
|
||||
if c.State.IntentPath == "" {
|
||||
c.State.IntentPath = "/var/lib/hecate/intent.json"
|
||||
c.State.IntentPath = "/var/lib/ananke/intent.json"
|
||||
}
|
||||
}
|
||||
|
||||
@ -9,7 +9,7 @@ import (
|
||||
|
||||
func TestLoadAcceptsUPSTargets(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
cfgPath := filepath.Join(tmp, "hecate.yaml")
|
||||
cfgPath := filepath.Join(tmp, "ananke.yaml")
|
||||
raw := `
|
||||
control_planes: [titan-0a, titan-0b, titan-0c]
|
||||
expected_flux_branch: main
|
||||
@ -24,7 +24,7 @@ shutdown:
|
||||
default_budget_seconds: 300
|
||||
state:
|
||||
run_history_path: /tmp/runs.json
|
||||
lock_path: /tmp/hecate.lock
|
||||
lock_path: /tmp/ananke.lock
|
||||
`
|
||||
if err := os.WriteFile(cfgPath, []byte(strings.TrimSpace(raw)), 0o644); err != nil {
|
||||
t.Fatalf("write config: %v", err)
|
||||
@ -74,7 +74,7 @@ func TestValidateRejectsUnknownEtcdRestoreControlPlane(t *testing.T) {
|
||||
|
||||
func TestLoadSetsCoordinationGuardDefaults(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
cfgPath := filepath.Join(tmp, "hecate.yaml")
|
||||
cfgPath := filepath.Join(tmp, "ananke.yaml")
|
||||
raw := `
|
||||
control_planes: [titan-0a, titan-0b, titan-0c]
|
||||
expected_flux_branch: main
|
||||
@ -85,7 +85,7 @@ ups:
|
||||
enabled: false
|
||||
state:
|
||||
run_history_path: /tmp/runs.json
|
||||
lock_path: /tmp/hecate.lock
|
||||
lock_path: /tmp/ananke.lock
|
||||
`
|
||||
if err := os.WriteFile(cfgPath, []byte(strings.TrimSpace(raw)), 0o644); err != nil {
|
||||
t.Fatalf("write config: %v", err)
|
||||
@ -146,3 +146,55 @@ func TestValidateRejectsMissingPostStartProbesWhenRequired(t *testing.T) {
|
||||
t.Fatalf("expected validation error when post start probes are required but empty")
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidateRejectsMissingServiceChecklistWhenRequired(t *testing.T) {
|
||||
cfg := defaults()
|
||||
cfg.Startup.RequireServiceChecklist = true
|
||||
cfg.Startup.ServiceChecklist = nil
|
||||
if err := cfg.Validate(); err == nil {
|
||||
t.Fatalf("expected validation error when service checklist is required but empty")
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidateRejectsBadServiceChecklistURL(t *testing.T) {
|
||||
cfg := defaults()
|
||||
cfg.Startup.ServiceChecklist = []ServiceChecklistCheck{
|
||||
{
|
||||
Name: "grafana",
|
||||
URL: "not-a-url",
|
||||
AcceptedStatuses: []int{200},
|
||||
TimeoutSeconds: 12,
|
||||
},
|
||||
}
|
||||
if err := cfg.Validate(); err == nil {
|
||||
t.Fatalf("expected validation error for invalid service checklist url")
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidateRejectsBadIgnoreFluxKustomizationFormat(t *testing.T) {
|
||||
cfg := defaults()
|
||||
cfg.Startup.IgnoreFluxKustomizations = []string{"jellyfin"}
|
||||
if err := cfg.Validate(); err == nil {
|
||||
t.Fatalf("expected validation error for invalid ignore_flux_kustomizations entry")
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidateRejectsBadIgnoreWorkloadFormat(t *testing.T) {
|
||||
cfg := defaults()
|
||||
cfg.Startup.IgnoreWorkloads = []string{"maintenance/metis/extra/value"}
|
||||
if err := cfg.Validate(); err == nil {
|
||||
t.Fatalf("expected validation error for invalid ignore_workloads entry")
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidateRejectsInvalidRequiredNodeLabel(t *testing.T) {
|
||||
cfg := defaults()
|
||||
cfg.Startup.RequiredNodeLabels = map[string]map[string]string{
|
||||
"titan-09": {
|
||||
"": "true",
|
||||
},
|
||||
}
|
||||
if err := cfg.Validate(); err == nil {
|
||||
t.Fatalf("expected validation error for invalid required_node_labels entry")
|
||||
}
|
||||
}
|
||||
|
||||
@ -84,41 +84,41 @@ func (e *Exporter) serveMetrics(w http.ResponseWriter, _ *http.Request) {
|
||||
|
||||
w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8")
|
||||
var b strings.Builder
|
||||
b.WriteString("# HELP hecate_shutdown_budget_seconds Estimated graceful shutdown budget in seconds (p95).\n")
|
||||
b.WriteString("# TYPE hecate_shutdown_budget_seconds gauge\n")
|
||||
b.WriteString(fmt.Sprintf("hecate_shutdown_budget_seconds %d\n", e.shutdownBudgetSec))
|
||||
b.WriteString("# HELP hecate_shutdown_triggers_total Total number of shutdown triggers issued by this instance.\n")
|
||||
b.WriteString("# TYPE hecate_shutdown_triggers_total counter\n")
|
||||
b.WriteString(fmt.Sprintf("hecate_shutdown_triggers_total %d\n", e.shutdownTriggers))
|
||||
b.WriteString("# HELP hecate_shutdown_last_trigger_timestamp_seconds Unix timestamp of last shutdown trigger.\n")
|
||||
b.WriteString("# TYPE hecate_shutdown_last_trigger_timestamp_seconds gauge\n")
|
||||
b.WriteString("# HELP ananke_shutdown_budget_seconds Estimated graceful shutdown budget in seconds (p95).\n")
|
||||
b.WriteString("# TYPE ananke_shutdown_budget_seconds gauge\n")
|
||||
b.WriteString(fmt.Sprintf("ananke_shutdown_budget_seconds %d\n", e.shutdownBudgetSec))
|
||||
b.WriteString("# HELP ananke_shutdown_triggers_total Total number of shutdown triggers issued by this instance.\n")
|
||||
b.WriteString("# TYPE ananke_shutdown_triggers_total counter\n")
|
||||
b.WriteString(fmt.Sprintf("ananke_shutdown_triggers_total %d\n", e.shutdownTriggers))
|
||||
b.WriteString("# HELP ananke_shutdown_last_trigger_timestamp_seconds Unix timestamp of last shutdown trigger.\n")
|
||||
b.WriteString("# TYPE ananke_shutdown_last_trigger_timestamp_seconds gauge\n")
|
||||
if e.lastShutdownAt.IsZero() {
|
||||
b.WriteString("hecate_shutdown_last_trigger_timestamp_seconds 0\n")
|
||||
b.WriteString("ananke_shutdown_last_trigger_timestamp_seconds 0\n")
|
||||
} else {
|
||||
b.WriteString(fmt.Sprintf("hecate_shutdown_last_trigger_timestamp_seconds %d\n", e.lastShutdownAt.Unix()))
|
||||
b.WriteString(fmt.Sprintf("ananke_shutdown_last_trigger_timestamp_seconds %d\n", e.lastShutdownAt.Unix()))
|
||||
}
|
||||
b.WriteString("# HELP hecate_ups_on_battery Whether a UPS source is currently on battery.\n")
|
||||
b.WriteString("# TYPE hecate_ups_on_battery gauge\n")
|
||||
b.WriteString("# HELP hecate_ups_low_battery Whether a UPS source currently reports low battery.\n")
|
||||
b.WriteString("# TYPE hecate_ups_low_battery gauge\n")
|
||||
b.WriteString("# HELP hecate_ups_runtime_seconds Battery runtime remaining reported by UPS.\n")
|
||||
b.WriteString("# TYPE hecate_ups_runtime_seconds gauge\n")
|
||||
b.WriteString("# HELP hecate_ups_battery_charge_percent Battery charge percentage reported by UPS.\n")
|
||||
b.WriteString("# TYPE hecate_ups_battery_charge_percent gauge\n")
|
||||
b.WriteString("# HELP hecate_ups_load_percent UPS output load percentage.\n")
|
||||
b.WriteString("# TYPE hecate_ups_load_percent gauge\n")
|
||||
b.WriteString("# HELP hecate_ups_power_nominal_watts UPS nominal power rating in watts.\n")
|
||||
b.WriteString("# TYPE hecate_ups_power_nominal_watts gauge\n")
|
||||
b.WriteString("# HELP hecate_ups_threshold_seconds Red-line threshold for runtime-based shutdown.\n")
|
||||
b.WriteString("# TYPE hecate_ups_threshold_seconds gauge\n")
|
||||
b.WriteString("# HELP hecate_ups_trigger_active Whether this UPS source currently breaches shutdown trigger conditions.\n")
|
||||
b.WriteString("# TYPE hecate_ups_trigger_active gauge\n")
|
||||
b.WriteString("# HELP hecate_ups_breach_count Current debounce breach count for this UPS source.\n")
|
||||
b.WriteString("# TYPE hecate_ups_breach_count gauge\n")
|
||||
b.WriteString("# HELP hecate_ups_last_sample_timestamp_seconds Unix timestamp of most recent sample.\n")
|
||||
b.WriteString("# TYPE hecate_ups_last_sample_timestamp_seconds gauge\n")
|
||||
b.WriteString("# HELP hecate_ups_error Whether the last sample had an error.\n")
|
||||
b.WriteString("# TYPE hecate_ups_error gauge\n")
|
||||
b.WriteString("# HELP ananke_ups_on_battery Whether a UPS source is currently on battery.\n")
|
||||
b.WriteString("# TYPE ananke_ups_on_battery gauge\n")
|
||||
b.WriteString("# HELP ananke_ups_low_battery Whether a UPS source currently reports low battery.\n")
|
||||
b.WriteString("# TYPE ananke_ups_low_battery gauge\n")
|
||||
b.WriteString("# HELP ananke_ups_runtime_seconds Battery runtime remaining reported by UPS.\n")
|
||||
b.WriteString("# TYPE ananke_ups_runtime_seconds gauge\n")
|
||||
b.WriteString("# HELP ananke_ups_battery_charge_percent Battery charge percentage reported by UPS.\n")
|
||||
b.WriteString("# TYPE ananke_ups_battery_charge_percent gauge\n")
|
||||
b.WriteString("# HELP ananke_ups_load_percent UPS output load percentage.\n")
|
||||
b.WriteString("# TYPE ananke_ups_load_percent gauge\n")
|
||||
b.WriteString("# HELP ananke_ups_power_nominal_watts UPS nominal power rating in watts.\n")
|
||||
b.WriteString("# TYPE ananke_ups_power_nominal_watts gauge\n")
|
||||
b.WriteString("# HELP ananke_ups_threshold_seconds Red-line threshold for runtime-based shutdown.\n")
|
||||
b.WriteString("# TYPE ananke_ups_threshold_seconds gauge\n")
|
||||
b.WriteString("# HELP ananke_ups_trigger_active Whether this UPS source currently breaches shutdown trigger conditions.\n")
|
||||
b.WriteString("# TYPE ananke_ups_trigger_active gauge\n")
|
||||
b.WriteString("# HELP ananke_ups_breach_count Current debounce breach count for this UPS source.\n")
|
||||
b.WriteString("# TYPE ananke_ups_breach_count gauge\n")
|
||||
b.WriteString("# HELP ananke_ups_last_sample_timestamp_seconds Unix timestamp of most recent sample.\n")
|
||||
b.WriteString("# TYPE ananke_ups_last_sample_timestamp_seconds gauge\n")
|
||||
b.WriteString("# HELP ananke_ups_error Whether the last sample had an error.\n")
|
||||
b.WriteString("# TYPE ananke_ups_error gauge\n")
|
||||
|
||||
names := make([]string, 0, len(e.samples))
|
||||
for name := range e.samples {
|
||||
@ -129,21 +129,21 @@ func (e *Exporter) serveMetrics(w http.ResponseWriter, _ *http.Request) {
|
||||
s := e.samples[name]
|
||||
labels := fmt.Sprintf("{source=%q,target=%q,status=%q,last_reason=%q}",
|
||||
safe(name), safe(s.Target), safe(s.Status), safe(e.lastShutdownReason))
|
||||
b.WriteString(fmt.Sprintf("hecate_ups_on_battery%s %d\n", labels, boolNum(s.OnBattery)))
|
||||
b.WriteString(fmt.Sprintf("hecate_ups_low_battery%s %d\n", labels, boolNum(s.LowBattery)))
|
||||
b.WriteString(fmt.Sprintf("hecate_ups_runtime_seconds%s %d\n", labels, s.RuntimeSecond))
|
||||
b.WriteString(fmt.Sprintf("hecate_ups_battery_charge_percent%s %.2f\n", labels, s.BatteryCharge))
|
||||
b.WriteString(fmt.Sprintf("hecate_ups_load_percent%s %.2f\n", labels, s.LoadPercent))
|
||||
b.WriteString(fmt.Sprintf("hecate_ups_power_nominal_watts%s %.2f\n", labels, s.PowerNominalW))
|
||||
b.WriteString(fmt.Sprintf("hecate_ups_threshold_seconds%s %d\n", labels, s.ThresholdSec))
|
||||
b.WriteString(fmt.Sprintf("hecate_ups_trigger_active%s %d\n", labels, boolNum(s.Trigger)))
|
||||
b.WriteString(fmt.Sprintf("hecate_ups_breach_count%s %d\n", labels, s.BreachCount))
|
||||
b.WriteString(fmt.Sprintf("ananke_ups_on_battery%s %d\n", labels, boolNum(s.OnBattery)))
|
||||
b.WriteString(fmt.Sprintf("ananke_ups_low_battery%s %d\n", labels, boolNum(s.LowBattery)))
|
||||
b.WriteString(fmt.Sprintf("ananke_ups_runtime_seconds%s %d\n", labels, s.RuntimeSecond))
|
||||
b.WriteString(fmt.Sprintf("ananke_ups_battery_charge_percent%s %.2f\n", labels, s.BatteryCharge))
|
||||
b.WriteString(fmt.Sprintf("ananke_ups_load_percent%s %.2f\n", labels, s.LoadPercent))
|
||||
b.WriteString(fmt.Sprintf("ananke_ups_power_nominal_watts%s %.2f\n", labels, s.PowerNominalW))
|
||||
b.WriteString(fmt.Sprintf("ananke_ups_threshold_seconds%s %d\n", labels, s.ThresholdSec))
|
||||
b.WriteString(fmt.Sprintf("ananke_ups_trigger_active%s %d\n", labels, boolNum(s.Trigger)))
|
||||
b.WriteString(fmt.Sprintf("ananke_ups_breach_count%s %d\n", labels, s.BreachCount))
|
||||
if s.UpdatedAt.IsZero() {
|
||||
b.WriteString(fmt.Sprintf("hecate_ups_last_sample_timestamp_seconds%s 0\n", labels))
|
||||
b.WriteString(fmt.Sprintf("ananke_ups_last_sample_timestamp_seconds%s 0\n", labels))
|
||||
} else {
|
||||
b.WriteString(fmt.Sprintf("hecate_ups_last_sample_timestamp_seconds%s %d\n", labels, s.UpdatedAt.Unix()))
|
||||
b.WriteString(fmt.Sprintf("ananke_ups_last_sample_timestamp_seconds%s %d\n", labels, s.UpdatedAt.Unix()))
|
||||
}
|
||||
b.WriteString(fmt.Sprintf("hecate_ups_error%s %d\n", labels, boolNum(s.LastError != "")))
|
||||
b.WriteString(fmt.Sprintf("ananke_ups_error%s %d\n", labels, boolNum(s.LastError != "")))
|
||||
}
|
||||
|
||||
_, _ = w.Write([]byte(b.String()))
|
||||
|
||||
@ -33,14 +33,14 @@ func TestExporterEmitsCoreMetrics(t *testing.T) {
|
||||
body := rr.Body.String()
|
||||
|
||||
mustContain := []string{
|
||||
"hecate_shutdown_budget_seconds 321",
|
||||
"hecate_shutdown_triggers_total 1",
|
||||
"hecate_ups_on_battery{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
|
||||
"hecate_ups_runtime_seconds{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
|
||||
"hecate_ups_battery_charge_percent{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
|
||||
"hecate_ups_load_percent{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
|
||||
"hecate_ups_power_nominal_watts{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
|
||||
"hecate_ups_threshold_seconds{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
|
||||
"ananke_shutdown_budget_seconds 321",
|
||||
"ananke_shutdown_triggers_total 1",
|
||||
"ananke_ups_on_battery{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
|
||||
"ananke_ups_runtime_seconds{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
|
||||
"ananke_ups_battery_charge_percent{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
|
||||
"ananke_ups_load_percent{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
|
||||
"ananke_ups_power_nominal_watts{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
|
||||
"ananke_ups_threshold_seconds{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
|
||||
}
|
||||
for _, m := range mustContain {
|
||||
if !strings.Contains(body, m) {
|
||||
|
||||
@ -12,12 +12,12 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"scm.bstein.dev/bstein/hecate/internal/cluster"
|
||||
"scm.bstein.dev/bstein/hecate/internal/config"
|
||||
"scm.bstein.dev/bstein/hecate/internal/metrics"
|
||||
"scm.bstein.dev/bstein/hecate/internal/sshutil"
|
||||
"scm.bstein.dev/bstein/hecate/internal/state"
|
||||
"scm.bstein.dev/bstein/hecate/internal/ups"
|
||||
"scm.bstein.dev/bstein/ananke/internal/cluster"
|
||||
"scm.bstein.dev/bstein/ananke/internal/config"
|
||||
"scm.bstein.dev/bstein/ananke/internal/metrics"
|
||||
"scm.bstein.dev/bstein/ananke/internal/sshutil"
|
||||
"scm.bstein.dev/bstein/ananke/internal/state"
|
||||
"scm.bstein.dev/bstein/ananke/internal/ups"
|
||||
)
|
||||
|
||||
type Target struct {
|
||||
@ -81,7 +81,7 @@ func (d *Daemon) Run(ctx context.Context) error {
|
||||
lastGood[t.Name] = time.Now()
|
||||
}
|
||||
|
||||
d.log.Printf("hecate daemon started: poll=%s debounce=%d telemetry_timeout=%s targets=%s",
|
||||
d.log.Printf("ananke daemon started: poll=%s debounce=%d telemetry_timeout=%s targets=%s",
|
||||
poll, debounce, telemetryTimeout, d.targetList())
|
||||
|
||||
for {
|
||||
@ -198,7 +198,7 @@ func (d *Daemon) forwardShutdown(ctx context.Context, reason string) error {
|
||||
runCtx, cancel := context.WithTimeout(ctx, timeout)
|
||||
defer cancel()
|
||||
|
||||
remoteCmd := fmt.Sprintf("sudo /usr/local/bin/hecate shutdown --config %q --execute --reason %q", d.cfg.Coordination.ForwardShutdownConfig, reason)
|
||||
remoteCmd := fmt.Sprintf("sudo /usr/local/bin/ananke shutdown --config %q --execute --reason %q", d.cfg.Coordination.ForwardShutdownConfig, reason)
|
||||
if d.cfg.Shutdown.EmergencySkipEtcd {
|
||||
remoteCmd += " --skip-etcd-snapshot"
|
||||
}
|
||||
|
||||
@ -6,7 +6,7 @@ import (
|
||||
"time"
|
||||
)
|
||||
|
||||
// ParseIntentOutput parses `hecate intent` CLI output from local/remote commands.
|
||||
// ParseIntentOutput parses `ananke intent` CLI output from local/remote commands.
|
||||
func ParseIntentOutput(raw string) (Intent, error) {
|
||||
for _, line := range strings.Split(raw, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
|
||||
@ -61,7 +61,7 @@ func TestReadIntentAutoHealsCorruptJSON(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestParseIntentOutputParsesStructuredLine(t *testing.T) {
|
||||
raw := `[hecate] 2026/04/05 11:24:49 intent=normal reason="guard-test-clear-2" source=drill updated_at=2026-04-05T16:24:33Z`
|
||||
raw := `[ananke] 2026/04/05 11:24:49 intent=normal reason="guard-test-clear-2" source=drill updated_at=2026-04-05T16:24:33Z`
|
||||
in, err := ParseIntentOutput(raw)
|
||||
if err != nil {
|
||||
t.Fatalf("parse intent output: %v", err)
|
||||
@ -81,7 +81,7 @@ func TestParseIntentOutputParsesStructuredLine(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestParseIntentOutputHandlesNone(t *testing.T) {
|
||||
in, err := ParseIntentOutput(`[hecate] 2026/04/05 11:24:49 intent=none`)
|
||||
in, err := ParseIntentOutput(`[ananke] 2026/04/05 11:24:49 intent=none`)
|
||||
if err != nil {
|
||||
t.Fatalf("parse none intent output: %v", err)
|
||||
}
|
||||
|
||||
@ -11,7 +11,7 @@ import (
|
||||
)
|
||||
|
||||
func TestAcquireLockLifecycle(t *testing.T) {
|
||||
lockPath := filepath.Join(t.TempDir(), "hecate.lock")
|
||||
lockPath := filepath.Join(t.TempDir(), "ananke.lock")
|
||||
unlock, err := AcquireLock(lockPath)
|
||||
if err != nil {
|
||||
t.Fatalf("acquire lock: %v", err)
|
||||
@ -26,7 +26,7 @@ func TestAcquireLockLifecycle(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestAcquireLockReclaimsStaleLock(t *testing.T) {
|
||||
lockPath := filepath.Join(t.TempDir(), "hecate.lock")
|
||||
lockPath := filepath.Join(t.TempDir(), "ananke.lock")
|
||||
if err := os.WriteFile(lockPath, []byte("pid=999999\n"), 0o600); err != nil {
|
||||
t.Fatalf("write stale lock: %v", err)
|
||||
}
|
||||
@ -47,7 +47,7 @@ func TestAcquireLockReclaimsStaleLock(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestAcquireLockRejectsActiveLock(t *testing.T) {
|
||||
lockPath := filepath.Join(t.TempDir(), "hecate.lock")
|
||||
lockPath := filepath.Join(t.TempDir(), "ananke.lock")
|
||||
active := "pid=" + strconv.Itoa(os.Getpid()) + "\n"
|
||||
if err := os.WriteFile(lockPath, []byte(active), 0o600); err != nil {
|
||||
t.Fatalf("write active lock: %v", err)
|
||||
|
||||
@ -3,7 +3,7 @@ package ups
|
||||
import "testing"
|
||||
|
||||
func TestParseNUT(t *testing.T) {
|
||||
raw := `battery.runtime: 384
|
||||
raw := `battery.runtime: 384
|
||||
battery.charge: 72
|
||||
ups.load: 19
|
||||
ups.realpower.nominal: 510
|
||||
|
||||
@ -2,23 +2,23 @@
|
||||
set -Eeuo pipefail
|
||||
|
||||
KUBECTL="${KUBECTL:-kubectl}"
|
||||
HECATE_COORDINATOR_HOST="${HECATE_COORDINATOR_HOST:-titan-db}"
|
||||
HECATE_BIN="${HECATE_BIN:-/usr/local/bin/hecate}"
|
||||
HECATE_CONFIG="${HECATE_CONFIG:-/etc/hecate/hecate.yaml}"
|
||||
HECATE_COORDINATOR_RELAY="${HECATE_COORDINATOR_RELAY:-}"
|
||||
LOG_DIR="${HECATE_DRILL_LOG_DIR:-/tmp/hecate-drills}"
|
||||
STARTUP_TIMEOUT_SECONDS="${HECATE_DRILL_STARTUP_TIMEOUT_SECONDS:-1800}"
|
||||
SHUTDOWN_TIMEOUT_SECONDS="${HECATE_DRILL_SHUTDOWN_TIMEOUT_SECONDS:-1800}"
|
||||
SHUTDOWN_CONFIG="${HECATE_DRILL_SHUTDOWN_CONFIG:-/tmp/hecate-drill-no-poweroff.yaml}"
|
||||
STARTUP_RETRY_DELAY_SECONDS="${HECATE_DRILL_STARTUP_RETRY_DELAY_SECONDS:-10}"
|
||||
STARTUP_RETRY_MAX="${HECATE_DRILL_STARTUP_RETRY_MAX:-12}"
|
||||
ANANKE_COORDINATOR_HOST="${ANANKE_COORDINATOR_HOST:-titan-db}"
|
||||
ANANKE_BIN="${ANANKE_BIN:-/usr/local/bin/ananke}"
|
||||
ANANKE_CONFIG="${ANANKE_CONFIG:-/etc/ananke/ananke.yaml}"
|
||||
ANANKE_COORDINATOR_RELAY="${ANANKE_COORDINATOR_RELAY:-}"
|
||||
LOG_DIR="${ANANKE_DRILL_LOG_DIR:-/tmp/ananke-drills}"
|
||||
STARTUP_TIMEOUT_SECONDS="${ANANKE_DRILL_STARTUP_TIMEOUT_SECONDS:-1800}"
|
||||
SHUTDOWN_TIMEOUT_SECONDS="${ANANKE_DRILL_SHUTDOWN_TIMEOUT_SECONDS:-1800}"
|
||||
SHUTDOWN_CONFIG="${ANANKE_DRILL_SHUTDOWN_CONFIG:-/tmp/ananke-drill-no-poweroff.yaml}"
|
||||
STARTUP_RETRY_DELAY_SECONDS="${ANANKE_DRILL_STARTUP_RETRY_DELAY_SECONDS:-10}"
|
||||
STARTUP_RETRY_MAX="${ANANKE_DRILL_STARTUP_RETRY_MAX:-12}"
|
||||
EXECUTE=0
|
||||
|
||||
usage() {
|
||||
cat <<'EOF'
|
||||
Usage:
|
||||
scripts/hecate-drills.sh list
|
||||
scripts/hecate-drills.sh run <drill-name> [--execute]
|
||||
scripts/ananke-drills.sh list
|
||||
scripts/ananke-drills.sh run <drill-name> [--execute]
|
||||
|
||||
Drills:
|
||||
flux-gitea-deadlock Simulate flux-controller + gitea outage and require startup recovery.
|
||||
@ -30,7 +30,7 @@ Drills:
|
||||
Notes:
|
||||
- Drills are intentionally disruptive and are not part of regular `make test`.
|
||||
- Use --execute to run live changes. Without it, this script prints planned actions only.
|
||||
- Optional relay: set HECATE_COORDINATOR_RELAY="ssh titan-db" to run coordinator commands via a jump host.
|
||||
- Optional relay: set ANANKE_COORDINATOR_RELAY="ssh titan-db" to run coordinator commands via a jump host.
|
||||
EOF
|
||||
}
|
||||
|
||||
@ -98,47 +98,47 @@ wait_ready_keycloak() {
|
||||
die "keycloak workload not found in sso namespace (expected deployment/keycloak or statefulset/keycloak)"
|
||||
}
|
||||
|
||||
run_hecate_startup() {
|
||||
run_ananke_startup() {
|
||||
local reason="$1"
|
||||
local cmd=(sudo "${HECATE_BIN}" startup --config "${HECATE_CONFIG}" --execute --force-flux-branch main --reason "${reason}")
|
||||
local cmd=(sudo "${ANANKE_BIN}" startup --config "${ANANKE_CONFIG}" --execute --force-flux-branch main --reason "${reason}")
|
||||
if [[ "${EXECUTE}" -eq 0 ]]; then
|
||||
if [[ -n "${HECATE_COORDINATOR_RELAY}" ]]; then
|
||||
log "plan: ssh ${HECATE_COORDINATOR_HOST} ${HECATE_COORDINATOR_RELAY} '${cmd[*]}'"
|
||||
if [[ -n "${ANANKE_COORDINATOR_RELAY}" ]]; then
|
||||
log "plan: ssh ${ANANKE_COORDINATOR_HOST} ${ANANKE_COORDINATOR_RELAY} '${cmd[*]}'"
|
||||
else
|
||||
log "plan: ssh ${HECATE_COORDINATOR_HOST} '${cmd[*]}'"
|
||||
log "plan: ssh ${ANANKE_COORDINATOR_HOST} '${cmd[*]}'"
|
||||
fi
|
||||
return 0
|
||||
fi
|
||||
if [[ -n "${HECATE_COORDINATOR_RELAY}" ]]; then
|
||||
if [[ -n "${ANANKE_COORDINATOR_RELAY}" ]]; then
|
||||
# shellcheck disable=SC2086
|
||||
timeout "${STARTUP_TIMEOUT_SECONDS}" ssh "${HECATE_COORDINATOR_HOST}" ${HECATE_COORDINATOR_RELAY} "${cmd[@]}"
|
||||
timeout "${STARTUP_TIMEOUT_SECONDS}" ssh "${ANANKE_COORDINATOR_HOST}" ${ANANKE_COORDINATOR_RELAY} "${cmd[@]}"
|
||||
else
|
||||
timeout "${STARTUP_TIMEOUT_SECONDS}" ssh "${HECATE_COORDINATOR_HOST}" "${cmd[@]}"
|
||||
timeout "${STARTUP_TIMEOUT_SECONDS}" ssh "${ANANKE_COORDINATOR_HOST}" "${cmd[@]}"
|
||||
fi
|
||||
}
|
||||
|
||||
run_hecate_shutdown() {
|
||||
run_ananke_shutdown() {
|
||||
local reason="$1"
|
||||
local cmd=(sudo "${HECATE_BIN}" shutdown --config "${SHUTDOWN_CONFIG}" --execute --reason "${reason}")
|
||||
local cmd=(sudo "${ANANKE_BIN}" shutdown --config "${SHUTDOWN_CONFIG}" --execute --reason "${reason}")
|
||||
if [[ "${EXECUTE}" -eq 0 ]]; then
|
||||
if [[ -n "${HECATE_COORDINATOR_RELAY}" ]]; then
|
||||
log "plan: ssh ${HECATE_COORDINATOR_HOST} ${HECATE_COORDINATOR_RELAY} '${cmd[*]}'"
|
||||
if [[ -n "${ANANKE_COORDINATOR_RELAY}" ]]; then
|
||||
log "plan: ssh ${ANANKE_COORDINATOR_HOST} ${ANANKE_COORDINATOR_RELAY} '${cmd[*]}'"
|
||||
else
|
||||
log "plan: ssh ${HECATE_COORDINATOR_HOST} '${cmd[*]}'"
|
||||
log "plan: ssh ${ANANKE_COORDINATOR_HOST} '${cmd[*]}'"
|
||||
fi
|
||||
return 0
|
||||
fi
|
||||
if [[ -n "${HECATE_COORDINATOR_RELAY}" ]]; then
|
||||
if [[ -n "${ANANKE_COORDINATOR_RELAY}" ]]; then
|
||||
# shellcheck disable=SC2086
|
||||
timeout "${SHUTDOWN_TIMEOUT_SECONDS}" ssh "${HECATE_COORDINATOR_HOST}" ${HECATE_COORDINATOR_RELAY} "${cmd[@]}"
|
||||
timeout "${SHUTDOWN_TIMEOUT_SECONDS}" ssh "${ANANKE_COORDINATOR_HOST}" ${ANANKE_COORDINATOR_RELAY} "${cmd[@]}"
|
||||
else
|
||||
timeout "${SHUTDOWN_TIMEOUT_SECONDS}" ssh "${HECATE_COORDINATOR_HOST}" "${cmd[@]}"
|
||||
timeout "${SHUTDOWN_TIMEOUT_SECONDS}" ssh "${ANANKE_COORDINATOR_HOST}" "${cmd[@]}"
|
||||
fi
|
||||
}
|
||||
|
||||
run_hecate_startup_with_retry() {
|
||||
run_ananke_startup_with_retry() {
|
||||
local reason="$1"
|
||||
local startup_cmd="sudo ${HECATE_BIN} startup --config ${HECATE_CONFIG} --execute --force-flux-branch main --reason ${reason}"
|
||||
local startup_cmd="sudo ${ANANKE_BIN} startup --config ${ANANKE_CONFIG} --execute --force-flux-branch main --reason ${reason}"
|
||||
|
||||
if [[ "${EXECUTE}" -eq 0 ]]; then
|
||||
log "plan: startup retry loop with max=${STARTUP_RETRY_MAX} delay=${STARTUP_RETRY_DELAY_SECONDS}s"
|
||||
@ -161,11 +161,11 @@ run_hecate_startup_with_retry() {
|
||||
|
||||
run_coordinator_bash() {
|
||||
local script="$1"
|
||||
if [[ -n "${HECATE_COORDINATOR_RELAY}" ]]; then
|
||||
if [[ -n "${ANANKE_COORDINATOR_RELAY}" ]]; then
|
||||
# shellcheck disable=SC2086
|
||||
printf '%s\n' "${script}" | ssh "${HECATE_COORDINATOR_HOST}" ${HECATE_COORDINATOR_RELAY} "bash -se"
|
||||
printf '%s\n' "${script}" | ssh "${ANANKE_COORDINATOR_HOST}" ${ANANKE_COORDINATOR_RELAY} "bash -se"
|
||||
else
|
||||
printf '%s\n' "${script}" | ssh "${HECATE_COORDINATOR_HOST}" "bash -se"
|
||||
printf '%s\n' "${script}" | ssh "${ANANKE_COORDINATOR_HOST}" "bash -se"
|
||||
fi
|
||||
}
|
||||
|
||||
@ -283,7 +283,7 @@ write_log_header() {
|
||||
mkdir -p "${LOG_DIR}"
|
||||
local f="${LOG_DIR}/${drill}-$(now_ts).log"
|
||||
exec > >(tee -a "${f}") 2>&1
|
||||
log "drill=${drill} execute=${EXECUTE} coordinator=${HECATE_COORDINATOR_HOST}"
|
||||
log "drill=${drill} execute=${EXECUTE} coordinator=${ANANKE_COORDINATOR_HOST}"
|
||||
}
|
||||
|
||||
run_drill_flux_gitea_deadlock() {
|
||||
@ -303,7 +303,7 @@ run_drill_flux_gitea_deadlock() {
|
||||
scale_to "$ns" "$kind" "$name" 0
|
||||
done
|
||||
|
||||
run_hecate_startup "drill-flux-gitea-deadlock"
|
||||
run_ananke_startup "drill-flux-gitea-deadlock"
|
||||
|
||||
log "verifying recovery"
|
||||
wait_ready flux-system deployment source-controller 240s
|
||||
@ -330,7 +330,7 @@ run_drill_foundation_recovery() {
|
||||
scale_to "$ns" "$kind" "$name" 0
|
||||
done
|
||||
|
||||
run_hecate_startup "drill-foundation-recovery"
|
||||
run_ananke_startup "drill-foundation-recovery"
|
||||
|
||||
log "verifying layered recovery"
|
||||
wait_ready vault statefulset vault 420s
|
||||
@ -350,7 +350,7 @@ run_drill_reconciliation_resume() {
|
||||
set_flux_suspend_all true
|
||||
scale_to flux-system deployment source-controller 0
|
||||
|
||||
run_hecate_startup "drill-reconciliation-resume"
|
||||
run_ananke_startup "drill-reconciliation-resume"
|
||||
|
||||
log "verifying reconciliation resumed"
|
||||
wait_ready flux-system deployment source-controller 240s
|
||||
@ -361,8 +361,8 @@ run_drill_reconciliation_resume() {
|
||||
}
|
||||
|
||||
run_drill_startup_intent_guard() {
|
||||
local intent_path="/var/lib/hecate/intent.json"
|
||||
local backup_path="/tmp/hecate-intent-pre-drill.json"
|
||||
local intent_path="/var/lib/ananke/intent.json"
|
||||
local backup_path="/tmp/ananke-intent-pre-drill.json"
|
||||
local inject_cmd="
|
||||
if [ -f '${intent_path}' ]; then sudo cp '${intent_path}' '${backup_path}'; else sudo rm -f '${backup_path}'; fi
|
||||
cat <<'JSON' | sudo tee '${intent_path}' >/dev/null
|
||||
@ -376,12 +376,12 @@ else
|
||||
sudo rm -f '${intent_path}'
|
||||
fi
|
||||
"
|
||||
local startup_cmd="sudo ${HECATE_BIN} startup --config ${HECATE_CONFIG} --execute --force-flux-branch main --reason drill-startup-intent-guard"
|
||||
local startup_cmd="sudo ${ANANKE_BIN} startup --config ${ANANKE_CONFIG} --execute --force-flux-branch main --reason drill-startup-intent-guard"
|
||||
|
||||
if [[ "${EXECUTE}" -eq 0 ]]; then
|
||||
log "plan: ssh ${HECATE_COORDINATOR_HOST} '<inject shutdown intent>'"
|
||||
log "plan: ssh ${HECATE_COORDINATOR_HOST} '${startup_cmd}' (expect failure)"
|
||||
log "plan: ssh ${HECATE_COORDINATOR_HOST} '<restore prior intent>'"
|
||||
log "plan: ssh ${ANANKE_COORDINATOR_HOST} '<inject shutdown intent>'"
|
||||
log "plan: ssh ${ANANKE_COORDINATOR_HOST} '${startup_cmd}' (expect failure)"
|
||||
log "plan: ssh ${ANANKE_COORDINATOR_HOST} '<restore prior intent>'"
|
||||
log "pass: startup-intent-guard (plan mode)"
|
||||
return 0
|
||||
fi
|
||||
@ -406,10 +406,10 @@ run_drill_controlled_cycle() {
|
||||
fi
|
||||
|
||||
log "running controlled shutdown cycle (poweroff disabled config)"
|
||||
run_hecate_shutdown "drill-controlled-cycle-shutdown"
|
||||
run_ananke_shutdown "drill-controlled-cycle-shutdown"
|
||||
|
||||
log "running startup recovery cycle"
|
||||
run_hecate_startup_with_retry "drill-controlled-cycle-startup"
|
||||
run_ananke_startup_with_retry "drill-controlled-cycle-startup"
|
||||
|
||||
log "verifying critical stack readiness after cycle"
|
||||
wait_ready flux-system deployment source-controller 240s
|
||||
@ -2,13 +2,13 @@
|
||||
set -euo pipefail
|
||||
|
||||
if [[ "${EUID}" -ne 0 ]]; then
|
||||
echo "hecate-self-update.sh must run as root" >&2
|
||||
echo "ananke-self-update.sh must run as root" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
REPO_URL="${HECATE_REPO_URL:-https://scm.bstein.dev/bstein/hecate.git}"
|
||||
BRANCH="${HECATE_REPO_BRANCH:-main}"
|
||||
REPO_DIR="${HECATE_REPO_DIR:-/opt/hecate}"
|
||||
REPO_URL="${ANANKE_REPO_URL:-ssh://git@scm.bstein.dev:2242/bstein/ananke.git}"
|
||||
BRANCH="${ANANKE_REPO_BRANCH:-main}"
|
||||
REPO_DIR="${ANANKE_REPO_DIR:-/opt/ananke}"
|
||||
|
||||
mkdir -p "$(dirname "${REPO_DIR}")"
|
||||
if [[ ! -d "${REPO_DIR}/.git" ]]; then
|
||||
Loading…
x
Reference in New Issue
Block a user