rename runtime surfaces from hecate to ananke
This commit is contained in:
parent
169324ef4a
commit
c605a083ee
6
Makefile
6
Makefile
@ -1,7 +1,7 @@
|
|||||||
.PHONY: build test fmt tidy install drill-list drill-run
|
.PHONY: build test fmt tidy install drill-list drill-run
|
||||||
|
|
||||||
build:
|
build:
|
||||||
go build -o dist/hecate ./cmd/hecate
|
go build -o dist/ananke ./cmd/ananke
|
||||||
|
|
||||||
test:
|
test:
|
||||||
go test ./...
|
go test ./...
|
||||||
@ -16,7 +16,7 @@ install:
|
|||||||
sudo ./scripts/install.sh
|
sudo ./scripts/install.sh
|
||||||
|
|
||||||
drill-list:
|
drill-list:
|
||||||
./scripts/hecate-drills.sh list
|
./scripts/ananke-drills.sh list
|
||||||
|
|
||||||
drill-run:
|
drill-run:
|
||||||
./scripts/hecate-drills.sh run $(DRILL) --execute
|
./scripts/ananke-drills.sh run $(DRILL) --execute
|
||||||
|
|||||||
@ -6,6 +6,7 @@ import (
|
|||||||
"flag"
|
"flag"
|
||||||
"fmt"
|
"fmt"
|
||||||
"log"
|
"log"
|
||||||
|
"math"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"os/signal"
|
"os/signal"
|
||||||
@ -14,17 +15,17 @@ import (
|
|||||||
"syscall"
|
"syscall"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"scm.bstein.dev/bstein/hecate/internal/cluster"
|
"scm.bstein.dev/bstein/ananke/internal/cluster"
|
||||||
"scm.bstein.dev/bstein/hecate/internal/config"
|
"scm.bstein.dev/bstein/ananke/internal/config"
|
||||||
"scm.bstein.dev/bstein/hecate/internal/execx"
|
"scm.bstein.dev/bstein/ananke/internal/execx"
|
||||||
"scm.bstein.dev/bstein/hecate/internal/service"
|
"scm.bstein.dev/bstein/ananke/internal/service"
|
||||||
"scm.bstein.dev/bstein/hecate/internal/sshutil"
|
"scm.bstein.dev/bstein/ananke/internal/sshutil"
|
||||||
"scm.bstein.dev/bstein/hecate/internal/state"
|
"scm.bstein.dev/bstein/ananke/internal/state"
|
||||||
"scm.bstein.dev/bstein/hecate/internal/ups"
|
"scm.bstein.dev/bstein/ananke/internal/ups"
|
||||||
)
|
)
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
logger := log.New(os.Stdout, "[hecate] ", log.LstdFlags)
|
logger := log.New(os.Stdout, "[ananke] ", log.LstdFlags)
|
||||||
if len(os.Args) < 2 {
|
if len(os.Args) < 2 {
|
||||||
usage()
|
usage()
|
||||||
os.Exit(2)
|
os.Exit(2)
|
||||||
@ -73,7 +74,7 @@ func main() {
|
|||||||
|
|
||||||
func runStartup(logger *log.Logger, args []string) error {
|
func runStartup(logger *log.Logger, args []string) error {
|
||||||
fs := flag.NewFlagSet("startup", flag.ExitOnError)
|
fs := flag.NewFlagSet("startup", flag.ExitOnError)
|
||||||
configPath := fs.String("config", "/etc/hecate/hecate.yaml", "Path to config file")
|
configPath := fs.String("config", "/etc/ananke/ananke.yaml", "Path to config file")
|
||||||
execute := fs.Bool("execute", false, "Actually execute changes (default dry-run)")
|
execute := fs.Bool("execute", false, "Actually execute changes (default dry-run)")
|
||||||
forceBranch := fs.String("force-flux-branch", "", "Patch Flux source branch before resume")
|
forceBranch := fs.String("force-flux-branch", "", "Patch Flux source branch before resume")
|
||||||
skipLocalBootstrap := fs.Bool("skip-local-bootstrap", false, "Skip local fallback bootstrap applies")
|
skipLocalBootstrap := fs.Bool("skip-local-bootstrap", false, "Skip local fallback bootstrap applies")
|
||||||
@ -124,7 +125,7 @@ func runStartup(logger *log.Logger, args []string) error {
|
|||||||
}
|
}
|
||||||
checkCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
checkCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
if err := ensureStartupPowerSafe(checkCtx, targets); err != nil {
|
if err := ensureStartupPowerSafe(checkCtx, targets, cfg.Startup.MinimumBatteryPercent); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -141,10 +142,11 @@ func runStartup(logger *log.Logger, args []string) error {
|
|||||||
|
|
||||||
func runShutdown(logger *log.Logger, args []string) error {
|
func runShutdown(logger *log.Logger, args []string) error {
|
||||||
fs := flag.NewFlagSet("shutdown", flag.ExitOnError)
|
fs := flag.NewFlagSet("shutdown", flag.ExitOnError)
|
||||||
configPath := fs.String("config", "/etc/hecate/hecate.yaml", "Path to config file")
|
configPath := fs.String("config", "/etc/ananke/ananke.yaml", "Path to config file")
|
||||||
execute := fs.Bool("execute", false, "Actually execute changes (default dry-run)")
|
execute := fs.Bool("execute", false, "Actually execute changes (default dry-run)")
|
||||||
skipEtcd := fs.Bool("skip-etcd-snapshot", false, "Skip etcd snapshot")
|
skipEtcd := fs.Bool("skip-etcd-snapshot", false, "Skip etcd snapshot")
|
||||||
skipDrain := fs.Bool("skip-drain", false, "Skip worker drain")
|
skipDrain := fs.Bool("skip-drain", false, "Skip worker drain")
|
||||||
|
mode := fs.String("mode", "config", "Shutdown mode: config|cluster-only|poweroff")
|
||||||
reason := fs.String("reason", "manual-shutdown", "Shutdown reason for run history")
|
reason := fs.String("reason", "manual-shutdown", "Shutdown reason for run history")
|
||||||
_ = fs.Parse(args)
|
_ = fs.Parse(args)
|
||||||
|
|
||||||
@ -158,13 +160,14 @@ func runShutdown(logger *log.Logger, args []string) error {
|
|||||||
return orch.Shutdown(ctx, cluster.ShutdownOptions{
|
return orch.Shutdown(ctx, cluster.ShutdownOptions{
|
||||||
SkipEtcdSnapshot: *skipEtcd,
|
SkipEtcdSnapshot: *skipEtcd,
|
||||||
SkipDrain: *skipDrain,
|
SkipDrain: *skipDrain,
|
||||||
|
Mode: *mode,
|
||||||
Reason: *reason,
|
Reason: *reason,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
func runDaemon(logger *log.Logger, args []string) error {
|
func runDaemon(logger *log.Logger, args []string) error {
|
||||||
fs := flag.NewFlagSet("daemon", flag.ExitOnError)
|
fs := flag.NewFlagSet("daemon", flag.ExitOnError)
|
||||||
configPath := fs.String("config", "/etc/hecate/hecate.yaml", "Path to config file")
|
configPath := fs.String("config", "/etc/ananke/ananke.yaml", "Path to config file")
|
||||||
dryRunActions := fs.Bool("dry-run-actions", false, "Log planned actions without executing")
|
dryRunActions := fs.Bool("dry-run-actions", false, "Log planned actions without executing")
|
||||||
_ = fs.Parse(args)
|
_ = fs.Parse(args)
|
||||||
|
|
||||||
@ -191,7 +194,7 @@ func runDaemon(logger *log.Logger, args []string) error {
|
|||||||
|
|
||||||
func runEtcdRestore(logger *log.Logger, args []string) error {
|
func runEtcdRestore(logger *log.Logger, args []string) error {
|
||||||
fs := flag.NewFlagSet("etcd-restore", flag.ExitOnError)
|
fs := flag.NewFlagSet("etcd-restore", flag.ExitOnError)
|
||||||
configPath := fs.String("config", "/etc/hecate/hecate.yaml", "Path to config file")
|
configPath := fs.String("config", "/etc/ananke/ananke.yaml", "Path to config file")
|
||||||
execute := fs.Bool("execute", false, "Actually execute restore (default dry-run)")
|
execute := fs.Bool("execute", false, "Actually execute restore (default dry-run)")
|
||||||
controlPlane := fs.String("control-plane", "", "Control plane to run restore on (defaults to startup.etcd_restore_control_plane)")
|
controlPlane := fs.String("control-plane", "", "Control plane to run restore on (defaults to startup.etcd_restore_control_plane)")
|
||||||
snapshotPath := fs.String("snapshot", "", "Explicit snapshot path (defaults to latest on selected control plane)")
|
snapshotPath := fs.String("snapshot", "", "Explicit snapshot path (defaults to latest on selected control plane)")
|
||||||
@ -211,7 +214,7 @@ func runEtcdRestore(logger *log.Logger, args []string) error {
|
|||||||
|
|
||||||
func runStatus(logger *log.Logger, args []string) error {
|
func runStatus(logger *log.Logger, args []string) error {
|
||||||
fs := flag.NewFlagSet("status", flag.ExitOnError)
|
fs := flag.NewFlagSet("status", flag.ExitOnError)
|
||||||
configPath := fs.String("config", "/etc/hecate/hecate.yaml", "Path to config file")
|
configPath := fs.String("config", "/etc/ananke/ananke.yaml", "Path to config file")
|
||||||
_ = fs.Parse(args)
|
_ = fs.Parse(args)
|
||||||
|
|
||||||
cfg, orch, err := buildOrchestrator(logger, *configPath, true)
|
cfg, orch, err := buildOrchestrator(logger, *configPath, true)
|
||||||
@ -246,7 +249,7 @@ func runStatus(logger *log.Logger, args []string) error {
|
|||||||
|
|
||||||
func runIntent(logger *log.Logger, args []string) error {
|
func runIntent(logger *log.Logger, args []string) error {
|
||||||
fs := flag.NewFlagSet("intent", flag.ExitOnError)
|
fs := flag.NewFlagSet("intent", flag.ExitOnError)
|
||||||
configPath := fs.String("config", "/etc/hecate/hecate.yaml", "Path to config file")
|
configPath := fs.String("config", "/etc/ananke/ananke.yaml", "Path to config file")
|
||||||
setState := fs.String("set", "", "Set intent state (normal|startup_in_progress|shutting_down|shutdown_complete)")
|
setState := fs.String("set", "", "Set intent state (normal|startup_in_progress|shutting_down|shutdown_complete)")
|
||||||
reason := fs.String("reason", "manual-intent", "Intent reason (used with --set)")
|
reason := fs.String("reason", "manual-intent", "Intent reason (used with --set)")
|
||||||
source := fs.String("source", "manual", "Intent source (used with --set)")
|
source := fs.String("source", "manual", "Intent source (used with --set)")
|
||||||
@ -314,7 +317,7 @@ func buildUPSTargets(cfg config.Config) ([]service.Target, error) {
|
|||||||
return targets, nil
|
return targets, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func ensureStartupPowerSafe(ctx context.Context, targets []service.Target) error {
|
func ensureStartupPowerSafe(ctx context.Context, targets []service.Target, minimumBatteryPercent float64) error {
|
||||||
type targetState struct {
|
type targetState struct {
|
||||||
seenGood bool
|
seenGood bool
|
||||||
lastErr error
|
lastErr error
|
||||||
@ -327,6 +330,7 @@ func ensureStartupPowerSafe(ctx context.Context, targets []service.Target) error
|
|||||||
const pollInterval = 3 * time.Second
|
const pollInterval = 3 * time.Second
|
||||||
for {
|
for {
|
||||||
onBatteryTargets := []string{}
|
onBatteryTargets := []string{}
|
||||||
|
lowChargeTargets := []string{}
|
||||||
allSeen := true
|
allSeen := true
|
||||||
for _, t := range targets {
|
for _, t := range targets {
|
||||||
key := t.Name + "|" + t.Target
|
key := t.Name + "|" + t.Target
|
||||||
@ -344,10 +348,25 @@ func ensureStartupPowerSafe(ctx context.Context, targets []service.Target) error
|
|||||||
if sample.OnBattery {
|
if sample.OnBattery {
|
||||||
onBatteryTargets = append(onBatteryTargets, fmt.Sprintf("%s(status=%s runtime_s=%d)", t.Name, sample.RawStatus, sample.RuntimeSeconds))
|
onBatteryTargets = append(onBatteryTargets, fmt.Sprintf("%s(status=%s runtime_s=%d)", t.Name, sample.RawStatus, sample.RuntimeSeconds))
|
||||||
}
|
}
|
||||||
|
if minimumBatteryPercent > 0 && sample.BatteryCharge > 0 && sample.BatteryCharge < minimumBatteryPercent {
|
||||||
|
lowChargeTargets = append(
|
||||||
|
lowChargeTargets,
|
||||||
|
fmt.Sprintf(
|
||||||
|
"%s(charge=%.1f%%<%.1f%% status=%s)",
|
||||||
|
t.Name,
|
||||||
|
sample.BatteryCharge,
|
||||||
|
minimumBatteryPercent,
|
||||||
|
sample.RawStatus,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if len(onBatteryTargets) > 0 {
|
if len(onBatteryTargets) > 0 {
|
||||||
return fmt.Errorf("startup blocked: UPS is on battery for %s", strings.Join(onBatteryTargets, ", "))
|
return fmt.Errorf("startup blocked: UPS is on battery for %s", strings.Join(onBatteryTargets, ", "))
|
||||||
}
|
}
|
||||||
|
if len(lowChargeTargets) > 0 {
|
||||||
|
return fmt.Errorf("startup blocked: UPS battery charge below minimum for %s", strings.Join(lowChargeTargets, ", "))
|
||||||
|
}
|
||||||
if allSeen {
|
if allSeen {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@ -366,7 +385,8 @@ func ensureStartupPowerSafe(ctx context.Context, targets []service.Target) error
|
|||||||
unverified = append(unverified, fmt.Sprintf("%s(%s): no telemetry sample yet", t.Name, t.Target))
|
unverified = append(unverified, fmt.Sprintf("%s(%s): no telemetry sample yet", t.Name, t.Target))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return fmt.Errorf("startup blocked: unable to verify UPS telemetry before timeout: %s", strings.Join(unverified, " | "))
|
roundedMin := math.Round(minimumBatteryPercent*10) / 10
|
||||||
|
return fmt.Errorf("startup blocked: unable to verify UPS telemetry before timeout (minimum_battery_percent=%.1f): %s", roundedMin, strings.Join(unverified, " | "))
|
||||||
case <-time.After(pollInterval):
|
case <-time.After(pollInterval):
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -391,26 +411,26 @@ func buildOrchestrator(logger *log.Logger, cfgPath string, dryRun bool) (config.
|
|||||||
}
|
}
|
||||||
|
|
||||||
func usage() {
|
func usage() {
|
||||||
fmt.Print(`hecate: staged startup/shutdown + UPS-triggered protection
|
fmt.Print(`ananke: staged startup/shutdown + UPS-triggered protection
|
||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
hecate <command> [flags]
|
ananke <command> [flags]
|
||||||
|
|
||||||
Commands:
|
Commands:
|
||||||
startup Perform staged cluster startup
|
startup Perform staged cluster startup
|
||||||
shutdown Perform graceful cluster shutdown
|
shutdown Perform graceful cluster shutdown
|
||||||
etcd-restore Restore etcd from snapshot on a control plane
|
etcd-restore Restore etcd from snapshot on a control plane
|
||||||
daemon Monitor UPS and auto-trigger shutdown
|
daemon Monitor UPS and auto-trigger shutdown
|
||||||
status Print current hecate status and estimates
|
status Print current ananke status and estimates
|
||||||
intent Read or manually set intent state
|
intent Read or manually set intent state
|
||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
hecate startup --config /etc/hecate/hecate.yaml --execute --force-flux-branch main
|
ananke startup --config /etc/ananke/ananke.yaml --execute --force-flux-branch main
|
||||||
hecate shutdown --config /etc/hecate/hecate.yaml --execute --reason "manual-maintenance"
|
ananke shutdown --config /etc/ananke/ananke.yaml --execute --reason "manual-maintenance"
|
||||||
hecate etcd-restore --config /etc/hecate/hecate.yaml --execute
|
ananke etcd-restore --config /etc/ananke/ananke.yaml --execute
|
||||||
hecate daemon --config /etc/hecate/hecate.yaml
|
ananke daemon --config /etc/ananke/ananke.yaml
|
||||||
hecate status --config /etc/hecate/hecate.yaml
|
ananke status --config /etc/ananke/ananke.yaml
|
||||||
hecate intent --config /etc/hecate/hecate.yaml --set normal --reason "manual-clear" --execute
|
ananke intent --config /etc/ananke/ananke.yaml --set normal --reason "manual-clear" --execute
|
||||||
`)
|
`)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -439,7 +459,7 @@ func tryPeerBootstrapHandoff(ctx context.Context, cfg config.Config, logger *log
|
|||||||
|
|
||||||
args := buildSSHBaseArgs(cfg)
|
args := buildSSHBaseArgs(cfg)
|
||||||
|
|
||||||
remote := "sudo -n systemctl start hecate-bootstrap.service"
|
remote := "sudo -n systemctl start ananke-bootstrap.service"
|
||||||
attempt := 1
|
attempt := 1
|
||||||
for {
|
for {
|
||||||
cmdArgs := append(append([]string{}, args...), target, remote)
|
cmdArgs := append(append([]string{}, args...), target, remote)
|
||||||
@ -480,7 +500,7 @@ func coordinatorAllowsPeerFallbackStartup(ctx context.Context, cfg config.Config
|
|||||||
if user != "" {
|
if user != "" {
|
||||||
target = user + "@" + host
|
target = user + "@" + host
|
||||||
}
|
}
|
||||||
remoteCmd := "if sudo -n /usr/bin/systemctl is-active --quiet hecate-bootstrap.service; then echo __HECATE_BOOTSTRAP_ACTIVE__; else echo __HECATE_BOOTSTRAP_IDLE__; fi; sudo -n /usr/local/bin/hecate intent --config /etc/hecate/hecate.yaml"
|
remoteCmd := "if sudo -n /usr/bin/systemctl is-active --quiet ananke-bootstrap.service; then echo __ANANKE_BOOTSTRAP_ACTIVE__; else echo __ANANKE_BOOTSTRAP_IDLE__; fi; sudo -n /usr/local/bin/ananke intent --config /etc/ananke/ananke.yaml"
|
||||||
args := append(buildSSHBaseArgs(cfg), target, remoteCmd)
|
args := append(buildSSHBaseArgs(cfg), target, remoteCmd)
|
||||||
out, err := runSSHWithRecovery(ctx, logger, cfg, args, []string{coordinator, host, cfg.SSHJumpHost})
|
out, err := runSSHWithRecovery(ctx, logger, cfg, args, []string{coordinator, host, cfg.SSHJumpHost})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -488,7 +508,7 @@ func coordinatorAllowsPeerFallbackStartup(ctx context.Context, cfg config.Config
|
|||||||
return true, "coordinator unreachable", nil
|
return true, "coordinator unreachable", nil
|
||||||
}
|
}
|
||||||
trimmed := strings.TrimSpace(out)
|
trimmed := strings.TrimSpace(out)
|
||||||
if strings.Contains(trimmed, "__HECATE_BOOTSTRAP_ACTIVE__") {
|
if strings.Contains(trimmed, "__ANANKE_BOOTSTRAP_ACTIVE__") {
|
||||||
return false, "coordinator bootstrap service is active", nil
|
return false, "coordinator bootstrap service is active", nil
|
||||||
}
|
}
|
||||||
remoteIntent, parseErr := state.ParseIntentOutput(trimmed)
|
remoteIntent, parseErr := state.ParseIntentOutput(trimmed)
|
||||||
@ -1,5 +1,5 @@
|
|||||||
# /etc/hecate/hecate.yaml
|
# /etc/ananke/ananke.yaml
|
||||||
kubeconfig: /etc/hecate/kubeconfig
|
kubeconfig: /etc/ananke/kubeconfig
|
||||||
ssh_user: atlas
|
ssh_user: atlas
|
||||||
ssh_port: 2277
|
ssh_port: 2277
|
||||||
ssh_config_file: ""
|
ssh_config_file: ""
|
||||||
@ -11,6 +11,7 @@ ssh_jump_host: ""
|
|||||||
ssh_jump_user: ""
|
ssh_jump_user: ""
|
||||||
iac_repo_path: /opt/titan-iac
|
iac_repo_path: /opt/titan-iac
|
||||||
expected_flux_branch: main
|
expected_flux_branch: main
|
||||||
|
expected_flux_source_url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git
|
||||||
control_planes:
|
control_planes:
|
||||||
- titan-0a
|
- titan-0a
|
||||||
- titan-0b
|
- titan-0b
|
||||||
@ -46,6 +47,10 @@ startup:
|
|||||||
api_wait_seconds: 1200
|
api_wait_seconds: 1200
|
||||||
api_poll_seconds: 2
|
api_poll_seconds: 2
|
||||||
shutdown_cooldown_seconds: 45
|
shutdown_cooldown_seconds: 45
|
||||||
|
minimum_battery_percent: 20
|
||||||
|
required_node_labels:
|
||||||
|
titan-09:
|
||||||
|
ananke.bstein.dev/harbor-bootstrap: "true"
|
||||||
require_time_sync: true
|
require_time_sync: true
|
||||||
time_sync_wait_seconds: 240
|
time_sync_wait_seconds: 240
|
||||||
time_sync_poll_seconds: 5
|
time_sync_poll_seconds: 5
|
||||||
@ -67,9 +72,36 @@ startup:
|
|||||||
post_start_probe_wait_seconds: 240
|
post_start_probe_wait_seconds: 240
|
||||||
post_start_probe_poll_seconds: 5
|
post_start_probe_poll_seconds: 5
|
||||||
post_start_probes:
|
post_start_probes:
|
||||||
- https://scm.bstein.dev/user/login
|
- https://scm.bstein.dev/api/healthz
|
||||||
- https://metrics.bstein.dev/login
|
- https://metrics.bstein.dev/api/health
|
||||||
vault_unseal_key_file: /var/lib/hecate/vault-unseal.key
|
require_service_checklist: true
|
||||||
|
service_checklist_wait_seconds: 420
|
||||||
|
service_checklist_poll_seconds: 5
|
||||||
|
service_checklist_stability_seconds: 120
|
||||||
|
service_checklist:
|
||||||
|
- name: gitea-api
|
||||||
|
url: https://scm.bstein.dev/api/healthz
|
||||||
|
accepted_statuses: [200]
|
||||||
|
body_contains: pass
|
||||||
|
timeout_seconds: 12
|
||||||
|
- name: grafana-api
|
||||||
|
url: https://metrics.bstein.dev/api/health
|
||||||
|
accepted_statuses: [200]
|
||||||
|
body_contains: '"database":"ok"'
|
||||||
|
timeout_seconds: 12
|
||||||
|
require_flux_health: true
|
||||||
|
flux_health_wait_seconds: 900
|
||||||
|
flux_health_poll_seconds: 5
|
||||||
|
ignore_flux_kustomizations: []
|
||||||
|
require_workload_convergence: true
|
||||||
|
workload_convergence_wait_seconds: 900
|
||||||
|
workload_convergence_poll_seconds: 5
|
||||||
|
ignore_workload_namespaces: []
|
||||||
|
ignore_workloads: []
|
||||||
|
ignore_unavailable_nodes: []
|
||||||
|
auto_recycle_stuck_pods: true
|
||||||
|
stuck_pod_grace_seconds: 180
|
||||||
|
vault_unseal_key_file: /var/lib/ananke/vault-unseal.key
|
||||||
vault_unseal_breakglass_command: ""
|
vault_unseal_breakglass_command: ""
|
||||||
vault_unseal_breakglass_timeout_seconds: 15
|
vault_unseal_breakglass_timeout_seconds: 15
|
||||||
shutdown:
|
shutdown:
|
||||||
@ -103,7 +135,7 @@ ups:
|
|||||||
coordination:
|
coordination:
|
||||||
forward_shutdown_host: ""
|
forward_shutdown_host: ""
|
||||||
forward_shutdown_user: atlas
|
forward_shutdown_user: atlas
|
||||||
forward_shutdown_config: /etc/hecate/hecate.yaml
|
forward_shutdown_config: /etc/ananke/ananke.yaml
|
||||||
peer_hosts: []
|
peer_hosts: []
|
||||||
fallback_local_shutdown: true
|
fallback_local_shutdown: true
|
||||||
command_timeout_seconds: 25
|
command_timeout_seconds: 25
|
||||||
@ -115,7 +147,7 @@ metrics:
|
|||||||
bind_addr: 0.0.0.0:9560
|
bind_addr: 0.0.0.0:9560
|
||||||
path: /metrics
|
path: /metrics
|
||||||
state:
|
state:
|
||||||
dir: /var/lib/hecate
|
dir: /var/lib/ananke
|
||||||
run_history_path: /var/lib/hecate/runs.json
|
run_history_path: /var/lib/ananke/runs.json
|
||||||
lock_path: /var/lib/hecate/hecate.lock
|
lock_path: /var/lib/ananke/ananke.lock
|
||||||
intent_path: /var/lib/hecate/intent.json
|
intent_path: /var/lib/ananke/intent.json
|
||||||
@ -1,5 +1,5 @@
|
|||||||
# /etc/hecate/hecate.yaml for titan-24 (tethys forwarder)
|
# /etc/ananke/ananke.yaml for titan-24 (tethys forwarder)
|
||||||
kubeconfig: /etc/hecate/kubeconfig
|
kubeconfig: /etc/ananke/kubeconfig
|
||||||
ssh_user: atlas
|
ssh_user: atlas
|
||||||
ssh_port: 2277
|
ssh_port: 2277
|
||||||
ssh_config_file: /home/tethys/.ssh/config
|
ssh_config_file: /home/tethys/.ssh/config
|
||||||
@ -58,6 +58,7 @@ ssh_jump_host: ""
|
|||||||
ssh_jump_user: ""
|
ssh_jump_user: ""
|
||||||
iac_repo_path: /opt/titan-iac
|
iac_repo_path: /opt/titan-iac
|
||||||
expected_flux_branch: main
|
expected_flux_branch: main
|
||||||
|
expected_flux_source_url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git
|
||||||
control_planes:
|
control_planes:
|
||||||
- titan-0a
|
- titan-0a
|
||||||
- titan-0b
|
- titan-0b
|
||||||
@ -112,6 +113,10 @@ startup:
|
|||||||
api_wait_seconds: 1200
|
api_wait_seconds: 1200
|
||||||
api_poll_seconds: 2
|
api_poll_seconds: 2
|
||||||
shutdown_cooldown_seconds: 45
|
shutdown_cooldown_seconds: 45
|
||||||
|
minimum_battery_percent: 20
|
||||||
|
required_node_labels:
|
||||||
|
titan-09:
|
||||||
|
ananke.bstein.dev/harbor-bootstrap: "true"
|
||||||
require_time_sync: true
|
require_time_sync: true
|
||||||
time_sync_wait_seconds: 240
|
time_sync_wait_seconds: 240
|
||||||
time_sync_poll_seconds: 5
|
time_sync_poll_seconds: 5
|
||||||
@ -133,10 +138,37 @@ startup:
|
|||||||
post_start_probe_wait_seconds: 240
|
post_start_probe_wait_seconds: 240
|
||||||
post_start_probe_poll_seconds: 5
|
post_start_probe_poll_seconds: 5
|
||||||
post_start_probes:
|
post_start_probes:
|
||||||
- https://scm.bstein.dev/user/login
|
- https://scm.bstein.dev/api/healthz
|
||||||
- https://metrics.bstein.dev/login
|
- https://metrics.bstein.dev/api/health
|
||||||
vault_unseal_key_file: /var/lib/hecate/vault-unseal.key
|
require_service_checklist: true
|
||||||
vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/tethys/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.hecate-breakglass/vault-unseal.key'"
|
service_checklist_wait_seconds: 420
|
||||||
|
service_checklist_poll_seconds: 5
|
||||||
|
service_checklist_stability_seconds: 120
|
||||||
|
service_checklist:
|
||||||
|
- name: gitea-api
|
||||||
|
url: https://scm.bstein.dev/api/healthz
|
||||||
|
accepted_statuses: [200]
|
||||||
|
body_contains: pass
|
||||||
|
timeout_seconds: 12
|
||||||
|
- name: grafana-api
|
||||||
|
url: https://metrics.bstein.dev/api/health
|
||||||
|
accepted_statuses: [200]
|
||||||
|
body_contains: '"database":"ok"'
|
||||||
|
timeout_seconds: 12
|
||||||
|
require_flux_health: true
|
||||||
|
flux_health_wait_seconds: 900
|
||||||
|
flux_health_poll_seconds: 5
|
||||||
|
ignore_flux_kustomizations: []
|
||||||
|
require_workload_convergence: true
|
||||||
|
workload_convergence_wait_seconds: 900
|
||||||
|
workload_convergence_poll_seconds: 5
|
||||||
|
ignore_workload_namespaces: []
|
||||||
|
ignore_workloads: []
|
||||||
|
ignore_unavailable_nodes: []
|
||||||
|
auto_recycle_stuck_pods: true
|
||||||
|
stuck_pod_grace_seconds: 180
|
||||||
|
vault_unseal_key_file: /var/lib/ananke/vault-unseal.key
|
||||||
|
vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/tethys/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.ananke-breakglass/vault-unseal.key'"
|
||||||
vault_unseal_breakglass_timeout_seconds: 15
|
vault_unseal_breakglass_timeout_seconds: 15
|
||||||
shutdown:
|
shutdown:
|
||||||
default_budget_seconds: 1380
|
default_budget_seconds: 1380
|
||||||
@ -167,7 +199,7 @@ ups:
|
|||||||
coordination:
|
coordination:
|
||||||
forward_shutdown_host: titan-db
|
forward_shutdown_host: titan-db
|
||||||
forward_shutdown_user: atlas
|
forward_shutdown_user: atlas
|
||||||
forward_shutdown_config: /etc/hecate/hecate.yaml
|
forward_shutdown_config: /etc/ananke/ananke.yaml
|
||||||
peer_hosts:
|
peer_hosts:
|
||||||
- titan-db
|
- titan-db
|
||||||
fallback_local_shutdown: false
|
fallback_local_shutdown: false
|
||||||
@ -180,7 +212,7 @@ metrics:
|
|||||||
bind_addr: 0.0.0.0:9560
|
bind_addr: 0.0.0.0:9560
|
||||||
path: /metrics
|
path: /metrics
|
||||||
state:
|
state:
|
||||||
dir: /var/lib/hecate
|
dir: /var/lib/ananke
|
||||||
run_history_path: /var/lib/hecate/runs.json
|
run_history_path: /var/lib/ananke/runs.json
|
||||||
lock_path: /var/lib/hecate/hecate.lock
|
lock_path: /var/lib/ananke/ananke.lock
|
||||||
intent_path: /var/lib/hecate/intent.json
|
intent_path: /var/lib/ananke/intent.json
|
||||||
@ -1,5 +1,5 @@
|
|||||||
# /etc/hecate/hecate.yaml for titan-db (coordinator)
|
# /etc/ananke/ananke.yaml for titan-db (coordinator)
|
||||||
kubeconfig: /etc/hecate/kubeconfig
|
kubeconfig: /etc/ananke/kubeconfig
|
||||||
ssh_user: atlas
|
ssh_user: atlas
|
||||||
ssh_port: 2277
|
ssh_port: 2277
|
||||||
ssh_config_file: /home/atlas/.ssh/config
|
ssh_config_file: /home/atlas/.ssh/config
|
||||||
@ -58,6 +58,7 @@ ssh_jump_host: ""
|
|||||||
ssh_jump_user: ""
|
ssh_jump_user: ""
|
||||||
iac_repo_path: /opt/titan-iac
|
iac_repo_path: /opt/titan-iac
|
||||||
expected_flux_branch: main
|
expected_flux_branch: main
|
||||||
|
expected_flux_source_url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git
|
||||||
control_planes:
|
control_planes:
|
||||||
- titan-0a
|
- titan-0a
|
||||||
- titan-0b
|
- titan-0b
|
||||||
@ -112,6 +113,10 @@ startup:
|
|||||||
api_wait_seconds: 1200
|
api_wait_seconds: 1200
|
||||||
api_poll_seconds: 2
|
api_poll_seconds: 2
|
||||||
shutdown_cooldown_seconds: 45
|
shutdown_cooldown_seconds: 45
|
||||||
|
minimum_battery_percent: 20
|
||||||
|
required_node_labels:
|
||||||
|
titan-09:
|
||||||
|
ananke.bstein.dev/harbor-bootstrap: "true"
|
||||||
require_time_sync: true
|
require_time_sync: true
|
||||||
time_sync_wait_seconds: 240
|
time_sync_wait_seconds: 240
|
||||||
time_sync_poll_seconds: 5
|
time_sync_poll_seconds: 5
|
||||||
@ -133,10 +138,37 @@ startup:
|
|||||||
post_start_probe_wait_seconds: 240
|
post_start_probe_wait_seconds: 240
|
||||||
post_start_probe_poll_seconds: 5
|
post_start_probe_poll_seconds: 5
|
||||||
post_start_probes:
|
post_start_probes:
|
||||||
- https://scm.bstein.dev/user/login
|
- https://scm.bstein.dev/api/healthz
|
||||||
- https://metrics.bstein.dev/login
|
- https://metrics.bstein.dev/api/health
|
||||||
vault_unseal_key_file: /var/lib/hecate/vault-unseal.key
|
require_service_checklist: true
|
||||||
vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/atlas/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.hecate-breakglass/vault-unseal.key'"
|
service_checklist_wait_seconds: 420
|
||||||
|
service_checklist_poll_seconds: 5
|
||||||
|
service_checklist_stability_seconds: 120
|
||||||
|
service_checklist:
|
||||||
|
- name: gitea-api
|
||||||
|
url: https://scm.bstein.dev/api/healthz
|
||||||
|
accepted_statuses: [200]
|
||||||
|
body_contains: pass
|
||||||
|
timeout_seconds: 12
|
||||||
|
- name: grafana-api
|
||||||
|
url: https://metrics.bstein.dev/api/health
|
||||||
|
accepted_statuses: [200]
|
||||||
|
body_contains: '"database":"ok"'
|
||||||
|
timeout_seconds: 12
|
||||||
|
require_flux_health: true
|
||||||
|
flux_health_wait_seconds: 900
|
||||||
|
flux_health_poll_seconds: 5
|
||||||
|
ignore_flux_kustomizations: []
|
||||||
|
require_workload_convergence: true
|
||||||
|
workload_convergence_wait_seconds: 900
|
||||||
|
workload_convergence_poll_seconds: 5
|
||||||
|
ignore_workload_namespaces: []
|
||||||
|
ignore_workloads: []
|
||||||
|
ignore_unavailable_nodes: []
|
||||||
|
auto_recycle_stuck_pods: true
|
||||||
|
stuck_pod_grace_seconds: 180
|
||||||
|
vault_unseal_key_file: /var/lib/ananke/vault-unseal.key
|
||||||
|
vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/atlas/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.ananke-breakglass/vault-unseal.key'"
|
||||||
vault_unseal_breakglass_timeout_seconds: 15
|
vault_unseal_breakglass_timeout_seconds: 15
|
||||||
shutdown:
|
shutdown:
|
||||||
default_budget_seconds: 1380
|
default_budget_seconds: 1380
|
||||||
@ -168,7 +200,7 @@ ups:
|
|||||||
coordination:
|
coordination:
|
||||||
forward_shutdown_host: ""
|
forward_shutdown_host: ""
|
||||||
forward_shutdown_user: atlas
|
forward_shutdown_user: atlas
|
||||||
forward_shutdown_config: /etc/hecate/hecate.yaml
|
forward_shutdown_config: /etc/ananke/ananke.yaml
|
||||||
peer_hosts:
|
peer_hosts:
|
||||||
- titan-24
|
- titan-24
|
||||||
fallback_local_shutdown: true
|
fallback_local_shutdown: true
|
||||||
@ -181,7 +213,7 @@ metrics:
|
|||||||
bind_addr: 0.0.0.0:9560
|
bind_addr: 0.0.0.0:9560
|
||||||
path: /metrics
|
path: /metrics
|
||||||
state:
|
state:
|
||||||
dir: /var/lib/hecate
|
dir: /var/lib/ananke
|
||||||
run_history_path: /var/lib/hecate/runs.json
|
run_history_path: /var/lib/ananke/runs.json
|
||||||
lock_path: /var/lib/hecate/hecate.lock
|
lock_path: /var/lib/ananke/ananke.lock
|
||||||
intent_path: /var/lib/hecate/intent.json
|
intent_path: /var/lib/ananke/intent.json
|
||||||
@ -1,15 +1,15 @@
|
|||||||
[Unit]
|
[Unit]
|
||||||
Description=Hecate Staged Cluster Bootstrap
|
Description=Ananke Staged Cluster Bootstrap
|
||||||
Wants=network-online.target
|
Wants=network-online.target
|
||||||
After=network-online.target
|
After=network-online.target
|
||||||
ConditionPathExists=/etc/hecate/hecate.yaml
|
ConditionPathExists=/etc/ananke/ananke.yaml
|
||||||
StartLimitIntervalSec=0
|
StartLimitIntervalSec=0
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
Type=oneshot
|
Type=oneshot
|
||||||
User=root
|
User=root
|
||||||
Group=root
|
Group=root
|
||||||
ExecStart=/usr/local/bin/hecate startup --config /etc/hecate/hecate.yaml --execute --force-flux-branch main --auto-peer-failover --peer-wait-seconds 180
|
ExecStart=/usr/local/bin/ananke startup --config /etc/ananke/ananke.yaml --execute --force-flux-branch main --auto-peer-failover --peer-wait-seconds 180
|
||||||
Restart=on-failure
|
Restart=on-failure
|
||||||
RestartSec=30
|
RestartSec=30
|
||||||
TimeoutStartSec=1800
|
TimeoutStartSec=1800
|
||||||
@ -1,5 +1,5 @@
|
|||||||
[Unit]
|
[Unit]
|
||||||
Description=Hecate Self-Update and Reinstall
|
Description=Ananke Self-Update and Reinstall
|
||||||
Wants=network-online.target
|
Wants=network-online.target
|
||||||
After=network-online.target
|
After=network-online.target
|
||||||
|
|
||||||
@ -7,6 +7,7 @@ After=network-online.target
|
|||||||
Type=oneshot
|
Type=oneshot
|
||||||
User=root
|
User=root
|
||||||
Group=root
|
Group=root
|
||||||
ExecStart=/usr/local/lib/hecate/hecate-self-update.sh
|
ExecStart=/usr/local/lib/ananke/ananke-self-update.sh
|
||||||
TimeoutStartSec=1800
|
TimeoutStartSec=1800
|
||||||
|
|
||||||
|
[Install]
|
||||||
@ -1,12 +1,11 @@
|
|||||||
[Unit]
|
[Unit]
|
||||||
Description=Periodic Hecate Self-Update Timer
|
Description=Periodic Ananke Self-Update Timer
|
||||||
|
|
||||||
[Timer]
|
[Timer]
|
||||||
OnBootSec=2m
|
OnBootSec=2m
|
||||||
OnUnitActiveSec=6h
|
OnUnitActiveSec=6h
|
||||||
Unit=hecate-update.service
|
Unit=ananke-update.service
|
||||||
Persistent=true
|
Persistent=true
|
||||||
|
|
||||||
[Install]
|
[Install]
|
||||||
WantedBy=timers.target
|
WantedBy=timers.target
|
||||||
|
|
||||||
@ -1,14 +1,14 @@
|
|||||||
[Unit]
|
[Unit]
|
||||||
Description=Hecate UPS Monitor and Auto Shutdown Orchestrator
|
Description=Ananke UPS Monitor and Auto Shutdown Orchestrator
|
||||||
Wants=network-online.target
|
Wants=network-online.target
|
||||||
After=network-online.target
|
After=network-online.target
|
||||||
ConditionPathExists=/etc/hecate/hecate.yaml
|
ConditionPathExists=/etc/ananke/ananke.yaml
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
Type=simple
|
Type=simple
|
||||||
User=root
|
User=root
|
||||||
Group=root
|
Group=root
|
||||||
ExecStart=/usr/local/bin/hecate daemon --config /etc/hecate/hecate.yaml
|
ExecStart=/usr/local/bin/ananke daemon --config /etc/ananke/ananke.yaml
|
||||||
Restart=on-failure
|
Restart=on-failure
|
||||||
RestartSec=5
|
RestartSec=5
|
||||||
NoNewPrivileges=true
|
NoNewPrivileges=true
|
||||||
2
go.mod
2
go.mod
@ -1,4 +1,4 @@
|
|||||||
module scm.bstein.dev/bstein/hecate
|
module scm.bstein.dev/bstein/ananke
|
||||||
|
|
||||||
go 1.25
|
go 1.25
|
||||||
|
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@ -1,14 +1,17 @@
|
|||||||
package cluster
|
package cluster
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"context"
|
||||||
"log"
|
"log"
|
||||||
|
"net/http"
|
||||||
|
"net/http/httptest"
|
||||||
"os"
|
"os"
|
||||||
"reflect"
|
"reflect"
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"scm.bstein.dev/bstein/hecate/internal/config"
|
"scm.bstein.dev/bstein/ananke/internal/config"
|
||||||
"scm.bstein.dev/bstein/hecate/internal/state"
|
"scm.bstein.dev/bstein/ananke/internal/state"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestParseVaultSealed(t *testing.T) {
|
func TestParseVaultSealed(t *testing.T) {
|
||||||
@ -117,3 +120,75 @@ func TestCoordinationPeersDedupesAndIncludesForwardHost(t *testing.T) {
|
|||||||
t.Fatalf("coordination peers mismatch: got=%v want=%v", got, want)
|
t.Fatalf("coordination peers mismatch: got=%v want=%v", got, want)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestWorkloadTargetsIgnoredNodesByNodeSelector(t *testing.T) {
|
||||||
|
spec := podSpec{
|
||||||
|
NodeSelector: map[string]string{
|
||||||
|
"kubernetes.io/hostname": "titan-22",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
ignored := map[string]struct{}{"titan-22": {}}
|
||||||
|
if !workloadTargetsIgnoredNodes(spec, ignored) {
|
||||||
|
t.Fatalf("expected workload to target ignored node via nodeSelector")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseWorkloadIgnoreRules(t *testing.T) {
|
||||||
|
rules := parseWorkloadIgnoreRules([]string{
|
||||||
|
"maintenance/metis",
|
||||||
|
"crypto/statefulset/monerod",
|
||||||
|
})
|
||||||
|
if len(rules) != 2 {
|
||||||
|
t.Fatalf("expected 2 ignore rules, got %d", len(rules))
|
||||||
|
}
|
||||||
|
if !workloadIgnored(rules, "maintenance", "deployment", "metis") {
|
||||||
|
t.Fatalf("expected namespace/name rule to match")
|
||||||
|
}
|
||||||
|
if !workloadIgnored(rules, "crypto", "statefulset", "monerod") {
|
||||||
|
t.Fatalf("expected namespace/kind/name rule to match")
|
||||||
|
}
|
||||||
|
if workloadIgnored(rules, "crypto", "deployment", "monerod") {
|
||||||
|
t.Fatalf("did not expect mismatched kind to match")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNamespaceCandidatesFromIgnoreKustomizations(t *testing.T) {
|
||||||
|
got := namespaceCandidatesFromIgnoreKustomizations([]string{
|
||||||
|
"flux-system/jellyfin",
|
||||||
|
"flux-system/outline",
|
||||||
|
})
|
||||||
|
if _, ok := got["jellyfin"]; !ok {
|
||||||
|
t.Fatalf("expected jellyfin namespace candidate")
|
||||||
|
}
|
||||||
|
if _, ok := got["outline"]; !ok {
|
||||||
|
t.Fatalf("expected outline namespace candidate")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestProbeStatusAcceptedRejects404(t *testing.T) {
|
||||||
|
if probeStatusAccepted("https://metrics.bstein.dev/login", 404) {
|
||||||
|
t.Fatalf("expected 404 probe status to be rejected")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestServiceCheckReadyRequiresBodyContains(t *testing.T) {
|
||||||
|
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
|
||||||
|
w.WriteHeader(http.StatusOK)
|
||||||
|
_, _ = w.Write([]byte(`{"database":"ok"}`))
|
||||||
|
}))
|
||||||
|
defer srv.Close()
|
||||||
|
|
||||||
|
orch := &Orchestrator{
|
||||||
|
log: log.New(os.Stdout, "", 0),
|
||||||
|
}
|
||||||
|
ok, detail := orch.serviceCheckReady(context.Background(), config.ServiceChecklistCheck{
|
||||||
|
Name: "grafana-api",
|
||||||
|
URL: srv.URL,
|
||||||
|
AcceptedStatuses: []int{200},
|
||||||
|
BodyContains: `"database":"ok"`,
|
||||||
|
TimeoutSeconds: 5,
|
||||||
|
})
|
||||||
|
if !ok {
|
||||||
|
t.Fatalf("expected service check to pass, detail=%s", detail)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@ -2,6 +2,7 @@ package config
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
neturl "net/url"
|
||||||
"os"
|
"os"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
@ -21,6 +22,7 @@ type Config struct {
|
|||||||
SSHJumpUser string `yaml:"ssh_jump_user"`
|
SSHJumpUser string `yaml:"ssh_jump_user"`
|
||||||
IACRepoPath string `yaml:"iac_repo_path"`
|
IACRepoPath string `yaml:"iac_repo_path"`
|
||||||
ExpectedFluxBranch string `yaml:"expected_flux_branch"`
|
ExpectedFluxBranch string `yaml:"expected_flux_branch"`
|
||||||
|
ExpectedFluxSource string `yaml:"expected_flux_source_url"`
|
||||||
ControlPlanes []string `yaml:"control_planes"`
|
ControlPlanes []string `yaml:"control_planes"`
|
||||||
Workers []string `yaml:"workers"`
|
Workers []string `yaml:"workers"`
|
||||||
LocalBootstrapPaths []string `yaml:"local_bootstrap_paths"`
|
LocalBootstrapPaths []string `yaml:"local_bootstrap_paths"`
|
||||||
@ -34,29 +36,58 @@ type Config struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type Startup struct {
|
type Startup struct {
|
||||||
APIWaitSeconds int `yaml:"api_wait_seconds"`
|
APIWaitSeconds int `yaml:"api_wait_seconds"`
|
||||||
APIPollSeconds int `yaml:"api_poll_seconds"`
|
APIPollSeconds int `yaml:"api_poll_seconds"`
|
||||||
ShutdownCooldownSeconds int `yaml:"shutdown_cooldown_seconds"`
|
ShutdownCooldownSeconds int `yaml:"shutdown_cooldown_seconds"`
|
||||||
RequireTimeSync bool `yaml:"require_time_sync"`
|
MinimumBatteryPercent float64 `yaml:"minimum_battery_percent"`
|
||||||
TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"`
|
RequiredNodeLabels map[string]map[string]string `yaml:"required_node_labels"`
|
||||||
TimeSyncPollSeconds int `yaml:"time_sync_poll_seconds"`
|
RequireTimeSync bool `yaml:"require_time_sync"`
|
||||||
TimeSyncMode string `yaml:"time_sync_mode"`
|
TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"`
|
||||||
TimeSyncQuorum int `yaml:"time_sync_quorum"`
|
TimeSyncPollSeconds int `yaml:"time_sync_poll_seconds"`
|
||||||
ReconcileAccessOnBoot bool `yaml:"reconcile_access_on_boot"`
|
TimeSyncMode string `yaml:"time_sync_mode"`
|
||||||
AutoEtcdRestoreOnAPIFailure bool `yaml:"auto_etcd_restore_on_api_failure"`
|
TimeSyncQuorum int `yaml:"time_sync_quorum"`
|
||||||
EtcdRestoreControlPlane string `yaml:"etcd_restore_control_plane"`
|
ReconcileAccessOnBoot bool `yaml:"reconcile_access_on_boot"`
|
||||||
RequireStorageReady bool `yaml:"require_storage_ready"`
|
AutoEtcdRestoreOnAPIFailure bool `yaml:"auto_etcd_restore_on_api_failure"`
|
||||||
StorageReadyWaitSeconds int `yaml:"storage_ready_wait_seconds"`
|
EtcdRestoreControlPlane string `yaml:"etcd_restore_control_plane"`
|
||||||
StorageReadyPollSeconds int `yaml:"storage_ready_poll_seconds"`
|
RequireStorageReady bool `yaml:"require_storage_ready"`
|
||||||
StorageMinReadyNodes int `yaml:"storage_min_ready_nodes"`
|
StorageReadyWaitSeconds int `yaml:"storage_ready_wait_seconds"`
|
||||||
StorageCriticalPVCs []string `yaml:"storage_critical_pvcs"`
|
StorageReadyPollSeconds int `yaml:"storage_ready_poll_seconds"`
|
||||||
RequirePostStartProbes bool `yaml:"require_post_start_probes"`
|
StorageMinReadyNodes int `yaml:"storage_min_ready_nodes"`
|
||||||
PostStartProbeWaitSeconds int `yaml:"post_start_probe_wait_seconds"`
|
StorageCriticalPVCs []string `yaml:"storage_critical_pvcs"`
|
||||||
PostStartProbePollSeconds int `yaml:"post_start_probe_poll_seconds"`
|
RequirePostStartProbes bool `yaml:"require_post_start_probes"`
|
||||||
PostStartProbes []string `yaml:"post_start_probes"`
|
PostStartProbeWaitSeconds int `yaml:"post_start_probe_wait_seconds"`
|
||||||
VaultUnsealKeyFile string `yaml:"vault_unseal_key_file"`
|
PostStartProbePollSeconds int `yaml:"post_start_probe_poll_seconds"`
|
||||||
VaultUnsealBreakglassCommand string `yaml:"vault_unseal_breakglass_command"`
|
PostStartProbes []string `yaml:"post_start_probes"`
|
||||||
VaultUnsealBreakglassTimeout int `yaml:"vault_unseal_breakglass_timeout_seconds"`
|
RequireServiceChecklist bool `yaml:"require_service_checklist"`
|
||||||
|
ServiceChecklistWaitSeconds int `yaml:"service_checklist_wait_seconds"`
|
||||||
|
ServiceChecklistPollSeconds int `yaml:"service_checklist_poll_seconds"`
|
||||||
|
ServiceChecklistStabilitySec int `yaml:"service_checklist_stability_seconds"`
|
||||||
|
ServiceChecklist []ServiceChecklistCheck `yaml:"service_checklist"`
|
||||||
|
RequireFluxHealth bool `yaml:"require_flux_health"`
|
||||||
|
FluxHealthWaitSeconds int `yaml:"flux_health_wait_seconds"`
|
||||||
|
FluxHealthPollSeconds int `yaml:"flux_health_poll_seconds"`
|
||||||
|
IgnoreFluxKustomizations []string `yaml:"ignore_flux_kustomizations"`
|
||||||
|
RequireWorkloadConvergence bool `yaml:"require_workload_convergence"`
|
||||||
|
WorkloadConvergenceWaitSeconds int `yaml:"workload_convergence_wait_seconds"`
|
||||||
|
WorkloadConvergencePollSeconds int `yaml:"workload_convergence_poll_seconds"`
|
||||||
|
IgnoreWorkloadNamespaces []string `yaml:"ignore_workload_namespaces"`
|
||||||
|
IgnoreWorkloads []string `yaml:"ignore_workloads"`
|
||||||
|
IgnoreUnavailableNodes []string `yaml:"ignore_unavailable_nodes"`
|
||||||
|
AutoRecycleStuckPods bool `yaml:"auto_recycle_stuck_pods"`
|
||||||
|
StuckPodGraceSeconds int `yaml:"stuck_pod_grace_seconds"`
|
||||||
|
VaultUnsealKeyFile string `yaml:"vault_unseal_key_file"`
|
||||||
|
VaultUnsealBreakglassCommand string `yaml:"vault_unseal_breakglass_command"`
|
||||||
|
VaultUnsealBreakglassTimeout int `yaml:"vault_unseal_breakglass_timeout_seconds"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type ServiceChecklistCheck struct {
|
||||||
|
Name string `yaml:"name"`
|
||||||
|
URL string `yaml:"url"`
|
||||||
|
AcceptedStatuses []int `yaml:"accepted_statuses"`
|
||||||
|
BodyContains string `yaml:"body_contains"`
|
||||||
|
BodyNotContains string `yaml:"body_not_contains"`
|
||||||
|
TimeoutSeconds int `yaml:"timeout_seconds"`
|
||||||
|
InsecureSkipTLS bool `yaml:"insecure_skip_tls"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type Shutdown struct {
|
type Shutdown struct {
|
||||||
@ -143,6 +174,9 @@ func (c Config) Validate() error {
|
|||||||
if c.ExpectedFluxBranch == "" {
|
if c.ExpectedFluxBranch == "" {
|
||||||
return fmt.Errorf("config.expected_flux_branch must not be empty")
|
return fmt.Errorf("config.expected_flux_branch must not be empty")
|
||||||
}
|
}
|
||||||
|
if c.ExpectedFluxSource == "" {
|
||||||
|
return fmt.Errorf("config.expected_flux_source_url must not be empty")
|
||||||
|
}
|
||||||
if c.IACRepoPath == "" {
|
if c.IACRepoPath == "" {
|
||||||
return fmt.Errorf("config.iac_repo_path must not be empty")
|
return fmt.Errorf("config.iac_repo_path must not be empty")
|
||||||
}
|
}
|
||||||
@ -176,6 +210,25 @@ func (c Config) Validate() error {
|
|||||||
if c.Startup.ShutdownCooldownSeconds <= 0 {
|
if c.Startup.ShutdownCooldownSeconds <= 0 {
|
||||||
return fmt.Errorf("config.startup.shutdown_cooldown_seconds must be > 0")
|
return fmt.Errorf("config.startup.shutdown_cooldown_seconds must be > 0")
|
||||||
}
|
}
|
||||||
|
if c.Startup.MinimumBatteryPercent < 0 || c.Startup.MinimumBatteryPercent > 100 {
|
||||||
|
return fmt.Errorf("config.startup.minimum_battery_percent must be between 0 and 100")
|
||||||
|
}
|
||||||
|
for node, labels := range c.Startup.RequiredNodeLabels {
|
||||||
|
if strings.TrimSpace(node) == "" {
|
||||||
|
return fmt.Errorf("config.startup.required_node_labels keys must not be empty")
|
||||||
|
}
|
||||||
|
if len(labels) == 0 {
|
||||||
|
return fmt.Errorf("config.startup.required_node_labels[%q] must include at least one label", node)
|
||||||
|
}
|
||||||
|
for key, value := range labels {
|
||||||
|
if strings.TrimSpace(key) == "" {
|
||||||
|
return fmt.Errorf("config.startup.required_node_labels[%q] contains empty label key", node)
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(value) == "" {
|
||||||
|
return fmt.Errorf("config.startup.required_node_labels[%q][%q] must not be empty", node, key)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
if c.Startup.TimeSyncWaitSeconds <= 0 {
|
if c.Startup.TimeSyncWaitSeconds <= 0 {
|
||||||
return fmt.Errorf("config.startup.time_sync_wait_seconds must be > 0")
|
return fmt.Errorf("config.startup.time_sync_wait_seconds must be > 0")
|
||||||
}
|
}
|
||||||
@ -223,11 +276,88 @@ func (c Config) Validate() error {
|
|||||||
if c.Startup.RequirePostStartProbes && len(c.Startup.PostStartProbes) == 0 {
|
if c.Startup.RequirePostStartProbes && len(c.Startup.PostStartProbes) == 0 {
|
||||||
return fmt.Errorf("config.startup.post_start_probes must not be empty when require_post_start_probes is true")
|
return fmt.Errorf("config.startup.post_start_probes must not be empty when require_post_start_probes is true")
|
||||||
}
|
}
|
||||||
|
if c.Startup.ServiceChecklistWaitSeconds <= 0 {
|
||||||
|
return fmt.Errorf("config.startup.service_checklist_wait_seconds must be > 0")
|
||||||
|
}
|
||||||
|
if c.Startup.ServiceChecklistPollSeconds <= 0 {
|
||||||
|
return fmt.Errorf("config.startup.service_checklist_poll_seconds must be > 0")
|
||||||
|
}
|
||||||
|
if c.Startup.ServiceChecklistStabilitySec < 0 {
|
||||||
|
return fmt.Errorf("config.startup.service_checklist_stability_seconds must be >= 0")
|
||||||
|
}
|
||||||
|
if c.Startup.RequireServiceChecklist && len(c.Startup.ServiceChecklist) == 0 {
|
||||||
|
return fmt.Errorf("config.startup.service_checklist must not be empty when require_service_checklist is true")
|
||||||
|
}
|
||||||
|
for i, check := range c.Startup.ServiceChecklist {
|
||||||
|
if strings.TrimSpace(check.Name) == "" {
|
||||||
|
return fmt.Errorf("config.startup.service_checklist[%d].name must not be empty", i)
|
||||||
|
}
|
||||||
|
rawURL := strings.TrimSpace(check.URL)
|
||||||
|
if rawURL == "" {
|
||||||
|
return fmt.Errorf("config.startup.service_checklist[%d].url must not be empty", i)
|
||||||
|
}
|
||||||
|
parsed, err := neturl.Parse(rawURL)
|
||||||
|
if err != nil || parsed.Scheme == "" || parsed.Host == "" {
|
||||||
|
return fmt.Errorf("config.startup.service_checklist[%d].url is invalid: %q", i, rawURL)
|
||||||
|
}
|
||||||
|
if check.TimeoutSeconds <= 0 {
|
||||||
|
return fmt.Errorf("config.startup.service_checklist[%d].timeout_seconds must be > 0", i)
|
||||||
|
}
|
||||||
|
for _, code := range check.AcceptedStatuses {
|
||||||
|
if code < 100 || code > 599 {
|
||||||
|
return fmt.Errorf("config.startup.service_checklist[%d].accepted_statuses contains invalid HTTP code %d", i, code)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if c.Startup.FluxHealthWaitSeconds <= 0 {
|
||||||
|
return fmt.Errorf("config.startup.flux_health_wait_seconds must be > 0")
|
||||||
|
}
|
||||||
|
if c.Startup.FluxHealthPollSeconds <= 0 {
|
||||||
|
return fmt.Errorf("config.startup.flux_health_poll_seconds must be > 0")
|
||||||
|
}
|
||||||
|
if c.Startup.WorkloadConvergenceWaitSeconds <= 0 {
|
||||||
|
return fmt.Errorf("config.startup.workload_convergence_wait_seconds must be > 0")
|
||||||
|
}
|
||||||
|
if c.Startup.WorkloadConvergencePollSeconds <= 0 {
|
||||||
|
return fmt.Errorf("config.startup.workload_convergence_poll_seconds must be > 0")
|
||||||
|
}
|
||||||
|
if c.Startup.StuckPodGraceSeconds <= 0 {
|
||||||
|
return fmt.Errorf("config.startup.stuck_pod_grace_seconds must be > 0")
|
||||||
|
}
|
||||||
for _, probe := range c.Startup.PostStartProbes {
|
for _, probe := range c.Startup.PostStartProbes {
|
||||||
if strings.TrimSpace(probe) == "" {
|
if strings.TrimSpace(probe) == "" {
|
||||||
return fmt.Errorf("config.startup.post_start_probes entries must not be empty")
|
return fmt.Errorf("config.startup.post_start_probes entries must not be empty")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
for _, item := range c.Startup.IgnoreFluxKustomizations {
|
||||||
|
item = strings.TrimSpace(item)
|
||||||
|
if item == "" {
|
||||||
|
return fmt.Errorf("config.startup.ignore_flux_kustomizations entries must not be empty")
|
||||||
|
}
|
||||||
|
if strings.Count(item, "/") != 1 {
|
||||||
|
return fmt.Errorf("config.startup.ignore_flux_kustomizations entries must be namespace/name, got %q", item)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, item := range c.Startup.IgnoreWorkloads {
|
||||||
|
item = strings.TrimSpace(item)
|
||||||
|
if item == "" {
|
||||||
|
return fmt.Errorf("config.startup.ignore_workloads entries must not be empty")
|
||||||
|
}
|
||||||
|
parts := strings.Split(item, "/")
|
||||||
|
if len(parts) != 2 && len(parts) != 3 {
|
||||||
|
return fmt.Errorf("config.startup.ignore_workloads entries must be namespace/name or namespace/kind/name, got %q", item)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, ns := range c.Startup.IgnoreWorkloadNamespaces {
|
||||||
|
if strings.TrimSpace(ns) == "" {
|
||||||
|
return fmt.Errorf("config.startup.ignore_workload_namespaces entries must not be empty")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, node := range c.Startup.IgnoreUnavailableNodes {
|
||||||
|
if strings.TrimSpace(node) == "" {
|
||||||
|
return fmt.Errorf("config.startup.ignore_unavailable_nodes entries must not be empty")
|
||||||
|
}
|
||||||
|
}
|
||||||
if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" {
|
if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" {
|
||||||
return fmt.Errorf("config.startup.vault_unseal_key_file must not be empty")
|
return fmt.Errorf("config.startup.vault_unseal_key_file must not be empty")
|
||||||
}
|
}
|
||||||
@ -276,6 +406,7 @@ func defaults() Config {
|
|||||||
c := Config{
|
c := Config{
|
||||||
IACRepoPath: "/opt/titan-iac",
|
IACRepoPath: "/opt/titan-iac",
|
||||||
ExpectedFluxBranch: "main",
|
ExpectedFluxBranch: "main",
|
||||||
|
ExpectedFluxSource: "ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git",
|
||||||
SSHPort: 2277,
|
SSHPort: 2277,
|
||||||
ControlPlanes: []string{"titan-0a", "titan-0b", "titan-0c"},
|
ControlPlanes: []string{"titan-0a", "titan-0b", "titan-0c"},
|
||||||
LocalBootstrapPaths: []string{
|
LocalBootstrapPaths: []string{
|
||||||
@ -328,16 +459,54 @@ func defaults() Config {
|
|||||||
"gitea/gitea-data",
|
"gitea/gitea-data",
|
||||||
"sso/keycloak-data",
|
"sso/keycloak-data",
|
||||||
},
|
},
|
||||||
|
MinimumBatteryPercent: 20,
|
||||||
|
RequiredNodeLabels: map[string]map[string]string{
|
||||||
|
"titan-09": {
|
||||||
|
"ananke.bstein.dev/harbor-bootstrap": "true",
|
||||||
|
},
|
||||||
|
},
|
||||||
RequirePostStartProbes: true,
|
RequirePostStartProbes: true,
|
||||||
PostStartProbeWaitSeconds: 240,
|
PostStartProbeWaitSeconds: 240,
|
||||||
PostStartProbePollSeconds: 5,
|
PostStartProbePollSeconds: 5,
|
||||||
PostStartProbes: []string{
|
PostStartProbes: []string{
|
||||||
"https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration",
|
"https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration",
|
||||||
"https://scm.bstein.dev/user/login",
|
"https://scm.bstein.dev/api/healthz",
|
||||||
"https://metrics.bstein.dev/login",
|
"https://metrics.bstein.dev/api/health",
|
||||||
},
|
},
|
||||||
VaultUnsealKeyFile: "/var/lib/hecate/vault-unseal.key",
|
RequireServiceChecklist: true,
|
||||||
VaultUnsealBreakglassTimeout: 15,
|
ServiceChecklistWaitSeconds: 420,
|
||||||
|
ServiceChecklistPollSeconds: 5,
|
||||||
|
ServiceChecklistStabilitySec: 120,
|
||||||
|
ServiceChecklist: []ServiceChecklistCheck{
|
||||||
|
{
|
||||||
|
Name: "gitea-api",
|
||||||
|
URL: "https://scm.bstein.dev/api/healthz",
|
||||||
|
AcceptedStatuses: []int{200},
|
||||||
|
BodyContains: "pass",
|
||||||
|
TimeoutSeconds: 12,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "grafana-api",
|
||||||
|
URL: "https://metrics.bstein.dev/api/health",
|
||||||
|
AcceptedStatuses: []int{200},
|
||||||
|
BodyContains: "\"database\":\"ok\"",
|
||||||
|
TimeoutSeconds: 12,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
RequireFluxHealth: true,
|
||||||
|
FluxHealthWaitSeconds: 900,
|
||||||
|
FluxHealthPollSeconds: 5,
|
||||||
|
IgnoreFluxKustomizations: []string{},
|
||||||
|
RequireWorkloadConvergence: true,
|
||||||
|
WorkloadConvergenceWaitSeconds: 900,
|
||||||
|
WorkloadConvergencePollSeconds: 5,
|
||||||
|
IgnoreWorkloadNamespaces: []string{},
|
||||||
|
IgnoreWorkloads: []string{},
|
||||||
|
IgnoreUnavailableNodes: []string{},
|
||||||
|
AutoRecycleStuckPods: true,
|
||||||
|
StuckPodGraceSeconds: 180,
|
||||||
|
VaultUnsealKeyFile: "/var/lib/ananke/vault-unseal.key",
|
||||||
|
VaultUnsealBreakglassTimeout: 15,
|
||||||
},
|
},
|
||||||
Shutdown: Shutdown{
|
Shutdown: Shutdown{
|
||||||
DefaultBudgetSeconds: 1380,
|
DefaultBudgetSeconds: 1380,
|
||||||
@ -362,7 +531,7 @@ func defaults() Config {
|
|||||||
TelemetryTimeoutSeconds: 90,
|
TelemetryTimeoutSeconds: 90,
|
||||||
},
|
},
|
||||||
Coordination: Coordination{
|
Coordination: Coordination{
|
||||||
ForwardShutdownConfig: "/etc/hecate/hecate.yaml",
|
ForwardShutdownConfig: "/etc/ananke/ananke.yaml",
|
||||||
PeerHosts: []string{},
|
PeerHosts: []string{},
|
||||||
FallbackLocalShutdown: true,
|
FallbackLocalShutdown: true,
|
||||||
CommandTimeoutSeconds: 25,
|
CommandTimeoutSeconds: 25,
|
||||||
@ -376,10 +545,10 @@ func defaults() Config {
|
|||||||
Path: "/metrics",
|
Path: "/metrics",
|
||||||
},
|
},
|
||||||
State: State{
|
State: State{
|
||||||
Dir: "/var/lib/hecate",
|
Dir: "/var/lib/ananke",
|
||||||
RunHistoryPath: "/var/lib/hecate/runs.json",
|
RunHistoryPath: "/var/lib/ananke/runs.json",
|
||||||
LockPath: "/var/lib/hecate/hecate.lock",
|
LockPath: "/var/lib/ananke/ananke.lock",
|
||||||
IntentPath: "/var/lib/hecate/intent.json",
|
IntentPath: "/var/lib/ananke/intent.json",
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
c.applyDefaults()
|
c.applyDefaults()
|
||||||
@ -393,6 +562,9 @@ func (c *Config) applyDefaults() {
|
|||||||
if c.IACRepoPath == "" {
|
if c.IACRepoPath == "" {
|
||||||
c.IACRepoPath = "/opt/titan-iac"
|
c.IACRepoPath = "/opt/titan-iac"
|
||||||
}
|
}
|
||||||
|
if c.ExpectedFluxSource == "" {
|
||||||
|
c.ExpectedFluxSource = "ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git"
|
||||||
|
}
|
||||||
if c.Startup.APIWaitSeconds <= 0 {
|
if c.Startup.APIWaitSeconds <= 0 {
|
||||||
c.Startup.APIWaitSeconds = 1200
|
c.Startup.APIWaitSeconds = 1200
|
||||||
}
|
}
|
||||||
@ -402,6 +574,16 @@ func (c *Config) applyDefaults() {
|
|||||||
if c.Startup.ShutdownCooldownSeconds <= 0 {
|
if c.Startup.ShutdownCooldownSeconds <= 0 {
|
||||||
c.Startup.ShutdownCooldownSeconds = 45
|
c.Startup.ShutdownCooldownSeconds = 45
|
||||||
}
|
}
|
||||||
|
if c.Startup.MinimumBatteryPercent <= 0 {
|
||||||
|
c.Startup.MinimumBatteryPercent = 20
|
||||||
|
}
|
||||||
|
if c.Startup.RequiredNodeLabels == nil {
|
||||||
|
c.Startup.RequiredNodeLabels = map[string]map[string]string{
|
||||||
|
"titan-09": {
|
||||||
|
"ananke.bstein.dev/harbor-bootstrap": "true",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
if c.Startup.TimeSyncWaitSeconds <= 0 {
|
if c.Startup.TimeSyncWaitSeconds <= 0 {
|
||||||
c.Startup.TimeSyncWaitSeconds = 240
|
c.Startup.TimeSyncWaitSeconds = 240
|
||||||
}
|
}
|
||||||
@ -446,12 +628,71 @@ func (c *Config) applyDefaults() {
|
|||||||
if len(c.Startup.PostStartProbes) == 0 {
|
if len(c.Startup.PostStartProbes) == 0 {
|
||||||
c.Startup.PostStartProbes = []string{
|
c.Startup.PostStartProbes = []string{
|
||||||
"https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration",
|
"https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration",
|
||||||
"https://scm.bstein.dev/user/login",
|
"https://scm.bstein.dev/api/healthz",
|
||||||
"https://metrics.bstein.dev/login",
|
"https://metrics.bstein.dev/api/health",
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if c.Startup.ServiceChecklistWaitSeconds <= 0 {
|
||||||
|
c.Startup.ServiceChecklistWaitSeconds = 420
|
||||||
|
}
|
||||||
|
if c.Startup.ServiceChecklistPollSeconds <= 0 {
|
||||||
|
c.Startup.ServiceChecklistPollSeconds = 5
|
||||||
|
}
|
||||||
|
if c.Startup.ServiceChecklistStabilitySec < 0 {
|
||||||
|
c.Startup.ServiceChecklistStabilitySec = 0
|
||||||
|
}
|
||||||
|
if len(c.Startup.ServiceChecklist) == 0 {
|
||||||
|
c.Startup.ServiceChecklist = []ServiceChecklistCheck{
|
||||||
|
{
|
||||||
|
Name: "gitea-api",
|
||||||
|
URL: "https://scm.bstein.dev/api/healthz",
|
||||||
|
AcceptedStatuses: []int{200},
|
||||||
|
BodyContains: "pass",
|
||||||
|
TimeoutSeconds: 12,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "grafana-api",
|
||||||
|
URL: "https://metrics.bstein.dev/api/health",
|
||||||
|
AcceptedStatuses: []int{200},
|
||||||
|
BodyContains: "\"database\":\"ok\"",
|
||||||
|
TimeoutSeconds: 12,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for i := range c.Startup.ServiceChecklist {
|
||||||
|
if c.Startup.ServiceChecklist[i].TimeoutSeconds <= 0 {
|
||||||
|
c.Startup.ServiceChecklist[i].TimeoutSeconds = 12
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if c.Startup.FluxHealthWaitSeconds <= 0 {
|
||||||
|
c.Startup.FluxHealthWaitSeconds = 900
|
||||||
|
}
|
||||||
|
if c.Startup.FluxHealthPollSeconds <= 0 {
|
||||||
|
c.Startup.FluxHealthPollSeconds = 5
|
||||||
|
}
|
||||||
|
if c.Startup.IgnoreFluxKustomizations == nil {
|
||||||
|
c.Startup.IgnoreFluxKustomizations = []string{}
|
||||||
|
}
|
||||||
|
if c.Startup.WorkloadConvergenceWaitSeconds <= 0 {
|
||||||
|
c.Startup.WorkloadConvergenceWaitSeconds = 900
|
||||||
|
}
|
||||||
|
if c.Startup.WorkloadConvergencePollSeconds <= 0 {
|
||||||
|
c.Startup.WorkloadConvergencePollSeconds = 5
|
||||||
|
}
|
||||||
|
if c.Startup.IgnoreWorkloadNamespaces == nil {
|
||||||
|
c.Startup.IgnoreWorkloadNamespaces = []string{}
|
||||||
|
}
|
||||||
|
if c.Startup.IgnoreWorkloads == nil {
|
||||||
|
c.Startup.IgnoreWorkloads = []string{}
|
||||||
|
}
|
||||||
|
if c.Startup.IgnoreUnavailableNodes == nil {
|
||||||
|
c.Startup.IgnoreUnavailableNodes = []string{}
|
||||||
|
}
|
||||||
|
if c.Startup.StuckPodGraceSeconds <= 0 {
|
||||||
|
c.Startup.StuckPodGraceSeconds = 180
|
||||||
|
}
|
||||||
if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" {
|
if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" {
|
||||||
c.Startup.VaultUnsealKeyFile = "/var/lib/hecate/vault-unseal.key"
|
c.Startup.VaultUnsealKeyFile = "/var/lib/ananke/vault-unseal.key"
|
||||||
}
|
}
|
||||||
if c.Startup.VaultUnsealBreakglassTimeout <= 0 {
|
if c.Startup.VaultUnsealBreakglassTimeout <= 0 {
|
||||||
c.Startup.VaultUnsealBreakglassTimeout = 15
|
c.Startup.VaultUnsealBreakglassTimeout = 15
|
||||||
@ -496,7 +737,7 @@ func (c *Config) applyDefaults() {
|
|||||||
c.UPS.TelemetryTimeoutSeconds = 90
|
c.UPS.TelemetryTimeoutSeconds = 90
|
||||||
}
|
}
|
||||||
if c.Coordination.ForwardShutdownConfig == "" {
|
if c.Coordination.ForwardShutdownConfig == "" {
|
||||||
c.Coordination.ForwardShutdownConfig = "/etc/hecate/hecate.yaml"
|
c.Coordination.ForwardShutdownConfig = "/etc/ananke/ananke.yaml"
|
||||||
}
|
}
|
||||||
if c.Coordination.PeerHosts == nil {
|
if c.Coordination.PeerHosts == nil {
|
||||||
c.Coordination.PeerHosts = []string{}
|
c.Coordination.PeerHosts = []string{}
|
||||||
@ -517,15 +758,15 @@ func (c *Config) applyDefaults() {
|
|||||||
c.Metrics.Path = "/metrics"
|
c.Metrics.Path = "/metrics"
|
||||||
}
|
}
|
||||||
if c.State.Dir == "" {
|
if c.State.Dir == "" {
|
||||||
c.State.Dir = "/var/lib/hecate"
|
c.State.Dir = "/var/lib/ananke"
|
||||||
}
|
}
|
||||||
if c.State.RunHistoryPath == "" {
|
if c.State.RunHistoryPath == "" {
|
||||||
c.State.RunHistoryPath = "/var/lib/hecate/runs.json"
|
c.State.RunHistoryPath = "/var/lib/ananke/runs.json"
|
||||||
}
|
}
|
||||||
if c.State.LockPath == "" {
|
if c.State.LockPath == "" {
|
||||||
c.State.LockPath = "/var/lib/hecate/hecate.lock"
|
c.State.LockPath = "/var/lib/ananke/ananke.lock"
|
||||||
}
|
}
|
||||||
if c.State.IntentPath == "" {
|
if c.State.IntentPath == "" {
|
||||||
c.State.IntentPath = "/var/lib/hecate/intent.json"
|
c.State.IntentPath = "/var/lib/ananke/intent.json"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -9,7 +9,7 @@ import (
|
|||||||
|
|
||||||
func TestLoadAcceptsUPSTargets(t *testing.T) {
|
func TestLoadAcceptsUPSTargets(t *testing.T) {
|
||||||
tmp := t.TempDir()
|
tmp := t.TempDir()
|
||||||
cfgPath := filepath.Join(tmp, "hecate.yaml")
|
cfgPath := filepath.Join(tmp, "ananke.yaml")
|
||||||
raw := `
|
raw := `
|
||||||
control_planes: [titan-0a, titan-0b, titan-0c]
|
control_planes: [titan-0a, titan-0b, titan-0c]
|
||||||
expected_flux_branch: main
|
expected_flux_branch: main
|
||||||
@ -24,7 +24,7 @@ shutdown:
|
|||||||
default_budget_seconds: 300
|
default_budget_seconds: 300
|
||||||
state:
|
state:
|
||||||
run_history_path: /tmp/runs.json
|
run_history_path: /tmp/runs.json
|
||||||
lock_path: /tmp/hecate.lock
|
lock_path: /tmp/ananke.lock
|
||||||
`
|
`
|
||||||
if err := os.WriteFile(cfgPath, []byte(strings.TrimSpace(raw)), 0o644); err != nil {
|
if err := os.WriteFile(cfgPath, []byte(strings.TrimSpace(raw)), 0o644); err != nil {
|
||||||
t.Fatalf("write config: %v", err)
|
t.Fatalf("write config: %v", err)
|
||||||
@ -74,7 +74,7 @@ func TestValidateRejectsUnknownEtcdRestoreControlPlane(t *testing.T) {
|
|||||||
|
|
||||||
func TestLoadSetsCoordinationGuardDefaults(t *testing.T) {
|
func TestLoadSetsCoordinationGuardDefaults(t *testing.T) {
|
||||||
tmp := t.TempDir()
|
tmp := t.TempDir()
|
||||||
cfgPath := filepath.Join(tmp, "hecate.yaml")
|
cfgPath := filepath.Join(tmp, "ananke.yaml")
|
||||||
raw := `
|
raw := `
|
||||||
control_planes: [titan-0a, titan-0b, titan-0c]
|
control_planes: [titan-0a, titan-0b, titan-0c]
|
||||||
expected_flux_branch: main
|
expected_flux_branch: main
|
||||||
@ -85,7 +85,7 @@ ups:
|
|||||||
enabled: false
|
enabled: false
|
||||||
state:
|
state:
|
||||||
run_history_path: /tmp/runs.json
|
run_history_path: /tmp/runs.json
|
||||||
lock_path: /tmp/hecate.lock
|
lock_path: /tmp/ananke.lock
|
||||||
`
|
`
|
||||||
if err := os.WriteFile(cfgPath, []byte(strings.TrimSpace(raw)), 0o644); err != nil {
|
if err := os.WriteFile(cfgPath, []byte(strings.TrimSpace(raw)), 0o644); err != nil {
|
||||||
t.Fatalf("write config: %v", err)
|
t.Fatalf("write config: %v", err)
|
||||||
@ -146,3 +146,55 @@ func TestValidateRejectsMissingPostStartProbesWhenRequired(t *testing.T) {
|
|||||||
t.Fatalf("expected validation error when post start probes are required but empty")
|
t.Fatalf("expected validation error when post start probes are required but empty")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestValidateRejectsMissingServiceChecklistWhenRequired(t *testing.T) {
|
||||||
|
cfg := defaults()
|
||||||
|
cfg.Startup.RequireServiceChecklist = true
|
||||||
|
cfg.Startup.ServiceChecklist = nil
|
||||||
|
if err := cfg.Validate(); err == nil {
|
||||||
|
t.Fatalf("expected validation error when service checklist is required but empty")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestValidateRejectsBadServiceChecklistURL(t *testing.T) {
|
||||||
|
cfg := defaults()
|
||||||
|
cfg.Startup.ServiceChecklist = []ServiceChecklistCheck{
|
||||||
|
{
|
||||||
|
Name: "grafana",
|
||||||
|
URL: "not-a-url",
|
||||||
|
AcceptedStatuses: []int{200},
|
||||||
|
TimeoutSeconds: 12,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
if err := cfg.Validate(); err == nil {
|
||||||
|
t.Fatalf("expected validation error for invalid service checklist url")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestValidateRejectsBadIgnoreFluxKustomizationFormat(t *testing.T) {
|
||||||
|
cfg := defaults()
|
||||||
|
cfg.Startup.IgnoreFluxKustomizations = []string{"jellyfin"}
|
||||||
|
if err := cfg.Validate(); err == nil {
|
||||||
|
t.Fatalf("expected validation error for invalid ignore_flux_kustomizations entry")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestValidateRejectsBadIgnoreWorkloadFormat(t *testing.T) {
|
||||||
|
cfg := defaults()
|
||||||
|
cfg.Startup.IgnoreWorkloads = []string{"maintenance/metis/extra/value"}
|
||||||
|
if err := cfg.Validate(); err == nil {
|
||||||
|
t.Fatalf("expected validation error for invalid ignore_workloads entry")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestValidateRejectsInvalidRequiredNodeLabel(t *testing.T) {
|
||||||
|
cfg := defaults()
|
||||||
|
cfg.Startup.RequiredNodeLabels = map[string]map[string]string{
|
||||||
|
"titan-09": {
|
||||||
|
"": "true",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
if err := cfg.Validate(); err == nil {
|
||||||
|
t.Fatalf("expected validation error for invalid required_node_labels entry")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@ -84,41 +84,41 @@ func (e *Exporter) serveMetrics(w http.ResponseWriter, _ *http.Request) {
|
|||||||
|
|
||||||
w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8")
|
w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8")
|
||||||
var b strings.Builder
|
var b strings.Builder
|
||||||
b.WriteString("# HELP hecate_shutdown_budget_seconds Estimated graceful shutdown budget in seconds (p95).\n")
|
b.WriteString("# HELP ananke_shutdown_budget_seconds Estimated graceful shutdown budget in seconds (p95).\n")
|
||||||
b.WriteString("# TYPE hecate_shutdown_budget_seconds gauge\n")
|
b.WriteString("# TYPE ananke_shutdown_budget_seconds gauge\n")
|
||||||
b.WriteString(fmt.Sprintf("hecate_shutdown_budget_seconds %d\n", e.shutdownBudgetSec))
|
b.WriteString(fmt.Sprintf("ananke_shutdown_budget_seconds %d\n", e.shutdownBudgetSec))
|
||||||
b.WriteString("# HELP hecate_shutdown_triggers_total Total number of shutdown triggers issued by this instance.\n")
|
b.WriteString("# HELP ananke_shutdown_triggers_total Total number of shutdown triggers issued by this instance.\n")
|
||||||
b.WriteString("# TYPE hecate_shutdown_triggers_total counter\n")
|
b.WriteString("# TYPE ananke_shutdown_triggers_total counter\n")
|
||||||
b.WriteString(fmt.Sprintf("hecate_shutdown_triggers_total %d\n", e.shutdownTriggers))
|
b.WriteString(fmt.Sprintf("ananke_shutdown_triggers_total %d\n", e.shutdownTriggers))
|
||||||
b.WriteString("# HELP hecate_shutdown_last_trigger_timestamp_seconds Unix timestamp of last shutdown trigger.\n")
|
b.WriteString("# HELP ananke_shutdown_last_trigger_timestamp_seconds Unix timestamp of last shutdown trigger.\n")
|
||||||
b.WriteString("# TYPE hecate_shutdown_last_trigger_timestamp_seconds gauge\n")
|
b.WriteString("# TYPE ananke_shutdown_last_trigger_timestamp_seconds gauge\n")
|
||||||
if e.lastShutdownAt.IsZero() {
|
if e.lastShutdownAt.IsZero() {
|
||||||
b.WriteString("hecate_shutdown_last_trigger_timestamp_seconds 0\n")
|
b.WriteString("ananke_shutdown_last_trigger_timestamp_seconds 0\n")
|
||||||
} else {
|
} else {
|
||||||
b.WriteString(fmt.Sprintf("hecate_shutdown_last_trigger_timestamp_seconds %d\n", e.lastShutdownAt.Unix()))
|
b.WriteString(fmt.Sprintf("ananke_shutdown_last_trigger_timestamp_seconds %d\n", e.lastShutdownAt.Unix()))
|
||||||
}
|
}
|
||||||
b.WriteString("# HELP hecate_ups_on_battery Whether a UPS source is currently on battery.\n")
|
b.WriteString("# HELP ananke_ups_on_battery Whether a UPS source is currently on battery.\n")
|
||||||
b.WriteString("# TYPE hecate_ups_on_battery gauge\n")
|
b.WriteString("# TYPE ananke_ups_on_battery gauge\n")
|
||||||
b.WriteString("# HELP hecate_ups_low_battery Whether a UPS source currently reports low battery.\n")
|
b.WriteString("# HELP ananke_ups_low_battery Whether a UPS source currently reports low battery.\n")
|
||||||
b.WriteString("# TYPE hecate_ups_low_battery gauge\n")
|
b.WriteString("# TYPE ananke_ups_low_battery gauge\n")
|
||||||
b.WriteString("# HELP hecate_ups_runtime_seconds Battery runtime remaining reported by UPS.\n")
|
b.WriteString("# HELP ananke_ups_runtime_seconds Battery runtime remaining reported by UPS.\n")
|
||||||
b.WriteString("# TYPE hecate_ups_runtime_seconds gauge\n")
|
b.WriteString("# TYPE ananke_ups_runtime_seconds gauge\n")
|
||||||
b.WriteString("# HELP hecate_ups_battery_charge_percent Battery charge percentage reported by UPS.\n")
|
b.WriteString("# HELP ananke_ups_battery_charge_percent Battery charge percentage reported by UPS.\n")
|
||||||
b.WriteString("# TYPE hecate_ups_battery_charge_percent gauge\n")
|
b.WriteString("# TYPE ananke_ups_battery_charge_percent gauge\n")
|
||||||
b.WriteString("# HELP hecate_ups_load_percent UPS output load percentage.\n")
|
b.WriteString("# HELP ananke_ups_load_percent UPS output load percentage.\n")
|
||||||
b.WriteString("# TYPE hecate_ups_load_percent gauge\n")
|
b.WriteString("# TYPE ananke_ups_load_percent gauge\n")
|
||||||
b.WriteString("# HELP hecate_ups_power_nominal_watts UPS nominal power rating in watts.\n")
|
b.WriteString("# HELP ananke_ups_power_nominal_watts UPS nominal power rating in watts.\n")
|
||||||
b.WriteString("# TYPE hecate_ups_power_nominal_watts gauge\n")
|
b.WriteString("# TYPE ananke_ups_power_nominal_watts gauge\n")
|
||||||
b.WriteString("# HELP hecate_ups_threshold_seconds Red-line threshold for runtime-based shutdown.\n")
|
b.WriteString("# HELP ananke_ups_threshold_seconds Red-line threshold for runtime-based shutdown.\n")
|
||||||
b.WriteString("# TYPE hecate_ups_threshold_seconds gauge\n")
|
b.WriteString("# TYPE ananke_ups_threshold_seconds gauge\n")
|
||||||
b.WriteString("# HELP hecate_ups_trigger_active Whether this UPS source currently breaches shutdown trigger conditions.\n")
|
b.WriteString("# HELP ananke_ups_trigger_active Whether this UPS source currently breaches shutdown trigger conditions.\n")
|
||||||
b.WriteString("# TYPE hecate_ups_trigger_active gauge\n")
|
b.WriteString("# TYPE ananke_ups_trigger_active gauge\n")
|
||||||
b.WriteString("# HELP hecate_ups_breach_count Current debounce breach count for this UPS source.\n")
|
b.WriteString("# HELP ananke_ups_breach_count Current debounce breach count for this UPS source.\n")
|
||||||
b.WriteString("# TYPE hecate_ups_breach_count gauge\n")
|
b.WriteString("# TYPE ananke_ups_breach_count gauge\n")
|
||||||
b.WriteString("# HELP hecate_ups_last_sample_timestamp_seconds Unix timestamp of most recent sample.\n")
|
b.WriteString("# HELP ananke_ups_last_sample_timestamp_seconds Unix timestamp of most recent sample.\n")
|
||||||
b.WriteString("# TYPE hecate_ups_last_sample_timestamp_seconds gauge\n")
|
b.WriteString("# TYPE ananke_ups_last_sample_timestamp_seconds gauge\n")
|
||||||
b.WriteString("# HELP hecate_ups_error Whether the last sample had an error.\n")
|
b.WriteString("# HELP ananke_ups_error Whether the last sample had an error.\n")
|
||||||
b.WriteString("# TYPE hecate_ups_error gauge\n")
|
b.WriteString("# TYPE ananke_ups_error gauge\n")
|
||||||
|
|
||||||
names := make([]string, 0, len(e.samples))
|
names := make([]string, 0, len(e.samples))
|
||||||
for name := range e.samples {
|
for name := range e.samples {
|
||||||
@ -129,21 +129,21 @@ func (e *Exporter) serveMetrics(w http.ResponseWriter, _ *http.Request) {
|
|||||||
s := e.samples[name]
|
s := e.samples[name]
|
||||||
labels := fmt.Sprintf("{source=%q,target=%q,status=%q,last_reason=%q}",
|
labels := fmt.Sprintf("{source=%q,target=%q,status=%q,last_reason=%q}",
|
||||||
safe(name), safe(s.Target), safe(s.Status), safe(e.lastShutdownReason))
|
safe(name), safe(s.Target), safe(s.Status), safe(e.lastShutdownReason))
|
||||||
b.WriteString(fmt.Sprintf("hecate_ups_on_battery%s %d\n", labels, boolNum(s.OnBattery)))
|
b.WriteString(fmt.Sprintf("ananke_ups_on_battery%s %d\n", labels, boolNum(s.OnBattery)))
|
||||||
b.WriteString(fmt.Sprintf("hecate_ups_low_battery%s %d\n", labels, boolNum(s.LowBattery)))
|
b.WriteString(fmt.Sprintf("ananke_ups_low_battery%s %d\n", labels, boolNum(s.LowBattery)))
|
||||||
b.WriteString(fmt.Sprintf("hecate_ups_runtime_seconds%s %d\n", labels, s.RuntimeSecond))
|
b.WriteString(fmt.Sprintf("ananke_ups_runtime_seconds%s %d\n", labels, s.RuntimeSecond))
|
||||||
b.WriteString(fmt.Sprintf("hecate_ups_battery_charge_percent%s %.2f\n", labels, s.BatteryCharge))
|
b.WriteString(fmt.Sprintf("ananke_ups_battery_charge_percent%s %.2f\n", labels, s.BatteryCharge))
|
||||||
b.WriteString(fmt.Sprintf("hecate_ups_load_percent%s %.2f\n", labels, s.LoadPercent))
|
b.WriteString(fmt.Sprintf("ananke_ups_load_percent%s %.2f\n", labels, s.LoadPercent))
|
||||||
b.WriteString(fmt.Sprintf("hecate_ups_power_nominal_watts%s %.2f\n", labels, s.PowerNominalW))
|
b.WriteString(fmt.Sprintf("ananke_ups_power_nominal_watts%s %.2f\n", labels, s.PowerNominalW))
|
||||||
b.WriteString(fmt.Sprintf("hecate_ups_threshold_seconds%s %d\n", labels, s.ThresholdSec))
|
b.WriteString(fmt.Sprintf("ananke_ups_threshold_seconds%s %d\n", labels, s.ThresholdSec))
|
||||||
b.WriteString(fmt.Sprintf("hecate_ups_trigger_active%s %d\n", labels, boolNum(s.Trigger)))
|
b.WriteString(fmt.Sprintf("ananke_ups_trigger_active%s %d\n", labels, boolNum(s.Trigger)))
|
||||||
b.WriteString(fmt.Sprintf("hecate_ups_breach_count%s %d\n", labels, s.BreachCount))
|
b.WriteString(fmt.Sprintf("ananke_ups_breach_count%s %d\n", labels, s.BreachCount))
|
||||||
if s.UpdatedAt.IsZero() {
|
if s.UpdatedAt.IsZero() {
|
||||||
b.WriteString(fmt.Sprintf("hecate_ups_last_sample_timestamp_seconds%s 0\n", labels))
|
b.WriteString(fmt.Sprintf("ananke_ups_last_sample_timestamp_seconds%s 0\n", labels))
|
||||||
} else {
|
} else {
|
||||||
b.WriteString(fmt.Sprintf("hecate_ups_last_sample_timestamp_seconds%s %d\n", labels, s.UpdatedAt.Unix()))
|
b.WriteString(fmt.Sprintf("ananke_ups_last_sample_timestamp_seconds%s %d\n", labels, s.UpdatedAt.Unix()))
|
||||||
}
|
}
|
||||||
b.WriteString(fmt.Sprintf("hecate_ups_error%s %d\n", labels, boolNum(s.LastError != "")))
|
b.WriteString(fmt.Sprintf("ananke_ups_error%s %d\n", labels, boolNum(s.LastError != "")))
|
||||||
}
|
}
|
||||||
|
|
||||||
_, _ = w.Write([]byte(b.String()))
|
_, _ = w.Write([]byte(b.String()))
|
||||||
|
|||||||
@ -33,14 +33,14 @@ func TestExporterEmitsCoreMetrics(t *testing.T) {
|
|||||||
body := rr.Body.String()
|
body := rr.Body.String()
|
||||||
|
|
||||||
mustContain := []string{
|
mustContain := []string{
|
||||||
"hecate_shutdown_budget_seconds 321",
|
"ananke_shutdown_budget_seconds 321",
|
||||||
"hecate_shutdown_triggers_total 1",
|
"ananke_shutdown_triggers_total 1",
|
||||||
"hecate_ups_on_battery{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
|
"ananke_ups_on_battery{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
|
||||||
"hecate_ups_runtime_seconds{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
|
"ananke_ups_runtime_seconds{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
|
||||||
"hecate_ups_battery_charge_percent{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
|
"ananke_ups_battery_charge_percent{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
|
||||||
"hecate_ups_load_percent{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
|
"ananke_ups_load_percent{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
|
||||||
"hecate_ups_power_nominal_watts{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
|
"ananke_ups_power_nominal_watts{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
|
||||||
"hecate_ups_threshold_seconds{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
|
"ananke_ups_threshold_seconds{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
|
||||||
}
|
}
|
||||||
for _, m := range mustContain {
|
for _, m := range mustContain {
|
||||||
if !strings.Contains(body, m) {
|
if !strings.Contains(body, m) {
|
||||||
|
|||||||
@ -12,12 +12,12 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"scm.bstein.dev/bstein/hecate/internal/cluster"
|
"scm.bstein.dev/bstein/ananke/internal/cluster"
|
||||||
"scm.bstein.dev/bstein/hecate/internal/config"
|
"scm.bstein.dev/bstein/ananke/internal/config"
|
||||||
"scm.bstein.dev/bstein/hecate/internal/metrics"
|
"scm.bstein.dev/bstein/ananke/internal/metrics"
|
||||||
"scm.bstein.dev/bstein/hecate/internal/sshutil"
|
"scm.bstein.dev/bstein/ananke/internal/sshutil"
|
||||||
"scm.bstein.dev/bstein/hecate/internal/state"
|
"scm.bstein.dev/bstein/ananke/internal/state"
|
||||||
"scm.bstein.dev/bstein/hecate/internal/ups"
|
"scm.bstein.dev/bstein/ananke/internal/ups"
|
||||||
)
|
)
|
||||||
|
|
||||||
type Target struct {
|
type Target struct {
|
||||||
@ -81,7 +81,7 @@ func (d *Daemon) Run(ctx context.Context) error {
|
|||||||
lastGood[t.Name] = time.Now()
|
lastGood[t.Name] = time.Now()
|
||||||
}
|
}
|
||||||
|
|
||||||
d.log.Printf("hecate daemon started: poll=%s debounce=%d telemetry_timeout=%s targets=%s",
|
d.log.Printf("ananke daemon started: poll=%s debounce=%d telemetry_timeout=%s targets=%s",
|
||||||
poll, debounce, telemetryTimeout, d.targetList())
|
poll, debounce, telemetryTimeout, d.targetList())
|
||||||
|
|
||||||
for {
|
for {
|
||||||
@ -198,7 +198,7 @@ func (d *Daemon) forwardShutdown(ctx context.Context, reason string) error {
|
|||||||
runCtx, cancel := context.WithTimeout(ctx, timeout)
|
runCtx, cancel := context.WithTimeout(ctx, timeout)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
|
|
||||||
remoteCmd := fmt.Sprintf("sudo /usr/local/bin/hecate shutdown --config %q --execute --reason %q", d.cfg.Coordination.ForwardShutdownConfig, reason)
|
remoteCmd := fmt.Sprintf("sudo /usr/local/bin/ananke shutdown --config %q --execute --reason %q", d.cfg.Coordination.ForwardShutdownConfig, reason)
|
||||||
if d.cfg.Shutdown.EmergencySkipEtcd {
|
if d.cfg.Shutdown.EmergencySkipEtcd {
|
||||||
remoteCmd += " --skip-etcd-snapshot"
|
remoteCmd += " --skip-etcd-snapshot"
|
||||||
}
|
}
|
||||||
|
|||||||
@ -6,7 +6,7 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
// ParseIntentOutput parses `hecate intent` CLI output from local/remote commands.
|
// ParseIntentOutput parses `ananke intent` CLI output from local/remote commands.
|
||||||
func ParseIntentOutput(raw string) (Intent, error) {
|
func ParseIntentOutput(raw string) (Intent, error) {
|
||||||
for _, line := range strings.Split(raw, "\n") {
|
for _, line := range strings.Split(raw, "\n") {
|
||||||
line = strings.TrimSpace(line)
|
line = strings.TrimSpace(line)
|
||||||
|
|||||||
@ -61,7 +61,7 @@ func TestReadIntentAutoHealsCorruptJSON(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestParseIntentOutputParsesStructuredLine(t *testing.T) {
|
func TestParseIntentOutputParsesStructuredLine(t *testing.T) {
|
||||||
raw := `[hecate] 2026/04/05 11:24:49 intent=normal reason="guard-test-clear-2" source=drill updated_at=2026-04-05T16:24:33Z`
|
raw := `[ananke] 2026/04/05 11:24:49 intent=normal reason="guard-test-clear-2" source=drill updated_at=2026-04-05T16:24:33Z`
|
||||||
in, err := ParseIntentOutput(raw)
|
in, err := ParseIntentOutput(raw)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("parse intent output: %v", err)
|
t.Fatalf("parse intent output: %v", err)
|
||||||
@ -81,7 +81,7 @@ func TestParseIntentOutputParsesStructuredLine(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestParseIntentOutputHandlesNone(t *testing.T) {
|
func TestParseIntentOutputHandlesNone(t *testing.T) {
|
||||||
in, err := ParseIntentOutput(`[hecate] 2026/04/05 11:24:49 intent=none`)
|
in, err := ParseIntentOutput(`[ananke] 2026/04/05 11:24:49 intent=none`)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("parse none intent output: %v", err)
|
t.Fatalf("parse none intent output: %v", err)
|
||||||
}
|
}
|
||||||
|
|||||||
@ -11,7 +11,7 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func TestAcquireLockLifecycle(t *testing.T) {
|
func TestAcquireLockLifecycle(t *testing.T) {
|
||||||
lockPath := filepath.Join(t.TempDir(), "hecate.lock")
|
lockPath := filepath.Join(t.TempDir(), "ananke.lock")
|
||||||
unlock, err := AcquireLock(lockPath)
|
unlock, err := AcquireLock(lockPath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("acquire lock: %v", err)
|
t.Fatalf("acquire lock: %v", err)
|
||||||
@ -26,7 +26,7 @@ func TestAcquireLockLifecycle(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestAcquireLockReclaimsStaleLock(t *testing.T) {
|
func TestAcquireLockReclaimsStaleLock(t *testing.T) {
|
||||||
lockPath := filepath.Join(t.TempDir(), "hecate.lock")
|
lockPath := filepath.Join(t.TempDir(), "ananke.lock")
|
||||||
if err := os.WriteFile(lockPath, []byte("pid=999999\n"), 0o600); err != nil {
|
if err := os.WriteFile(lockPath, []byte("pid=999999\n"), 0o600); err != nil {
|
||||||
t.Fatalf("write stale lock: %v", err)
|
t.Fatalf("write stale lock: %v", err)
|
||||||
}
|
}
|
||||||
@ -47,7 +47,7 @@ func TestAcquireLockReclaimsStaleLock(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestAcquireLockRejectsActiveLock(t *testing.T) {
|
func TestAcquireLockRejectsActiveLock(t *testing.T) {
|
||||||
lockPath := filepath.Join(t.TempDir(), "hecate.lock")
|
lockPath := filepath.Join(t.TempDir(), "ananke.lock")
|
||||||
active := "pid=" + strconv.Itoa(os.Getpid()) + "\n"
|
active := "pid=" + strconv.Itoa(os.Getpid()) + "\n"
|
||||||
if err := os.WriteFile(lockPath, []byte(active), 0o600); err != nil {
|
if err := os.WriteFile(lockPath, []byte(active), 0o600); err != nil {
|
||||||
t.Fatalf("write active lock: %v", err)
|
t.Fatalf("write active lock: %v", err)
|
||||||
|
|||||||
@ -3,7 +3,7 @@ package ups
|
|||||||
import "testing"
|
import "testing"
|
||||||
|
|
||||||
func TestParseNUT(t *testing.T) {
|
func TestParseNUT(t *testing.T) {
|
||||||
raw := `battery.runtime: 384
|
raw := `battery.runtime: 384
|
||||||
battery.charge: 72
|
battery.charge: 72
|
||||||
ups.load: 19
|
ups.load: 19
|
||||||
ups.realpower.nominal: 510
|
ups.realpower.nominal: 510
|
||||||
|
|||||||
@ -2,23 +2,23 @@
|
|||||||
set -Eeuo pipefail
|
set -Eeuo pipefail
|
||||||
|
|
||||||
KUBECTL="${KUBECTL:-kubectl}"
|
KUBECTL="${KUBECTL:-kubectl}"
|
||||||
HECATE_COORDINATOR_HOST="${HECATE_COORDINATOR_HOST:-titan-db}"
|
ANANKE_COORDINATOR_HOST="${ANANKE_COORDINATOR_HOST:-titan-db}"
|
||||||
HECATE_BIN="${HECATE_BIN:-/usr/local/bin/hecate}"
|
ANANKE_BIN="${ANANKE_BIN:-/usr/local/bin/ananke}"
|
||||||
HECATE_CONFIG="${HECATE_CONFIG:-/etc/hecate/hecate.yaml}"
|
ANANKE_CONFIG="${ANANKE_CONFIG:-/etc/ananke/ananke.yaml}"
|
||||||
HECATE_COORDINATOR_RELAY="${HECATE_COORDINATOR_RELAY:-}"
|
ANANKE_COORDINATOR_RELAY="${ANANKE_COORDINATOR_RELAY:-}"
|
||||||
LOG_DIR="${HECATE_DRILL_LOG_DIR:-/tmp/hecate-drills}"
|
LOG_DIR="${ANANKE_DRILL_LOG_DIR:-/tmp/ananke-drills}"
|
||||||
STARTUP_TIMEOUT_SECONDS="${HECATE_DRILL_STARTUP_TIMEOUT_SECONDS:-1800}"
|
STARTUP_TIMEOUT_SECONDS="${ANANKE_DRILL_STARTUP_TIMEOUT_SECONDS:-1800}"
|
||||||
SHUTDOWN_TIMEOUT_SECONDS="${HECATE_DRILL_SHUTDOWN_TIMEOUT_SECONDS:-1800}"
|
SHUTDOWN_TIMEOUT_SECONDS="${ANANKE_DRILL_SHUTDOWN_TIMEOUT_SECONDS:-1800}"
|
||||||
SHUTDOWN_CONFIG="${HECATE_DRILL_SHUTDOWN_CONFIG:-/tmp/hecate-drill-no-poweroff.yaml}"
|
SHUTDOWN_CONFIG="${ANANKE_DRILL_SHUTDOWN_CONFIG:-/tmp/ananke-drill-no-poweroff.yaml}"
|
||||||
STARTUP_RETRY_DELAY_SECONDS="${HECATE_DRILL_STARTUP_RETRY_DELAY_SECONDS:-10}"
|
STARTUP_RETRY_DELAY_SECONDS="${ANANKE_DRILL_STARTUP_RETRY_DELAY_SECONDS:-10}"
|
||||||
STARTUP_RETRY_MAX="${HECATE_DRILL_STARTUP_RETRY_MAX:-12}"
|
STARTUP_RETRY_MAX="${ANANKE_DRILL_STARTUP_RETRY_MAX:-12}"
|
||||||
EXECUTE=0
|
EXECUTE=0
|
||||||
|
|
||||||
usage() {
|
usage() {
|
||||||
cat <<'EOF'
|
cat <<'EOF'
|
||||||
Usage:
|
Usage:
|
||||||
scripts/hecate-drills.sh list
|
scripts/ananke-drills.sh list
|
||||||
scripts/hecate-drills.sh run <drill-name> [--execute]
|
scripts/ananke-drills.sh run <drill-name> [--execute]
|
||||||
|
|
||||||
Drills:
|
Drills:
|
||||||
flux-gitea-deadlock Simulate flux-controller + gitea outage and require startup recovery.
|
flux-gitea-deadlock Simulate flux-controller + gitea outage and require startup recovery.
|
||||||
@ -30,7 +30,7 @@ Drills:
|
|||||||
Notes:
|
Notes:
|
||||||
- Drills are intentionally disruptive and are not part of regular `make test`.
|
- Drills are intentionally disruptive and are not part of regular `make test`.
|
||||||
- Use --execute to run live changes. Without it, this script prints planned actions only.
|
- Use --execute to run live changes. Without it, this script prints planned actions only.
|
||||||
- Optional relay: set HECATE_COORDINATOR_RELAY="ssh titan-db" to run coordinator commands via a jump host.
|
- Optional relay: set ANANKE_COORDINATOR_RELAY="ssh titan-db" to run coordinator commands via a jump host.
|
||||||
EOF
|
EOF
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -98,47 +98,47 @@ wait_ready_keycloak() {
|
|||||||
die "keycloak workload not found in sso namespace (expected deployment/keycloak or statefulset/keycloak)"
|
die "keycloak workload not found in sso namespace (expected deployment/keycloak or statefulset/keycloak)"
|
||||||
}
|
}
|
||||||
|
|
||||||
run_hecate_startup() {
|
run_ananke_startup() {
|
||||||
local reason="$1"
|
local reason="$1"
|
||||||
local cmd=(sudo "${HECATE_BIN}" startup --config "${HECATE_CONFIG}" --execute --force-flux-branch main --reason "${reason}")
|
local cmd=(sudo "${ANANKE_BIN}" startup --config "${ANANKE_CONFIG}" --execute --force-flux-branch main --reason "${reason}")
|
||||||
if [[ "${EXECUTE}" -eq 0 ]]; then
|
if [[ "${EXECUTE}" -eq 0 ]]; then
|
||||||
if [[ -n "${HECATE_COORDINATOR_RELAY}" ]]; then
|
if [[ -n "${ANANKE_COORDINATOR_RELAY}" ]]; then
|
||||||
log "plan: ssh ${HECATE_COORDINATOR_HOST} ${HECATE_COORDINATOR_RELAY} '${cmd[*]}'"
|
log "plan: ssh ${ANANKE_COORDINATOR_HOST} ${ANANKE_COORDINATOR_RELAY} '${cmd[*]}'"
|
||||||
else
|
else
|
||||||
log "plan: ssh ${HECATE_COORDINATOR_HOST} '${cmd[*]}'"
|
log "plan: ssh ${ANANKE_COORDINATOR_HOST} '${cmd[*]}'"
|
||||||
fi
|
fi
|
||||||
return 0
|
return 0
|
||||||
fi
|
fi
|
||||||
if [[ -n "${HECATE_COORDINATOR_RELAY}" ]]; then
|
if [[ -n "${ANANKE_COORDINATOR_RELAY}" ]]; then
|
||||||
# shellcheck disable=SC2086
|
# shellcheck disable=SC2086
|
||||||
timeout "${STARTUP_TIMEOUT_SECONDS}" ssh "${HECATE_COORDINATOR_HOST}" ${HECATE_COORDINATOR_RELAY} "${cmd[@]}"
|
timeout "${STARTUP_TIMEOUT_SECONDS}" ssh "${ANANKE_COORDINATOR_HOST}" ${ANANKE_COORDINATOR_RELAY} "${cmd[@]}"
|
||||||
else
|
else
|
||||||
timeout "${STARTUP_TIMEOUT_SECONDS}" ssh "${HECATE_COORDINATOR_HOST}" "${cmd[@]}"
|
timeout "${STARTUP_TIMEOUT_SECONDS}" ssh "${ANANKE_COORDINATOR_HOST}" "${cmd[@]}"
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
run_hecate_shutdown() {
|
run_ananke_shutdown() {
|
||||||
local reason="$1"
|
local reason="$1"
|
||||||
local cmd=(sudo "${HECATE_BIN}" shutdown --config "${SHUTDOWN_CONFIG}" --execute --reason "${reason}")
|
local cmd=(sudo "${ANANKE_BIN}" shutdown --config "${SHUTDOWN_CONFIG}" --execute --reason "${reason}")
|
||||||
if [[ "${EXECUTE}" -eq 0 ]]; then
|
if [[ "${EXECUTE}" -eq 0 ]]; then
|
||||||
if [[ -n "${HECATE_COORDINATOR_RELAY}" ]]; then
|
if [[ -n "${ANANKE_COORDINATOR_RELAY}" ]]; then
|
||||||
log "plan: ssh ${HECATE_COORDINATOR_HOST} ${HECATE_COORDINATOR_RELAY} '${cmd[*]}'"
|
log "plan: ssh ${ANANKE_COORDINATOR_HOST} ${ANANKE_COORDINATOR_RELAY} '${cmd[*]}'"
|
||||||
else
|
else
|
||||||
log "plan: ssh ${HECATE_COORDINATOR_HOST} '${cmd[*]}'"
|
log "plan: ssh ${ANANKE_COORDINATOR_HOST} '${cmd[*]}'"
|
||||||
fi
|
fi
|
||||||
return 0
|
return 0
|
||||||
fi
|
fi
|
||||||
if [[ -n "${HECATE_COORDINATOR_RELAY}" ]]; then
|
if [[ -n "${ANANKE_COORDINATOR_RELAY}" ]]; then
|
||||||
# shellcheck disable=SC2086
|
# shellcheck disable=SC2086
|
||||||
timeout "${SHUTDOWN_TIMEOUT_SECONDS}" ssh "${HECATE_COORDINATOR_HOST}" ${HECATE_COORDINATOR_RELAY} "${cmd[@]}"
|
timeout "${SHUTDOWN_TIMEOUT_SECONDS}" ssh "${ANANKE_COORDINATOR_HOST}" ${ANANKE_COORDINATOR_RELAY} "${cmd[@]}"
|
||||||
else
|
else
|
||||||
timeout "${SHUTDOWN_TIMEOUT_SECONDS}" ssh "${HECATE_COORDINATOR_HOST}" "${cmd[@]}"
|
timeout "${SHUTDOWN_TIMEOUT_SECONDS}" ssh "${ANANKE_COORDINATOR_HOST}" "${cmd[@]}"
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
run_hecate_startup_with_retry() {
|
run_ananke_startup_with_retry() {
|
||||||
local reason="$1"
|
local reason="$1"
|
||||||
local startup_cmd="sudo ${HECATE_BIN} startup --config ${HECATE_CONFIG} --execute --force-flux-branch main --reason ${reason}"
|
local startup_cmd="sudo ${ANANKE_BIN} startup --config ${ANANKE_CONFIG} --execute --force-flux-branch main --reason ${reason}"
|
||||||
|
|
||||||
if [[ "${EXECUTE}" -eq 0 ]]; then
|
if [[ "${EXECUTE}" -eq 0 ]]; then
|
||||||
log "plan: startup retry loop with max=${STARTUP_RETRY_MAX} delay=${STARTUP_RETRY_DELAY_SECONDS}s"
|
log "plan: startup retry loop with max=${STARTUP_RETRY_MAX} delay=${STARTUP_RETRY_DELAY_SECONDS}s"
|
||||||
@ -161,11 +161,11 @@ run_hecate_startup_with_retry() {
|
|||||||
|
|
||||||
run_coordinator_bash() {
|
run_coordinator_bash() {
|
||||||
local script="$1"
|
local script="$1"
|
||||||
if [[ -n "${HECATE_COORDINATOR_RELAY}" ]]; then
|
if [[ -n "${ANANKE_COORDINATOR_RELAY}" ]]; then
|
||||||
# shellcheck disable=SC2086
|
# shellcheck disable=SC2086
|
||||||
printf '%s\n' "${script}" | ssh "${HECATE_COORDINATOR_HOST}" ${HECATE_COORDINATOR_RELAY} "bash -se"
|
printf '%s\n' "${script}" | ssh "${ANANKE_COORDINATOR_HOST}" ${ANANKE_COORDINATOR_RELAY} "bash -se"
|
||||||
else
|
else
|
||||||
printf '%s\n' "${script}" | ssh "${HECATE_COORDINATOR_HOST}" "bash -se"
|
printf '%s\n' "${script}" | ssh "${ANANKE_COORDINATOR_HOST}" "bash -se"
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -283,7 +283,7 @@ write_log_header() {
|
|||||||
mkdir -p "${LOG_DIR}"
|
mkdir -p "${LOG_DIR}"
|
||||||
local f="${LOG_DIR}/${drill}-$(now_ts).log"
|
local f="${LOG_DIR}/${drill}-$(now_ts).log"
|
||||||
exec > >(tee -a "${f}") 2>&1
|
exec > >(tee -a "${f}") 2>&1
|
||||||
log "drill=${drill} execute=${EXECUTE} coordinator=${HECATE_COORDINATOR_HOST}"
|
log "drill=${drill} execute=${EXECUTE} coordinator=${ANANKE_COORDINATOR_HOST}"
|
||||||
}
|
}
|
||||||
|
|
||||||
run_drill_flux_gitea_deadlock() {
|
run_drill_flux_gitea_deadlock() {
|
||||||
@ -303,7 +303,7 @@ run_drill_flux_gitea_deadlock() {
|
|||||||
scale_to "$ns" "$kind" "$name" 0
|
scale_to "$ns" "$kind" "$name" 0
|
||||||
done
|
done
|
||||||
|
|
||||||
run_hecate_startup "drill-flux-gitea-deadlock"
|
run_ananke_startup "drill-flux-gitea-deadlock"
|
||||||
|
|
||||||
log "verifying recovery"
|
log "verifying recovery"
|
||||||
wait_ready flux-system deployment source-controller 240s
|
wait_ready flux-system deployment source-controller 240s
|
||||||
@ -330,7 +330,7 @@ run_drill_foundation_recovery() {
|
|||||||
scale_to "$ns" "$kind" "$name" 0
|
scale_to "$ns" "$kind" "$name" 0
|
||||||
done
|
done
|
||||||
|
|
||||||
run_hecate_startup "drill-foundation-recovery"
|
run_ananke_startup "drill-foundation-recovery"
|
||||||
|
|
||||||
log "verifying layered recovery"
|
log "verifying layered recovery"
|
||||||
wait_ready vault statefulset vault 420s
|
wait_ready vault statefulset vault 420s
|
||||||
@ -350,7 +350,7 @@ run_drill_reconciliation_resume() {
|
|||||||
set_flux_suspend_all true
|
set_flux_suspend_all true
|
||||||
scale_to flux-system deployment source-controller 0
|
scale_to flux-system deployment source-controller 0
|
||||||
|
|
||||||
run_hecate_startup "drill-reconciliation-resume"
|
run_ananke_startup "drill-reconciliation-resume"
|
||||||
|
|
||||||
log "verifying reconciliation resumed"
|
log "verifying reconciliation resumed"
|
||||||
wait_ready flux-system deployment source-controller 240s
|
wait_ready flux-system deployment source-controller 240s
|
||||||
@ -361,8 +361,8 @@ run_drill_reconciliation_resume() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
run_drill_startup_intent_guard() {
|
run_drill_startup_intent_guard() {
|
||||||
local intent_path="/var/lib/hecate/intent.json"
|
local intent_path="/var/lib/ananke/intent.json"
|
||||||
local backup_path="/tmp/hecate-intent-pre-drill.json"
|
local backup_path="/tmp/ananke-intent-pre-drill.json"
|
||||||
local inject_cmd="
|
local inject_cmd="
|
||||||
if [ -f '${intent_path}' ]; then sudo cp '${intent_path}' '${backup_path}'; else sudo rm -f '${backup_path}'; fi
|
if [ -f '${intent_path}' ]; then sudo cp '${intent_path}' '${backup_path}'; else sudo rm -f '${backup_path}'; fi
|
||||||
cat <<'JSON' | sudo tee '${intent_path}' >/dev/null
|
cat <<'JSON' | sudo tee '${intent_path}' >/dev/null
|
||||||
@ -376,12 +376,12 @@ else
|
|||||||
sudo rm -f '${intent_path}'
|
sudo rm -f '${intent_path}'
|
||||||
fi
|
fi
|
||||||
"
|
"
|
||||||
local startup_cmd="sudo ${HECATE_BIN} startup --config ${HECATE_CONFIG} --execute --force-flux-branch main --reason drill-startup-intent-guard"
|
local startup_cmd="sudo ${ANANKE_BIN} startup --config ${ANANKE_CONFIG} --execute --force-flux-branch main --reason drill-startup-intent-guard"
|
||||||
|
|
||||||
if [[ "${EXECUTE}" -eq 0 ]]; then
|
if [[ "${EXECUTE}" -eq 0 ]]; then
|
||||||
log "plan: ssh ${HECATE_COORDINATOR_HOST} '<inject shutdown intent>'"
|
log "plan: ssh ${ANANKE_COORDINATOR_HOST} '<inject shutdown intent>'"
|
||||||
log "plan: ssh ${HECATE_COORDINATOR_HOST} '${startup_cmd}' (expect failure)"
|
log "plan: ssh ${ANANKE_COORDINATOR_HOST} '${startup_cmd}' (expect failure)"
|
||||||
log "plan: ssh ${HECATE_COORDINATOR_HOST} '<restore prior intent>'"
|
log "plan: ssh ${ANANKE_COORDINATOR_HOST} '<restore prior intent>'"
|
||||||
log "pass: startup-intent-guard (plan mode)"
|
log "pass: startup-intent-guard (plan mode)"
|
||||||
return 0
|
return 0
|
||||||
fi
|
fi
|
||||||
@ -406,10 +406,10 @@ run_drill_controlled_cycle() {
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
log "running controlled shutdown cycle (poweroff disabled config)"
|
log "running controlled shutdown cycle (poweroff disabled config)"
|
||||||
run_hecate_shutdown "drill-controlled-cycle-shutdown"
|
run_ananke_shutdown "drill-controlled-cycle-shutdown"
|
||||||
|
|
||||||
log "running startup recovery cycle"
|
log "running startup recovery cycle"
|
||||||
run_hecate_startup_with_retry "drill-controlled-cycle-startup"
|
run_ananke_startup_with_retry "drill-controlled-cycle-startup"
|
||||||
|
|
||||||
log "verifying critical stack readiness after cycle"
|
log "verifying critical stack readiness after cycle"
|
||||||
wait_ready flux-system deployment source-controller 240s
|
wait_ready flux-system deployment source-controller 240s
|
||||||
@ -2,13 +2,13 @@
|
|||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
if [[ "${EUID}" -ne 0 ]]; then
|
if [[ "${EUID}" -ne 0 ]]; then
|
||||||
echo "hecate-self-update.sh must run as root" >&2
|
echo "ananke-self-update.sh must run as root" >&2
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
REPO_URL="${HECATE_REPO_URL:-https://scm.bstein.dev/bstein/hecate.git}"
|
REPO_URL="${ANANKE_REPO_URL:-ssh://git@scm.bstein.dev:2242/bstein/ananke.git}"
|
||||||
BRANCH="${HECATE_REPO_BRANCH:-main}"
|
BRANCH="${ANANKE_REPO_BRANCH:-main}"
|
||||||
REPO_DIR="${HECATE_REPO_DIR:-/opt/hecate}"
|
REPO_DIR="${ANANKE_REPO_DIR:-/opt/ananke}"
|
||||||
|
|
||||||
mkdir -p "$(dirname "${REPO_DIR}")"
|
mkdir -p "$(dirname "${REPO_DIR}")"
|
||||||
if [[ ! -d "${REPO_DIR}/.git" ]]; then
|
if [[ ! -d "${REPO_DIR}/.git" ]]; then
|
||||||
Loading…
x
Reference in New Issue
Block a user