hecate: add layered startup drills and rename UPS peers
This commit is contained in:
parent
138f816093
commit
aa9c7b69f3
8
Makefile
8
Makefile
@ -1,4 +1,4 @@
|
|||||||
.PHONY: build test fmt tidy install
|
.PHONY: build test fmt tidy install drill-list drill-run
|
||||||
|
|
||||||
build:
|
build:
|
||||||
go build -o dist/hecate ./cmd/hecate
|
go build -o dist/hecate ./cmd/hecate
|
||||||
@ -14,3 +14,9 @@ tidy:
|
|||||||
|
|
||||||
install:
|
install:
|
||||||
sudo ./scripts/install.sh
|
sudo ./scripts/install.sh
|
||||||
|
|
||||||
|
drill-list:
|
||||||
|
./scripts/hecate-drills.sh list
|
||||||
|
|
||||||
|
drill-run:
|
||||||
|
./scripts/hecate-drills.sh run $(DRILL) --execute
|
||||||
|
|||||||
19
README.md
19
README.md
@ -42,9 +42,9 @@ Installer knobs (optional):
|
|||||||
- `HECATE_ENABLE_BOOTSTRAP=1` enables `hecate-bootstrap.service` on this host.
|
- `HECATE_ENABLE_BOOTSTRAP=1` enables `hecate-bootstrap.service` on this host.
|
||||||
- `HECATE_ENABLE_BOOTSTRAP=0` disables it; default `auto` preserves current bootstrap enablement state.
|
- `HECATE_ENABLE_BOOTSTRAP=0` disables it; default `auto` preserves current bootstrap enablement state.
|
||||||
- `HECATE_MANAGE_NUT=0` skips writing NUT/udev files.
|
- `HECATE_MANAGE_NUT=0` skips writing NUT/udev files.
|
||||||
- `HECATE_NUT_UPS_NAME` (default `atlasups`)
|
- `HECATE_NUT_UPS_NAME` (default inferred from `/etc/hecate/hecate.yaml` target, fallback `pyrphoros`)
|
||||||
- `HECATE_NUT_VENDOR_ID` / `HECATE_NUT_PRODUCT_ID` (defaults `0764` / `0601`)
|
- `HECATE_NUT_VENDOR_ID` / `HECATE_NUT_PRODUCT_ID` (defaults `0764` / `0601`)
|
||||||
- `HECATE_NUT_MONITOR_USER` / `HECATE_NUT_MONITOR_PASSWORD` (defaults `monuser` / `atlasupsmon`)
|
- `HECATE_NUT_MONITOR_USER` / `HECATE_NUT_MONITOR_PASSWORD` (defaults `monuser` / `hecateupsmon`)
|
||||||
|
|
||||||
Bootstrap now (without reboot):
|
Bootstrap now (without reboot):
|
||||||
|
|
||||||
@ -64,8 +64,8 @@ sudo systemctl start hecate-bootstrap.service
|
|||||||
## Multi-UPS topology
|
## Multi-UPS topology
|
||||||
|
|
||||||
Recommended:
|
Recommended:
|
||||||
- `titan-db` runs Hecate as the shutdown coordinator (local UPS target + local shutdown execution).
|
- `titan-db` runs Hecate as the shutdown coordinator with UPS `Pyrphoros` (`pyrphoros@localhost`).
|
||||||
- `tethys` runs Hecate with local UPS target and forwards shutdown triggers to `titan-db`.
|
- `tethys` runs Hecate as a peer with UPS `Statera` (`statera@localhost`) and forwards shutdown triggers to `titan-db`.
|
||||||
- If forwarding fails, fallback local shutdown can remain enabled.
|
- If forwarding fails, fallback local shutdown can remain enabled.
|
||||||
|
|
||||||
## Config
|
## Config
|
||||||
@ -89,3 +89,14 @@ Power metrics:
|
|||||||
- `hecate-bootstrap.service` is enabled to run at host boot and perform staged startup automatically.
|
- `hecate-bootstrap.service` is enabled to run at host boot and perform staged startup automatically.
|
||||||
- `HECATE_ENABLE_BOOTSTRAP=1` enables `hecate-bootstrap.service` (recommended on `titan-db`; keep disabled on non-coordinator hosts).
|
- `HECATE_ENABLE_BOOTSTRAP=1` enables `hecate-bootstrap.service` (recommended on `titan-db`; keep disabled on non-coordinator hosts).
|
||||||
- `hecate-update.timer` runs on boot and periodically to pull latest `main` and reinstall Hecate declaratively.
|
- `hecate-update.timer` runs on boot and periodically to pull latest `main` and reinstall Hecate declaratively.
|
||||||
|
|
||||||
|
## Disruptive startup drills
|
||||||
|
|
||||||
|
Hecate includes scripted disruptive drills that intentionally break critical services and verify startup recovery paths:
|
||||||
|
|
||||||
|
- `scripts/hecate-drills.sh list`
|
||||||
|
- `scripts/hecate-drills.sh run flux-gitea-deadlock --execute`
|
||||||
|
- `scripts/hecate-drills.sh run foundation-recovery --execute`
|
||||||
|
- `scripts/hecate-drills.sh run reconciliation-resume --execute`
|
||||||
|
|
||||||
|
These drills are intentionally **not** part of regular `go test ./...`.
|
||||||
|
|||||||
@ -10,6 +10,7 @@ control_planes:
|
|||||||
workers: []
|
workers: []
|
||||||
local_bootstrap_paths:
|
local_bootstrap_paths:
|
||||||
- infrastructure/core
|
- infrastructure/core
|
||||||
|
- infrastructure/flux-system
|
||||||
- infrastructure/sources/helm
|
- infrastructure/sources/helm
|
||||||
- infrastructure/metallb
|
- infrastructure/metallb
|
||||||
- infrastructure/traefik
|
- infrastructure/traefik
|
||||||
@ -42,10 +43,10 @@ shutdown:
|
|||||||
ups:
|
ups:
|
||||||
enabled: true
|
enabled: true
|
||||||
provider: nut
|
provider: nut
|
||||||
target: atlasups@localhost
|
target: pyrphoros@localhost
|
||||||
targets:
|
targets:
|
||||||
- name: db-ups
|
- name: Pyrphoros
|
||||||
target: atlasups@localhost
|
target: pyrphoros@localhost
|
||||||
poll_seconds: 5
|
poll_seconds: 5
|
||||||
runtime_safety_factor: 1.10
|
runtime_safety_factor: 1.10
|
||||||
debounce_count: 3
|
debounce_count: 3
|
||||||
|
|||||||
@ -34,8 +34,8 @@ ups:
|
|||||||
enabled: true
|
enabled: true
|
||||||
provider: nut
|
provider: nut
|
||||||
targets:
|
targets:
|
||||||
- name: tethys-ups
|
- name: Statera
|
||||||
target: atlasups@localhost
|
target: statera@localhost
|
||||||
poll_seconds: 5
|
poll_seconds: 5
|
||||||
runtime_safety_factor: 1.10
|
runtime_safety_factor: 1.10
|
||||||
debounce_count: 3
|
debounce_count: 3
|
||||||
@ -54,4 +54,3 @@ state:
|
|||||||
dir: /var/lib/hecate
|
dir: /var/lib/hecate
|
||||||
run_history_path: /var/lib/hecate/runs.json
|
run_history_path: /var/lib/hecate/runs.json
|
||||||
lock_path: /var/lib/hecate/hecate.lock
|
lock_path: /var/lib/hecate/hecate.lock
|
||||||
|
|
||||||
|
|||||||
@ -10,6 +10,7 @@ control_planes:
|
|||||||
workers: []
|
workers: []
|
||||||
local_bootstrap_paths:
|
local_bootstrap_paths:
|
||||||
- infrastructure/core
|
- infrastructure/core
|
||||||
|
- infrastructure/flux-system
|
||||||
- infrastructure/sources/helm
|
- infrastructure/sources/helm
|
||||||
- infrastructure/metallb
|
- infrastructure/metallb
|
||||||
- infrastructure/traefik
|
- infrastructure/traefik
|
||||||
@ -43,8 +44,8 @@ ups:
|
|||||||
enabled: true
|
enabled: true
|
||||||
provider: nut
|
provider: nut
|
||||||
targets:
|
targets:
|
||||||
- name: db-ups
|
- name: Pyrphoros
|
||||||
target: atlasups@localhost
|
target: pyrphoros@localhost
|
||||||
poll_seconds: 5
|
poll_seconds: 5
|
||||||
runtime_safety_factor: 1.10
|
runtime_safety_factor: 1.10
|
||||||
debounce_count: 3
|
debounce_count: 3
|
||||||
@ -63,4 +64,3 @@ state:
|
|||||||
dir: /var/lib/hecate
|
dir: /var/lib/hecate
|
||||||
run_history_path: /var/lib/hecate/runs.json
|
run_history_path: /var/lib/hecate/runs.json
|
||||||
lock_path: /var/lib/hecate/hecate.lock
|
lock_path: /var/lib/hecate/hecate.lock
|
||||||
|
|
||||||
|
|||||||
@ -7,6 +7,7 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"sort"
|
"sort"
|
||||||
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@ -34,6 +35,22 @@ type ShutdownOptions struct {
|
|||||||
Reason string
|
Reason string
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type startupWorkload struct {
|
||||||
|
Namespace string
|
||||||
|
Kind string
|
||||||
|
Name string
|
||||||
|
}
|
||||||
|
|
||||||
|
var criticalStartupWorkloads = []startupWorkload{
|
||||||
|
{Namespace: "flux-system", Kind: "deployment", Name: "source-controller"},
|
||||||
|
{Namespace: "flux-system", Kind: "deployment", Name: "kustomize-controller"},
|
||||||
|
{Namespace: "flux-system", Kind: "deployment", Name: "helm-controller"},
|
||||||
|
{Namespace: "flux-system", Kind: "deployment", Name: "notification-controller"},
|
||||||
|
{Namespace: "vault", Kind: "statefulset", Name: "vault"},
|
||||||
|
{Namespace: "postgres", Kind: "statefulset", Name: "postgres"},
|
||||||
|
{Namespace: "gitea", Kind: "deployment", Name: "gitea"},
|
||||||
|
}
|
||||||
|
|
||||||
func New(cfg config.Config, runner *execx.Runner, store *state.Store, logger *log.Logger) *Orchestrator {
|
func New(cfg config.Config, runner *execx.Runner, store *state.Store, logger *log.Logger) *Orchestrator {
|
||||||
return &Orchestrator{cfg: cfg, runner: runner, store: store, log: logger}
|
return &Orchestrator{cfg: cfg, runner: runner, store: store, log: logger}
|
||||||
}
|
}
|
||||||
@ -74,19 +91,53 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
needsBootstrap := false
|
||||||
|
bootstrapReasons := []string{}
|
||||||
if !opts.SkipLocalBootstrap {
|
if !opts.SkipLocalBootstrap {
|
||||||
ready, readyErr := o.fluxSourceReady(ctx)
|
ready, readyErr := o.fluxSourceReady(ctx)
|
||||||
if readyErr != nil {
|
if readyErr != nil {
|
||||||
o.log.Printf("warning: unable to read flux source readiness: %v", readyErr)
|
o.log.Printf("warning: unable to read flux source readiness: %v", readyErr)
|
||||||
|
needsBootstrap = true
|
||||||
|
bootstrapReasons = append(bootstrapReasons, "flux source readiness check failed")
|
||||||
}
|
}
|
||||||
if !ready {
|
if !ready {
|
||||||
o.log.Printf("flux source not ready, applying local bootstrap path")
|
needsBootstrap = true
|
||||||
|
bootstrapReasons = append(bootstrapReasons, "flux source not ready")
|
||||||
|
}
|
||||||
|
|
||||||
|
missing, missingErr := o.missingCriticalStartupWorkloads(ctx)
|
||||||
|
if missingErr != nil {
|
||||||
|
o.log.Printf("warning: unable to inspect critical startup workloads: %v", missingErr)
|
||||||
|
needsBootstrap = true
|
||||||
|
bootstrapReasons = append(bootstrapReasons, "critical workload readiness check failed")
|
||||||
|
}
|
||||||
|
if len(missing) > 0 {
|
||||||
|
needsBootstrap = true
|
||||||
|
bootstrapReasons = append(bootstrapReasons, "critical workloads not ready: "+strings.Join(missing, ", "))
|
||||||
|
}
|
||||||
|
|
||||||
|
if needsBootstrap {
|
||||||
|
o.log.Printf("startup bootstrap required: %s", strings.Join(bootstrapReasons, "; "))
|
||||||
if err := o.bootstrapLocal(ctx); err != nil {
|
if err := o.bootstrapLocal(ctx); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if err := o.ensureCriticalStartupWorkloads(ctx); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if !opts.SkipLocalBootstrap && needsBootstrap {
|
||||||
|
ready, err := o.fluxSourceReady(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("flux source readiness after bootstrap: %w", err)
|
||||||
|
}
|
||||||
|
if !ready {
|
||||||
|
return fmt.Errorf("flux source still not ready after local bootstrap")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if err := o.resumeFluxAndReconcile(ctx); err != nil {
|
if err := o.resumeFluxAndReconcile(ctx); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
@ -419,6 +470,113 @@ func (o *Orchestrator) bestEffort(name string, fn func() error) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (o *Orchestrator) missingCriticalStartupWorkloads(ctx context.Context) ([]string, error) {
|
||||||
|
missing := []string{}
|
||||||
|
for _, w := range criticalStartupWorkloads {
|
||||||
|
ready, err := o.workloadReady(ctx, w)
|
||||||
|
if err != nil {
|
||||||
|
if isNotFoundErr(err) {
|
||||||
|
missing = append(missing, fmt.Sprintf("%s/%s/%s(not found)", w.Namespace, w.Kind, w.Name))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
return nil, fmt.Errorf("check %s/%s/%s: %w", w.Namespace, w.Kind, w.Name, err)
|
||||||
|
}
|
||||||
|
if !ready {
|
||||||
|
missing = append(missing, fmt.Sprintf("%s/%s/%s", w.Namespace, w.Kind, w.Name))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return missing, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (o *Orchestrator) ensureCriticalStartupWorkloads(ctx context.Context) error {
|
||||||
|
for _, w := range criticalStartupWorkloads {
|
||||||
|
if err := o.ensureWorkloadReplicas(ctx, w, 1); err != nil {
|
||||||
|
if isNotFoundErr(err) {
|
||||||
|
o.log.Printf("warning: startup workload missing, skipping scale: %s/%s/%s", w.Namespace, w.Kind, w.Name)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
return fmt.Errorf("scale %s/%s/%s: %w", w.Namespace, w.Kind, w.Name, err)
|
||||||
|
}
|
||||||
|
if err := o.waitWorkloadReady(ctx, w); err != nil {
|
||||||
|
if isNotFoundErr(err) {
|
||||||
|
o.log.Printf("warning: startup workload missing during readiness wait: %s/%s/%s", w.Namespace, w.Kind, w.Name)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (o *Orchestrator) ensureWorkloadReplicas(ctx context.Context, w startupWorkload, replicas int) error {
|
||||||
|
_, err := o.kubectl(
|
||||||
|
ctx,
|
||||||
|
45*time.Second,
|
||||||
|
"-n",
|
||||||
|
w.Namespace,
|
||||||
|
"scale",
|
||||||
|
w.Kind,
|
||||||
|
w.Name,
|
||||||
|
fmt.Sprintf("--replicas=%d", replicas),
|
||||||
|
)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (o *Orchestrator) waitWorkloadReady(ctx context.Context, w startupWorkload) error {
|
||||||
|
timeout := "240s"
|
||||||
|
if w.Kind == "statefulset" {
|
||||||
|
timeout = "360s"
|
||||||
|
}
|
||||||
|
_, err := o.kubectl(
|
||||||
|
ctx,
|
||||||
|
7*time.Minute,
|
||||||
|
"-n",
|
||||||
|
w.Namespace,
|
||||||
|
"rollout",
|
||||||
|
"status",
|
||||||
|
fmt.Sprintf("%s/%s", w.Kind, w.Name),
|
||||||
|
"--timeout="+timeout,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("wait ready %s/%s/%s: %w", w.Namespace, w.Kind, w.Name, err)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (o *Orchestrator) workloadReady(ctx context.Context, w startupWorkload) (bool, error) {
|
||||||
|
out, err := o.kubectl(
|
||||||
|
ctx,
|
||||||
|
20*time.Second,
|
||||||
|
"-n",
|
||||||
|
w.Namespace,
|
||||||
|
"get",
|
||||||
|
w.Kind,
|
||||||
|
w.Name,
|
||||||
|
"-o",
|
||||||
|
"jsonpath={.status.readyReplicas}",
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
raw := strings.TrimSpace(out)
|
||||||
|
if raw == "" || raw == "<no value>" {
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
n, err := strconv.Atoi(raw)
|
||||||
|
if err != nil {
|
||||||
|
return false, fmt.Errorf("parse readyReplicas %q: %w", raw, err)
|
||||||
|
}
|
||||||
|
return n >= 1, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func isNotFoundErr(err error) bool {
|
||||||
|
if err == nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
msg := strings.ToLower(err.Error())
|
||||||
|
return strings.Contains(msg, "not found") || strings.Contains(msg, "(notfound)")
|
||||||
|
}
|
||||||
|
|
||||||
func (o *Orchestrator) poweroffHosts(ctx context.Context, workers []string) error {
|
func (o *Orchestrator) poweroffHosts(ctx context.Context, workers []string) error {
|
||||||
delay := o.cfg.Shutdown.PoweroffDelaySeconds
|
delay := o.cfg.Shutdown.PoweroffDelaySeconds
|
||||||
if delay <= 0 {
|
if delay <= 0 {
|
||||||
|
|||||||
@ -131,6 +131,7 @@ func defaults() Config {
|
|||||||
ControlPlanes: []string{"titan-0a", "titan-0b", "titan-0c"},
|
ControlPlanes: []string{"titan-0a", "titan-0b", "titan-0c"},
|
||||||
LocalBootstrapPaths: []string{
|
LocalBootstrapPaths: []string{
|
||||||
"infrastructure/core",
|
"infrastructure/core",
|
||||||
|
"infrastructure/flux-system",
|
||||||
"infrastructure/sources/helm",
|
"infrastructure/sources/helm",
|
||||||
"infrastructure/metallb",
|
"infrastructure/metallb",
|
||||||
"infrastructure/traefik",
|
"infrastructure/traefik",
|
||||||
|
|||||||
@ -18,8 +18,8 @@ ups:
|
|||||||
enabled: true
|
enabled: true
|
||||||
provider: nut
|
provider: nut
|
||||||
targets:
|
targets:
|
||||||
- name: db
|
- name: pyrphoros
|
||||||
target: atlasups@localhost
|
target: pyrphoros@localhost
|
||||||
shutdown:
|
shutdown:
|
||||||
default_budget_seconds: 300
|
default_budget_seconds: 300
|
||||||
state:
|
state:
|
||||||
@ -34,7 +34,7 @@ state:
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("load config: %v", err)
|
t.Fatalf("load config: %v", err)
|
||||||
}
|
}
|
||||||
if len(cfg.UPS.Targets) != 1 || cfg.UPS.Targets[0].Target != "atlasups@localhost" {
|
if len(cfg.UPS.Targets) != 1 || cfg.UPS.Targets[0].Target != "pyrphoros@localhost" {
|
||||||
t.Fatalf("unexpected UPS targets: %#v", cfg.UPS.Targets)
|
t.Fatalf("unexpected UPS targets: %#v", cfg.UPS.Targets)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -11,8 +11,8 @@ func TestExporterEmitsCoreMetrics(t *testing.T) {
|
|||||||
e := New()
|
e := New()
|
||||||
e.UpdateBudget(321)
|
e.UpdateBudget(321)
|
||||||
e.UpdateSample(Sample{
|
e.UpdateSample(Sample{
|
||||||
Name: "db-ups",
|
Name: "Pyrphoros",
|
||||||
Target: "atlasups@localhost",
|
Target: "pyrphoros@localhost",
|
||||||
OnBattery: true,
|
OnBattery: true,
|
||||||
LowBattery: false,
|
LowBattery: false,
|
||||||
RuntimeSecond: 412,
|
RuntimeSecond: 412,
|
||||||
@ -35,12 +35,12 @@ func TestExporterEmitsCoreMetrics(t *testing.T) {
|
|||||||
mustContain := []string{
|
mustContain := []string{
|
||||||
"hecate_shutdown_budget_seconds 321",
|
"hecate_shutdown_budget_seconds 321",
|
||||||
"hecate_shutdown_triggers_total 1",
|
"hecate_shutdown_triggers_total 1",
|
||||||
"hecate_ups_on_battery{source=\"db-ups\",target=\"atlasups@localhost\"",
|
"hecate_ups_on_battery{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
|
||||||
"hecate_ups_runtime_seconds{source=\"db-ups\",target=\"atlasups@localhost\"",
|
"hecate_ups_runtime_seconds{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
|
||||||
"hecate_ups_battery_charge_percent{source=\"db-ups\",target=\"atlasups@localhost\"",
|
"hecate_ups_battery_charge_percent{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
|
||||||
"hecate_ups_load_percent{source=\"db-ups\",target=\"atlasups@localhost\"",
|
"hecate_ups_load_percent{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
|
||||||
"hecate_ups_power_nominal_watts{source=\"db-ups\",target=\"atlasups@localhost\"",
|
"hecate_ups_power_nominal_watts{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
|
||||||
"hecate_ups_threshold_seconds{source=\"db-ups\",target=\"atlasups@localhost\"",
|
"hecate_ups_threshold_seconds{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
|
||||||
}
|
}
|
||||||
for _, m := range mustContain {
|
for _, m := range mustContain {
|
||||||
if !strings.Contains(body, m) {
|
if !strings.Contains(body, m) {
|
||||||
|
|||||||
295
scripts/hecate-drills.sh
Executable file
295
scripts/hecate-drills.sh
Executable file
@ -0,0 +1,295 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
KUBECTL="${KUBECTL:-kubectl}"
|
||||||
|
HECATE_COORDINATOR_HOST="${HECATE_COORDINATOR_HOST:-titan-db}"
|
||||||
|
HECATE_BIN="${HECATE_BIN:-/usr/local/bin/hecate}"
|
||||||
|
HECATE_CONFIG="${HECATE_CONFIG:-/etc/hecate/hecate.yaml}"
|
||||||
|
LOG_DIR="${HECATE_DRILL_LOG_DIR:-/tmp/hecate-drills}"
|
||||||
|
STARTUP_TIMEOUT_SECONDS="${HECATE_DRILL_STARTUP_TIMEOUT_SECONDS:-1800}"
|
||||||
|
EXECUTE=0
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
cat <<'EOF'
|
||||||
|
Usage:
|
||||||
|
scripts/hecate-drills.sh list
|
||||||
|
scripts/hecate-drills.sh run <drill-name> [--execute]
|
||||||
|
|
||||||
|
Drills:
|
||||||
|
flux-gitea-deadlock Simulate flux-controller + gitea outage and require startup recovery.
|
||||||
|
foundation-recovery Simulate vault/postgres/gitea outage and require layered restore.
|
||||||
|
reconciliation-resume Simulate global Flux suspend + source-controller down and require resume.
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
- Drills are intentionally disruptive and are not part of regular `make test`.
|
||||||
|
- Use --execute to run live changes. Without it, this script prints planned actions only.
|
||||||
|
EOF
|
||||||
|
}
|
||||||
|
|
||||||
|
log() {
|
||||||
|
printf '[drill] %s\n' "$*"
|
||||||
|
}
|
||||||
|
|
||||||
|
die() {
|
||||||
|
printf '[drill] ERROR: %s\n' "$*" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
need_cmd() {
|
||||||
|
command -v "$1" >/dev/null 2>&1 || die "missing required command: $1"
|
||||||
|
}
|
||||||
|
|
||||||
|
now_ts() {
|
||||||
|
date -u +%Y%m%dT%H%M%SZ
|
||||||
|
}
|
||||||
|
|
||||||
|
resource_key() {
|
||||||
|
local ns="$1" kind="$2" name="$3"
|
||||||
|
printf '%s|%s|%s' "$ns" "$kind" "$name"
|
||||||
|
}
|
||||||
|
|
||||||
|
get_replicas() {
|
||||||
|
local ns="$1" kind="$2" name="$3"
|
||||||
|
"${KUBECTL}" -n "$ns" get "$kind" "$name" -o jsonpath='{.spec.replicas}' 2>/dev/null || echo "0"
|
||||||
|
}
|
||||||
|
|
||||||
|
scale_to() {
|
||||||
|
local ns="$1" kind="$2" name="$3" replicas="$4"
|
||||||
|
if [[ "${EXECUTE}" -eq 0 ]]; then
|
||||||
|
log "plan: kubectl -n ${ns} scale ${kind} ${name} --replicas=${replicas}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
"${KUBECTL}" -n "$ns" scale "$kind" "$name" --replicas="${replicas}" >/dev/null
|
||||||
|
}
|
||||||
|
|
||||||
|
wait_ready() {
|
||||||
|
local ns="$1" kind="$2" name="$3" timeout="$4"
|
||||||
|
if [[ "${EXECUTE}" -eq 0 ]]; then
|
||||||
|
log "plan: kubectl -n ${ns} rollout status ${kind}/${name} --timeout=${timeout}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
"${KUBECTL}" -n "$ns" rollout status "${kind}/${name}" --timeout="${timeout}" >/dev/null
|
||||||
|
}
|
||||||
|
|
||||||
|
run_hecate_startup() {
|
||||||
|
local reason="$1"
|
||||||
|
local cmd=(sudo "${HECATE_BIN}" startup --config "${HECATE_CONFIG}" --execute --force-flux-branch main)
|
||||||
|
if [[ "${EXECUTE}" -eq 0 ]]; then
|
||||||
|
log "plan: ssh ${HECATE_COORDINATOR_HOST} '${cmd[*]}'"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
timeout "${STARTUP_TIMEOUT_SECONDS}" ssh "${HECATE_COORDINATOR_HOST}" "${cmd[@]}"
|
||||||
|
}
|
||||||
|
|
||||||
|
declare -A SNAPSHOT_REPLICAS=()
|
||||||
|
SUSPENDED_KS_BEFORE=""
|
||||||
|
SUSPENDED_HR_BEFORE=""
|
||||||
|
|
||||||
|
snapshot_resources() {
|
||||||
|
local resources=("$@")
|
||||||
|
SNAPSHOT_REPLICAS=()
|
||||||
|
for res in "${resources[@]}"; do
|
||||||
|
IFS='|' read -r ns kind name _ <<<"${res}"
|
||||||
|
SNAPSHOT_REPLICAS["$(resource_key "$ns" "$kind" "$name")"]="$(get_replicas "$ns" "$kind" "$name")"
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
restore_resources() {
|
||||||
|
local resources=("$@")
|
||||||
|
for res in "${resources[@]}"; do
|
||||||
|
IFS='|' read -r ns kind name _ <<<"${res}"
|
||||||
|
local key
|
||||||
|
key="$(resource_key "$ns" "$kind" "$name")"
|
||||||
|
local replicas="${SNAPSHOT_REPLICAS[${key}]:-1}"
|
||||||
|
log "rollback replicas: ${ns}/${kind}/${name} -> ${replicas}"
|
||||||
|
scale_to "$ns" "$kind" "$name" "$replicas" || true
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
record_flux_suspended_before() {
|
||||||
|
SUSPENDED_KS_BEFORE="$("${KUBECTL}" get kustomizations.kustomize.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.namespace}/{.metadata.name}{"\n"}{end}' || true)"
|
||||||
|
SUSPENDED_HR_BEFORE="$("${KUBECTL}" get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.namespace}/{.metadata.name}{"\n"}{end}' || true)"
|
||||||
|
}
|
||||||
|
|
||||||
|
set_flux_suspend_all() {
|
||||||
|
local value="$1"
|
||||||
|
if [[ "${EXECUTE}" -eq 0 ]]; then
|
||||||
|
log "plan: patch all Flux kustomizations + helmreleases suspend=${value}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
local patch
|
||||||
|
patch="$(printf '{"spec":{"suspend":%s}}' "${value}")"
|
||||||
|
while read -r ks; do
|
||||||
|
[[ -z "${ks}" ]] && continue
|
||||||
|
"${KUBECTL}" -n flux-system patch kustomization "${ks}" --type=merge -p "${patch}" >/dev/null || true
|
||||||
|
done < <("${KUBECTL}" -n flux-system get kustomizations.kustomize.toolkit.fluxcd.io -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}')
|
||||||
|
|
||||||
|
while read -r hr; do
|
||||||
|
[[ -z "${hr}" ]] && continue
|
||||||
|
local ns="${hr%%/*}"
|
||||||
|
local name="${hr##*/}"
|
||||||
|
"${KUBECTL}" -n "${ns}" patch helmrelease "${name}" --type=merge -p "${patch}" >/dev/null || true
|
||||||
|
done < <("${KUBECTL}" get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"/"}{.metadata.name}{"\n"}{end}')
|
||||||
|
}
|
||||||
|
|
||||||
|
restore_flux_suspended_before() {
|
||||||
|
set_flux_suspend_all false
|
||||||
|
if [[ "${EXECUTE}" -eq 0 ]]; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
local patch='{"spec":{"suspend":true}}'
|
||||||
|
while read -r ref; do
|
||||||
|
[[ -z "${ref}" ]] && continue
|
||||||
|
local ns="${ref%%/*}"
|
||||||
|
local name="${ref##*/}"
|
||||||
|
"${KUBECTL}" -n "${ns}" patch kustomization "${name}" --type=merge -p "${patch}" >/dev/null || true
|
||||||
|
done <<<"${SUSPENDED_KS_BEFORE}"
|
||||||
|
while read -r ref; do
|
||||||
|
[[ -z "${ref}" ]] && continue
|
||||||
|
local ns="${ref%%/*}"
|
||||||
|
local name="${ref##*/}"
|
||||||
|
"${KUBECTL}" -n "${ns}" patch helmrelease "${name}" --type=merge -p "${patch}" >/dev/null || true
|
||||||
|
done <<<"${SUSPENDED_HR_BEFORE}"
|
||||||
|
}
|
||||||
|
|
||||||
|
verify_flux_unsuspended() {
|
||||||
|
if [[ "${EXECUTE}" -eq 0 ]]; then
|
||||||
|
log "plan: verify no Flux kustomizations/helmreleases remain suspended"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
local ks_count hr_count
|
||||||
|
ks_count="$("${KUBECTL}" get kustomizations.kustomize.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.name}{"\n"}{end}' | sed '/^$/d' | wc -l | tr -d ' ')"
|
||||||
|
hr_count="$("${KUBECTL}" get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.name}{"\n"}{end}' | sed '/^$/d' | wc -l | tr -d ' ')"
|
||||||
|
[[ "${ks_count}" == "0" ]] || die "kustomizations still suspended: ${ks_count}"
|
||||||
|
[[ "${hr_count}" == "0" ]] || die "helmreleases still suspended: ${hr_count}"
|
||||||
|
}
|
||||||
|
|
||||||
|
write_log_header() {
|
||||||
|
local drill="$1"
|
||||||
|
mkdir -p "${LOG_DIR}"
|
||||||
|
local f="${LOG_DIR}/${drill}-$(now_ts).log"
|
||||||
|
exec > >(tee -a "${f}") 2>&1
|
||||||
|
log "drill=${drill} execute=${EXECUTE} coordinator=${HECATE_COORDINATOR_HOST}"
|
||||||
|
}
|
||||||
|
|
||||||
|
run_drill_flux_gitea_deadlock() {
|
||||||
|
local resources=(
|
||||||
|
"flux-system|deployment|source-controller|1"
|
||||||
|
"flux-system|deployment|kustomize-controller|1"
|
||||||
|
"flux-system|deployment|helm-controller|1"
|
||||||
|
"flux-system|deployment|notification-controller|1"
|
||||||
|
"gitea|deployment|gitea|1"
|
||||||
|
)
|
||||||
|
snapshot_resources "${resources[@]}"
|
||||||
|
trap 'restore_resources "${resources[@]}"' ERR
|
||||||
|
|
||||||
|
log "injecting outage: flux controllers + gitea"
|
||||||
|
for res in "${resources[@]}"; do
|
||||||
|
IFS='|' read -r ns kind name _ <<<"${res}"
|
||||||
|
scale_to "$ns" "$kind" "$name" 0
|
||||||
|
done
|
||||||
|
|
||||||
|
run_hecate_startup "drill-flux-gitea-deadlock"
|
||||||
|
|
||||||
|
log "verifying recovery"
|
||||||
|
wait_ready flux-system deployment source-controller 240s
|
||||||
|
wait_ready flux-system deployment kustomize-controller 240s
|
||||||
|
wait_ready flux-system deployment helm-controller 240s
|
||||||
|
wait_ready flux-system deployment notification-controller 240s
|
||||||
|
wait_ready gitea deployment gitea 300s
|
||||||
|
log "pass: flux-gitea-deadlock"
|
||||||
|
trap - ERR
|
||||||
|
}
|
||||||
|
|
||||||
|
run_drill_foundation_recovery() {
|
||||||
|
local resources=(
|
||||||
|
"vault|statefulset|vault|1"
|
||||||
|
"postgres|statefulset|postgres|1"
|
||||||
|
"gitea|deployment|gitea|1"
|
||||||
|
)
|
||||||
|
snapshot_resources "${resources[@]}"
|
||||||
|
trap 'restore_resources "${resources[@]}"' ERR
|
||||||
|
|
||||||
|
log "injecting outage: vault + postgres + gitea"
|
||||||
|
for res in "${resources[@]}"; do
|
||||||
|
IFS='|' read -r ns kind name _ <<<"${res}"
|
||||||
|
scale_to "$ns" "$kind" "$name" 0
|
||||||
|
done
|
||||||
|
|
||||||
|
run_hecate_startup "drill-foundation-recovery"
|
||||||
|
|
||||||
|
log "verifying layered recovery"
|
||||||
|
wait_ready vault statefulset vault 420s
|
||||||
|
wait_ready postgres statefulset postgres 420s
|
||||||
|
wait_ready gitea deployment gitea 300s
|
||||||
|
log "pass: foundation-recovery"
|
||||||
|
trap - ERR
|
||||||
|
}
|
||||||
|
|
||||||
|
run_drill_reconciliation_resume() {
|
||||||
|
local resources=("flux-system|deployment|source-controller|1")
|
||||||
|
snapshot_resources "${resources[@]}"
|
||||||
|
record_flux_suspended_before
|
||||||
|
trap 'restore_flux_suspended_before; restore_resources "${resources[@]}"' ERR
|
||||||
|
|
||||||
|
log "injecting outage: suspend all Flux objects + stop source-controller"
|
||||||
|
set_flux_suspend_all true
|
||||||
|
scale_to flux-system deployment source-controller 0
|
||||||
|
|
||||||
|
run_hecate_startup "drill-reconciliation-resume"
|
||||||
|
|
||||||
|
log "verifying reconciliation resumed"
|
||||||
|
wait_ready flux-system deployment source-controller 240s
|
||||||
|
verify_flux_unsuspended
|
||||||
|
log "pass: reconciliation-resume"
|
||||||
|
trap - ERR
|
||||||
|
}
|
||||||
|
|
||||||
|
main() {
|
||||||
|
need_cmd "${KUBECTL}"
|
||||||
|
need_cmd ssh
|
||||||
|
need_cmd timeout
|
||||||
|
|
||||||
|
local cmd="${1:-}"
|
||||||
|
case "${cmd}" in
|
||||||
|
list)
|
||||||
|
usage
|
||||||
|
exit 0
|
||||||
|
;;
|
||||||
|
run)
|
||||||
|
shift || true
|
||||||
|
local drill="${1:-}"
|
||||||
|
[[ -n "${drill}" ]] || die "missing drill name"
|
||||||
|
shift || true
|
||||||
|
while [[ $# -gt 0 ]]; do
|
||||||
|
case "$1" in
|
||||||
|
--execute) EXECUTE=1 ;;
|
||||||
|
*) die "unknown option: $1" ;;
|
||||||
|
esac
|
||||||
|
shift
|
||||||
|
done
|
||||||
|
write_log_header "${drill}"
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
usage
|
||||||
|
exit 2
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
case "${drill}" in
|
||||||
|
flux-gitea-deadlock)
|
||||||
|
run_drill_flux_gitea_deadlock
|
||||||
|
;;
|
||||||
|
foundation-recovery)
|
||||||
|
run_drill_foundation_recovery
|
||||||
|
;;
|
||||||
|
reconciliation-resume)
|
||||||
|
run_drill_reconciliation_resume
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
die "unknown drill: ${drill}"
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
}
|
||||||
|
|
||||||
|
main "$@"
|
||||||
@ -16,11 +16,11 @@ START_NOW=1
|
|||||||
INSTALL_DEPS=1
|
INSTALL_DEPS=1
|
||||||
ENABLE_BOOTSTRAP="${HECATE_ENABLE_BOOTSTRAP:-auto}"
|
ENABLE_BOOTSTRAP="${HECATE_ENABLE_BOOTSTRAP:-auto}"
|
||||||
MANAGE_NUT="${HECATE_MANAGE_NUT:-1}"
|
MANAGE_NUT="${HECATE_MANAGE_NUT:-1}"
|
||||||
NUT_UPS_NAME="${HECATE_NUT_UPS_NAME:-atlasups}"
|
NUT_UPS_NAME="${HECATE_NUT_UPS_NAME:-}"
|
||||||
NUT_VENDOR_ID="${HECATE_NUT_VENDOR_ID:-0764}"
|
NUT_VENDOR_ID="${HECATE_NUT_VENDOR_ID:-0764}"
|
||||||
NUT_PRODUCT_ID="${HECATE_NUT_PRODUCT_ID:-0601}"
|
NUT_PRODUCT_ID="${HECATE_NUT_PRODUCT_ID:-0601}"
|
||||||
NUT_MONITOR_USER="${HECATE_NUT_MONITOR_USER:-monuser}"
|
NUT_MONITOR_USER="${HECATE_NUT_MONITOR_USER:-monuser}"
|
||||||
NUT_MONITOR_PASSWORD="${HECATE_NUT_MONITOR_PASSWORD:-atlasupsmon}"
|
NUT_MONITOR_PASSWORD="${HECATE_NUT_MONITOR_PASSWORD:-hecateupsmon}"
|
||||||
|
|
||||||
while [[ $# -gt 0 ]]; do
|
while [[ $# -gt 0 ]]; do
|
||||||
case "$1" in
|
case "$1" in
|
||||||
@ -39,6 +39,25 @@ while [[ $# -gt 0 ]]; do
|
|||||||
esac
|
esac
|
||||||
done
|
done
|
||||||
|
|
||||||
|
resolve_nut_ups_name() {
|
||||||
|
if [[ -n "${NUT_UPS_NAME}" ]]; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -f "${CONF_DIR}/hecate.yaml" ]]; then
|
||||||
|
local target=""
|
||||||
|
target="$(grep -Eo 'target:[[:space:]]*[A-Za-z0-9._-]+@localhost' "${CONF_DIR}/hecate.yaml" | head -n 1 | awk '{print $2}')"
|
||||||
|
if [[ -n "${target}" ]]; then
|
||||||
|
NUT_UPS_NAME="${target%@localhost}"
|
||||||
|
echo "[install] inferred NUT UPS name from config: ${NUT_UPS_NAME}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
NUT_UPS_NAME="pyrphoros"
|
||||||
|
echo "[install] defaulting NUT UPS name to ${NUT_UPS_NAME}"
|
||||||
|
}
|
||||||
|
|
||||||
ensure_apt_packages() {
|
ensure_apt_packages() {
|
||||||
local missing=()
|
local missing=()
|
||||||
for pkg in "$@"; do
|
for pkg in "$@"; do
|
||||||
@ -179,6 +198,7 @@ install -m 0644 deploy/systemd/hecate-update.service "${SYSTEMD_DIR}/hecate-upda
|
|||||||
install -m 0644 deploy/systemd/hecate-update.timer "${SYSTEMD_DIR}/hecate-update.timer"
|
install -m 0644 deploy/systemd/hecate-update.timer "${SYSTEMD_DIR}/hecate-update.timer"
|
||||||
install -m 0755 scripts/hecate-self-update.sh "${LIB_DIR}/hecate-self-update.sh"
|
install -m 0755 scripts/hecate-self-update.sh "${LIB_DIR}/hecate-self-update.sh"
|
||||||
|
|
||||||
|
resolve_nut_ups_name
|
||||||
configure_nut
|
configure_nut
|
||||||
|
|
||||||
systemctl daemon-reload
|
systemctl daemon-reload
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user