From aa9c7b69f3c4c958851ebb6039dc2b84552f25eb Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Sat, 4 Apr 2026 05:50:38 -0300 Subject: [PATCH] hecate: add layered startup drills and rename UPS peers --- Makefile | 8 +- README.md | 19 +- configs/hecate.example.yaml | 7 +- configs/hecate.tethys.yaml | 5 +- configs/hecate.titan-db.yaml | 6 +- internal/cluster/orchestrator.go | 160 +++++++++++++++- internal/config/config.go | 1 + internal/config/config_test.go | 6 +- internal/metrics/exporter_test.go | 16 +- scripts/hecate-drills.sh | 295 ++++++++++++++++++++++++++++++ scripts/install.sh | 24 ++- 11 files changed, 519 insertions(+), 28 deletions(-) create mode 100755 scripts/hecate-drills.sh diff --git a/Makefile b/Makefile index 996496d..4fb47f9 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: build test fmt tidy install +.PHONY: build test fmt tidy install drill-list drill-run build: go build -o dist/hecate ./cmd/hecate @@ -14,3 +14,9 @@ tidy: install: sudo ./scripts/install.sh + +drill-list: + ./scripts/hecate-drills.sh list + +drill-run: + ./scripts/hecate-drills.sh run $(DRILL) --execute diff --git a/README.md b/README.md index e4867c1..8f68841 100644 --- a/README.md +++ b/README.md @@ -42,9 +42,9 @@ Installer knobs (optional): - `HECATE_ENABLE_BOOTSTRAP=1` enables `hecate-bootstrap.service` on this host. - `HECATE_ENABLE_BOOTSTRAP=0` disables it; default `auto` preserves current bootstrap enablement state. - `HECATE_MANAGE_NUT=0` skips writing NUT/udev files. -- `HECATE_NUT_UPS_NAME` (default `atlasups`) +- `HECATE_NUT_UPS_NAME` (default inferred from `/etc/hecate/hecate.yaml` target, fallback `pyrphoros`) - `HECATE_NUT_VENDOR_ID` / `HECATE_NUT_PRODUCT_ID` (defaults `0764` / `0601`) -- `HECATE_NUT_MONITOR_USER` / `HECATE_NUT_MONITOR_PASSWORD` (defaults `monuser` / `atlasupsmon`) +- `HECATE_NUT_MONITOR_USER` / `HECATE_NUT_MONITOR_PASSWORD` (defaults `monuser` / `hecateupsmon`) Bootstrap now (without reboot): @@ -64,8 +64,8 @@ sudo systemctl start hecate-bootstrap.service ## Multi-UPS topology Recommended: -- `titan-db` runs Hecate as the shutdown coordinator (local UPS target + local shutdown execution). -- `tethys` runs Hecate with local UPS target and forwards shutdown triggers to `titan-db`. +- `titan-db` runs Hecate as the shutdown coordinator with UPS `Pyrphoros` (`pyrphoros@localhost`). +- `tethys` runs Hecate as a peer with UPS `Statera` (`statera@localhost`) and forwards shutdown triggers to `titan-db`. - If forwarding fails, fallback local shutdown can remain enabled. ## Config @@ -89,3 +89,14 @@ Power metrics: - `hecate-bootstrap.service` is enabled to run at host boot and perform staged startup automatically. - `HECATE_ENABLE_BOOTSTRAP=1` enables `hecate-bootstrap.service` (recommended on `titan-db`; keep disabled on non-coordinator hosts). - `hecate-update.timer` runs on boot and periodically to pull latest `main` and reinstall Hecate declaratively. + +## Disruptive startup drills + +Hecate includes scripted disruptive drills that intentionally break critical services and verify startup recovery paths: + +- `scripts/hecate-drills.sh list` +- `scripts/hecate-drills.sh run flux-gitea-deadlock --execute` +- `scripts/hecate-drills.sh run foundation-recovery --execute` +- `scripts/hecate-drills.sh run reconciliation-resume --execute` + +These drills are intentionally **not** part of regular `go test ./...`. diff --git a/configs/hecate.example.yaml b/configs/hecate.example.yaml index 1288ea3..fbcbbbd 100644 --- a/configs/hecate.example.yaml +++ b/configs/hecate.example.yaml @@ -10,6 +10,7 @@ control_planes: workers: [] local_bootstrap_paths: - infrastructure/core + - infrastructure/flux-system - infrastructure/sources/helm - infrastructure/metallb - infrastructure/traefik @@ -42,10 +43,10 @@ shutdown: ups: enabled: true provider: nut - target: atlasups@localhost + target: pyrphoros@localhost targets: - - name: db-ups - target: atlasups@localhost + - name: Pyrphoros + target: pyrphoros@localhost poll_seconds: 5 runtime_safety_factor: 1.10 debounce_count: 3 diff --git a/configs/hecate.tethys.yaml b/configs/hecate.tethys.yaml index ecee181..519076e 100644 --- a/configs/hecate.tethys.yaml +++ b/configs/hecate.tethys.yaml @@ -34,8 +34,8 @@ ups: enabled: true provider: nut targets: - - name: tethys-ups - target: atlasups@localhost + - name: Statera + target: statera@localhost poll_seconds: 5 runtime_safety_factor: 1.10 debounce_count: 3 @@ -54,4 +54,3 @@ state: dir: /var/lib/hecate run_history_path: /var/lib/hecate/runs.json lock_path: /var/lib/hecate/hecate.lock - diff --git a/configs/hecate.titan-db.yaml b/configs/hecate.titan-db.yaml index 67c40bd..3ed41d2 100644 --- a/configs/hecate.titan-db.yaml +++ b/configs/hecate.titan-db.yaml @@ -10,6 +10,7 @@ control_planes: workers: [] local_bootstrap_paths: - infrastructure/core + - infrastructure/flux-system - infrastructure/sources/helm - infrastructure/metallb - infrastructure/traefik @@ -43,8 +44,8 @@ ups: enabled: true provider: nut targets: - - name: db-ups - target: atlasups@localhost + - name: Pyrphoros + target: pyrphoros@localhost poll_seconds: 5 runtime_safety_factor: 1.10 debounce_count: 3 @@ -63,4 +64,3 @@ state: dir: /var/lib/hecate run_history_path: /var/lib/hecate/runs.json lock_path: /var/lib/hecate/hecate.lock - diff --git a/internal/cluster/orchestrator.go b/internal/cluster/orchestrator.go index 02a7194..226bb8f 100644 --- a/internal/cluster/orchestrator.go +++ b/internal/cluster/orchestrator.go @@ -7,6 +7,7 @@ import ( "os" "path/filepath" "sort" + "strconv" "strings" "time" @@ -34,6 +35,22 @@ type ShutdownOptions struct { Reason string } +type startupWorkload struct { + Namespace string + Kind string + Name string +} + +var criticalStartupWorkloads = []startupWorkload{ + {Namespace: "flux-system", Kind: "deployment", Name: "source-controller"}, + {Namespace: "flux-system", Kind: "deployment", Name: "kustomize-controller"}, + {Namespace: "flux-system", Kind: "deployment", Name: "helm-controller"}, + {Namespace: "flux-system", Kind: "deployment", Name: "notification-controller"}, + {Namespace: "vault", Kind: "statefulset", Name: "vault"}, + {Namespace: "postgres", Kind: "statefulset", Name: "postgres"}, + {Namespace: "gitea", Kind: "deployment", Name: "gitea"}, +} + func New(cfg config.Config, runner *execx.Runner, store *state.Store, logger *log.Logger) *Orchestrator { return &Orchestrator{cfg: cfg, runner: runner, store: store, log: logger} } @@ -74,19 +91,53 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er } } + needsBootstrap := false + bootstrapReasons := []string{} if !opts.SkipLocalBootstrap { ready, readyErr := o.fluxSourceReady(ctx) if readyErr != nil { o.log.Printf("warning: unable to read flux source readiness: %v", readyErr) + needsBootstrap = true + bootstrapReasons = append(bootstrapReasons, "flux source readiness check failed") } if !ready { - o.log.Printf("flux source not ready, applying local bootstrap path") + needsBootstrap = true + bootstrapReasons = append(bootstrapReasons, "flux source not ready") + } + + missing, missingErr := o.missingCriticalStartupWorkloads(ctx) + if missingErr != nil { + o.log.Printf("warning: unable to inspect critical startup workloads: %v", missingErr) + needsBootstrap = true + bootstrapReasons = append(bootstrapReasons, "critical workload readiness check failed") + } + if len(missing) > 0 { + needsBootstrap = true + bootstrapReasons = append(bootstrapReasons, "critical workloads not ready: "+strings.Join(missing, ", ")) + } + + if needsBootstrap { + o.log.Printf("startup bootstrap required: %s", strings.Join(bootstrapReasons, "; ")) if err := o.bootstrapLocal(ctx); err != nil { return err } } } + if err := o.ensureCriticalStartupWorkloads(ctx); err != nil { + return err + } + + if !opts.SkipLocalBootstrap && needsBootstrap { + ready, err := o.fluxSourceReady(ctx) + if err != nil { + return fmt.Errorf("flux source readiness after bootstrap: %w", err) + } + if !ready { + return fmt.Errorf("flux source still not ready after local bootstrap") + } + } + if err := o.resumeFluxAndReconcile(ctx); err != nil { return err } @@ -419,6 +470,113 @@ func (o *Orchestrator) bestEffort(name string, fn func() error) { } } +func (o *Orchestrator) missingCriticalStartupWorkloads(ctx context.Context) ([]string, error) { + missing := []string{} + for _, w := range criticalStartupWorkloads { + ready, err := o.workloadReady(ctx, w) + if err != nil { + if isNotFoundErr(err) { + missing = append(missing, fmt.Sprintf("%s/%s/%s(not found)", w.Namespace, w.Kind, w.Name)) + continue + } + return nil, fmt.Errorf("check %s/%s/%s: %w", w.Namespace, w.Kind, w.Name, err) + } + if !ready { + missing = append(missing, fmt.Sprintf("%s/%s/%s", w.Namespace, w.Kind, w.Name)) + } + } + return missing, nil +} + +func (o *Orchestrator) ensureCriticalStartupWorkloads(ctx context.Context) error { + for _, w := range criticalStartupWorkloads { + if err := o.ensureWorkloadReplicas(ctx, w, 1); err != nil { + if isNotFoundErr(err) { + o.log.Printf("warning: startup workload missing, skipping scale: %s/%s/%s", w.Namespace, w.Kind, w.Name) + continue + } + return fmt.Errorf("scale %s/%s/%s: %w", w.Namespace, w.Kind, w.Name, err) + } + if err := o.waitWorkloadReady(ctx, w); err != nil { + if isNotFoundErr(err) { + o.log.Printf("warning: startup workload missing during readiness wait: %s/%s/%s", w.Namespace, w.Kind, w.Name) + continue + } + return err + } + } + return nil +} + +func (o *Orchestrator) ensureWorkloadReplicas(ctx context.Context, w startupWorkload, replicas int) error { + _, err := o.kubectl( + ctx, + 45*time.Second, + "-n", + w.Namespace, + "scale", + w.Kind, + w.Name, + fmt.Sprintf("--replicas=%d", replicas), + ) + return err +} + +func (o *Orchestrator) waitWorkloadReady(ctx context.Context, w startupWorkload) error { + timeout := "240s" + if w.Kind == "statefulset" { + timeout = "360s" + } + _, err := o.kubectl( + ctx, + 7*time.Minute, + "-n", + w.Namespace, + "rollout", + "status", + fmt.Sprintf("%s/%s", w.Kind, w.Name), + "--timeout="+timeout, + ) + if err != nil { + return fmt.Errorf("wait ready %s/%s/%s: %w", w.Namespace, w.Kind, w.Name, err) + } + return nil +} + +func (o *Orchestrator) workloadReady(ctx context.Context, w startupWorkload) (bool, error) { + out, err := o.kubectl( + ctx, + 20*time.Second, + "-n", + w.Namespace, + "get", + w.Kind, + w.Name, + "-o", + "jsonpath={.status.readyReplicas}", + ) + if err != nil { + return false, err + } + raw := strings.TrimSpace(out) + if raw == "" || raw == "" { + return false, nil + } + n, err := strconv.Atoi(raw) + if err != nil { + return false, fmt.Errorf("parse readyReplicas %q: %w", raw, err) + } + return n >= 1, nil +} + +func isNotFoundErr(err error) bool { + if err == nil { + return false + } + msg := strings.ToLower(err.Error()) + return strings.Contains(msg, "not found") || strings.Contains(msg, "(notfound)") +} + func (o *Orchestrator) poweroffHosts(ctx context.Context, workers []string) error { delay := o.cfg.Shutdown.PoweroffDelaySeconds if delay <= 0 { diff --git a/internal/config/config.go b/internal/config/config.go index 4b33084..26d75a1 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -131,6 +131,7 @@ func defaults() Config { ControlPlanes: []string{"titan-0a", "titan-0b", "titan-0c"}, LocalBootstrapPaths: []string{ "infrastructure/core", + "infrastructure/flux-system", "infrastructure/sources/helm", "infrastructure/metallb", "infrastructure/traefik", diff --git a/internal/config/config_test.go b/internal/config/config_test.go index c10e12b..29e8e3b 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -18,8 +18,8 @@ ups: enabled: true provider: nut targets: - - name: db - target: atlasups@localhost + - name: pyrphoros + target: pyrphoros@localhost shutdown: default_budget_seconds: 300 state: @@ -34,7 +34,7 @@ state: if err != nil { t.Fatalf("load config: %v", err) } - if len(cfg.UPS.Targets) != 1 || cfg.UPS.Targets[0].Target != "atlasups@localhost" { + if len(cfg.UPS.Targets) != 1 || cfg.UPS.Targets[0].Target != "pyrphoros@localhost" { t.Fatalf("unexpected UPS targets: %#v", cfg.UPS.Targets) } } diff --git a/internal/metrics/exporter_test.go b/internal/metrics/exporter_test.go index f0f3269..9c30c7f 100644 --- a/internal/metrics/exporter_test.go +++ b/internal/metrics/exporter_test.go @@ -11,8 +11,8 @@ func TestExporterEmitsCoreMetrics(t *testing.T) { e := New() e.UpdateBudget(321) e.UpdateSample(Sample{ - Name: "db-ups", - Target: "atlasups@localhost", + Name: "Pyrphoros", + Target: "pyrphoros@localhost", OnBattery: true, LowBattery: false, RuntimeSecond: 412, @@ -35,12 +35,12 @@ func TestExporterEmitsCoreMetrics(t *testing.T) { mustContain := []string{ "hecate_shutdown_budget_seconds 321", "hecate_shutdown_triggers_total 1", - "hecate_ups_on_battery{source=\"db-ups\",target=\"atlasups@localhost\"", - "hecate_ups_runtime_seconds{source=\"db-ups\",target=\"atlasups@localhost\"", - "hecate_ups_battery_charge_percent{source=\"db-ups\",target=\"atlasups@localhost\"", - "hecate_ups_load_percent{source=\"db-ups\",target=\"atlasups@localhost\"", - "hecate_ups_power_nominal_watts{source=\"db-ups\",target=\"atlasups@localhost\"", - "hecate_ups_threshold_seconds{source=\"db-ups\",target=\"atlasups@localhost\"", + "hecate_ups_on_battery{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"", + "hecate_ups_runtime_seconds{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"", + "hecate_ups_battery_charge_percent{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"", + "hecate_ups_load_percent{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"", + "hecate_ups_power_nominal_watts{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"", + "hecate_ups_threshold_seconds{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"", } for _, m := range mustContain { if !strings.Contains(body, m) { diff --git a/scripts/hecate-drills.sh b/scripts/hecate-drills.sh new file mode 100755 index 0000000..f9865e9 --- /dev/null +++ b/scripts/hecate-drills.sh @@ -0,0 +1,295 @@ +#!/usr/bin/env bash +set -euo pipefail + +KUBECTL="${KUBECTL:-kubectl}" +HECATE_COORDINATOR_HOST="${HECATE_COORDINATOR_HOST:-titan-db}" +HECATE_BIN="${HECATE_BIN:-/usr/local/bin/hecate}" +HECATE_CONFIG="${HECATE_CONFIG:-/etc/hecate/hecate.yaml}" +LOG_DIR="${HECATE_DRILL_LOG_DIR:-/tmp/hecate-drills}" +STARTUP_TIMEOUT_SECONDS="${HECATE_DRILL_STARTUP_TIMEOUT_SECONDS:-1800}" +EXECUTE=0 + +usage() { + cat <<'EOF' +Usage: + scripts/hecate-drills.sh list + scripts/hecate-drills.sh run [--execute] + +Drills: + flux-gitea-deadlock Simulate flux-controller + gitea outage and require startup recovery. + foundation-recovery Simulate vault/postgres/gitea outage and require layered restore. + reconciliation-resume Simulate global Flux suspend + source-controller down and require resume. + +Notes: + - Drills are intentionally disruptive and are not part of regular `make test`. + - Use --execute to run live changes. Without it, this script prints planned actions only. +EOF +} + +log() { + printf '[drill] %s\n' "$*" +} + +die() { + printf '[drill] ERROR: %s\n' "$*" >&2 + exit 1 +} + +need_cmd() { + command -v "$1" >/dev/null 2>&1 || die "missing required command: $1" +} + +now_ts() { + date -u +%Y%m%dT%H%M%SZ +} + +resource_key() { + local ns="$1" kind="$2" name="$3" + printf '%s|%s|%s' "$ns" "$kind" "$name" +} + +get_replicas() { + local ns="$1" kind="$2" name="$3" + "${KUBECTL}" -n "$ns" get "$kind" "$name" -o jsonpath='{.spec.replicas}' 2>/dev/null || echo "0" +} + +scale_to() { + local ns="$1" kind="$2" name="$3" replicas="$4" + if [[ "${EXECUTE}" -eq 0 ]]; then + log "plan: kubectl -n ${ns} scale ${kind} ${name} --replicas=${replicas}" + return 0 + fi + "${KUBECTL}" -n "$ns" scale "$kind" "$name" --replicas="${replicas}" >/dev/null +} + +wait_ready() { + local ns="$1" kind="$2" name="$3" timeout="$4" + if [[ "${EXECUTE}" -eq 0 ]]; then + log "plan: kubectl -n ${ns} rollout status ${kind}/${name} --timeout=${timeout}" + return 0 + fi + "${KUBECTL}" -n "$ns" rollout status "${kind}/${name}" --timeout="${timeout}" >/dev/null +} + +run_hecate_startup() { + local reason="$1" + local cmd=(sudo "${HECATE_BIN}" startup --config "${HECATE_CONFIG}" --execute --force-flux-branch main) + if [[ "${EXECUTE}" -eq 0 ]]; then + log "plan: ssh ${HECATE_COORDINATOR_HOST} '${cmd[*]}'" + return 0 + fi + timeout "${STARTUP_TIMEOUT_SECONDS}" ssh "${HECATE_COORDINATOR_HOST}" "${cmd[@]}" +} + +declare -A SNAPSHOT_REPLICAS=() +SUSPENDED_KS_BEFORE="" +SUSPENDED_HR_BEFORE="" + +snapshot_resources() { + local resources=("$@") + SNAPSHOT_REPLICAS=() + for res in "${resources[@]}"; do + IFS='|' read -r ns kind name _ <<<"${res}" + SNAPSHOT_REPLICAS["$(resource_key "$ns" "$kind" "$name")"]="$(get_replicas "$ns" "$kind" "$name")" + done +} + +restore_resources() { + local resources=("$@") + for res in "${resources[@]}"; do + IFS='|' read -r ns kind name _ <<<"${res}" + local key + key="$(resource_key "$ns" "$kind" "$name")" + local replicas="${SNAPSHOT_REPLICAS[${key}]:-1}" + log "rollback replicas: ${ns}/${kind}/${name} -> ${replicas}" + scale_to "$ns" "$kind" "$name" "$replicas" || true + done +} + +record_flux_suspended_before() { + SUSPENDED_KS_BEFORE="$("${KUBECTL}" get kustomizations.kustomize.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.namespace}/{.metadata.name}{"\n"}{end}' || true)" + SUSPENDED_HR_BEFORE="$("${KUBECTL}" get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.namespace}/{.metadata.name}{"\n"}{end}' || true)" +} + +set_flux_suspend_all() { + local value="$1" + if [[ "${EXECUTE}" -eq 0 ]]; then + log "plan: patch all Flux kustomizations + helmreleases suspend=${value}" + return 0 + fi + local patch + patch="$(printf '{"spec":{"suspend":%s}}' "${value}")" + while read -r ks; do + [[ -z "${ks}" ]] && continue + "${KUBECTL}" -n flux-system patch kustomization "${ks}" --type=merge -p "${patch}" >/dev/null || true + done < <("${KUBECTL}" -n flux-system get kustomizations.kustomize.toolkit.fluxcd.io -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}') + + while read -r hr; do + [[ -z "${hr}" ]] && continue + local ns="${hr%%/*}" + local name="${hr##*/}" + "${KUBECTL}" -n "${ns}" patch helmrelease "${name}" --type=merge -p "${patch}" >/dev/null || true + done < <("${KUBECTL}" get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"/"}{.metadata.name}{"\n"}{end}') +} + +restore_flux_suspended_before() { + set_flux_suspend_all false + if [[ "${EXECUTE}" -eq 0 ]]; then + return 0 + fi + local patch='{"spec":{"suspend":true}}' + while read -r ref; do + [[ -z "${ref}" ]] && continue + local ns="${ref%%/*}" + local name="${ref##*/}" + "${KUBECTL}" -n "${ns}" patch kustomization "${name}" --type=merge -p "${patch}" >/dev/null || true + done <<<"${SUSPENDED_KS_BEFORE}" + while read -r ref; do + [[ -z "${ref}" ]] && continue + local ns="${ref%%/*}" + local name="${ref##*/}" + "${KUBECTL}" -n "${ns}" patch helmrelease "${name}" --type=merge -p "${patch}" >/dev/null || true + done <<<"${SUSPENDED_HR_BEFORE}" +} + +verify_flux_unsuspended() { + if [[ "${EXECUTE}" -eq 0 ]]; then + log "plan: verify no Flux kustomizations/helmreleases remain suspended" + return 0 + fi + local ks_count hr_count + ks_count="$("${KUBECTL}" get kustomizations.kustomize.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.name}{"\n"}{end}' | sed '/^$/d' | wc -l | tr -d ' ')" + hr_count="$("${KUBECTL}" get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.name}{"\n"}{end}' | sed '/^$/d' | wc -l | tr -d ' ')" + [[ "${ks_count}" == "0" ]] || die "kustomizations still suspended: ${ks_count}" + [[ "${hr_count}" == "0" ]] || die "helmreleases still suspended: ${hr_count}" +} + +write_log_header() { + local drill="$1" + mkdir -p "${LOG_DIR}" + local f="${LOG_DIR}/${drill}-$(now_ts).log" + exec > >(tee -a "${f}") 2>&1 + log "drill=${drill} execute=${EXECUTE} coordinator=${HECATE_COORDINATOR_HOST}" +} + +run_drill_flux_gitea_deadlock() { + local resources=( + "flux-system|deployment|source-controller|1" + "flux-system|deployment|kustomize-controller|1" + "flux-system|deployment|helm-controller|1" + "flux-system|deployment|notification-controller|1" + "gitea|deployment|gitea|1" + ) + snapshot_resources "${resources[@]}" + trap 'restore_resources "${resources[@]}"' ERR + + log "injecting outage: flux controllers + gitea" + for res in "${resources[@]}"; do + IFS='|' read -r ns kind name _ <<<"${res}" + scale_to "$ns" "$kind" "$name" 0 + done + + run_hecate_startup "drill-flux-gitea-deadlock" + + log "verifying recovery" + wait_ready flux-system deployment source-controller 240s + wait_ready flux-system deployment kustomize-controller 240s + wait_ready flux-system deployment helm-controller 240s + wait_ready flux-system deployment notification-controller 240s + wait_ready gitea deployment gitea 300s + log "pass: flux-gitea-deadlock" + trap - ERR +} + +run_drill_foundation_recovery() { + local resources=( + "vault|statefulset|vault|1" + "postgres|statefulset|postgres|1" + "gitea|deployment|gitea|1" + ) + snapshot_resources "${resources[@]}" + trap 'restore_resources "${resources[@]}"' ERR + + log "injecting outage: vault + postgres + gitea" + for res in "${resources[@]}"; do + IFS='|' read -r ns kind name _ <<<"${res}" + scale_to "$ns" "$kind" "$name" 0 + done + + run_hecate_startup "drill-foundation-recovery" + + log "verifying layered recovery" + wait_ready vault statefulset vault 420s + wait_ready postgres statefulset postgres 420s + wait_ready gitea deployment gitea 300s + log "pass: foundation-recovery" + trap - ERR +} + +run_drill_reconciliation_resume() { + local resources=("flux-system|deployment|source-controller|1") + snapshot_resources "${resources[@]}" + record_flux_suspended_before + trap 'restore_flux_suspended_before; restore_resources "${resources[@]}"' ERR + + log "injecting outage: suspend all Flux objects + stop source-controller" + set_flux_suspend_all true + scale_to flux-system deployment source-controller 0 + + run_hecate_startup "drill-reconciliation-resume" + + log "verifying reconciliation resumed" + wait_ready flux-system deployment source-controller 240s + verify_flux_unsuspended + log "pass: reconciliation-resume" + trap - ERR +} + +main() { + need_cmd "${KUBECTL}" + need_cmd ssh + need_cmd timeout + + local cmd="${1:-}" + case "${cmd}" in + list) + usage + exit 0 + ;; + run) + shift || true + local drill="${1:-}" + [[ -n "${drill}" ]] || die "missing drill name" + shift || true + while [[ $# -gt 0 ]]; do + case "$1" in + --execute) EXECUTE=1 ;; + *) die "unknown option: $1" ;; + esac + shift + done + write_log_header "${drill}" + ;; + *) + usage + exit 2 + ;; + esac + + case "${drill}" in + flux-gitea-deadlock) + run_drill_flux_gitea_deadlock + ;; + foundation-recovery) + run_drill_foundation_recovery + ;; + reconciliation-resume) + run_drill_reconciliation_resume + ;; + *) + die "unknown drill: ${drill}" + ;; + esac +} + +main "$@" diff --git a/scripts/install.sh b/scripts/install.sh index 3eec6c4..f973251 100755 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -16,11 +16,11 @@ START_NOW=1 INSTALL_DEPS=1 ENABLE_BOOTSTRAP="${HECATE_ENABLE_BOOTSTRAP:-auto}" MANAGE_NUT="${HECATE_MANAGE_NUT:-1}" -NUT_UPS_NAME="${HECATE_NUT_UPS_NAME:-atlasups}" +NUT_UPS_NAME="${HECATE_NUT_UPS_NAME:-}" NUT_VENDOR_ID="${HECATE_NUT_VENDOR_ID:-0764}" NUT_PRODUCT_ID="${HECATE_NUT_PRODUCT_ID:-0601}" NUT_MONITOR_USER="${HECATE_NUT_MONITOR_USER:-monuser}" -NUT_MONITOR_PASSWORD="${HECATE_NUT_MONITOR_PASSWORD:-atlasupsmon}" +NUT_MONITOR_PASSWORD="${HECATE_NUT_MONITOR_PASSWORD:-hecateupsmon}" while [[ $# -gt 0 ]]; do case "$1" in @@ -39,6 +39,25 @@ while [[ $# -gt 0 ]]; do esac done +resolve_nut_ups_name() { + if [[ -n "${NUT_UPS_NAME}" ]]; then + return 0 + fi + + if [[ -f "${CONF_DIR}/hecate.yaml" ]]; then + local target="" + target="$(grep -Eo 'target:[[:space:]]*[A-Za-z0-9._-]+@localhost' "${CONF_DIR}/hecate.yaml" | head -n 1 | awk '{print $2}')" + if [[ -n "${target}" ]]; then + NUT_UPS_NAME="${target%@localhost}" + echo "[install] inferred NUT UPS name from config: ${NUT_UPS_NAME}" + return 0 + fi + fi + + NUT_UPS_NAME="pyrphoros" + echo "[install] defaulting NUT UPS name to ${NUT_UPS_NAME}" +} + ensure_apt_packages() { local missing=() for pkg in "$@"; do @@ -179,6 +198,7 @@ install -m 0644 deploy/systemd/hecate-update.service "${SYSTEMD_DIR}/hecate-upda install -m 0644 deploy/systemd/hecate-update.timer "${SYSTEMD_DIR}/hecate-update.timer" install -m 0755 scripts/hecate-self-update.sh "${LIB_DIR}/hecate-self-update.sh" +resolve_nut_ups_name configure_nut systemctl daemon-reload