hecate: add layered startup drills and rename UPS peers

This commit is contained in:
Brad Stein 2026-04-04 05:50:38 -03:00
parent 138f816093
commit aa9c7b69f3
11 changed files with 519 additions and 28 deletions

View File

@ -1,4 +1,4 @@
.PHONY: build test fmt tidy install .PHONY: build test fmt tidy install drill-list drill-run
build: build:
go build -o dist/hecate ./cmd/hecate go build -o dist/hecate ./cmd/hecate
@ -14,3 +14,9 @@ tidy:
install: install:
sudo ./scripts/install.sh sudo ./scripts/install.sh
drill-list:
./scripts/hecate-drills.sh list
drill-run:
./scripts/hecate-drills.sh run $(DRILL) --execute

View File

@ -42,9 +42,9 @@ Installer knobs (optional):
- `HECATE_ENABLE_BOOTSTRAP=1` enables `hecate-bootstrap.service` on this host. - `HECATE_ENABLE_BOOTSTRAP=1` enables `hecate-bootstrap.service` on this host.
- `HECATE_ENABLE_BOOTSTRAP=0` disables it; default `auto` preserves current bootstrap enablement state. - `HECATE_ENABLE_BOOTSTRAP=0` disables it; default `auto` preserves current bootstrap enablement state.
- `HECATE_MANAGE_NUT=0` skips writing NUT/udev files. - `HECATE_MANAGE_NUT=0` skips writing NUT/udev files.
- `HECATE_NUT_UPS_NAME` (default `atlasups`) - `HECATE_NUT_UPS_NAME` (default inferred from `/etc/hecate/hecate.yaml` target, fallback `pyrphoros`)
- `HECATE_NUT_VENDOR_ID` / `HECATE_NUT_PRODUCT_ID` (defaults `0764` / `0601`) - `HECATE_NUT_VENDOR_ID` / `HECATE_NUT_PRODUCT_ID` (defaults `0764` / `0601`)
- `HECATE_NUT_MONITOR_USER` / `HECATE_NUT_MONITOR_PASSWORD` (defaults `monuser` / `atlasupsmon`) - `HECATE_NUT_MONITOR_USER` / `HECATE_NUT_MONITOR_PASSWORD` (defaults `monuser` / `hecateupsmon`)
Bootstrap now (without reboot): Bootstrap now (without reboot):
@ -64,8 +64,8 @@ sudo systemctl start hecate-bootstrap.service
## Multi-UPS topology ## Multi-UPS topology
Recommended: Recommended:
- `titan-db` runs Hecate as the shutdown coordinator (local UPS target + local shutdown execution). - `titan-db` runs Hecate as the shutdown coordinator with UPS `Pyrphoros` (`pyrphoros@localhost`).
- `tethys` runs Hecate with local UPS target and forwards shutdown triggers to `titan-db`. - `tethys` runs Hecate as a peer with UPS `Statera` (`statera@localhost`) and forwards shutdown triggers to `titan-db`.
- If forwarding fails, fallback local shutdown can remain enabled. - If forwarding fails, fallback local shutdown can remain enabled.
## Config ## Config
@ -89,3 +89,14 @@ Power metrics:
- `hecate-bootstrap.service` is enabled to run at host boot and perform staged startup automatically. - `hecate-bootstrap.service` is enabled to run at host boot and perform staged startup automatically.
- `HECATE_ENABLE_BOOTSTRAP=1` enables `hecate-bootstrap.service` (recommended on `titan-db`; keep disabled on non-coordinator hosts). - `HECATE_ENABLE_BOOTSTRAP=1` enables `hecate-bootstrap.service` (recommended on `titan-db`; keep disabled on non-coordinator hosts).
- `hecate-update.timer` runs on boot and periodically to pull latest `main` and reinstall Hecate declaratively. - `hecate-update.timer` runs on boot and periodically to pull latest `main` and reinstall Hecate declaratively.
## Disruptive startup drills
Hecate includes scripted disruptive drills that intentionally break critical services and verify startup recovery paths:
- `scripts/hecate-drills.sh list`
- `scripts/hecate-drills.sh run flux-gitea-deadlock --execute`
- `scripts/hecate-drills.sh run foundation-recovery --execute`
- `scripts/hecate-drills.sh run reconciliation-resume --execute`
These drills are intentionally **not** part of regular `go test ./...`.

View File

@ -10,6 +10,7 @@ control_planes:
workers: [] workers: []
local_bootstrap_paths: local_bootstrap_paths:
- infrastructure/core - infrastructure/core
- infrastructure/flux-system
- infrastructure/sources/helm - infrastructure/sources/helm
- infrastructure/metallb - infrastructure/metallb
- infrastructure/traefik - infrastructure/traefik
@ -42,10 +43,10 @@ shutdown:
ups: ups:
enabled: true enabled: true
provider: nut provider: nut
target: atlasups@localhost target: pyrphoros@localhost
targets: targets:
- name: db-ups - name: Pyrphoros
target: atlasups@localhost target: pyrphoros@localhost
poll_seconds: 5 poll_seconds: 5
runtime_safety_factor: 1.10 runtime_safety_factor: 1.10
debounce_count: 3 debounce_count: 3

View File

@ -34,8 +34,8 @@ ups:
enabled: true enabled: true
provider: nut provider: nut
targets: targets:
- name: tethys-ups - name: Statera
target: atlasups@localhost target: statera@localhost
poll_seconds: 5 poll_seconds: 5
runtime_safety_factor: 1.10 runtime_safety_factor: 1.10
debounce_count: 3 debounce_count: 3
@ -54,4 +54,3 @@ state:
dir: /var/lib/hecate dir: /var/lib/hecate
run_history_path: /var/lib/hecate/runs.json run_history_path: /var/lib/hecate/runs.json
lock_path: /var/lib/hecate/hecate.lock lock_path: /var/lib/hecate/hecate.lock

View File

@ -10,6 +10,7 @@ control_planes:
workers: [] workers: []
local_bootstrap_paths: local_bootstrap_paths:
- infrastructure/core - infrastructure/core
- infrastructure/flux-system
- infrastructure/sources/helm - infrastructure/sources/helm
- infrastructure/metallb - infrastructure/metallb
- infrastructure/traefik - infrastructure/traefik
@ -43,8 +44,8 @@ ups:
enabled: true enabled: true
provider: nut provider: nut
targets: targets:
- name: db-ups - name: Pyrphoros
target: atlasups@localhost target: pyrphoros@localhost
poll_seconds: 5 poll_seconds: 5
runtime_safety_factor: 1.10 runtime_safety_factor: 1.10
debounce_count: 3 debounce_count: 3
@ -63,4 +64,3 @@ state:
dir: /var/lib/hecate dir: /var/lib/hecate
run_history_path: /var/lib/hecate/runs.json run_history_path: /var/lib/hecate/runs.json
lock_path: /var/lib/hecate/hecate.lock lock_path: /var/lib/hecate/hecate.lock

View File

@ -7,6 +7,7 @@ import (
"os" "os"
"path/filepath" "path/filepath"
"sort" "sort"
"strconv"
"strings" "strings"
"time" "time"
@ -34,6 +35,22 @@ type ShutdownOptions struct {
Reason string Reason string
} }
type startupWorkload struct {
Namespace string
Kind string
Name string
}
var criticalStartupWorkloads = []startupWorkload{
{Namespace: "flux-system", Kind: "deployment", Name: "source-controller"},
{Namespace: "flux-system", Kind: "deployment", Name: "kustomize-controller"},
{Namespace: "flux-system", Kind: "deployment", Name: "helm-controller"},
{Namespace: "flux-system", Kind: "deployment", Name: "notification-controller"},
{Namespace: "vault", Kind: "statefulset", Name: "vault"},
{Namespace: "postgres", Kind: "statefulset", Name: "postgres"},
{Namespace: "gitea", Kind: "deployment", Name: "gitea"},
}
func New(cfg config.Config, runner *execx.Runner, store *state.Store, logger *log.Logger) *Orchestrator { func New(cfg config.Config, runner *execx.Runner, store *state.Store, logger *log.Logger) *Orchestrator {
return &Orchestrator{cfg: cfg, runner: runner, store: store, log: logger} return &Orchestrator{cfg: cfg, runner: runner, store: store, log: logger}
} }
@ -74,19 +91,53 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
} }
} }
needsBootstrap := false
bootstrapReasons := []string{}
if !opts.SkipLocalBootstrap { if !opts.SkipLocalBootstrap {
ready, readyErr := o.fluxSourceReady(ctx) ready, readyErr := o.fluxSourceReady(ctx)
if readyErr != nil { if readyErr != nil {
o.log.Printf("warning: unable to read flux source readiness: %v", readyErr) o.log.Printf("warning: unable to read flux source readiness: %v", readyErr)
needsBootstrap = true
bootstrapReasons = append(bootstrapReasons, "flux source readiness check failed")
} }
if !ready { if !ready {
o.log.Printf("flux source not ready, applying local bootstrap path") needsBootstrap = true
bootstrapReasons = append(bootstrapReasons, "flux source not ready")
}
missing, missingErr := o.missingCriticalStartupWorkloads(ctx)
if missingErr != nil {
o.log.Printf("warning: unable to inspect critical startup workloads: %v", missingErr)
needsBootstrap = true
bootstrapReasons = append(bootstrapReasons, "critical workload readiness check failed")
}
if len(missing) > 0 {
needsBootstrap = true
bootstrapReasons = append(bootstrapReasons, "critical workloads not ready: "+strings.Join(missing, ", "))
}
if needsBootstrap {
o.log.Printf("startup bootstrap required: %s", strings.Join(bootstrapReasons, "; "))
if err := o.bootstrapLocal(ctx); err != nil { if err := o.bootstrapLocal(ctx); err != nil {
return err return err
} }
} }
} }
if err := o.ensureCriticalStartupWorkloads(ctx); err != nil {
return err
}
if !opts.SkipLocalBootstrap && needsBootstrap {
ready, err := o.fluxSourceReady(ctx)
if err != nil {
return fmt.Errorf("flux source readiness after bootstrap: %w", err)
}
if !ready {
return fmt.Errorf("flux source still not ready after local bootstrap")
}
}
if err := o.resumeFluxAndReconcile(ctx); err != nil { if err := o.resumeFluxAndReconcile(ctx); err != nil {
return err return err
} }
@ -419,6 +470,113 @@ func (o *Orchestrator) bestEffort(name string, fn func() error) {
} }
} }
func (o *Orchestrator) missingCriticalStartupWorkloads(ctx context.Context) ([]string, error) {
missing := []string{}
for _, w := range criticalStartupWorkloads {
ready, err := o.workloadReady(ctx, w)
if err != nil {
if isNotFoundErr(err) {
missing = append(missing, fmt.Sprintf("%s/%s/%s(not found)", w.Namespace, w.Kind, w.Name))
continue
}
return nil, fmt.Errorf("check %s/%s/%s: %w", w.Namespace, w.Kind, w.Name, err)
}
if !ready {
missing = append(missing, fmt.Sprintf("%s/%s/%s", w.Namespace, w.Kind, w.Name))
}
}
return missing, nil
}
func (o *Orchestrator) ensureCriticalStartupWorkloads(ctx context.Context) error {
for _, w := range criticalStartupWorkloads {
if err := o.ensureWorkloadReplicas(ctx, w, 1); err != nil {
if isNotFoundErr(err) {
o.log.Printf("warning: startup workload missing, skipping scale: %s/%s/%s", w.Namespace, w.Kind, w.Name)
continue
}
return fmt.Errorf("scale %s/%s/%s: %w", w.Namespace, w.Kind, w.Name, err)
}
if err := o.waitWorkloadReady(ctx, w); err != nil {
if isNotFoundErr(err) {
o.log.Printf("warning: startup workload missing during readiness wait: %s/%s/%s", w.Namespace, w.Kind, w.Name)
continue
}
return err
}
}
return nil
}
func (o *Orchestrator) ensureWorkloadReplicas(ctx context.Context, w startupWorkload, replicas int) error {
_, err := o.kubectl(
ctx,
45*time.Second,
"-n",
w.Namespace,
"scale",
w.Kind,
w.Name,
fmt.Sprintf("--replicas=%d", replicas),
)
return err
}
func (o *Orchestrator) waitWorkloadReady(ctx context.Context, w startupWorkload) error {
timeout := "240s"
if w.Kind == "statefulset" {
timeout = "360s"
}
_, err := o.kubectl(
ctx,
7*time.Minute,
"-n",
w.Namespace,
"rollout",
"status",
fmt.Sprintf("%s/%s", w.Kind, w.Name),
"--timeout="+timeout,
)
if err != nil {
return fmt.Errorf("wait ready %s/%s/%s: %w", w.Namespace, w.Kind, w.Name, err)
}
return nil
}
func (o *Orchestrator) workloadReady(ctx context.Context, w startupWorkload) (bool, error) {
out, err := o.kubectl(
ctx,
20*time.Second,
"-n",
w.Namespace,
"get",
w.Kind,
w.Name,
"-o",
"jsonpath={.status.readyReplicas}",
)
if err != nil {
return false, err
}
raw := strings.TrimSpace(out)
if raw == "" || raw == "<no value>" {
return false, nil
}
n, err := strconv.Atoi(raw)
if err != nil {
return false, fmt.Errorf("parse readyReplicas %q: %w", raw, err)
}
return n >= 1, nil
}
func isNotFoundErr(err error) bool {
if err == nil {
return false
}
msg := strings.ToLower(err.Error())
return strings.Contains(msg, "not found") || strings.Contains(msg, "(notfound)")
}
func (o *Orchestrator) poweroffHosts(ctx context.Context, workers []string) error { func (o *Orchestrator) poweroffHosts(ctx context.Context, workers []string) error {
delay := o.cfg.Shutdown.PoweroffDelaySeconds delay := o.cfg.Shutdown.PoweroffDelaySeconds
if delay <= 0 { if delay <= 0 {

View File

@ -131,6 +131,7 @@ func defaults() Config {
ControlPlanes: []string{"titan-0a", "titan-0b", "titan-0c"}, ControlPlanes: []string{"titan-0a", "titan-0b", "titan-0c"},
LocalBootstrapPaths: []string{ LocalBootstrapPaths: []string{
"infrastructure/core", "infrastructure/core",
"infrastructure/flux-system",
"infrastructure/sources/helm", "infrastructure/sources/helm",
"infrastructure/metallb", "infrastructure/metallb",
"infrastructure/traefik", "infrastructure/traefik",

View File

@ -18,8 +18,8 @@ ups:
enabled: true enabled: true
provider: nut provider: nut
targets: targets:
- name: db - name: pyrphoros
target: atlasups@localhost target: pyrphoros@localhost
shutdown: shutdown:
default_budget_seconds: 300 default_budget_seconds: 300
state: state:
@ -34,7 +34,7 @@ state:
if err != nil { if err != nil {
t.Fatalf("load config: %v", err) t.Fatalf("load config: %v", err)
} }
if len(cfg.UPS.Targets) != 1 || cfg.UPS.Targets[0].Target != "atlasups@localhost" { if len(cfg.UPS.Targets) != 1 || cfg.UPS.Targets[0].Target != "pyrphoros@localhost" {
t.Fatalf("unexpected UPS targets: %#v", cfg.UPS.Targets) t.Fatalf("unexpected UPS targets: %#v", cfg.UPS.Targets)
} }
} }

View File

@ -11,8 +11,8 @@ func TestExporterEmitsCoreMetrics(t *testing.T) {
e := New() e := New()
e.UpdateBudget(321) e.UpdateBudget(321)
e.UpdateSample(Sample{ e.UpdateSample(Sample{
Name: "db-ups", Name: "Pyrphoros",
Target: "atlasups@localhost", Target: "pyrphoros@localhost",
OnBattery: true, OnBattery: true,
LowBattery: false, LowBattery: false,
RuntimeSecond: 412, RuntimeSecond: 412,
@ -35,12 +35,12 @@ func TestExporterEmitsCoreMetrics(t *testing.T) {
mustContain := []string{ mustContain := []string{
"hecate_shutdown_budget_seconds 321", "hecate_shutdown_budget_seconds 321",
"hecate_shutdown_triggers_total 1", "hecate_shutdown_triggers_total 1",
"hecate_ups_on_battery{source=\"db-ups\",target=\"atlasups@localhost\"", "hecate_ups_on_battery{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
"hecate_ups_runtime_seconds{source=\"db-ups\",target=\"atlasups@localhost\"", "hecate_ups_runtime_seconds{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
"hecate_ups_battery_charge_percent{source=\"db-ups\",target=\"atlasups@localhost\"", "hecate_ups_battery_charge_percent{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
"hecate_ups_load_percent{source=\"db-ups\",target=\"atlasups@localhost\"", "hecate_ups_load_percent{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
"hecate_ups_power_nominal_watts{source=\"db-ups\",target=\"atlasups@localhost\"", "hecate_ups_power_nominal_watts{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
"hecate_ups_threshold_seconds{source=\"db-ups\",target=\"atlasups@localhost\"", "hecate_ups_threshold_seconds{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
} }
for _, m := range mustContain { for _, m := range mustContain {
if !strings.Contains(body, m) { if !strings.Contains(body, m) {

295
scripts/hecate-drills.sh Executable file
View File

@ -0,0 +1,295 @@
#!/usr/bin/env bash
set -euo pipefail
KUBECTL="${KUBECTL:-kubectl}"
HECATE_COORDINATOR_HOST="${HECATE_COORDINATOR_HOST:-titan-db}"
HECATE_BIN="${HECATE_BIN:-/usr/local/bin/hecate}"
HECATE_CONFIG="${HECATE_CONFIG:-/etc/hecate/hecate.yaml}"
LOG_DIR="${HECATE_DRILL_LOG_DIR:-/tmp/hecate-drills}"
STARTUP_TIMEOUT_SECONDS="${HECATE_DRILL_STARTUP_TIMEOUT_SECONDS:-1800}"
EXECUTE=0
usage() {
cat <<'EOF'
Usage:
scripts/hecate-drills.sh list
scripts/hecate-drills.sh run <drill-name> [--execute]
Drills:
flux-gitea-deadlock Simulate flux-controller + gitea outage and require startup recovery.
foundation-recovery Simulate vault/postgres/gitea outage and require layered restore.
reconciliation-resume Simulate global Flux suspend + source-controller down and require resume.
Notes:
- Drills are intentionally disruptive and are not part of regular `make test`.
- Use --execute to run live changes. Without it, this script prints planned actions only.
EOF
}
log() {
printf '[drill] %s\n' "$*"
}
die() {
printf '[drill] ERROR: %s\n' "$*" >&2
exit 1
}
need_cmd() {
command -v "$1" >/dev/null 2>&1 || die "missing required command: $1"
}
now_ts() {
date -u +%Y%m%dT%H%M%SZ
}
resource_key() {
local ns="$1" kind="$2" name="$3"
printf '%s|%s|%s' "$ns" "$kind" "$name"
}
get_replicas() {
local ns="$1" kind="$2" name="$3"
"${KUBECTL}" -n "$ns" get "$kind" "$name" -o jsonpath='{.spec.replicas}' 2>/dev/null || echo "0"
}
scale_to() {
local ns="$1" kind="$2" name="$3" replicas="$4"
if [[ "${EXECUTE}" -eq 0 ]]; then
log "plan: kubectl -n ${ns} scale ${kind} ${name} --replicas=${replicas}"
return 0
fi
"${KUBECTL}" -n "$ns" scale "$kind" "$name" --replicas="${replicas}" >/dev/null
}
wait_ready() {
local ns="$1" kind="$2" name="$3" timeout="$4"
if [[ "${EXECUTE}" -eq 0 ]]; then
log "plan: kubectl -n ${ns} rollout status ${kind}/${name} --timeout=${timeout}"
return 0
fi
"${KUBECTL}" -n "$ns" rollout status "${kind}/${name}" --timeout="${timeout}" >/dev/null
}
run_hecate_startup() {
local reason="$1"
local cmd=(sudo "${HECATE_BIN}" startup --config "${HECATE_CONFIG}" --execute --force-flux-branch main)
if [[ "${EXECUTE}" -eq 0 ]]; then
log "plan: ssh ${HECATE_COORDINATOR_HOST} '${cmd[*]}'"
return 0
fi
timeout "${STARTUP_TIMEOUT_SECONDS}" ssh "${HECATE_COORDINATOR_HOST}" "${cmd[@]}"
}
declare -A SNAPSHOT_REPLICAS=()
SUSPENDED_KS_BEFORE=""
SUSPENDED_HR_BEFORE=""
snapshot_resources() {
local resources=("$@")
SNAPSHOT_REPLICAS=()
for res in "${resources[@]}"; do
IFS='|' read -r ns kind name _ <<<"${res}"
SNAPSHOT_REPLICAS["$(resource_key "$ns" "$kind" "$name")"]="$(get_replicas "$ns" "$kind" "$name")"
done
}
restore_resources() {
local resources=("$@")
for res in "${resources[@]}"; do
IFS='|' read -r ns kind name _ <<<"${res}"
local key
key="$(resource_key "$ns" "$kind" "$name")"
local replicas="${SNAPSHOT_REPLICAS[${key}]:-1}"
log "rollback replicas: ${ns}/${kind}/${name} -> ${replicas}"
scale_to "$ns" "$kind" "$name" "$replicas" || true
done
}
record_flux_suspended_before() {
SUSPENDED_KS_BEFORE="$("${KUBECTL}" get kustomizations.kustomize.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.namespace}/{.metadata.name}{"\n"}{end}' || true)"
SUSPENDED_HR_BEFORE="$("${KUBECTL}" get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.namespace}/{.metadata.name}{"\n"}{end}' || true)"
}
set_flux_suspend_all() {
local value="$1"
if [[ "${EXECUTE}" -eq 0 ]]; then
log "plan: patch all Flux kustomizations + helmreleases suspend=${value}"
return 0
fi
local patch
patch="$(printf '{"spec":{"suspend":%s}}' "${value}")"
while read -r ks; do
[[ -z "${ks}" ]] && continue
"${KUBECTL}" -n flux-system patch kustomization "${ks}" --type=merge -p "${patch}" >/dev/null || true
done < <("${KUBECTL}" -n flux-system get kustomizations.kustomize.toolkit.fluxcd.io -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}')
while read -r hr; do
[[ -z "${hr}" ]] && continue
local ns="${hr%%/*}"
local name="${hr##*/}"
"${KUBECTL}" -n "${ns}" patch helmrelease "${name}" --type=merge -p "${patch}" >/dev/null || true
done < <("${KUBECTL}" get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"/"}{.metadata.name}{"\n"}{end}')
}
restore_flux_suspended_before() {
set_flux_suspend_all false
if [[ "${EXECUTE}" -eq 0 ]]; then
return 0
fi
local patch='{"spec":{"suspend":true}}'
while read -r ref; do
[[ -z "${ref}" ]] && continue
local ns="${ref%%/*}"
local name="${ref##*/}"
"${KUBECTL}" -n "${ns}" patch kustomization "${name}" --type=merge -p "${patch}" >/dev/null || true
done <<<"${SUSPENDED_KS_BEFORE}"
while read -r ref; do
[[ -z "${ref}" ]] && continue
local ns="${ref%%/*}"
local name="${ref##*/}"
"${KUBECTL}" -n "${ns}" patch helmrelease "${name}" --type=merge -p "${patch}" >/dev/null || true
done <<<"${SUSPENDED_HR_BEFORE}"
}
verify_flux_unsuspended() {
if [[ "${EXECUTE}" -eq 0 ]]; then
log "plan: verify no Flux kustomizations/helmreleases remain suspended"
return 0
fi
local ks_count hr_count
ks_count="$("${KUBECTL}" get kustomizations.kustomize.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.name}{"\n"}{end}' | sed '/^$/d' | wc -l | tr -d ' ')"
hr_count="$("${KUBECTL}" get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.name}{"\n"}{end}' | sed '/^$/d' | wc -l | tr -d ' ')"
[[ "${ks_count}" == "0" ]] || die "kustomizations still suspended: ${ks_count}"
[[ "${hr_count}" == "0" ]] || die "helmreleases still suspended: ${hr_count}"
}
write_log_header() {
local drill="$1"
mkdir -p "${LOG_DIR}"
local f="${LOG_DIR}/${drill}-$(now_ts).log"
exec > >(tee -a "${f}") 2>&1
log "drill=${drill} execute=${EXECUTE} coordinator=${HECATE_COORDINATOR_HOST}"
}
run_drill_flux_gitea_deadlock() {
local resources=(
"flux-system|deployment|source-controller|1"
"flux-system|deployment|kustomize-controller|1"
"flux-system|deployment|helm-controller|1"
"flux-system|deployment|notification-controller|1"
"gitea|deployment|gitea|1"
)
snapshot_resources "${resources[@]}"
trap 'restore_resources "${resources[@]}"' ERR
log "injecting outage: flux controllers + gitea"
for res in "${resources[@]}"; do
IFS='|' read -r ns kind name _ <<<"${res}"
scale_to "$ns" "$kind" "$name" 0
done
run_hecate_startup "drill-flux-gitea-deadlock"
log "verifying recovery"
wait_ready flux-system deployment source-controller 240s
wait_ready flux-system deployment kustomize-controller 240s
wait_ready flux-system deployment helm-controller 240s
wait_ready flux-system deployment notification-controller 240s
wait_ready gitea deployment gitea 300s
log "pass: flux-gitea-deadlock"
trap - ERR
}
run_drill_foundation_recovery() {
local resources=(
"vault|statefulset|vault|1"
"postgres|statefulset|postgres|1"
"gitea|deployment|gitea|1"
)
snapshot_resources "${resources[@]}"
trap 'restore_resources "${resources[@]}"' ERR
log "injecting outage: vault + postgres + gitea"
for res in "${resources[@]}"; do
IFS='|' read -r ns kind name _ <<<"${res}"
scale_to "$ns" "$kind" "$name" 0
done
run_hecate_startup "drill-foundation-recovery"
log "verifying layered recovery"
wait_ready vault statefulset vault 420s
wait_ready postgres statefulset postgres 420s
wait_ready gitea deployment gitea 300s
log "pass: foundation-recovery"
trap - ERR
}
run_drill_reconciliation_resume() {
local resources=("flux-system|deployment|source-controller|1")
snapshot_resources "${resources[@]}"
record_flux_suspended_before
trap 'restore_flux_suspended_before; restore_resources "${resources[@]}"' ERR
log "injecting outage: suspend all Flux objects + stop source-controller"
set_flux_suspend_all true
scale_to flux-system deployment source-controller 0
run_hecate_startup "drill-reconciliation-resume"
log "verifying reconciliation resumed"
wait_ready flux-system deployment source-controller 240s
verify_flux_unsuspended
log "pass: reconciliation-resume"
trap - ERR
}
main() {
need_cmd "${KUBECTL}"
need_cmd ssh
need_cmd timeout
local cmd="${1:-}"
case "${cmd}" in
list)
usage
exit 0
;;
run)
shift || true
local drill="${1:-}"
[[ -n "${drill}" ]] || die "missing drill name"
shift || true
while [[ $# -gt 0 ]]; do
case "$1" in
--execute) EXECUTE=1 ;;
*) die "unknown option: $1" ;;
esac
shift
done
write_log_header "${drill}"
;;
*)
usage
exit 2
;;
esac
case "${drill}" in
flux-gitea-deadlock)
run_drill_flux_gitea_deadlock
;;
foundation-recovery)
run_drill_foundation_recovery
;;
reconciliation-resume)
run_drill_reconciliation_resume
;;
*)
die "unknown drill: ${drill}"
;;
esac
}
main "$@"

View File

@ -16,11 +16,11 @@ START_NOW=1
INSTALL_DEPS=1 INSTALL_DEPS=1
ENABLE_BOOTSTRAP="${HECATE_ENABLE_BOOTSTRAP:-auto}" ENABLE_BOOTSTRAP="${HECATE_ENABLE_BOOTSTRAP:-auto}"
MANAGE_NUT="${HECATE_MANAGE_NUT:-1}" MANAGE_NUT="${HECATE_MANAGE_NUT:-1}"
NUT_UPS_NAME="${HECATE_NUT_UPS_NAME:-atlasups}" NUT_UPS_NAME="${HECATE_NUT_UPS_NAME:-}"
NUT_VENDOR_ID="${HECATE_NUT_VENDOR_ID:-0764}" NUT_VENDOR_ID="${HECATE_NUT_VENDOR_ID:-0764}"
NUT_PRODUCT_ID="${HECATE_NUT_PRODUCT_ID:-0601}" NUT_PRODUCT_ID="${HECATE_NUT_PRODUCT_ID:-0601}"
NUT_MONITOR_USER="${HECATE_NUT_MONITOR_USER:-monuser}" NUT_MONITOR_USER="${HECATE_NUT_MONITOR_USER:-monuser}"
NUT_MONITOR_PASSWORD="${HECATE_NUT_MONITOR_PASSWORD:-atlasupsmon}" NUT_MONITOR_PASSWORD="${HECATE_NUT_MONITOR_PASSWORD:-hecateupsmon}"
while [[ $# -gt 0 ]]; do while [[ $# -gt 0 ]]; do
case "$1" in case "$1" in
@ -39,6 +39,25 @@ while [[ $# -gt 0 ]]; do
esac esac
done done
resolve_nut_ups_name() {
if [[ -n "${NUT_UPS_NAME}" ]]; then
return 0
fi
if [[ -f "${CONF_DIR}/hecate.yaml" ]]; then
local target=""
target="$(grep -Eo 'target:[[:space:]]*[A-Za-z0-9._-]+@localhost' "${CONF_DIR}/hecate.yaml" | head -n 1 | awk '{print $2}')"
if [[ -n "${target}" ]]; then
NUT_UPS_NAME="${target%@localhost}"
echo "[install] inferred NUT UPS name from config: ${NUT_UPS_NAME}"
return 0
fi
fi
NUT_UPS_NAME="pyrphoros"
echo "[install] defaulting NUT UPS name to ${NUT_UPS_NAME}"
}
ensure_apt_packages() { ensure_apt_packages() {
local missing=() local missing=()
for pkg in "$@"; do for pkg in "$@"; do
@ -179,6 +198,7 @@ install -m 0644 deploy/systemd/hecate-update.service "${SYSTEMD_DIR}/hecate-upda
install -m 0644 deploy/systemd/hecate-update.timer "${SYSTEMD_DIR}/hecate-update.timer" install -m 0644 deploy/systemd/hecate-update.timer "${SYSTEMD_DIR}/hecate-update.timer"
install -m 0755 scripts/hecate-self-update.sh "${LIB_DIR}/hecate-self-update.sh" install -m 0755 scripts/hecate-self-update.sh "${LIB_DIR}/hecate-self-update.sh"
resolve_nut_ups_name
configure_nut configure_nut
systemctl daemon-reload systemctl daemon-reload