hecate: add layered startup drills and rename UPS peers
This commit is contained in:
parent
138f816093
commit
aa9c7b69f3
8
Makefile
8
Makefile
@ -1,4 +1,4 @@
|
||||
.PHONY: build test fmt tidy install
|
||||
.PHONY: build test fmt tidy install drill-list drill-run
|
||||
|
||||
build:
|
||||
go build -o dist/hecate ./cmd/hecate
|
||||
@ -14,3 +14,9 @@ tidy:
|
||||
|
||||
install:
|
||||
sudo ./scripts/install.sh
|
||||
|
||||
drill-list:
|
||||
./scripts/hecate-drills.sh list
|
||||
|
||||
drill-run:
|
||||
./scripts/hecate-drills.sh run $(DRILL) --execute
|
||||
|
||||
19
README.md
19
README.md
@ -42,9 +42,9 @@ Installer knobs (optional):
|
||||
- `HECATE_ENABLE_BOOTSTRAP=1` enables `hecate-bootstrap.service` on this host.
|
||||
- `HECATE_ENABLE_BOOTSTRAP=0` disables it; default `auto` preserves current bootstrap enablement state.
|
||||
- `HECATE_MANAGE_NUT=0` skips writing NUT/udev files.
|
||||
- `HECATE_NUT_UPS_NAME` (default `atlasups`)
|
||||
- `HECATE_NUT_UPS_NAME` (default inferred from `/etc/hecate/hecate.yaml` target, fallback `pyrphoros`)
|
||||
- `HECATE_NUT_VENDOR_ID` / `HECATE_NUT_PRODUCT_ID` (defaults `0764` / `0601`)
|
||||
- `HECATE_NUT_MONITOR_USER` / `HECATE_NUT_MONITOR_PASSWORD` (defaults `monuser` / `atlasupsmon`)
|
||||
- `HECATE_NUT_MONITOR_USER` / `HECATE_NUT_MONITOR_PASSWORD` (defaults `monuser` / `hecateupsmon`)
|
||||
|
||||
Bootstrap now (without reboot):
|
||||
|
||||
@ -64,8 +64,8 @@ sudo systemctl start hecate-bootstrap.service
|
||||
## Multi-UPS topology
|
||||
|
||||
Recommended:
|
||||
- `titan-db` runs Hecate as the shutdown coordinator (local UPS target + local shutdown execution).
|
||||
- `tethys` runs Hecate with local UPS target and forwards shutdown triggers to `titan-db`.
|
||||
- `titan-db` runs Hecate as the shutdown coordinator with UPS `Pyrphoros` (`pyrphoros@localhost`).
|
||||
- `tethys` runs Hecate as a peer with UPS `Statera` (`statera@localhost`) and forwards shutdown triggers to `titan-db`.
|
||||
- If forwarding fails, fallback local shutdown can remain enabled.
|
||||
|
||||
## Config
|
||||
@ -89,3 +89,14 @@ Power metrics:
|
||||
- `hecate-bootstrap.service` is enabled to run at host boot and perform staged startup automatically.
|
||||
- `HECATE_ENABLE_BOOTSTRAP=1` enables `hecate-bootstrap.service` (recommended on `titan-db`; keep disabled on non-coordinator hosts).
|
||||
- `hecate-update.timer` runs on boot and periodically to pull latest `main` and reinstall Hecate declaratively.
|
||||
|
||||
## Disruptive startup drills
|
||||
|
||||
Hecate includes scripted disruptive drills that intentionally break critical services and verify startup recovery paths:
|
||||
|
||||
- `scripts/hecate-drills.sh list`
|
||||
- `scripts/hecate-drills.sh run flux-gitea-deadlock --execute`
|
||||
- `scripts/hecate-drills.sh run foundation-recovery --execute`
|
||||
- `scripts/hecate-drills.sh run reconciliation-resume --execute`
|
||||
|
||||
These drills are intentionally **not** part of regular `go test ./...`.
|
||||
|
||||
@ -10,6 +10,7 @@ control_planes:
|
||||
workers: []
|
||||
local_bootstrap_paths:
|
||||
- infrastructure/core
|
||||
- infrastructure/flux-system
|
||||
- infrastructure/sources/helm
|
||||
- infrastructure/metallb
|
||||
- infrastructure/traefik
|
||||
@ -42,10 +43,10 @@ shutdown:
|
||||
ups:
|
||||
enabled: true
|
||||
provider: nut
|
||||
target: atlasups@localhost
|
||||
target: pyrphoros@localhost
|
||||
targets:
|
||||
- name: db-ups
|
||||
target: atlasups@localhost
|
||||
- name: Pyrphoros
|
||||
target: pyrphoros@localhost
|
||||
poll_seconds: 5
|
||||
runtime_safety_factor: 1.10
|
||||
debounce_count: 3
|
||||
|
||||
@ -34,8 +34,8 @@ ups:
|
||||
enabled: true
|
||||
provider: nut
|
||||
targets:
|
||||
- name: tethys-ups
|
||||
target: atlasups@localhost
|
||||
- name: Statera
|
||||
target: statera@localhost
|
||||
poll_seconds: 5
|
||||
runtime_safety_factor: 1.10
|
||||
debounce_count: 3
|
||||
@ -54,4 +54,3 @@ state:
|
||||
dir: /var/lib/hecate
|
||||
run_history_path: /var/lib/hecate/runs.json
|
||||
lock_path: /var/lib/hecate/hecate.lock
|
||||
|
||||
|
||||
@ -10,6 +10,7 @@ control_planes:
|
||||
workers: []
|
||||
local_bootstrap_paths:
|
||||
- infrastructure/core
|
||||
- infrastructure/flux-system
|
||||
- infrastructure/sources/helm
|
||||
- infrastructure/metallb
|
||||
- infrastructure/traefik
|
||||
@ -43,8 +44,8 @@ ups:
|
||||
enabled: true
|
||||
provider: nut
|
||||
targets:
|
||||
- name: db-ups
|
||||
target: atlasups@localhost
|
||||
- name: Pyrphoros
|
||||
target: pyrphoros@localhost
|
||||
poll_seconds: 5
|
||||
runtime_safety_factor: 1.10
|
||||
debounce_count: 3
|
||||
@ -63,4 +64,3 @@ state:
|
||||
dir: /var/lib/hecate
|
||||
run_history_path: /var/lib/hecate/runs.json
|
||||
lock_path: /var/lib/hecate/hecate.lock
|
||||
|
||||
|
||||
@ -7,6 +7,7 @@ import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
@ -34,6 +35,22 @@ type ShutdownOptions struct {
|
||||
Reason string
|
||||
}
|
||||
|
||||
type startupWorkload struct {
|
||||
Namespace string
|
||||
Kind string
|
||||
Name string
|
||||
}
|
||||
|
||||
var criticalStartupWorkloads = []startupWorkload{
|
||||
{Namespace: "flux-system", Kind: "deployment", Name: "source-controller"},
|
||||
{Namespace: "flux-system", Kind: "deployment", Name: "kustomize-controller"},
|
||||
{Namespace: "flux-system", Kind: "deployment", Name: "helm-controller"},
|
||||
{Namespace: "flux-system", Kind: "deployment", Name: "notification-controller"},
|
||||
{Namespace: "vault", Kind: "statefulset", Name: "vault"},
|
||||
{Namespace: "postgres", Kind: "statefulset", Name: "postgres"},
|
||||
{Namespace: "gitea", Kind: "deployment", Name: "gitea"},
|
||||
}
|
||||
|
||||
func New(cfg config.Config, runner *execx.Runner, store *state.Store, logger *log.Logger) *Orchestrator {
|
||||
return &Orchestrator{cfg: cfg, runner: runner, store: store, log: logger}
|
||||
}
|
||||
@ -74,19 +91,53 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
|
||||
}
|
||||
}
|
||||
|
||||
needsBootstrap := false
|
||||
bootstrapReasons := []string{}
|
||||
if !opts.SkipLocalBootstrap {
|
||||
ready, readyErr := o.fluxSourceReady(ctx)
|
||||
if readyErr != nil {
|
||||
o.log.Printf("warning: unable to read flux source readiness: %v", readyErr)
|
||||
needsBootstrap = true
|
||||
bootstrapReasons = append(bootstrapReasons, "flux source readiness check failed")
|
||||
}
|
||||
if !ready {
|
||||
o.log.Printf("flux source not ready, applying local bootstrap path")
|
||||
needsBootstrap = true
|
||||
bootstrapReasons = append(bootstrapReasons, "flux source not ready")
|
||||
}
|
||||
|
||||
missing, missingErr := o.missingCriticalStartupWorkloads(ctx)
|
||||
if missingErr != nil {
|
||||
o.log.Printf("warning: unable to inspect critical startup workloads: %v", missingErr)
|
||||
needsBootstrap = true
|
||||
bootstrapReasons = append(bootstrapReasons, "critical workload readiness check failed")
|
||||
}
|
||||
if len(missing) > 0 {
|
||||
needsBootstrap = true
|
||||
bootstrapReasons = append(bootstrapReasons, "critical workloads not ready: "+strings.Join(missing, ", "))
|
||||
}
|
||||
|
||||
if needsBootstrap {
|
||||
o.log.Printf("startup bootstrap required: %s", strings.Join(bootstrapReasons, "; "))
|
||||
if err := o.bootstrapLocal(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if err := o.ensureCriticalStartupWorkloads(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if !opts.SkipLocalBootstrap && needsBootstrap {
|
||||
ready, err := o.fluxSourceReady(ctx)
|
||||
if err != nil {
|
||||
return fmt.Errorf("flux source readiness after bootstrap: %w", err)
|
||||
}
|
||||
if !ready {
|
||||
return fmt.Errorf("flux source still not ready after local bootstrap")
|
||||
}
|
||||
}
|
||||
|
||||
if err := o.resumeFluxAndReconcile(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
@ -419,6 +470,113 @@ func (o *Orchestrator) bestEffort(name string, fn func() error) {
|
||||
}
|
||||
}
|
||||
|
||||
func (o *Orchestrator) missingCriticalStartupWorkloads(ctx context.Context) ([]string, error) {
|
||||
missing := []string{}
|
||||
for _, w := range criticalStartupWorkloads {
|
||||
ready, err := o.workloadReady(ctx, w)
|
||||
if err != nil {
|
||||
if isNotFoundErr(err) {
|
||||
missing = append(missing, fmt.Sprintf("%s/%s/%s(not found)", w.Namespace, w.Kind, w.Name))
|
||||
continue
|
||||
}
|
||||
return nil, fmt.Errorf("check %s/%s/%s: %w", w.Namespace, w.Kind, w.Name, err)
|
||||
}
|
||||
if !ready {
|
||||
missing = append(missing, fmt.Sprintf("%s/%s/%s", w.Namespace, w.Kind, w.Name))
|
||||
}
|
||||
}
|
||||
return missing, nil
|
||||
}
|
||||
|
||||
func (o *Orchestrator) ensureCriticalStartupWorkloads(ctx context.Context) error {
|
||||
for _, w := range criticalStartupWorkloads {
|
||||
if err := o.ensureWorkloadReplicas(ctx, w, 1); err != nil {
|
||||
if isNotFoundErr(err) {
|
||||
o.log.Printf("warning: startup workload missing, skipping scale: %s/%s/%s", w.Namespace, w.Kind, w.Name)
|
||||
continue
|
||||
}
|
||||
return fmt.Errorf("scale %s/%s/%s: %w", w.Namespace, w.Kind, w.Name, err)
|
||||
}
|
||||
if err := o.waitWorkloadReady(ctx, w); err != nil {
|
||||
if isNotFoundErr(err) {
|
||||
o.log.Printf("warning: startup workload missing during readiness wait: %s/%s/%s", w.Namespace, w.Kind, w.Name)
|
||||
continue
|
||||
}
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (o *Orchestrator) ensureWorkloadReplicas(ctx context.Context, w startupWorkload, replicas int) error {
|
||||
_, err := o.kubectl(
|
||||
ctx,
|
||||
45*time.Second,
|
||||
"-n",
|
||||
w.Namespace,
|
||||
"scale",
|
||||
w.Kind,
|
||||
w.Name,
|
||||
fmt.Sprintf("--replicas=%d", replicas),
|
||||
)
|
||||
return err
|
||||
}
|
||||
|
||||
func (o *Orchestrator) waitWorkloadReady(ctx context.Context, w startupWorkload) error {
|
||||
timeout := "240s"
|
||||
if w.Kind == "statefulset" {
|
||||
timeout = "360s"
|
||||
}
|
||||
_, err := o.kubectl(
|
||||
ctx,
|
||||
7*time.Minute,
|
||||
"-n",
|
||||
w.Namespace,
|
||||
"rollout",
|
||||
"status",
|
||||
fmt.Sprintf("%s/%s", w.Kind, w.Name),
|
||||
"--timeout="+timeout,
|
||||
)
|
||||
if err != nil {
|
||||
return fmt.Errorf("wait ready %s/%s/%s: %w", w.Namespace, w.Kind, w.Name, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (o *Orchestrator) workloadReady(ctx context.Context, w startupWorkload) (bool, error) {
|
||||
out, err := o.kubectl(
|
||||
ctx,
|
||||
20*time.Second,
|
||||
"-n",
|
||||
w.Namespace,
|
||||
"get",
|
||||
w.Kind,
|
||||
w.Name,
|
||||
"-o",
|
||||
"jsonpath={.status.readyReplicas}",
|
||||
)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
raw := strings.TrimSpace(out)
|
||||
if raw == "" || raw == "<no value>" {
|
||||
return false, nil
|
||||
}
|
||||
n, err := strconv.Atoi(raw)
|
||||
if err != nil {
|
||||
return false, fmt.Errorf("parse readyReplicas %q: %w", raw, err)
|
||||
}
|
||||
return n >= 1, nil
|
||||
}
|
||||
|
||||
func isNotFoundErr(err error) bool {
|
||||
if err == nil {
|
||||
return false
|
||||
}
|
||||
msg := strings.ToLower(err.Error())
|
||||
return strings.Contains(msg, "not found") || strings.Contains(msg, "(notfound)")
|
||||
}
|
||||
|
||||
func (o *Orchestrator) poweroffHosts(ctx context.Context, workers []string) error {
|
||||
delay := o.cfg.Shutdown.PoweroffDelaySeconds
|
||||
if delay <= 0 {
|
||||
|
||||
@ -131,6 +131,7 @@ func defaults() Config {
|
||||
ControlPlanes: []string{"titan-0a", "titan-0b", "titan-0c"},
|
||||
LocalBootstrapPaths: []string{
|
||||
"infrastructure/core",
|
||||
"infrastructure/flux-system",
|
||||
"infrastructure/sources/helm",
|
||||
"infrastructure/metallb",
|
||||
"infrastructure/traefik",
|
||||
|
||||
@ -18,8 +18,8 @@ ups:
|
||||
enabled: true
|
||||
provider: nut
|
||||
targets:
|
||||
- name: db
|
||||
target: atlasups@localhost
|
||||
- name: pyrphoros
|
||||
target: pyrphoros@localhost
|
||||
shutdown:
|
||||
default_budget_seconds: 300
|
||||
state:
|
||||
@ -34,7 +34,7 @@ state:
|
||||
if err != nil {
|
||||
t.Fatalf("load config: %v", err)
|
||||
}
|
||||
if len(cfg.UPS.Targets) != 1 || cfg.UPS.Targets[0].Target != "atlasups@localhost" {
|
||||
if len(cfg.UPS.Targets) != 1 || cfg.UPS.Targets[0].Target != "pyrphoros@localhost" {
|
||||
t.Fatalf("unexpected UPS targets: %#v", cfg.UPS.Targets)
|
||||
}
|
||||
}
|
||||
|
||||
@ -11,8 +11,8 @@ func TestExporterEmitsCoreMetrics(t *testing.T) {
|
||||
e := New()
|
||||
e.UpdateBudget(321)
|
||||
e.UpdateSample(Sample{
|
||||
Name: "db-ups",
|
||||
Target: "atlasups@localhost",
|
||||
Name: "Pyrphoros",
|
||||
Target: "pyrphoros@localhost",
|
||||
OnBattery: true,
|
||||
LowBattery: false,
|
||||
RuntimeSecond: 412,
|
||||
@ -35,12 +35,12 @@ func TestExporterEmitsCoreMetrics(t *testing.T) {
|
||||
mustContain := []string{
|
||||
"hecate_shutdown_budget_seconds 321",
|
||||
"hecate_shutdown_triggers_total 1",
|
||||
"hecate_ups_on_battery{source=\"db-ups\",target=\"atlasups@localhost\"",
|
||||
"hecate_ups_runtime_seconds{source=\"db-ups\",target=\"atlasups@localhost\"",
|
||||
"hecate_ups_battery_charge_percent{source=\"db-ups\",target=\"atlasups@localhost\"",
|
||||
"hecate_ups_load_percent{source=\"db-ups\",target=\"atlasups@localhost\"",
|
||||
"hecate_ups_power_nominal_watts{source=\"db-ups\",target=\"atlasups@localhost\"",
|
||||
"hecate_ups_threshold_seconds{source=\"db-ups\",target=\"atlasups@localhost\"",
|
||||
"hecate_ups_on_battery{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
|
||||
"hecate_ups_runtime_seconds{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
|
||||
"hecate_ups_battery_charge_percent{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
|
||||
"hecate_ups_load_percent{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
|
||||
"hecate_ups_power_nominal_watts{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
|
||||
"hecate_ups_threshold_seconds{source=\"Pyrphoros\",target=\"pyrphoros@localhost\"",
|
||||
}
|
||||
for _, m := range mustContain {
|
||||
if !strings.Contains(body, m) {
|
||||
|
||||
295
scripts/hecate-drills.sh
Executable file
295
scripts/hecate-drills.sh
Executable file
@ -0,0 +1,295 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
KUBECTL="${KUBECTL:-kubectl}"
|
||||
HECATE_COORDINATOR_HOST="${HECATE_COORDINATOR_HOST:-titan-db}"
|
||||
HECATE_BIN="${HECATE_BIN:-/usr/local/bin/hecate}"
|
||||
HECATE_CONFIG="${HECATE_CONFIG:-/etc/hecate/hecate.yaml}"
|
||||
LOG_DIR="${HECATE_DRILL_LOG_DIR:-/tmp/hecate-drills}"
|
||||
STARTUP_TIMEOUT_SECONDS="${HECATE_DRILL_STARTUP_TIMEOUT_SECONDS:-1800}"
|
||||
EXECUTE=0
|
||||
|
||||
usage() {
|
||||
cat <<'EOF'
|
||||
Usage:
|
||||
scripts/hecate-drills.sh list
|
||||
scripts/hecate-drills.sh run <drill-name> [--execute]
|
||||
|
||||
Drills:
|
||||
flux-gitea-deadlock Simulate flux-controller + gitea outage and require startup recovery.
|
||||
foundation-recovery Simulate vault/postgres/gitea outage and require layered restore.
|
||||
reconciliation-resume Simulate global Flux suspend + source-controller down and require resume.
|
||||
|
||||
Notes:
|
||||
- Drills are intentionally disruptive and are not part of regular `make test`.
|
||||
- Use --execute to run live changes. Without it, this script prints planned actions only.
|
||||
EOF
|
||||
}
|
||||
|
||||
log() {
|
||||
printf '[drill] %s\n' "$*"
|
||||
}
|
||||
|
||||
die() {
|
||||
printf '[drill] ERROR: %s\n' "$*" >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
need_cmd() {
|
||||
command -v "$1" >/dev/null 2>&1 || die "missing required command: $1"
|
||||
}
|
||||
|
||||
now_ts() {
|
||||
date -u +%Y%m%dT%H%M%SZ
|
||||
}
|
||||
|
||||
resource_key() {
|
||||
local ns="$1" kind="$2" name="$3"
|
||||
printf '%s|%s|%s' "$ns" "$kind" "$name"
|
||||
}
|
||||
|
||||
get_replicas() {
|
||||
local ns="$1" kind="$2" name="$3"
|
||||
"${KUBECTL}" -n "$ns" get "$kind" "$name" -o jsonpath='{.spec.replicas}' 2>/dev/null || echo "0"
|
||||
}
|
||||
|
||||
scale_to() {
|
||||
local ns="$1" kind="$2" name="$3" replicas="$4"
|
||||
if [[ "${EXECUTE}" -eq 0 ]]; then
|
||||
log "plan: kubectl -n ${ns} scale ${kind} ${name} --replicas=${replicas}"
|
||||
return 0
|
||||
fi
|
||||
"${KUBECTL}" -n "$ns" scale "$kind" "$name" --replicas="${replicas}" >/dev/null
|
||||
}
|
||||
|
||||
wait_ready() {
|
||||
local ns="$1" kind="$2" name="$3" timeout="$4"
|
||||
if [[ "${EXECUTE}" -eq 0 ]]; then
|
||||
log "plan: kubectl -n ${ns} rollout status ${kind}/${name} --timeout=${timeout}"
|
||||
return 0
|
||||
fi
|
||||
"${KUBECTL}" -n "$ns" rollout status "${kind}/${name}" --timeout="${timeout}" >/dev/null
|
||||
}
|
||||
|
||||
run_hecate_startup() {
|
||||
local reason="$1"
|
||||
local cmd=(sudo "${HECATE_BIN}" startup --config "${HECATE_CONFIG}" --execute --force-flux-branch main)
|
||||
if [[ "${EXECUTE}" -eq 0 ]]; then
|
||||
log "plan: ssh ${HECATE_COORDINATOR_HOST} '${cmd[*]}'"
|
||||
return 0
|
||||
fi
|
||||
timeout "${STARTUP_TIMEOUT_SECONDS}" ssh "${HECATE_COORDINATOR_HOST}" "${cmd[@]}"
|
||||
}
|
||||
|
||||
declare -A SNAPSHOT_REPLICAS=()
|
||||
SUSPENDED_KS_BEFORE=""
|
||||
SUSPENDED_HR_BEFORE=""
|
||||
|
||||
snapshot_resources() {
|
||||
local resources=("$@")
|
||||
SNAPSHOT_REPLICAS=()
|
||||
for res in "${resources[@]}"; do
|
||||
IFS='|' read -r ns kind name _ <<<"${res}"
|
||||
SNAPSHOT_REPLICAS["$(resource_key "$ns" "$kind" "$name")"]="$(get_replicas "$ns" "$kind" "$name")"
|
||||
done
|
||||
}
|
||||
|
||||
restore_resources() {
|
||||
local resources=("$@")
|
||||
for res in "${resources[@]}"; do
|
||||
IFS='|' read -r ns kind name _ <<<"${res}"
|
||||
local key
|
||||
key="$(resource_key "$ns" "$kind" "$name")"
|
||||
local replicas="${SNAPSHOT_REPLICAS[${key}]:-1}"
|
||||
log "rollback replicas: ${ns}/${kind}/${name} -> ${replicas}"
|
||||
scale_to "$ns" "$kind" "$name" "$replicas" || true
|
||||
done
|
||||
}
|
||||
|
||||
record_flux_suspended_before() {
|
||||
SUSPENDED_KS_BEFORE="$("${KUBECTL}" get kustomizations.kustomize.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.namespace}/{.metadata.name}{"\n"}{end}' || true)"
|
||||
SUSPENDED_HR_BEFORE="$("${KUBECTL}" get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.namespace}/{.metadata.name}{"\n"}{end}' || true)"
|
||||
}
|
||||
|
||||
set_flux_suspend_all() {
|
||||
local value="$1"
|
||||
if [[ "${EXECUTE}" -eq 0 ]]; then
|
||||
log "plan: patch all Flux kustomizations + helmreleases suspend=${value}"
|
||||
return 0
|
||||
fi
|
||||
local patch
|
||||
patch="$(printf '{"spec":{"suspend":%s}}' "${value}")"
|
||||
while read -r ks; do
|
||||
[[ -z "${ks}" ]] && continue
|
||||
"${KUBECTL}" -n flux-system patch kustomization "${ks}" --type=merge -p "${patch}" >/dev/null || true
|
||||
done < <("${KUBECTL}" -n flux-system get kustomizations.kustomize.toolkit.fluxcd.io -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}')
|
||||
|
||||
while read -r hr; do
|
||||
[[ -z "${hr}" ]] && continue
|
||||
local ns="${hr%%/*}"
|
||||
local name="${hr##*/}"
|
||||
"${KUBECTL}" -n "${ns}" patch helmrelease "${name}" --type=merge -p "${patch}" >/dev/null || true
|
||||
done < <("${KUBECTL}" get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"/"}{.metadata.name}{"\n"}{end}')
|
||||
}
|
||||
|
||||
restore_flux_suspended_before() {
|
||||
set_flux_suspend_all false
|
||||
if [[ "${EXECUTE}" -eq 0 ]]; then
|
||||
return 0
|
||||
fi
|
||||
local patch='{"spec":{"suspend":true}}'
|
||||
while read -r ref; do
|
||||
[[ -z "${ref}" ]] && continue
|
||||
local ns="${ref%%/*}"
|
||||
local name="${ref##*/}"
|
||||
"${KUBECTL}" -n "${ns}" patch kustomization "${name}" --type=merge -p "${patch}" >/dev/null || true
|
||||
done <<<"${SUSPENDED_KS_BEFORE}"
|
||||
while read -r ref; do
|
||||
[[ -z "${ref}" ]] && continue
|
||||
local ns="${ref%%/*}"
|
||||
local name="${ref##*/}"
|
||||
"${KUBECTL}" -n "${ns}" patch helmrelease "${name}" --type=merge -p "${patch}" >/dev/null || true
|
||||
done <<<"${SUSPENDED_HR_BEFORE}"
|
||||
}
|
||||
|
||||
verify_flux_unsuspended() {
|
||||
if [[ "${EXECUTE}" -eq 0 ]]; then
|
||||
log "plan: verify no Flux kustomizations/helmreleases remain suspended"
|
||||
return 0
|
||||
fi
|
||||
local ks_count hr_count
|
||||
ks_count="$("${KUBECTL}" get kustomizations.kustomize.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.name}{"\n"}{end}' | sed '/^$/d' | wc -l | tr -d ' ')"
|
||||
hr_count="$("${KUBECTL}" get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.name}{"\n"}{end}' | sed '/^$/d' | wc -l | tr -d ' ')"
|
||||
[[ "${ks_count}" == "0" ]] || die "kustomizations still suspended: ${ks_count}"
|
||||
[[ "${hr_count}" == "0" ]] || die "helmreleases still suspended: ${hr_count}"
|
||||
}
|
||||
|
||||
write_log_header() {
|
||||
local drill="$1"
|
||||
mkdir -p "${LOG_DIR}"
|
||||
local f="${LOG_DIR}/${drill}-$(now_ts).log"
|
||||
exec > >(tee -a "${f}") 2>&1
|
||||
log "drill=${drill} execute=${EXECUTE} coordinator=${HECATE_COORDINATOR_HOST}"
|
||||
}
|
||||
|
||||
run_drill_flux_gitea_deadlock() {
|
||||
local resources=(
|
||||
"flux-system|deployment|source-controller|1"
|
||||
"flux-system|deployment|kustomize-controller|1"
|
||||
"flux-system|deployment|helm-controller|1"
|
||||
"flux-system|deployment|notification-controller|1"
|
||||
"gitea|deployment|gitea|1"
|
||||
)
|
||||
snapshot_resources "${resources[@]}"
|
||||
trap 'restore_resources "${resources[@]}"' ERR
|
||||
|
||||
log "injecting outage: flux controllers + gitea"
|
||||
for res in "${resources[@]}"; do
|
||||
IFS='|' read -r ns kind name _ <<<"${res}"
|
||||
scale_to "$ns" "$kind" "$name" 0
|
||||
done
|
||||
|
||||
run_hecate_startup "drill-flux-gitea-deadlock"
|
||||
|
||||
log "verifying recovery"
|
||||
wait_ready flux-system deployment source-controller 240s
|
||||
wait_ready flux-system deployment kustomize-controller 240s
|
||||
wait_ready flux-system deployment helm-controller 240s
|
||||
wait_ready flux-system deployment notification-controller 240s
|
||||
wait_ready gitea deployment gitea 300s
|
||||
log "pass: flux-gitea-deadlock"
|
||||
trap - ERR
|
||||
}
|
||||
|
||||
run_drill_foundation_recovery() {
|
||||
local resources=(
|
||||
"vault|statefulset|vault|1"
|
||||
"postgres|statefulset|postgres|1"
|
||||
"gitea|deployment|gitea|1"
|
||||
)
|
||||
snapshot_resources "${resources[@]}"
|
||||
trap 'restore_resources "${resources[@]}"' ERR
|
||||
|
||||
log "injecting outage: vault + postgres + gitea"
|
||||
for res in "${resources[@]}"; do
|
||||
IFS='|' read -r ns kind name _ <<<"${res}"
|
||||
scale_to "$ns" "$kind" "$name" 0
|
||||
done
|
||||
|
||||
run_hecate_startup "drill-foundation-recovery"
|
||||
|
||||
log "verifying layered recovery"
|
||||
wait_ready vault statefulset vault 420s
|
||||
wait_ready postgres statefulset postgres 420s
|
||||
wait_ready gitea deployment gitea 300s
|
||||
log "pass: foundation-recovery"
|
||||
trap - ERR
|
||||
}
|
||||
|
||||
run_drill_reconciliation_resume() {
|
||||
local resources=("flux-system|deployment|source-controller|1")
|
||||
snapshot_resources "${resources[@]}"
|
||||
record_flux_suspended_before
|
||||
trap 'restore_flux_suspended_before; restore_resources "${resources[@]}"' ERR
|
||||
|
||||
log "injecting outage: suspend all Flux objects + stop source-controller"
|
||||
set_flux_suspend_all true
|
||||
scale_to flux-system deployment source-controller 0
|
||||
|
||||
run_hecate_startup "drill-reconciliation-resume"
|
||||
|
||||
log "verifying reconciliation resumed"
|
||||
wait_ready flux-system deployment source-controller 240s
|
||||
verify_flux_unsuspended
|
||||
log "pass: reconciliation-resume"
|
||||
trap - ERR
|
||||
}
|
||||
|
||||
main() {
|
||||
need_cmd "${KUBECTL}"
|
||||
need_cmd ssh
|
||||
need_cmd timeout
|
||||
|
||||
local cmd="${1:-}"
|
||||
case "${cmd}" in
|
||||
list)
|
||||
usage
|
||||
exit 0
|
||||
;;
|
||||
run)
|
||||
shift || true
|
||||
local drill="${1:-}"
|
||||
[[ -n "${drill}" ]] || die "missing drill name"
|
||||
shift || true
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--execute) EXECUTE=1 ;;
|
||||
*) die "unknown option: $1" ;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
write_log_header "${drill}"
|
||||
;;
|
||||
*)
|
||||
usage
|
||||
exit 2
|
||||
;;
|
||||
esac
|
||||
|
||||
case "${drill}" in
|
||||
flux-gitea-deadlock)
|
||||
run_drill_flux_gitea_deadlock
|
||||
;;
|
||||
foundation-recovery)
|
||||
run_drill_foundation_recovery
|
||||
;;
|
||||
reconciliation-resume)
|
||||
run_drill_reconciliation_resume
|
||||
;;
|
||||
*)
|
||||
die "unknown drill: ${drill}"
|
||||
;;
|
||||
esac
|
||||
}
|
||||
|
||||
main "$@"
|
||||
@ -16,11 +16,11 @@ START_NOW=1
|
||||
INSTALL_DEPS=1
|
||||
ENABLE_BOOTSTRAP="${HECATE_ENABLE_BOOTSTRAP:-auto}"
|
||||
MANAGE_NUT="${HECATE_MANAGE_NUT:-1}"
|
||||
NUT_UPS_NAME="${HECATE_NUT_UPS_NAME:-atlasups}"
|
||||
NUT_UPS_NAME="${HECATE_NUT_UPS_NAME:-}"
|
||||
NUT_VENDOR_ID="${HECATE_NUT_VENDOR_ID:-0764}"
|
||||
NUT_PRODUCT_ID="${HECATE_NUT_PRODUCT_ID:-0601}"
|
||||
NUT_MONITOR_USER="${HECATE_NUT_MONITOR_USER:-monuser}"
|
||||
NUT_MONITOR_PASSWORD="${HECATE_NUT_MONITOR_PASSWORD:-atlasupsmon}"
|
||||
NUT_MONITOR_PASSWORD="${HECATE_NUT_MONITOR_PASSWORD:-hecateupsmon}"
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
@ -39,6 +39,25 @@ while [[ $# -gt 0 ]]; do
|
||||
esac
|
||||
done
|
||||
|
||||
resolve_nut_ups_name() {
|
||||
if [[ -n "${NUT_UPS_NAME}" ]]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
if [[ -f "${CONF_DIR}/hecate.yaml" ]]; then
|
||||
local target=""
|
||||
target="$(grep -Eo 'target:[[:space:]]*[A-Za-z0-9._-]+@localhost' "${CONF_DIR}/hecate.yaml" | head -n 1 | awk '{print $2}')"
|
||||
if [[ -n "${target}" ]]; then
|
||||
NUT_UPS_NAME="${target%@localhost}"
|
||||
echo "[install] inferred NUT UPS name from config: ${NUT_UPS_NAME}"
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
|
||||
NUT_UPS_NAME="pyrphoros"
|
||||
echo "[install] defaulting NUT UPS name to ${NUT_UPS_NAME}"
|
||||
}
|
||||
|
||||
ensure_apt_packages() {
|
||||
local missing=()
|
||||
for pkg in "$@"; do
|
||||
@ -179,6 +198,7 @@ install -m 0644 deploy/systemd/hecate-update.service "${SYSTEMD_DIR}/hecate-upda
|
||||
install -m 0644 deploy/systemd/hecate-update.timer "${SYSTEMD_DIR}/hecate-update.timer"
|
||||
install -m 0755 scripts/hecate-self-update.sh "${LIB_DIR}/hecate-self-update.sh"
|
||||
|
||||
resolve_nut_ups_name
|
||||
configure_nut
|
||||
|
||||
systemctl daemon-reload
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user