Compare commits

..

No commits in common. "main" and "codex/ananke-gate-platform-metrics" have entirely different histories.

64 changed files with 1157 additions and 4539 deletions

2
.gitignore vendored
View File

@ -1,6 +1,4 @@
/bin/
/build/
/dist/
internal/state/.corrupt-*
*.log
*.tmp

201
Jenkinsfile vendored
View File

@ -1,59 +1,25 @@
pipeline {
agent {
kubernetes {
label 'ananke-quality'
defaultContainer 'go-tester'
yaml """
apiVersion: v1
kind: Pod
spec:
nodeSelector:
hardware: rpi5
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values:
- titan-06
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values:
- titan-13
- titan-15
- titan-17
- titan-19
topologySpreadConstraints:
- maxSkew: 1
topologyKey: kubernetes.io/hostname
whenUnsatisfiable: ScheduleAnyway
labelSelector:
matchLabels:
jenkins/jenkins-jenkins-agent: "true"
containers:
- name: go-tester
image: registry.bstein.dev/bstein/golang:1.25-bookworm
image: golang:1.25-bookworm
command: ["cat"]
tty: true
volumeMounts:
- name: workspace-volume
mountPath: /home/jenkins/agent
- name: publisher
image: registry.bstein.dev/bstein/python:3.12-slim
command: ["cat"]
tty: true
volumeMounts:
- name: workspace-volume
mountPath: /home/jenkins/agent
- name: quality-tools
image: registry.bstein.dev/bstein/quality-tools:sonar8.0.1-trivy0.70.0-db20260422-arm64
image: python:3.12-slim
command: ["cat"]
tty: true
volumeMounts:
@ -69,13 +35,7 @@ spec:
environment {
SUITE_NAME = 'ananke'
PUSHGATEWAY_URL = 'http://platform-quality-gateway.monitoring.svc.cluster.local:9091'
SONARQUBE_HOST_URL = 'http://sonarqube.quality.svc.cluster.local:9000'
SONARQUBE_PROJECT_KEY = 'ananke'
SONARQUBE_TOKEN = credentials('sonarqube-token')
QUALITY_GATE_SONARQUBE_ENFORCE = '1'
QUALITY_GATE_SONARQUBE_REPORT = 'build/sonarqube-quality-gate.json'
QUALITY_GATE_IRONBANK_ENFORCE = '1'
QUALITY_GATE_IRONBANK_REQUIRED = '0'
QUALITY_GATE_IRONBANK_REPORT = 'build/ironbank-compliance.json'
}
@ -97,27 +57,6 @@ spec:
stage('Collect SonarQube evidence') {
steps {
container('quality-tools') {
sh '''#!/usr/bin/env bash
set -euo pipefail
mkdir -p build
args=(
"-Dsonar.host.url=${SONARQUBE_HOST_URL}"
"-Dsonar.login=${SONARQUBE_TOKEN}"
"-Dsonar.projectKey=${SONARQUBE_PROJECT_KEY}"
"-Dsonar.projectName=${SONARQUBE_PROJECT_KEY}"
"-Dsonar.sources=."
"-Dsonar.exclusions=**/.git/**,**/build/**,**/dist/**,**/node_modules/**,**/.venv/**,**/__pycache__/**,**/coverage/**,**/test-results/**,**/playwright-report/**"
"-Dsonar.test.inclusions=**/tests/**,**/testing/**,**/*_test.go,**/*.test.ts,**/*.test.tsx,**/*.spec.ts,**/*.spec.tsx"
)
[ -f build/coverage.out ] && args+=("-Dsonar.go.coverage.reportPaths=build/coverage.out")
set +e
sonar-scanner "${args[@]}" | tee build/sonar-scanner.log
rc=${PIPESTATUS[0]}
set -e
printf '%s\n' "${rc}" > build/sonarqube-analysis.rc
'''
}
container('publisher') {
sh '''
set -eu
@ -156,34 +95,6 @@ PY
stage('Collect Supply Chain evidence') {
steps {
container('quality-tools') {
sh '''#!/usr/bin/env bash
set -euo pipefail
mkdir -p build
set +e
trivy fs --cache-dir "${TRIVY_CACHE_DIR}" --skip-db-update --timeout 5m --no-progress --format json --output build/trivy-fs.json --scanners vuln,secret,misconfig --severity HIGH,CRITICAL .
trivy_rc=$?
set -e
if [ ! -s build/trivy-fs.json ]; then
cat > build/ironbank-compliance.json <<EOF
{"status":"failed","compliant":false,"scanner":"trivy","scan_type":"filesystem","error":"trivy did not produce JSON output","trivy_rc":${trivy_rc}}
EOF
exit 0
fi
critical="$(jq '[.Results[]? | .Vulnerabilities[]? | select(.Severity=="CRITICAL")] | length' build/trivy-fs.json)"
high="$(jq '[.Results[]? | .Vulnerabilities[]? | select(.Severity=="HIGH")] | length' build/trivy-fs.json)"
secrets="$(jq '[.Results[]? | .Secrets[]?] | length' build/trivy-fs.json)"
misconfigs="$(jq '[.Results[]? | .Misconfigurations[]? | select(.Status=="FAIL" and (.Severity=="CRITICAL" or .Severity=="HIGH"))] | length' build/trivy-fs.json)"
status=ok
compliant=true
if [ "${critical}" -gt 0 ] || [ "${secrets}" -gt 0 ] || [ "${misconfigs}" -gt 0 ]; then
status=failed
compliant=false
fi
jq -n --arg status "${status}" --argjson compliant "${compliant}" --argjson critical "${critical}" --argjson high "${high}" --argjson secrets "${secrets}" --argjson misconfigs "${misconfigs}" --argjson trivy_rc "${trivy_rc}" \
'{status:$status, compliant:$compliant, category:"artifact_security", scan_type:"filesystem", scanner:"trivy", critical_vulnerabilities:$critical, high_vulnerabilities:$high, secrets:$secrets, high_or_critical_misconfigurations:$misconfigs, trivy_rc:$trivy_rc, high_vulnerability_policy:"observe"}' > build/ironbank-compliance.json
'''
}
container('publisher') {
sh '''
set -eu
@ -241,25 +152,13 @@ PY
failed_runs="$(awk -F= '$1=="failed"{print $2}' build/quality-gate.state 2>/dev/null | tail -n1)"
[ -n "${ok_runs}" ] || ok_runs=0
[ -n "${failed_runs}" ] || failed_runs=0
coverage_percent="$(python3 - <<'PY'
import re
from pathlib import Path
log_path = Path("build/quality-gate.out")
text = log_path.read_text(encoding="utf-8", errors="ignore") if log_path.exists() else ""
values = [float(match.group(1)) for match in re.finditer(r"([0-9]+(?:\\.[0-9]+)?)%", text)]
print(values[-1] if values else 0.0)
PY
)"
printf '%s\n' "${coverage_percent}" > build/coverage-percent.txt
python3 scripts/publish_quality_metrics.py \
--pushgateway-url "${PUSHGATEWAY_URL}" \
--job-name platform-quality-ci \
--suite "${SUITE_NAME}" \
--trigger jenkins \
--local-ok "${ok_runs}" \
--local-failed "${failed_runs}" \
--coverage-percent-file build/coverage-percent.txt
--local-failed "${failed_runs}"
'''
}
}
@ -270,95 +169,7 @@ PY
container('publisher') {
sh '''
set -eu
gate_rc="$(cat build/quality-gate.rc 2>/dev/null || echo 1)"
fail=0
if [ "${gate_rc}" -ne 0 ]; then
echo "quality gate failed with rc=${gate_rc}" >&2
fail=1
fi
enabled() {
case "$(printf '%s' "${1:-}" | tr '[:upper:]' '[:lower:]')" in
1|true|yes|on) return 0 ;;
*) return 1 ;;
esac
}
if enabled "${QUALITY_GATE_SONARQUBE_ENFORCE:-1}"; then
sonar_status="$(python3 - <<'PY'
import json
from pathlib import Path
path = Path("build/sonarqube-quality-gate.json")
if not path.exists():
print("missing")
raise SystemExit(0)
try:
payload = json.loads(path.read_text(encoding="utf-8"))
except Exception: # noqa: BLE001
print("error")
raise SystemExit(0)
status = (payload.get("status") or payload.get("projectStatus", {}).get("status") or payload.get("qualityGate", {}).get("status") or "").strip().lower()
print(status or "missing")
PY
)"
case "${sonar_status}" in
ok|pass|passed|success) ;;
*)
echo "sonarqube gate failed: ${sonar_status}" >&2
fail=1
;;
esac
fi
ironbank_required="${QUALITY_GATE_IRONBANK_REQUIRED:-0}"
if [ "${PUBLISH_IMAGES:-false}" = "true" ]; then
ironbank_required=1
fi
if enabled "${QUALITY_GATE_IRONBANK_ENFORCE:-1}"; then
supply_status="$(python3 - <<'PY'
import json
from pathlib import Path
path = Path("build/ironbank-compliance.json")
if not path.exists():
print("missing")
raise SystemExit(0)
try:
payload = json.loads(path.read_text(encoding="utf-8"))
except Exception: # noqa: BLE001
print("error")
raise SystemExit(0)
compliant = payload.get("compliant")
if compliant is True:
print("ok")
elif compliant is False:
print("failed")
else:
status = str(payload.get("status") or payload.get("result") or payload.get("compliance") or "").strip().lower()
print(status or "missing")
PY
)"
case "${supply_status}" in
ok|pass|passed|success|compliant) ;;
not_applicable|na|n/a)
if enabled "${ironbank_required}"; then
echo "supply chain gate required but status=${supply_status}" >&2
fail=1
fi
;;
*)
if enabled "${ironbank_required}"; then
echo "supply chain gate failed: ${supply_status}" >&2
fail=1
else
echo "supply chain gate not passing (${supply_status}) but not required for this run" >&2
fi
;;
esac
fi
exit "${fail}"
test "$(cat build/quality-gate.rc 2>/dev/null || echo 1)" -eq 0
'''
}
}
@ -367,7 +178,7 @@ PY
post {
always {
archiveArtifacts artifacts: 'build/*.json,build/*.out,build/*.rc,build/*.txt,build/*.xml', allowEmptyArchive: true, fingerprint: true
archiveArtifacts artifacts: 'build/quality-gate.out,build/quality-gate.rc', allowEmptyArchive: true, fingerprint: true
}
}
}

View File

@ -97,15 +97,10 @@ Primary config path:
Keep these fields accurate:
- `expected_flux_source_url`
- `expected_flux_branch`
- `startup.service_checklist_explicit_only`
- `startup.service_checklist`
- `startup.critical_service_endpoints`
- `startup.require_ingress_checklist`
- `startup.require_node_inventory_reachability`
- `startup.node_inventory_reachability_required_nodes`
- `startup.node_ssh_auth_required_nodes`
- `startup.flux_health_required_kustomizations`
- `startup.workload_convergence_required_namespaces`
- `startup.ignore_unavailable_nodes`
- `coordination.role`
- `coordination.peer_hosts`
@ -139,10 +134,9 @@ Installer behavior:
When adding nodes or services:
1. Update inventory and node mapping in config.
2. Keep the explicit service checklist focused on the core services that must come back during an outage.
3. Keep `*_required_*` startup scopes aligned with the same core set so optional stacks do not block bootstrap.
4. Add/adjust ingress expectations for exposed services.
5. Use temporary ignores only when truly intentional, then remove them.
6. Run `scripts/quality_gate.sh` before host deployment.
2. Add/adjust service checklist entries for anything user-facing or critical.
3. Add/adjust ingress expectations for exposed services.
4. Use temporary ignores only when truly intentional, then remove them.
5. Run `scripts/quality_gate.sh` before host deployment.
Recovery quality should improve over time: every drill should reduce manual work in the next drill.

View File

@ -51,7 +51,6 @@ startup:
require_node_inventory_reachability: true
node_inventory_reachability_wait_seconds: 300
node_inventory_reachability_poll_seconds: 5
node_inventory_reachability_required_nodes: []
required_node_labels:
titan-09:
ananke.bstein.dev/harbor-bootstrap: "true"
@ -91,7 +90,6 @@ startup:
admin_secret_name: keycloak-admin
admin_secret_username_key: username
admin_secret_password_key: password
service_checklist_explicit_only: false
service_checklist:
- name: gitea-api
url: https://scm.bstein.dev/api/healthz
@ -136,26 +134,18 @@ startup:
require_node_ssh_auth: true
node_ssh_auth_wait_seconds: 240
node_ssh_auth_poll_seconds: 5
node_ssh_auth_required_nodes: []
require_flux_health: true
flux_health_wait_seconds: 900
flux_health_poll_seconds: 5
flux_health_required_kustomizations: []
ignore_flux_kustomizations: []
require_workload_convergence: true
workload_convergence_wait_seconds: 900
workload_convergence_poll_seconds: 5
workload_convergence_required_namespaces: []
ignore_workload_namespaces: []
ignore_workloads: []
ignore_unavailable_nodes: []
auto_recycle_stuck_pods: true
auto_quarantine_scheduling_storms: false
scheduling_storm_event_threshold: 30
scheduling_storm_window_seconds: 180
stuck_pod_grace_seconds: 180
post_start_auto_heal_seconds: 60
dead_node_cleanup_grace_seconds: 300
vault_unseal_key_file: /var/lib/ananke/vault-unseal.key
vault_unseal_breakglass_command: ""
vault_unseal_breakglass_timeout_seconds: 15
@ -180,7 +170,6 @@ ups:
target: pyrphoros@localhost
poll_seconds: 5
runtime_safety_factor: 1.25
on_battery_grace_seconds: 90
debounce_count: 3
telemetry_timeout_seconds: 90
coordination:

View File

@ -117,52 +117,8 @@ startup:
require_node_inventory_reachability: true
node_inventory_reachability_wait_seconds: 300
node_inventory_reachability_poll_seconds: 5
node_inventory_reachability_required_nodes:
- titan-0a
- titan-0b
- titan-0c
required_node_labels:
titan-04:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-05:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-06:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-07:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-08:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-11:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-12:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-13:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-14:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-15:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-17:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-18:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-19:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-09:
node-role.kubernetes.io/worker: "true"
ananke.bstein.dev/harbor-bootstrap: "true"
require_time_sync: true
time_sync_wait_seconds: 240
@ -200,7 +156,6 @@ startup:
admin_secret_name: keycloak-admin
admin_secret_username_key: username
admin_secret_password_key: password
service_checklist_explicit_only: true
service_checklist:
- name: gitea-api
url: https://scm.bstein.dev/api/healthz
@ -245,49 +200,18 @@ startup:
require_node_ssh_auth: true
node_ssh_auth_wait_seconds: 240
node_ssh_auth_poll_seconds: 5
node_ssh_auth_required_nodes:
- titan-0a
- titan-0b
- titan-0c
require_flux_health: true
flux_health_wait_seconds: 900
flux_health_poll_seconds: 5
flux_health_required_kustomizations:
- flux-system/core
- flux-system/helm
- flux-system/traefik
- flux-system/cert-manager
- flux-system/longhorn
- flux-system/vault-csi
- flux-system/vault-injector
- flux-system/postgres
- flux-system/vault
- flux-system/keycloak
- flux-system/oauth2-proxy
- flux-system/gitea
- flux-system/monitoring
- flux-system/harbor
ignore_flux_kustomizations: []
require_workload_convergence: true
workload_convergence_wait_seconds: 900
workload_convergence_poll_seconds: 5
workload_convergence_required_namespaces:
- vault
- postgres
- sso
- gitea
- monitoring
- harbor
ignore_workload_namespaces: []
ignore_workloads: []
ignore_unavailable_nodes: []
auto_recycle_stuck_pods: true
auto_quarantine_scheduling_storms: true
scheduling_storm_event_threshold: 30
scheduling_storm_window_seconds: 180
stuck_pod_grace_seconds: 180
post_start_auto_heal_seconds: 60
dead_node_cleanup_grace_seconds: 300
vault_unseal_key_file: /var/lib/ananke/vault-unseal.key
vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/tethys/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.ananke-breakglass/vault-unseal.key'"
vault_unseal_breakglass_timeout_seconds: 15
@ -311,7 +235,6 @@ ups:
target: statera@localhost
poll_seconds: 5
runtime_safety_factor: 1.25
on_battery_grace_seconds: 90
debounce_count: 3
telemetry_timeout_seconds: 90
coordination:

View File

@ -117,52 +117,8 @@ startup:
require_node_inventory_reachability: true
node_inventory_reachability_wait_seconds: 300
node_inventory_reachability_poll_seconds: 5
node_inventory_reachability_required_nodes:
- titan-0a
- titan-0b
- titan-0c
required_node_labels:
titan-04:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-05:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-06:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-07:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-08:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-11:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-12:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-13:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-14:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-15:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-17:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-18:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-19:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-09:
node-role.kubernetes.io/worker: "true"
ananke.bstein.dev/harbor-bootstrap: "true"
require_time_sync: true
time_sync_wait_seconds: 240
@ -200,7 +156,6 @@ startup:
admin_secret_name: keycloak-admin
admin_secret_username_key: username
admin_secret_password_key: password
service_checklist_explicit_only: true
service_checklist:
- name: gitea-api
url: https://scm.bstein.dev/api/healthz
@ -245,49 +200,18 @@ startup:
require_node_ssh_auth: true
node_ssh_auth_wait_seconds: 240
node_ssh_auth_poll_seconds: 5
node_ssh_auth_required_nodes:
- titan-0a
- titan-0b
- titan-0c
require_flux_health: true
flux_health_wait_seconds: 900
flux_health_poll_seconds: 5
flux_health_required_kustomizations:
- flux-system/core
- flux-system/helm
- flux-system/traefik
- flux-system/cert-manager
- flux-system/longhorn
- flux-system/vault-csi
- flux-system/vault-injector
- flux-system/postgres
- flux-system/vault
- flux-system/keycloak
- flux-system/oauth2-proxy
- flux-system/gitea
- flux-system/monitoring
- flux-system/harbor
ignore_flux_kustomizations: []
require_workload_convergence: true
workload_convergence_wait_seconds: 900
workload_convergence_poll_seconds: 5
workload_convergence_required_namespaces:
- vault
- postgres
- sso
- gitea
- monitoring
- harbor
ignore_workload_namespaces: []
ignore_workloads: []
ignore_unavailable_nodes: []
auto_recycle_stuck_pods: true
auto_quarantine_scheduling_storms: true
scheduling_storm_event_threshold: 30
scheduling_storm_window_seconds: 180
stuck_pod_grace_seconds: 180
post_start_auto_heal_seconds: 60
dead_node_cleanup_grace_seconds: 300
vault_unseal_key_file: /var/lib/ananke/vault-unseal.key
vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/atlas/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.ananke-breakglass/vault-unseal.key'"
vault_unseal_breakglass_timeout_seconds: 15
@ -311,7 +235,6 @@ ups:
target: pyrphoros@localhost
poll_seconds: 5
runtime_safety_factor: 1.25
on_battery_grace_seconds: 90
debounce_count: 3
telemetry_timeout_seconds: 90
coordination:

View File

@ -77,7 +77,7 @@ func (o *Orchestrator) waitForNodeSSHAuth(ctx context.Context, nodes []string) e
ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
seen := map[string]struct{}{}
targets := make([]string, 0, len(nodes))
for _, node := range startupRequiredNodes(nodes, o.cfg.Startup.NodeSSHAuthRequiredNodes) {
for _, node := range nodes {
node = strings.TrimSpace(node)
if node == "" {
continue

View File

@ -1,288 +0,0 @@
package cluster
import (
"context"
"encoding/json"
"errors"
"fmt"
"strings"
"time"
)
type nodeReadyList struct {
Items []struct {
Metadata struct {
Name string `json:"name"`
} `json:"metadata"`
Status struct {
Conditions []struct {
Type string `json:"type"`
Status string `json:"status"`
} `json:"conditions"`
} `json:"status"`
} `json:"items"`
}
type podDeleteList struct {
Items []struct {
Metadata struct {
Namespace string `json:"namespace"`
Name string `json:"name"`
DeletionTimestamp *time.Time `json:"deletionTimestamp"`
} `json:"metadata"`
Spec struct {
NodeName string `json:"nodeName"`
} `json:"spec"`
} `json:"items"`
}
// RunPostStartAutoHeal runs one orchestration or CLI step.
// Signature: (o *Orchestrator) RunPostStartAutoHeal(ctx context.Context) error.
// Why: gives the long-running daemon a narrow, testable repair entrypoint for
// post-start drift without rerunning the full startup flow.
func (o *Orchestrator) RunPostStartAutoHeal(ctx context.Context) error {
return o.postStartAutoHeal(ctx)
}
// postStartAutoHeal runs one orchestration or CLI step.
// Signature: (o *Orchestrator) postStartAutoHeal(ctx context.Context) error.
// Why: centralizes bounded post-start repair actions so recurring outage
// patterns only trigger the specific remediation they need.
func (o *Orchestrator) postStartAutoHeal(ctx context.Context) error {
if o.runner.DryRun {
return nil
}
errs := []string{}
requestReconcile := false
if err := o.ensureRequiredNodeLabels(ctx); err != nil {
errs = append(errs, fmt.Sprintf("required node labels: %v", err))
}
vaultRecovered, err := o.autoRecoverSealedVault(ctx)
if err != nil {
errs = append(errs, fmt.Sprintf("vault auto-recovery: %v", err))
} else if vaultRecovered {
requestReconcile = true
if err := o.rerunVaultK8sAuthConfigJob(ctx); err != nil {
errs = append(errs, fmt.Sprintf("vault k8s auth config rerun: %v", err))
}
}
cleaned, err := o.cleanupTerminatingPodsOnUnavailableNodes(ctx)
if err != nil {
errs = append(errs, fmt.Sprintf("dead-node terminating pod cleanup: %v", err))
} else if cleaned > 0 {
requestReconcile = true
}
if requestReconcile {
o.bestEffort("request flux reconcile after post-start auto-heal", func() error {
return o.requestFluxReconcile(ctx)
})
}
if len(errs) > 0 {
return errors.New(strings.Join(errs, "; "))
}
return nil
}
// autoRecoverSealedVault runs one orchestration or CLI step.
// Signature: (o *Orchestrator) autoRecoverSealedVault(ctx context.Context) (bool, error).
// Why: lets the daemon repair a later Vault reseal without waiting for a new
// bootstrap run.
func (o *Orchestrator) autoRecoverSealedVault(ctx context.Context) (bool, error) {
if o.runner.DryRun {
return false, nil
}
phase, err := o.kubectl(ctx, 15*time.Second, "-n", "vault", "get", "pod", "vault-0", "-o", "jsonpath={.status.phase}")
if err != nil {
if isNotFoundErr(err) {
return false, nil
}
return false, fmt.Errorf("vault pod phase check failed: %w", err)
}
if strings.TrimSpace(phase) != "Running" {
return false, nil
}
sealed, err := o.vaultSealed(ctx)
if err != nil {
return false, err
}
if !sealed {
return false, nil
}
o.log.Printf("warning: detected sealed Vault after startup; attempting post-start auto-recovery")
if err := o.ensureVaultUnsealed(ctx); err != nil {
return false, err
}
return true, nil
}
// rerunVaultK8sAuthConfigJob runs one orchestration or CLI step.
// Signature: (o *Orchestrator) rerunVaultK8sAuthConfigJob(ctx context.Context) error.
// Why: post-unseal Vault recovery needs the auth-config job retriggered so
// downstream secret consumers stop carrying stale failures from the sealed window.
func (o *Orchestrator) rerunVaultK8sAuthConfigJob(ctx context.Context) error {
if o.runner.DryRun {
return nil
}
jobName := fmt.Sprintf("vault-k8s-auth-config-autoheal-%d", time.Now().Unix())
if _, err := o.kubectl(
ctx,
25*time.Second,
"-n", "vault",
"create", "job",
"--from=cronjob/vault-k8s-auth-config",
jobName,
); err != nil {
return fmt.Errorf("create job %s: %w", jobName, err)
}
o.log.Printf("triggered vault k8s auth config job %s after vault recovery", jobName)
return nil
}
// cleanupTerminatingPodsOnUnavailableNodes runs one orchestration or CLI step.
// Signature: (o *Orchestrator) cleanupTerminatingPodsOnUnavailableNodes(ctx context.Context) (int, error).
// Why: dead nodes can strand terminating pods indefinitely, so the daemon should
// clear only that narrow failure class instead of leaving garbage behind forever.
func (o *Orchestrator) cleanupTerminatingPodsOnUnavailableNodes(ctx context.Context) (int, error) {
if o.runner.DryRun {
return 0, nil
}
unavailable, err := o.unavailableNodeSet(ctx)
if err != nil {
return 0, err
}
if len(unavailable) == 0 {
return 0, nil
}
out, err := o.kubectl(ctx, 30*time.Second, "get", "pods", "-A", "-o", "json")
if err != nil {
return 0, fmt.Errorf("query pods: %w", err)
}
var pods podDeleteList
if err := json.Unmarshal([]byte(out), &pods); err != nil {
return 0, fmt.Errorf("decode pods: %w", err)
}
grace := time.Duration(o.cfg.Startup.DeadNodeCleanupGraceSeconds) * time.Second
now := time.Now()
count := 0
for _, item := range pods.Items {
if item.Metadata.DeletionTimestamp == nil || item.Spec.NodeName == "" {
continue
}
if _, badNode := unavailable[item.Spec.NodeName]; !badNode {
continue
}
if now.Sub(*item.Metadata.DeletionTimestamp) < grace {
continue
}
o.log.Printf("warning: force deleting terminating pod %s/%s on unavailable node %s", item.Metadata.Namespace, item.Metadata.Name, item.Spec.NodeName)
if _, err := o.kubectl(
ctx,
20*time.Second,
"-n", item.Metadata.Namespace,
"delete", "pod", item.Metadata.Name,
"--grace-period=0",
"--force",
"--wait=false",
); err != nil && !isNotFoundErr(err) {
return count, fmt.Errorf("delete pod %s/%s: %w", item.Metadata.Namespace, item.Metadata.Name, err)
}
count++
}
if count > 0 {
o.log.Printf("post-start auto-heal cleaned %d terminating pod(s) from unavailable nodes", count)
}
return count, nil
}
// unavailableNodeSet runs one orchestration or CLI step.
// Signature: (o *Orchestrator) unavailableNodeSet(ctx context.Context) (map[string]struct{}, error).
// Why: isolates Ready-condition parsing so dead-node cleanup stays targeted.
func (o *Orchestrator) unavailableNodeSet(ctx context.Context) (map[string]struct{}, error) {
out, err := o.kubectl(ctx, 20*time.Second, "get", "nodes", "-o", "json")
if err != nil {
return nil, fmt.Errorf("query nodes: %w", err)
}
var nodes nodeReadyList
if err := json.Unmarshal([]byte(out), &nodes); err != nil {
return nil, fmt.Errorf("decode nodes: %w", err)
}
unavailable := map[string]struct{}{}
for _, item := range nodes.Items {
ready := ""
for _, cond := range item.Status.Conditions {
if strings.EqualFold(strings.TrimSpace(cond.Type), "Ready") {
ready = strings.TrimSpace(cond.Status)
break
}
}
if ready != "True" {
unavailable[item.Metadata.Name] = struct{}{}
}
}
return unavailable, nil
}
// requestFluxReconcile runs one orchestration or CLI step.
// Signature: (o *Orchestrator) requestFluxReconcile(ctx context.Context) error.
// Why: post-start repairs need a lightweight way to refresh GitOps health
// without reusing the broader startup flux-resume flow.
func (o *Orchestrator) requestFluxReconcile(ctx context.Context) error {
if o.runner.DryRun {
return nil
}
now := time.Now().UTC().Format(time.RFC3339)
if _, err := o.kubectl(
ctx,
25*time.Second,
"-n", "flux-system",
"annotate", "gitrepository", "flux-system",
"reconcile.fluxcd.io/requestedAt="+now,
"--overwrite",
); err != nil {
return fmt.Errorf("annotate flux source reconcile: %w", err)
}
if _, err := o.kubectl(
ctx,
25*time.Second,
"-n", "flux-system",
"annotate",
"kustomizations.kustomize.toolkit.fluxcd.io",
"--all",
"reconcile.fluxcd.io/requestedAt="+now,
"--overwrite",
); err != nil {
return fmt.Errorf("annotate flux kustomizations reconcile: %w", err)
}
if _, err := o.kubectl(
ctx,
25*time.Second,
"annotate",
"--all-namespaces",
"helmreleases.helm.toolkit.fluxcd.io",
"--all",
"reconcile.fluxcd.io/requestedAt="+now,
"--overwrite",
); err != nil {
o.log.Printf("warning: annotate helmreleases for post-start reconcile failed: %v", err)
}
if o.runOverride == nil && o.runner.CommandExists("flux") {
if _, err := o.run(ctx, 75*time.Second, "flux", "reconcile", "source", "git", "flux-system", "-n", "flux-system", "--timeout=60s"); err != nil {
o.log.Printf("warning: flux source reconcile command failed during post-start auto-heal: %v", err)
}
}
return nil
}

View File

@ -1,296 +0,0 @@
package cluster
import (
"context"
"errors"
"io"
"log"
"os"
"path/filepath"
"strings"
"testing"
"time"
"scm.bstein.dev/bstein/ananke/internal/config"
"scm.bstein.dev/bstein/ananke/internal/execx"
"scm.bstein.dev/bstein/ananke/internal/state"
)
// TestCleanupTerminatingPodsOnUnavailableNodesBranches runs one orchestration or CLI step.
// Signature: TestCleanupTerminatingPodsOnUnavailableNodesBranches(t *testing.T).
// Why: cleanup on dead nodes must be selective so Ananke only force-deletes the
// truly stranded pods and tolerates already-gone objects.
func TestCleanupTerminatingPodsOnUnavailableNodesBranches(t *testing.T) {
t.Run("dry run skips", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
orch.runner.DryRun = true
count, err := orch.cleanupTerminatingPodsOnUnavailableNodes(context.Background())
if err != nil || count != 0 {
t.Fatalf("expected dry-run skip, got count=%d err=%v", count, err)
}
})
t.Run("selective cleanup tolerates not found", func(t *testing.T) {
oldDelete := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
recentDelete := time.Now().Add(-2 * time.Minute).UTC().Format(time.RFC3339)
orch := buildOrchestratorWithStubs(t, config.Config{
Startup: config.Startup{DeadNodeCleanupGraceSeconds: 300},
}, []commandStub{
{
match: matchContains("kubectl", "get nodes -o json"),
out: `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"Unknown"}]}},{"metadata":{"name":"titan-07"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`,
},
{
match: matchContains("kubectl", "get pods -A -o json"),
out: `{"items":[` +
`{"metadata":{"namespace":"maintenance","name":"old-stale","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-22"}},` +
`{"metadata":{"namespace":"maintenance","name":"fresh-stale","deletionTimestamp":"` + recentDelete + `"},"spec":{"nodeName":"titan-22"}},` +
`{"metadata":{"namespace":"logging","name":"healthy-node","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-18"}},` +
`{"metadata":{"namespace":"logging","name":"no-delete"},"spec":{"nodeName":"titan-22"}}]}`,
},
{
match: matchContains("kubectl", "-n maintenance delete pod old-stale --grace-period=0 --force --wait=false"),
err: errors.New("pod old-stale not found"),
},
})
count, err := orch.cleanupTerminatingPodsOnUnavailableNodes(context.Background())
if err != nil {
t.Fatalf("cleanupTerminatingPodsOnUnavailableNodes failed: %v", err)
}
if count != 1 {
t.Fatalf("expected one cleaned pod, got %d", count)
}
})
t.Run("query and decode errors surface", func(t *testing.T) {
queryErrOrch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
{
match: matchContains("kubectl", "get nodes -o json"),
err: errors.New("nodes failed"),
},
})
if _, err := queryErrOrch.cleanupTerminatingPodsOnUnavailableNodes(context.Background()); err == nil || !strings.Contains(err.Error(), "query nodes") {
t.Fatalf("expected node query error, got %v", err)
}
decodeErrOrch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
{
match: matchContains("kubectl", "get nodes -o json"),
out: `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"Unknown"}]}}]}`,
},
{
match: matchContains("kubectl", "get pods -A -o json"),
out: `{bad json`,
},
})
if _, err := decodeErrOrch.cleanupTerminatingPodsOnUnavailableNodes(context.Background()); err == nil || !strings.Contains(err.Error(), "decode pods") {
t.Fatalf("expected pod decode error, got %v", err)
}
})
t.Run("delete hard error surfaces", func(t *testing.T) {
oldDelete := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
orch := buildOrchestratorWithStubs(t, config.Config{
Startup: config.Startup{DeadNodeCleanupGraceSeconds: 300},
}, []commandStub{
{
match: matchContains("kubectl", "get nodes -o json"),
out: `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"False"}]}}]}`,
},
{
match: matchContains("kubectl", "get pods -A -o json"),
out: `{"items":[{"metadata":{"namespace":"maintenance","name":"old-stale","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-22"}}]}`,
},
{
match: matchContains("kubectl", "-n maintenance delete pod old-stale --grace-period=0 --force --wait=false"),
err: errors.New("delete failed"),
},
})
count, err := orch.cleanupTerminatingPodsOnUnavailableNodes(context.Background())
if count != 0 || err == nil || !strings.Contains(err.Error(), "delete pod maintenance/old-stale") {
t.Fatalf("expected delete failure, got count=%d err=%v", count, err)
}
})
}
// TestUnavailableNodeSetBranches runs one orchestration or CLI step.
// Signature: TestUnavailableNodeSetBranches(t *testing.T).
// Why: node Ready parsing drives dead-node cleanup, so malformed and missing
// Ready condition payloads need direct coverage too.
func TestUnavailableNodeSetBranches(t *testing.T) {
t.Run("decode error surfaces", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
{match: matchContains("kubectl", "get nodes -o json"), out: `{bad json`},
})
if _, err := orch.unavailableNodeSet(context.Background()); err == nil || !strings.Contains(err.Error(), "decode nodes") {
t.Fatalf("expected decode error, got %v", err)
}
})
t.Run("missing ready condition counts as unavailable", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
{
match: matchContains("kubectl", "get nodes -o json"),
out: `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"MemoryPressure","status":"False"}]}},{"metadata":{"name":"titan-07"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`,
},
})
nodes, err := orch.unavailableNodeSet(context.Background())
if err != nil {
t.Fatalf("unavailableNodeSet failed: %v", err)
}
if _, ok := nodes["titan-22"]; !ok {
t.Fatalf("expected titan-22 to be treated as unavailable")
}
if _, ok := nodes["titan-07"]; ok {
t.Fatalf("did not expect titan-07 to be treated as unavailable")
}
})
}
// TestRequestFluxReconcileBranches runs one orchestration or CLI step.
// Signature: TestRequestFluxReconcileBranches(t *testing.T).
// Why: the post-start repair loop needs predictable Flux refresh behavior even
// when one annotation call is flaky.
func TestRequestFluxReconcileBranches(t *testing.T) {
t.Run("dry run skips", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
orch.runner.DryRun = true
if err := orch.requestFluxReconcile(context.Background()); err != nil {
t.Fatalf("dry-run requestFluxReconcile failed: %v", err)
}
})
t.Run("git source annotate error surfaces", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
{
match: matchContains("kubectl", "-n flux-system annotate gitrepository flux-system reconcile.fluxcd.io/requestedAt="),
err: errors.New("annotate failed"),
},
})
if err := orch.requestFluxReconcile(context.Background()); err == nil || !strings.Contains(err.Error(), "annotate flux source reconcile") {
t.Fatalf("expected gitrepository annotate error, got %v", err)
}
})
t.Run("kustomization annotate error surfaces", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
{
match: matchContains("kubectl", "-n flux-system annotate gitrepository flux-system reconcile.fluxcd.io/requestedAt="),
out: "",
},
{
match: matchContains("kubectl", "-n flux-system annotate kustomizations.kustomize.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt="),
err: errors.New("annotate failed"),
},
})
if err := orch.requestFluxReconcile(context.Background()); err == nil || !strings.Contains(err.Error(), "annotate flux kustomizations reconcile") {
t.Fatalf("expected kustomization annotate error, got %v", err)
}
})
t.Run("helm annotate warning and flux command path", func(t *testing.T) {
tmpDir := t.TempDir()
callLog := filepath.Join(tmpDir, "calls.log")
kubectlPath := filepath.Join(tmpDir, "kubectl")
fluxPath := filepath.Join(tmpDir, "flux")
kubectlScript := "#!/bin/sh\n" +
"printf '%s\\n' \"$*\" >> \"" + callLog + "\"\n" +
"case \"$*\" in\n" +
" *helmreleases.helm.toolkit.fluxcd.io*) echo helm annotate failed >&2; exit 1 ;;\n" +
"esac\n" +
"exit 0\n"
fluxScript := "#!/bin/sh\n" +
"printf 'flux %s\\n' \"$*\" >> \"" + callLog + "\"\n" +
"exit 0\n"
if err := os.WriteFile(kubectlPath, []byte(kubectlScript), 0o755); err != nil {
t.Fatalf("write fake kubectl: %v", err)
}
if err := os.WriteFile(fluxPath, []byte(fluxScript), 0o755); err != nil {
t.Fatalf("write fake flux: %v", err)
}
t.Setenv("PATH", tmpDir+":"+os.Getenv("PATH"))
cfg := config.Config{
State: config.State{
Dir: t.TempDir(),
ReportsDir: filepath.Join(t.TempDir(), "reports"),
RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"),
},
}
orch := &Orchestrator{
cfg: cfg,
runner: &execx.Runner{},
store: state.New(cfg.State.RunHistoryPath),
log: log.New(io.Discard, "", 0),
}
if err := orch.requestFluxReconcile(context.Background()); err != nil {
t.Fatalf("requestFluxReconcile with fake binaries failed: %v", err)
}
calls, err := os.ReadFile(callLog)
if err != nil {
t.Fatalf("read fake command log: %v", err)
}
logText := string(calls)
if !strings.Contains(logText, "annotate gitrepository flux-system") {
t.Fatalf("expected gitrepository annotate call, got %q", logText)
}
if !strings.Contains(logText, "annotate kustomizations.kustomize.toolkit.fluxcd.io --all") {
t.Fatalf("expected kustomization annotate call, got %q", logText)
}
if !strings.Contains(logText, "flux reconcile source git flux-system -n flux-system --timeout=60s") {
t.Fatalf("expected flux reconcile command, got %q", logText)
}
})
t.Run("flux command failure is tolerated", func(t *testing.T) {
tmpDir := t.TempDir()
callLog := filepath.Join(tmpDir, "calls.log")
kubectlPath := filepath.Join(tmpDir, "kubectl")
fluxPath := filepath.Join(tmpDir, "flux")
kubectlScript := "#!/bin/sh\n" +
"printf '%s\\n' \"$*\" >> \"" + callLog + "\"\n" +
"exit 0\n"
fluxScript := "#!/bin/sh\n" +
"printf 'flux-fail %s\\n' \"$*\" >> \"" + callLog + "\"\n" +
"exit 1\n"
if err := os.WriteFile(kubectlPath, []byte(kubectlScript), 0o755); err != nil {
t.Fatalf("write fake kubectl: %v", err)
}
if err := os.WriteFile(fluxPath, []byte(fluxScript), 0o755); err != nil {
t.Fatalf("write fake flux: %v", err)
}
t.Setenv("PATH", tmpDir+":"+os.Getenv("PATH"))
cfg := config.Config{
State: config.State{
Dir: t.TempDir(),
ReportsDir: filepath.Join(t.TempDir(), "reports"),
RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"),
},
}
orch := &Orchestrator{
cfg: cfg,
runner: &execx.Runner{},
store: state.New(cfg.State.RunHistoryPath),
log: log.New(io.Discard, "", 0),
}
if err := orch.requestFluxReconcile(context.Background()); err != nil {
t.Fatalf("requestFluxReconcile should tolerate flux failure, got %v", err)
}
calls, err := os.ReadFile(callLog)
if err != nil {
t.Fatalf("read fake command log: %v", err)
}
if !strings.Contains(string(calls), "flux-fail reconcile source git flux-system -n flux-system --timeout=60s") {
t.Fatalf("expected failing flux command to be attempted, got %q", string(calls))
}
})
}

View File

@ -1,382 +0,0 @@
package cluster
import (
"context"
"encoding/base64"
"errors"
"io"
"log"
"path/filepath"
"strings"
"testing"
"time"
"scm.bstein.dev/bstein/ananke/internal/config"
"scm.bstein.dev/bstein/ananke/internal/execx"
"scm.bstein.dev/bstein/ananke/internal/state"
)
// TestPostStartAutoHealRepairsVaultAndUnavailableNodes runs one orchestration or CLI step.
// Signature: TestPostStartAutoHealRepairsVaultAndUnavailableNodes(t *testing.T).
// Why: covers the new daemon-triggered repair path for late Vault reseals and
// stale terminating pods anchored to unavailable nodes.
func TestPostStartAutoHealRepairsVaultAndUnavailableNodes(t *testing.T) {
cfg := config.Config{
Startup: config.Startup{
DeadNodeCleanupGraceSeconds: 300,
RequiredNodeLabels: map[string]map[string]string{
"titan-07": {"node-role.kubernetes.io/worker": "true"},
},
},
State: config.State{
Dir: t.TempDir(),
ReportsDir: filepath.Join(t.TempDir(), "reports"),
RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"),
},
}
orch := &Orchestrator{
cfg: cfg,
runner: &execx.Runner{},
store: state.New(filepath.Join(t.TempDir(), "runs.json")),
log: log.New(io.Discard, "", 0),
}
oldDelete := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
unsealCalls := 0
jobCreated := false
reconciled := false
deleted := map[string]bool{}
dispatch := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
if name != "kubectl" {
return "", nil
}
joined := strings.Join(args, " ")
switch {
case strings.Contains(joined, "label node titan-07 --overwrite node-role.kubernetes.io/worker=true"):
return "", nil
case strings.Contains(joined, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
return "Running", nil
case strings.Contains(joined, "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"):
if unsealCalls == 0 {
return `{"initialized":true,"sealed":true}`, nil
}
return `{"initialized":true,"sealed":false}`, nil
case strings.Contains(joined, "-n vault get secret vault-init -o jsonpath={.data.unseal_key_b64}"):
return base64.StdEncoding.EncodeToString([]byte("vault-unseal-key")), nil
case strings.Contains(joined, "vault operator unseal"):
unsealCalls++
return "", nil
case strings.Contains(joined, "-n vault create job --from=cronjob/vault-k8s-auth-config"):
jobCreated = true
return "", nil
case strings.Contains(joined, "get nodes -o json"):
return `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"Unknown"}]}},{"metadata":{"name":"titan-07"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`, nil
case strings.Contains(joined, "get pods -A -o json"):
return `{"items":[{"metadata":{"namespace":"maintenance","name":"stale-pod","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-22"}},{"metadata":{"namespace":"logging","name":"healthy-node-pod","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-18"}}]}`, nil
case strings.Contains(joined, "-n maintenance delete pod stale-pod --grace-period=0 --force --wait=false"):
deleted["maintenance/stale-pod"] = true
return "", nil
case strings.Contains(joined, "-n flux-system annotate gitrepository flux-system reconcile.fluxcd.io/requestedAt="):
reconciled = true
return "", nil
case strings.Contains(joined, "-n flux-system annotate kustomizations.kustomize.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt="):
return "", nil
case strings.Contains(joined, "annotate --all-namespaces helmreleases.helm.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt="):
return "", nil
default:
return "", nil
}
}
orch.SetCommandOverrides(dispatch, dispatch)
if err := orch.postStartAutoHeal(context.Background()); err != nil {
t.Fatalf("postStartAutoHeal failed: %v", err)
}
if unsealCalls != 1 {
t.Fatalf("expected one Vault unseal attempt, got %d", unsealCalls)
}
if !jobCreated {
t.Fatalf("expected vault k8s auth config job to be created")
}
if !deleted["maintenance/stale-pod"] {
t.Fatalf("expected stale unavailable-node pod to be deleted")
}
if !reconciled {
t.Fatalf("expected flux reconcile request after repairs")
}
if deleted["logging/healthy-node-pod"] {
t.Fatalf("did not expect terminating pod on healthy node to be deleted")
}
}
// TestPostStartAutoHealSkipsWhenClusterIsHealthy runs one orchestration or CLI step.
// Signature: TestPostStartAutoHealSkipsWhenClusterIsHealthy(t *testing.T).
// Why: proves the new post-start repair loop stays quiet when the specific
// failure patterns are absent.
func TestPostStartAutoHealSkipsWhenClusterIsHealthy(t *testing.T) {
cfg := config.Config{
Startup: config.Startup{
DeadNodeCleanupGraceSeconds: 300,
},
State: config.State{
Dir: t.TempDir(),
ReportsDir: filepath.Join(t.TempDir(), "reports"),
RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"),
},
}
orch := &Orchestrator{
cfg: cfg,
runner: &execx.Runner{},
store: state.New(filepath.Join(t.TempDir(), "runs.json")),
log: log.New(io.Discard, "", 0),
}
unsealCalls := 0
jobCreated := false
reconciled := false
dispatch := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
if name != "kubectl" {
return "", nil
}
joined := strings.Join(args, " ")
switch {
case strings.Contains(joined, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
return "Running", nil
case strings.Contains(joined, "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"):
return `{"initialized":true,"sealed":false}`, nil
case strings.Contains(joined, "-n vault create job --from=cronjob/vault-k8s-auth-config"):
jobCreated = true
return "", nil
case strings.Contains(joined, "vault operator unseal"):
unsealCalls++
return "", nil
case strings.Contains(joined, "get nodes -o json"):
return `{"items":[{"metadata":{"name":"titan-07"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`, nil
case strings.Contains(joined, "get pods -A -o json"):
return `{"items":[]}`, nil
case strings.Contains(joined, "reconcile.fluxcd.io/requestedAt="):
reconciled = true
return "", nil
default:
return "", nil
}
}
orch.SetCommandOverrides(dispatch, dispatch)
if err := orch.postStartAutoHeal(context.Background()); err != nil {
t.Fatalf("postStartAutoHeal failed: %v", err)
}
if unsealCalls != 0 {
t.Fatalf("did not expect Vault unseal calls, got %d", unsealCalls)
}
if jobCreated {
t.Fatalf("did not expect vault auth config job creation")
}
if reconciled {
t.Fatalf("did not expect flux reconcile request for healthy cluster")
}
}
// TestRunPostStartAutoHealDryRun runs one orchestration or CLI step.
// Signature: TestRunPostStartAutoHealDryRun(t *testing.T).
// Why: covers the exported wrapper and the top-level dry-run guard so daemon
// auto-heal never mutates cluster state during rehearsal runs.
func TestRunPostStartAutoHealDryRun(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
orch.runner.DryRun = true
if err := orch.RunPostStartAutoHeal(context.Background()); err != nil {
t.Fatalf("RunPostStartAutoHeal dry-run failed: %v", err)
}
}
// TestPostStartAutoHealAggregatesErrors runs one orchestration or CLI step.
// Signature: TestPostStartAutoHealAggregatesErrors(t *testing.T).
// Why: proves the daemon reports each failed sub-repair together instead of
// hiding later failures behind the first problem.
func TestPostStartAutoHealAggregatesErrors(t *testing.T) {
cfg := config.Config{
Startup: config.Startup{
DeadNodeCleanupGraceSeconds: 300,
RequiredNodeLabels: map[string]map[string]string{
"titan-07": {"node-role.kubernetes.io/worker": "true"},
},
},
}
orch := buildOrchestratorWithStubs(t, cfg, []commandStub{
{
match: matchContains("kubectl", "label node titan-07 --overwrite node-role.kubernetes.io/worker=true"),
err: errors.New("label failed"),
},
{
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
err: errors.New("vault phase failed"),
},
{
match: matchContains("kubectl", "get nodes -o json"),
err: errors.New("node query failed"),
},
})
err := orch.postStartAutoHeal(context.Background())
if err == nil {
t.Fatalf("expected aggregated error")
}
msg := err.Error()
for _, want := range []string{
"required node labels:",
"vault auto-recovery:",
"dead-node terminating pod cleanup:",
} {
if !strings.Contains(msg, want) {
t.Fatalf("expected %q in %q", want, msg)
}
}
}
// TestAutoRecoverSealedVaultBranches runs one orchestration or CLI step.
// Signature: TestAutoRecoverSealedVaultBranches(t *testing.T).
// Why: late Vault reseals are a high-risk failure path, so the daemon needs
// coverage across the quiet-skip, parse-failure, and unseal-failure branches.
func TestAutoRecoverSealedVaultBranches(t *testing.T) {
t.Run("dry run skips", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
orch.runner.DryRun = true
recovered, err := orch.autoRecoverSealedVault(context.Background())
if err != nil || recovered {
t.Fatalf("expected dry-run skip, got recovered=%v err=%v", recovered, err)
}
})
t.Run("pod missing is quiet", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
{
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
err: errors.New("vault-0 not found"),
},
})
recovered, err := orch.autoRecoverSealedVault(context.Background())
if err != nil || recovered {
t.Fatalf("expected quiet skip, got recovered=%v err=%v", recovered, err)
}
})
t.Run("phase check error surfaces", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
{
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
err: errors.New("phase check failed"),
},
})
recovered, err := orch.autoRecoverSealedVault(context.Background())
if recovered || err == nil || !strings.Contains(err.Error(), "vault pod phase check failed") {
t.Fatalf("expected phase check error, got recovered=%v err=%v", recovered, err)
}
})
t.Run("non-running pod defers", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
{
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
out: "Pending",
},
})
recovered, err := orch.autoRecoverSealedVault(context.Background())
if err != nil || recovered {
t.Fatalf("expected pending pod skip, got recovered=%v err=%v", recovered, err)
}
})
t.Run("status parse failure surfaces", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
{
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
out: "Running",
},
{
match: matchContains("kubectl", "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"),
out: "garbage",
},
})
recovered, err := orch.autoRecoverSealedVault(context.Background())
if recovered || err == nil || !strings.Contains(err.Error(), "parse vault status") {
t.Fatalf("expected parse error, got recovered=%v err=%v", recovered, err)
}
})
t.Run("already unsealed stays quiet", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
{
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
out: "Running",
},
{
match: matchContains("kubectl", "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"),
out: `{"sealed":false}`,
},
})
recovered, err := orch.autoRecoverSealedVault(context.Background())
if err != nil || recovered {
t.Fatalf("expected already-unsealed skip, got recovered=%v err=%v", recovered, err)
}
})
t.Run("unseal failure surfaces", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
{
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
out: "Running",
},
{
match: matchContains("kubectl", "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"),
out: `{"sealed":true}`,
},
{
match: matchContains("kubectl", "-n vault get secret vault-init -o jsonpath={.data.unseal_key_b64}"),
out: base64.StdEncoding.EncodeToString([]byte("vault-unseal-key")),
},
{
match: matchContains("kubectl", "vault operator unseal"),
err: errors.New("exec boom"),
},
})
recovered, err := orch.autoRecoverSealedVault(context.Background())
if recovered || err == nil || !strings.Contains(err.Error(), "vault unseal attempt 1 failed") {
t.Fatalf("expected unseal failure, got recovered=%v err=%v", recovered, err)
}
})
}
// TestRerunVaultK8sAuthConfigJobBranches runs one orchestration or CLI step.
// Signature: TestRerunVaultK8sAuthConfigJobBranches(t *testing.T).
// Why: the post-unseal auth job is part of the production recovery chain, so
// dry-run and create-error behavior both need explicit coverage.
func TestRerunVaultK8sAuthConfigJobBranches(t *testing.T) {
t.Run("dry run skips", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
orch.runner.DryRun = true
if err := orch.rerunVaultK8sAuthConfigJob(context.Background()); err != nil {
t.Fatalf("dry-run rerunVaultK8sAuthConfigJob failed: %v", err)
}
})
t.Run("create error surfaces", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
{
match: matchContains("kubectl", "-n vault create job --from=cronjob/vault-k8s-auth-config"),
err: errors.New("create failed"),
},
})
err := orch.rerunVaultK8sAuthConfigJob(context.Background())
if err == nil || !strings.Contains(err.Error(), "create job vault-k8s-auth-config-autoheal-") {
t.Fatalf("expected create-job error, got %v", err)
}
})
}

View File

@ -227,31 +227,6 @@ func (o *Orchestrator) waitVaultReady(ctx context.Context, w startupWorkload) er
return fmt.Errorf("wait ready %s/%s/%s: timeout", w.Namespace, w.Kind, w.Name)
}
// ensureVaultUnsealedWhenRunnable runs one orchestration or CLI step.
// Signature: (o *Orchestrator) ensureVaultUnsealedWhenRunnable(ctx context.Context) (bool, string, error).
// Why: lets startup defer vault unseal until the pod is actually runnable, while
// keeping the direct unseal helper strict for explicit recovery paths and tests.
func (o *Orchestrator) ensureVaultUnsealedWhenRunnable(ctx context.Context) (bool, string, error) {
if o.runner.DryRun {
return false, "", nil
}
phase, err := o.kubectl(ctx, 15*time.Second, "-n", "vault", "get", "pod", "vault-0", "-o", "jsonpath={.status.phase}")
if err != nil {
if isNotFoundErr(err) {
return true, "vault-0 pod is not present yet; deferring unseal until critical workload recovery", nil
}
return false, "", fmt.Errorf("vault pod phase check failed: %w", err)
}
trimmedPhase := strings.TrimSpace(phase)
if trimmedPhase != "Running" {
return true, fmt.Sprintf("vault-0 pod phase is %q; deferring unseal until critical workload recovery", trimmedPhase), nil
}
return false, "", o.ensureVaultUnsealed(ctx)
}
// ensureVaultUnsealed runs one orchestration or CLI step.
// Signature: (o *Orchestrator) ensureVaultUnsealed(ctx context.Context) error.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.

View File

@ -143,8 +143,6 @@ func (o *Orchestrator) fluxHealthReady(ctx context.Context) (bool, string, error
return false, "", fmt.Errorf("decode flux kustomizations: %w", err)
}
ignored := makeStringSet(o.cfg.Startup.IgnoreFluxKustomizations)
required := o.startupRequiredFluxKustomizations()
requiredSeen := map[string]struct{}{}
notReady := []string{}
for _, ks := range list.Items {
ns := strings.TrimSpace(ks.Metadata.Namespace)
@ -156,12 +154,6 @@ func (o *Orchestrator) fluxHealthReady(ctx context.Context) (bool, string, error
if ks.Spec.Suspend {
continue
}
if len(required) > 0 {
if _, ok := required[full]; !ok {
continue
}
requiredSeen[full] = struct{}{}
}
if _, ok := ignored[full]; ok {
continue
}
@ -181,25 +173,10 @@ func (o *Orchestrator) fluxHealthReady(ctx context.Context) (bool, string, error
}
notReady = append(notReady, fmt.Sprintf("%s(%s)", full, reason))
}
if len(required) > 0 {
missing := []string{}
for full := range required {
if _, ok := requiredSeen[full]; !ok {
missing = append(missing, full+"(missing)")
}
}
if len(missing) > 0 {
sort.Strings(missing)
notReady = append(notReady, missing...)
}
}
if len(notReady) > 0 {
sort.Strings(notReady)
return false, "not ready: " + joinLimited(notReady, 6), nil
}
if len(required) > 0 {
return true, fmt.Sprintf("required kustomizations ready=%d", len(requiredSeen)), nil
}
return true, fmt.Sprintf("all kustomizations ready=%d", len(list.Items)), nil
}

View File

@ -19,7 +19,6 @@ func (o *Orchestrator) ensureRequiredNodeLabels(ctx context.Context) error {
if o.runner.DryRun || len(o.cfg.Startup.RequiredNodeLabels) == 0 {
return nil
}
ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
nodes := make([]string, 0, len(o.cfg.Startup.RequiredNodeLabels))
for node := range o.cfg.Startup.RequiredNodeLabels {
node = strings.TrimSpace(node)
@ -29,10 +28,6 @@ func (o *Orchestrator) ensureRequiredNodeLabels(ctx context.Context) error {
}
sort.Strings(nodes)
for _, node := range nodes {
if _, skip := ignored[node]; skip {
o.log.Printf("skipping required node labels for ignored unavailable node %s", node)
continue
}
labels := o.cfg.Startup.RequiredNodeLabels[node]
if len(labels) == 0 {
continue
@ -60,11 +55,6 @@ func (o *Orchestrator) ensureRequiredNodeLabels(ctx context.Context) error {
continue
}
if _, err := o.kubectl(ctx, 25*time.Second, args...); err != nil {
if isNotFoundErr(err) && !o.startupNodeStrictlyRequired(node) {
o.log.Printf("warning: skipping required labels for absent non-core node %s: %v", node, err)
o.noteStartupAutoHeal(fmt.Sprintf("skipped required node labels for absent non-core node %s", node))
continue
}
return fmt.Errorf("ensure required node labels on %s (%s): %w", node, strings.Join(pairs, ", "), err)
}
o.log.Printf("ensured required labels on node %s: %s", node, strings.Join(pairs, ", "))

View File

@ -37,7 +37,6 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
return invErr
}
o.noteStartupCheck("node-inventory", true, "inventory/user/port validation passed")
o.maybeRunEarlyVaultUnseal(ctx)
o.setStartupPhase("preflight-node-reachability", "waiting for ssh reachability across configured inventory")
if reachErr := o.waitForNodeInventoryReachability(ctx); reachErr != nil {
o.noteStartupCheck("node-inventory-reachability", false, reachErr.Error())
@ -180,9 +179,6 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
}
}
o.noteStartupCheck("kubernetes-api", true, "kubernetes api reachable")
if err := o.runStartupVaultUnsealGate(ctx); err != nil {
return err
}
if err := o.ensureRequiredNodeLabels(ctx); err != nil {
return err
}
@ -480,3 +476,18 @@ func (o *Orchestrator) Shutdown(ctx context.Context, opts ShutdownOptions) (err
o.log.Printf("shutdown flow complete")
return nil
}
// normalizeShutdownMode runs one orchestration or CLI step.
// Signature: normalizeShutdownMode(raw string) (string, error).
// Why: keeps shutdown behavior explicit and safe by allowing only cluster-only
// semantics while preserving compatibility with legacy "config" callers.
func normalizeShutdownMode(raw string) (string, error) {
switch strings.TrimSpace(raw) {
case "", "config", "cluster-only":
return "cluster-only", nil
case "poweroff":
return "", fmt.Errorf("shutdown mode %q has been removed; ananke no longer powers off hosts", raw)
default:
return "", fmt.Errorf("unsupported shutdown mode %q (expected config|cluster-only)", raw)
}
}

View File

@ -30,7 +30,7 @@ func (o *Orchestrator) waitForNodeInventoryReachability(ctx context.Context) err
ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
targets := make([]string, 0, len(o.inventoryNodesForValidation()))
seen := map[string]struct{}{}
for _, node := range startupRequiredNodes(o.inventoryNodesForValidation(), o.cfg.Startup.NodeInventoryReachRequiredNodes) {
for _, node := range o.inventoryNodesForValidation() {
node = strings.TrimSpace(node)
if node == "" {
continue

View File

@ -1,261 +0,0 @@
package cluster
import (
"context"
"encoding/json"
"fmt"
"sort"
"strings"
"time"
)
// maybeAutoQuarantineSchedulingStorms runs one orchestration or CLI step.
// Signature: (o *Orchestrator) maybeAutoQuarantineSchedulingStorms(ctx context.Context, lastAttempt *time.Time).
// Why: a non-core workload that cannot schedule can emit enough warning events to
// thrash the control plane datastore; quarantine keeps startup moving while
// preserving core services.
func (o *Orchestrator) maybeAutoQuarantineSchedulingStorms(ctx context.Context, lastAttempt *time.Time) {
if o.runner.DryRun || !o.cfg.Startup.AutoQuarantineSchedulingStorms {
return
}
now := time.Now()
if lastAttempt != nil && !lastAttempt.IsZero() && now.Sub(*lastAttempt) < 30*time.Second {
return
}
if lastAttempt != nil {
*lastAttempt = now
}
o.bestEffort("quarantine non-core scheduling storm workloads", func() error {
return o.quarantineSchedulingStormWorkloads(ctx)
})
}
// quarantineSchedulingStormWorkloads runs one orchestration or CLI step.
// Signature: (o *Orchestrator) quarantineSchedulingStormWorkloads(ctx context.Context) error.
// Why: limits startup-only mitigation to workloads proven to be generating a
// scheduling event storm, instead of scaling optional apps down blindly.
func (o *Orchestrator) quarantineSchedulingStormWorkloads(ctx context.Context) error {
podsOut, err := o.kubectl(ctx, 30*time.Second, "get", "pods", "-A", "-o", "json")
if err != nil {
return fmt.Errorf("query pods for scheduling storm scan: %w", err)
}
var pods podList
if err := json.Unmarshal([]byte(podsOut), &pods); err != nil {
return fmt.Errorf("decode pods for scheduling storm scan: %w", err)
}
rsOut, err := o.kubectl(ctx, 30*time.Second, "get", "replicasets", "-A", "-o", "json")
if err != nil {
return fmt.Errorf("query replicasets for scheduling storm scan: %w", err)
}
var rsList replicaSetList
if err := json.Unmarshal([]byte(rsOut), &rsList); err != nil {
return fmt.Errorf("decode replicasets for scheduling storm scan: %w", err)
}
eventsOut, err := o.kubectl(ctx, 30*time.Second, "get", "events", "-A", "-o", "json")
if err != nil {
return fmt.Errorf("query events for scheduling storm scan: %w", err)
}
var events eventList
if err := json.Unmarshal([]byte(eventsOut), &events); err != nil {
return fmt.Errorf("decode events for scheduling storm scan: %w", err)
}
workloadsOut, err := o.kubectl(ctx, 30*time.Second, "get", "deploy,statefulset", "-A", "-o", "json")
if err != nil {
return fmt.Errorf("query workloads for scheduling storm scan: %w", err)
}
var workloads workloadList
if err := json.Unmarshal([]byte(workloadsOut), &workloads); err != nil {
return fmt.Errorf("decode workloads for scheduling storm scan: %w", err)
}
requiredNamespaces := o.startupRequiredWorkloadNamespaces()
ignoredNamespaces := makeStringSet(o.cfg.Startup.IgnoreWorkloadNamespaces)
ignoredNodes := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
ignoreRules := parseWorkloadIgnoreRules(o.cfg.Startup.IgnoreWorkloads)
eventThreshold := o.cfg.Startup.SchedulingStormEventThreshold
if eventThreshold <= 0 {
eventThreshold = 30
}
window := time.Duration(o.cfg.Startup.SchedulingStormWindowSeconds) * time.Second
if window <= 0 {
window = 3 * time.Minute
}
podsByKey := map[string]podResource{}
for _, pod := range pods.Items {
ns := strings.TrimSpace(pod.Metadata.Namespace)
name := strings.TrimSpace(pod.Metadata.Name)
if ns == "" || name == "" {
continue
}
podsByKey[ns+"/"+name] = pod
}
rsOwners := map[string]ownerReference{}
for _, rs := range rsList.Items {
ns := strings.TrimSpace(rs.Metadata.Namespace)
name := strings.TrimSpace(rs.Metadata.Name)
if ns == "" || name == "" {
continue
}
for _, owner := range rs.Metadata.OwnerReferences {
kind := strings.TrimSpace(owner.Kind)
ownerName := strings.TrimSpace(owner.Name)
if kind == "" || ownerName == "" {
continue
}
rsOwners[ns+"/"+name] = owner
break
}
}
workloadDesired := map[string]int32{}
for _, item := range workloads.Items {
kind := strings.ToLower(strings.TrimSpace(item.Kind))
ns := strings.TrimSpace(item.Metadata.Namespace)
name := strings.TrimSpace(item.Metadata.Name)
if kind == "" || ns == "" || name == "" {
continue
}
desired, _, ok := desiredReady(item)
if !ok {
continue
}
workloadDesired[ns+"/"+kind+"/"+name] = desired
}
quarantined := []string{}
seen := map[string]struct{}{}
now := time.Now()
for _, event := range events.Items {
if !strings.EqualFold(strings.TrimSpace(event.Type), "Warning") {
continue
}
if strings.TrimSpace(event.Reason) != "FailedScheduling" {
continue
}
if !strings.EqualFold(strings.TrimSpace(event.InvolvedObject.Kind), "Pod") {
continue
}
lastSeen := eventLastObservedAt(event)
if !lastSeen.IsZero() && now.Sub(lastSeen) > window {
continue
}
count := eventObservationCount(event)
if count < eventThreshold {
continue
}
podKey := strings.TrimSpace(event.InvolvedObject.Namespace) + "/" + strings.TrimSpace(event.InvolvedObject.Name)
pod, ok := podsByKey[podKey]
if !ok {
continue
}
if !strings.EqualFold(strings.TrimSpace(pod.Status.Phase), "Pending") {
continue
}
ns := strings.TrimSpace(pod.Metadata.Namespace)
if _, ok := requiredNamespaces[ns]; ok {
continue
}
if _, ok := ignoredNamespaces[ns]; ok {
continue
}
if workloadIgnored(ignoreRules, ns, "", strings.TrimSpace(pod.Metadata.Name)) {
continue
}
if podTargetsIgnoredNode(pod, ignoredNodes) {
continue
}
workload, ok := schedulingStormOwnerWorkload(pod, rsOwners)
if !ok {
continue
}
if workloadIgnored(ignoreRules, workload.Namespace, workload.Kind, workload.Name) {
continue
}
workloadKey := workload.Namespace + "/" + workload.Kind + "/" + workload.Name
if _, done := seen[workloadKey]; done {
continue
}
desired := workloadDesired[workloadKey]
if desired <= 0 {
continue
}
if err := o.ensureWorkloadReplicas(ctx, workload, 0); err != nil {
return fmt.Errorf("scale scheduling storm workload %s to 0: %w", workloadKey, err)
}
seen[workloadKey] = struct{}{}
quarantined = append(quarantined, fmt.Sprintf("%s events=%d window=%s", workloadKey, count, window))
}
if len(quarantined) == 0 {
return nil
}
sort.Strings(quarantined)
detail := fmt.Sprintf("quarantined scheduling storm workload(s): %s", joinLimited(quarantined, 8))
o.log.Printf("%s", detail)
o.noteStartupAutoHeal(detail)
return nil
}
// schedulingStormOwnerWorkload runs one orchestration or CLI step.
// Signature: schedulingStormOwnerWorkload(pod podResource, rsOwners map[string]ownerReference) (startupWorkload, bool).
// Why: scheduling storms happen at the pod layer, but safe mitigation needs to
// operate on the owning deployment or statefulset.
func schedulingStormOwnerWorkload(pod podResource, rsOwners map[string]ownerReference) (startupWorkload, bool) {
ns := strings.TrimSpace(pod.Metadata.Namespace)
for _, owner := range pod.Metadata.OwnerReferences {
switch strings.TrimSpace(owner.Kind) {
case "StatefulSet":
if name := strings.TrimSpace(owner.Name); name != "" {
return startupWorkload{Namespace: ns, Kind: "statefulset", Name: name}, true
}
case "ReplicaSet":
rsName := strings.TrimSpace(owner.Name)
if rsName == "" {
continue
}
rsOwner, ok := rsOwners[ns+"/"+rsName]
if !ok || strings.TrimSpace(rsOwner.Kind) != "Deployment" || strings.TrimSpace(rsOwner.Name) == "" {
continue
}
return startupWorkload{Namespace: ns, Kind: "deployment", Name: strings.TrimSpace(rsOwner.Name)}, true
}
}
return startupWorkload{}, false
}
// eventObservationCount runs one orchestration or CLI step.
// Signature: eventObservationCount(event eventResource) int.
// Why: event count can live either on the root event or in the series payload;
// using the max keeps detection stable across Kubernetes versions.
func eventObservationCount(event eventResource) int {
count := event.Count
if event.Series.Count > count {
count = event.Series.Count
}
if count < 1 {
return 1
}
return count
}
// eventLastObservedAt runs one orchestration or CLI step.
// Signature: eventLastObservedAt(event eventResource) time.Time.
// Why: event recency fields vary by cluster version; prefer the newest explicit
// observation time and fall back to creation time when needed.
func eventLastObservedAt(event eventResource) time.Time {
switch {
case !event.Series.LastObservedTime.IsZero():
return event.Series.LastObservedTime
case !event.LastTimestamp.IsZero():
return event.LastTimestamp
case !event.EventTime.IsZero():
return event.EventTime
default:
return event.Metadata.CreationTimestamp
}
}

View File

@ -1,21 +0,0 @@
package cluster
import (
"fmt"
"strings"
)
// normalizeShutdownMode runs one orchestration or CLI step.
// Signature: normalizeShutdownMode(raw string) (string, error).
// Why: keeps shutdown behavior explicit and safe by allowing only cluster-only
// semantics while preserving compatibility with legacy "config" callers.
func normalizeShutdownMode(raw string) (string, error) {
switch strings.TrimSpace(raw) {
case "", "config", "cluster-only":
return "cluster-only", nil
case "poweroff":
return "", fmt.Errorf("shutdown mode %q has been removed; ananke no longer powers off hosts", raw)
default:
return "", fmt.Errorf("unsupported shutdown mode %q (expected config|cluster-only)", raw)
}
}

View File

@ -1,81 +0,0 @@
package cluster
import "strings"
// startupRequiredNodes runs one orchestration or CLI step.
// Signature: startupRequiredNodes(nodes []string, required []string) []string.
// Why: lets startup enforce a smaller core node set during outage recovery
// without losing the stricter all-nodes behavior when no override is configured.
func startupRequiredNodes(nodes []string, required []string) []string {
requiredSet := makeStringSet(required)
if len(requiredSet) == 0 {
return nodes
}
filtered := make([]string, 0, len(nodes))
for _, node := range nodes {
node = strings.TrimSpace(node)
if node == "" {
continue
}
if _, ok := requiredSet[node]; ok {
filtered = append(filtered, node)
}
}
return filtered
}
// startupNodeStrictlyRequired runs one orchestration or CLI step.
// Signature: (o *Orchestrator) startupNodeStrictlyRequired(node string) bool.
// Why: absent or broken non-core nodes should not block recovery-only actions
// like label reconciliation once the operator has narrowed startup to core nodes.
func (o *Orchestrator) startupNodeStrictlyRequired(node string) bool {
node = strings.TrimSpace(node)
if node == "" {
return false
}
if len(o.cfg.Startup.NodeInventoryReachRequiredNodes) == 0 && len(o.cfg.Startup.NodeSSHAuthRequiredNodes) == 0 {
return true
}
for _, controlPlane := range o.cfg.ControlPlanes {
if strings.TrimSpace(controlPlane) == node {
return true
}
}
if containsNode(o.cfg.Startup.NodeInventoryReachRequiredNodes, node) {
return true
}
return containsNode(o.cfg.Startup.NodeSSHAuthRequiredNodes, node)
}
// startupRequiredFluxKustomizations runs one orchestration or CLI step.
// Signature: (o *Orchestrator) startupRequiredFluxKustomizations() map[string]struct{}.
// Why: lets outage recovery wait on a declared core GitOps slice while leaving
// optional stacks free to converge after bootstrap succeeds.
func (o *Orchestrator) startupRequiredFluxKustomizations() map[string]struct{} {
return makeStringSet(o.cfg.Startup.FluxHealthRequiredKustomizations)
}
// startupRequiredWorkloadNamespaces runs one orchestration or CLI step.
// Signature: (o *Orchestrator) startupRequiredWorkloadNamespaces() map[string]struct{}.
// Why: keeps workload readiness scoped to core namespaces during recovery while
// preserving broad convergence checks when no explicit core list is configured.
func (o *Orchestrator) startupRequiredWorkloadNamespaces() map[string]struct{} {
return makeStringSet(o.cfg.Startup.WorkloadConvergenceRequiredNamespaces)
}
// containsNode runs one orchestration or CLI step.
// Signature: containsNode(entries []string, needle string) bool.
// Why: keeps node-scope checks small and explicit anywhere startup narrows its
// recovery gates to a declared core set.
func containsNode(entries []string, needle string) bool {
needle = strings.TrimSpace(needle)
if needle == "" {
return false
}
for _, entry := range entries {
if strings.TrimSpace(entry) == needle {
return true
}
}
return false
}

View File

@ -1,52 +0,0 @@
package cluster
import (
"context"
"fmt"
"time"
)
// maybeRunEarlyVaultUnseal runs one orchestration or CLI step.
// Signature: (o *Orchestrator) maybeRunEarlyVaultUnseal(ctx context.Context).
// Why: gives startup a best-effort Vault recovery path when the API is already
// live, without consuming the hard startup failure path before workloads recover.
func (o *Orchestrator) maybeRunEarlyVaultUnseal(ctx context.Context) {
if err := o.waitForAPI(ctx, 1, time.Second); err != nil {
return
}
o.noteStartupCheckState("vault-unseal-early", "running", "best-effort early vault unseal while kubernetes api is already available")
deferred, detail, err := o.ensureVaultUnsealedWhenRunnable(ctx)
if err != nil {
o.log.Printf("warning: early vault unseal deferred: %v", err)
o.noteStartupAutoHeal(fmt.Sprintf("deferred early vault unseal: %v", err))
return
}
if deferred {
o.log.Printf("vault early unseal deferred: %s", detail)
o.noteStartupAutoHeal(detail)
return
}
o.noteStartupCheck("vault-unseal-early", true, "vault is already unsealed")
}
// runStartupVaultUnsealGate runs one orchestration or CLI step.
// Signature: (o *Orchestrator) runStartupVaultUnsealGate(ctx context.Context) error.
// Why: keeps the top-level startup flow readable while allowing Vault unseal to
// defer cleanly until critical workload recovery when the pod is not runnable yet.
func (o *Orchestrator) runStartupVaultUnsealGate(ctx context.Context) error {
o.noteStartupCheckState("vault-unseal", "running", "ensuring vault is unsealed before startup gates")
deferred, detail, err := o.ensureVaultUnsealedWhenRunnable(ctx)
if err != nil {
o.noteStartupCheck("vault-unseal", false, err.Error())
return err
}
if deferred {
o.log.Printf("vault unseal deferred until workload recovery: %s", detail)
o.noteStartupAutoHeal(detail)
o.noteStartupCheck("vault-unseal", true, detail)
return nil
}
o.noteStartupCheck("vault-unseal", true, "vault is unsealed")
return nil
}

View File

@ -177,46 +177,6 @@ type jobConditionRef struct {
Status string `json:"status"`
}
type eventList struct {
Items []eventResource `json:"items"`
}
type eventResource struct {
Metadata struct {
Namespace string `json:"namespace"`
CreationTimestamp time.Time `json:"creationTimestamp"`
} `json:"metadata"`
InvolvedObject struct {
Kind string `json:"kind"`
Namespace string `json:"namespace"`
Name string `json:"name"`
} `json:"involvedObject"`
Type string `json:"type"`
Reason string `json:"reason"`
Message string `json:"message"`
Count int `json:"count"`
EventTime time.Time `json:"eventTime"`
LastTimestamp time.Time `json:"lastTimestamp"`
Series eventSeries `json:"series"`
}
type eventSeries struct {
Count int `json:"count"`
LastObservedTime time.Time `json:"lastObservedTime"`
}
type replicaSetList struct {
Items []replicaSetResource `json:"items"`
}
type replicaSetResource struct {
Metadata struct {
Namespace string `json:"namespace"`
Name string `json:"name"`
OwnerReferences []ownerReference `json:"ownerReferences"`
} `json:"metadata"`
}
type workloadResource struct {
Kind string `json:"kind"`
Metadata struct {
@ -261,7 +221,6 @@ type podResource struct {
type ownerReference struct {
Kind string `json:"kind"`
Name string `json:"name"`
}
type podContainerStatus struct {

View File

@ -26,12 +26,10 @@ func (o *Orchestrator) waitForWorkloadConvergence(ctx context.Context) error {
lastLogged := time.Time{}
lastRecycleAttempt := time.Time{}
lastReplicaHeal := time.Time{}
lastSchedulingStormHeal := time.Time{}
for {
prevFailure := lastFailure
o.maybeAutoRecycleStuckPods(ctx, &lastRecycleAttempt)
o.maybeAutoHealCriticalWorkloadReplicas(ctx, &lastReplicaHeal)
o.maybeAutoQuarantineSchedulingStorms(ctx, &lastSchedulingStormHeal)
ready, detail, err := o.workloadConvergenceReady(ctx)
if err != nil {
lastFailure = err.Error()
@ -73,7 +71,6 @@ func (o *Orchestrator) workloadConvergenceReady(ctx context.Context) (bool, stri
if err := json.Unmarshal([]byte(out), &list); err != nil {
return false, "", fmt.Errorf("decode controllers: %w", err)
}
requiredNamespaces := o.startupRequiredWorkloadNamespaces()
ignoredNamespaces := makeStringSet(o.cfg.Startup.IgnoreWorkloadNamespaces)
ignoredNodes := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
ignoreRules := parseWorkloadIgnoreRules(o.cfg.Startup.IgnoreWorkloads)
@ -87,11 +84,6 @@ func (o *Orchestrator) workloadConvergenceReady(ctx context.Context) (bool, stri
if kind == "" || ns == "" || name == "" {
continue
}
if len(requiredNamespaces) > 0 {
if _, ok := requiredNamespaces[ns]; !ok {
continue
}
}
if _, ok := ignoredNamespaces[ns]; ok {
continue
}

View File

@ -116,7 +116,6 @@ func (o *Orchestrator) startupFailurePods(ctx context.Context) ([]string, error)
return nil, fmt.Errorf("decode pods: %w", err)
}
requiredNamespaces := o.startupRequiredWorkloadNamespaces()
ignoredNamespaces := makeStringSet(o.cfg.Startup.IgnoreWorkloadNamespaces)
ignoredNodes := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
stuckReasons := map[string]struct{}{
@ -139,11 +138,6 @@ func (o *Orchestrator) startupFailurePods(ctx context.Context) ([]string, error)
if ns == "" || name == "" {
continue
}
if len(requiredNamespaces) > 0 {
if _, ok := requiredNamespaces[ns]; !ok {
continue
}
}
if _, ok := ignoredNamespaces[ns]; ok {
continue
}

View File

@ -1,88 +0,0 @@
package cluster
import (
"context"
"fmt"
"strings"
"time"
)
// TestHookMaybeAutoQuarantineSchedulingStorms runs one orchestration or CLI step.
// Signature: (o *Orchestrator) TestHookMaybeAutoQuarantineSchedulingStorms(ctx context.Context, lastAttempt *time.Time).
// Why: exposes the scheduling-storm trigger guard to the split top-level test module.
func (o *Orchestrator) TestHookMaybeAutoQuarantineSchedulingStorms(ctx context.Context, lastAttempt *time.Time) {
o.maybeAutoQuarantineSchedulingStorms(ctx, lastAttempt)
}
// TestHookQuarantineSchedulingStormWorkloads runs one orchestration or CLI step.
// Signature: (o *Orchestrator) TestHookQuarantineSchedulingStormWorkloads(ctx context.Context) error.
// Why: exposes the scheduling-storm auto-heal body to the split top-level test module.
func (o *Orchestrator) TestHookQuarantineSchedulingStormWorkloads(ctx context.Context) error {
return o.quarantineSchedulingStormWorkloads(ctx)
}
// TestHookSchedulingStormOwnerWorkload runs one orchestration or CLI step.
// Signature: TestHookSchedulingStormOwnerWorkload(namespace string, ownerKind string, ownerName string, rsOwnerKind string, rsOwnerName string) (string, bool).
// Why: exposes owner-resolution behavior without leaking internal workload types.
func TestHookSchedulingStormOwnerWorkload(
namespace string,
ownerKind string,
ownerName string,
rsOwnerKind string,
rsOwnerName string,
) (string, bool) {
var pod podResource
pod.Metadata.Namespace = strings.TrimSpace(namespace)
pod.Metadata.OwnerReferences = []ownerReference{{
Kind: strings.TrimSpace(ownerKind),
Name: strings.TrimSpace(ownerName),
}}
rsOwners := map[string]ownerReference{}
if rsName := strings.TrimSpace(ownerName); rsName != "" && strings.TrimSpace(ownerKind) == "ReplicaSet" {
rsOwners[pod.Metadata.Namespace+"/"+rsName] = ownerReference{
Kind: strings.TrimSpace(rsOwnerKind),
Name: strings.TrimSpace(rsOwnerName),
}
}
workload, ok := schedulingStormOwnerWorkload(pod, rsOwners)
if !ok {
return "", false
}
return fmt.Sprintf("%s/%s/%s", workload.Namespace, workload.Kind, workload.Name), true
}
// TestHookEventObservationCount runs one orchestration or CLI step.
// Signature: TestHookEventObservationCount(count int, seriesCount int) int.
// Why: exposes event-count normalization used by scheduling-storm detection.
func TestHookEventObservationCount(count int, seriesCount int) int {
return eventObservationCount(eventResource{
Count: count,
Series: eventSeries{
Count: seriesCount,
},
})
}
// TestHookEventLastObservedAt runs one orchestration or CLI step.
// Signature: TestHookEventLastObservedAt(seriesLastObserved time.Time, lastTimestamp time.Time, eventTime time.Time, creationTimestamp time.Time) time.Time.
// Why: exposes event-time fallback behavior used by scheduling-storm detection.
func TestHookEventLastObservedAt(
seriesLastObserved time.Time,
lastTimestamp time.Time,
eventTime time.Time,
creationTimestamp time.Time,
) time.Time {
return eventLastObservedAt(eventResource{
LastTimestamp: lastTimestamp,
EventTime: eventTime,
Series: eventSeries{
LastObservedTime: seriesLastObserved,
},
Metadata: struct {
Namespace string `json:"namespace"`
CreationTimestamp time.Time `json:"creationTimestamp"`
}{
CreationTimestamp: creationTimestamp,
},
})
}

View File

@ -1,55 +0,0 @@
package cluster
import "context"
// TestHookStartupRequiredNodes runs one orchestration or CLI step.
// Signature: TestHookStartupRequiredNodes(nodes []string, required []string) []string.
// Why: exposes recovery-scope node filtering so top-level tests can cover core-only startup narrowing.
func TestHookStartupRequiredNodes(nodes []string, required []string) []string {
return startupRequiredNodes(nodes, required)
}
// TestHookContainsNode runs one orchestration or CLI step.
// Signature: TestHookContainsNode(entries []string, needle string) bool.
// Why: exposes the small startup-scope membership helper to top-level tests.
func TestHookContainsNode(entries []string, needle string) bool {
return containsNode(entries, needle)
}
// TestHookStartupNodeStrictlyRequired runs one orchestration or CLI step.
// Signature: (o *Orchestrator) TestHookStartupNodeStrictlyRequired(node string) bool.
// Why: exposes strict-node startup scoping so outage-recovery tests can confirm
// non-core nodes stop blocking bootstrap.
func (o *Orchestrator) TestHookStartupNodeStrictlyRequired(node string) bool {
return o.startupNodeStrictlyRequired(node)
}
// TestHookStartupRequiredFluxKustomizations runs one orchestration or CLI step.
// Signature: (o *Orchestrator) TestHookStartupRequiredFluxKustomizations() map[string]struct{}.
// Why: exposes flux startup scoping so top-level tests can confirm only core
// kustomizations block emergency bootstrap.
func (o *Orchestrator) TestHookStartupRequiredFluxKustomizations() map[string]struct{} {
return o.startupRequiredFluxKustomizations()
}
// TestHookStartupRequiredWorkloadNamespaces runs one orchestration or CLI step.
// Signature: (o *Orchestrator) TestHookStartupRequiredWorkloadNamespaces() map[string]struct{}.
// Why: exposes workload namespace startup scoping so top-level tests can
// confirm only core workloads block emergency bootstrap.
func (o *Orchestrator) TestHookStartupRequiredWorkloadNamespaces() map[string]struct{} {
return o.startupRequiredWorkloadNamespaces()
}
// TestHookMaybeRunEarlyVaultUnseal runs one orchestration or CLI step.
// Signature: (o *Orchestrator) TestHookMaybeRunEarlyVaultUnseal(ctx context.Context).
// Why: exposes the early startup Vault deferral helper to top-level tests.
func (o *Orchestrator) TestHookMaybeRunEarlyVaultUnseal(ctx context.Context) {
o.maybeRunEarlyVaultUnseal(ctx)
}
// TestHookRunStartupVaultUnsealGate runs one orchestration or CLI step.
// Signature: (o *Orchestrator) TestHookRunStartupVaultUnsealGate(ctx context.Context) error.
// Why: exposes the startup Vault gate helper to top-level tests.
func (o *Orchestrator) TestHookRunStartupVaultUnsealGate(ctx context.Context) error {
return o.runStartupVaultUnsealGate(ctx)
}

View File

@ -33,9 +33,6 @@ func (c *Config) applyDefaults() {
if c.Startup.NodeInventoryReachPollSeconds <= 0 {
c.Startup.NodeInventoryReachPollSeconds = 5
}
if c.Startup.NodeInventoryReachRequiredNodes == nil {
c.Startup.NodeInventoryReachRequiredNodes = []string{}
}
if c.Startup.RequiredNodeLabels == nil {
c.Startup.RequiredNodeLabels = map[string]map[string]string{
"titan-09": {
@ -124,11 +121,7 @@ func (c *Config) applyDefaults() {
if strings.TrimSpace(c.Startup.ServiceChecklistAuth.AdminSecretPasswordKey) == "" {
c.Startup.ServiceChecklistAuth.AdminSecretPasswordKey = "password"
}
if c.Startup.ServiceChecklistExplicitOnly {
c.Startup.ServiceChecklist = mergeServiceChecklistDefaults(c.Startup.ServiceChecklist, []ServiceChecklistCheck{})
} else {
c.Startup.ServiceChecklist = mergeServiceChecklistDefaults(c.Startup.ServiceChecklist, defaultServiceChecklist())
}
c.Startup.ServiceChecklist = mergeServiceChecklistDefaults(c.Startup.ServiceChecklist, defaultServiceChecklist())
for i := range c.Startup.ServiceChecklist {
if c.Startup.ServiceChecklist[i].TimeoutSeconds <= 0 {
c.Startup.ServiceChecklist[i].TimeoutSeconds = 12
@ -159,18 +152,12 @@ func (c *Config) applyDefaults() {
if c.Startup.NodeSSHAuthPollSeconds <= 0 {
c.Startup.NodeSSHAuthPollSeconds = 5
}
if c.Startup.NodeSSHAuthRequiredNodes == nil {
c.Startup.NodeSSHAuthRequiredNodes = []string{}
}
if c.Startup.FluxHealthWaitSeconds <= 0 {
c.Startup.FluxHealthWaitSeconds = 900
}
if c.Startup.FluxHealthPollSeconds <= 0 {
c.Startup.FluxHealthPollSeconds = 5
}
if c.Startup.FluxHealthRequiredKustomizations == nil {
c.Startup.FluxHealthRequiredKustomizations = []string{}
}
if c.Startup.IgnoreFluxKustomizations == nil {
c.Startup.IgnoreFluxKustomizations = []string{}
}
@ -180,9 +167,6 @@ func (c *Config) applyDefaults() {
if c.Startup.WorkloadConvergencePollSeconds <= 0 {
c.Startup.WorkloadConvergencePollSeconds = 5
}
if c.Startup.WorkloadConvergenceRequiredNamespaces == nil {
c.Startup.WorkloadConvergenceRequiredNamespaces = []string{}
}
if c.Startup.IgnoreWorkloadNamespaces == nil {
c.Startup.IgnoreWorkloadNamespaces = []string{}
}
@ -195,12 +179,6 @@ func (c *Config) applyDefaults() {
if c.Startup.StuckPodGraceSeconds <= 0 {
c.Startup.StuckPodGraceSeconds = 180
}
if c.Startup.PostStartAutoHealSeconds <= 0 {
c.Startup.PostStartAutoHealSeconds = 60
}
if c.Startup.DeadNodeCleanupGraceSeconds <= 0 {
c.Startup.DeadNodeCleanupGraceSeconds = 300
}
if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" {
c.Startup.VaultUnsealKeyFile = "/var/lib/ananke/vault-unseal.key"
}
@ -243,12 +221,6 @@ func (c *Config) applyDefaults() {
if c.UPS.TelemetryTimeoutSeconds <= 0 {
c.UPS.TelemetryTimeoutSeconds = 90
}
if c.Startup.SchedulingStormEventThreshold <= 0 {
c.Startup.SchedulingStormEventThreshold = 30
}
if c.Startup.SchedulingStormWindowSeconds <= 0 {
c.Startup.SchedulingStormWindowSeconds = 180
}
if c.Coordination.ForwardShutdownConfig == "" {
c.Coordination.ForwardShutdownConfig = "/etc/ananke/ananke.yaml"
}

View File

@ -39,25 +39,24 @@ func defaults() Config {
"maintenance",
},
Startup: Startup{
APIWaitSeconds: 1200,
APIPollSeconds: 2,
ShutdownCooldownSeconds: 45,
RequireNodeInventoryReach: true,
NodeInventoryReachWaitSeconds: 300,
NodeInventoryReachPollSeconds: 5,
NodeInventoryReachRequiredNodes: []string{},
RequireTimeSync: true,
TimeSyncWaitSeconds: 240,
TimeSyncPollSeconds: 5,
TimeSyncMode: "quorum",
TimeSyncQuorum: 2,
ReconcileAccessOnBoot: true,
AutoEtcdRestoreOnAPIFailure: true,
EtcdRestoreControlPlane: "titan-0a",
RequireStorageReady: true,
StorageReadyWaitSeconds: 420,
StorageReadyPollSeconds: 5,
StorageMinReadyNodes: 2,
APIWaitSeconds: 1200,
APIPollSeconds: 2,
ShutdownCooldownSeconds: 45,
RequireNodeInventoryReach: true,
NodeInventoryReachWaitSeconds: 300,
NodeInventoryReachPollSeconds: 5,
RequireTimeSync: true,
TimeSyncWaitSeconds: 240,
TimeSyncPollSeconds: 5,
TimeSyncMode: "quorum",
TimeSyncQuorum: 2,
ReconcileAccessOnBoot: true,
AutoEtcdRestoreOnAPIFailure: true,
EtcdRestoreControlPlane: "titan-0a",
RequireStorageReady: true,
StorageReadyWaitSeconds: 420,
StorageReadyPollSeconds: 5,
StorageMinReadyNodes: 2,
StorageCriticalPVCs: []string{
"vault/data-vault-0",
"postgres/postgres-data-postgres-0",
@ -92,36 +91,33 @@ func defaults() Config {
AdminSecretUsernameKey: "username",
AdminSecretPasswordKey: "password",
},
ServiceChecklist: defaultServiceChecklist(),
RequireCriticalServiceEndpoints: true,
CriticalServiceEndpointWaitSec: 420,
CriticalServiceEndpointPollSec: 5,
CriticalServiceEndpoints: defaultCriticalServiceEndpoints(),
RequireIngressChecklist: true,
IngressChecklistWaitSeconds: 420,
IngressChecklistPollSeconds: 5,
IngressChecklistAccepted: []int{200, 301, 302, 307, 308, 401, 403, 404},
IngressChecklistIgnoreHosts: []string{},
RequireNodeSSHAuth: true,
NodeSSHAuthWaitSeconds: 240,
NodeSSHAuthPollSeconds: 5,
NodeSSHAuthRequiredNodes: []string{},
RequireFluxHealth: true,
FluxHealthWaitSeconds: 900,
FluxHealthPollSeconds: 5,
FluxHealthRequiredKustomizations: []string{},
IgnoreFluxKustomizations: []string{},
RequireWorkloadConvergence: true,
WorkloadConvergenceWaitSeconds: 900,
WorkloadConvergencePollSeconds: 5,
WorkloadConvergenceRequiredNamespaces: []string{},
IgnoreWorkloadNamespaces: []string{},
IgnoreWorkloads: []string{},
IgnoreUnavailableNodes: []string{},
AutoRecycleStuckPods: true,
StuckPodGraceSeconds: 180,
VaultUnsealKeyFile: "/var/lib/ananke/vault-unseal.key",
VaultUnsealBreakglassTimeout: 15,
ServiceChecklist: defaultServiceChecklist(),
RequireCriticalServiceEndpoints: true,
CriticalServiceEndpointWaitSec: 420,
CriticalServiceEndpointPollSec: 5,
CriticalServiceEndpoints: defaultCriticalServiceEndpoints(),
RequireIngressChecklist: true,
IngressChecklistWaitSeconds: 420,
IngressChecklistPollSeconds: 5,
IngressChecklistAccepted: []int{200, 301, 302, 307, 308, 401, 403, 404},
IngressChecklistIgnoreHosts: []string{},
RequireNodeSSHAuth: true,
NodeSSHAuthWaitSeconds: 240,
NodeSSHAuthPollSeconds: 5,
RequireFluxHealth: true,
FluxHealthWaitSeconds: 900,
FluxHealthPollSeconds: 5,
IgnoreFluxKustomizations: []string{},
RequireWorkloadConvergence: true,
WorkloadConvergenceWaitSeconds: 900,
WorkloadConvergencePollSeconds: 5,
IgnoreWorkloadNamespaces: []string{},
IgnoreWorkloads: []string{},
IgnoreUnavailableNodes: []string{},
AutoRecycleStuckPods: true,
StuckPodGraceSeconds: 180,
VaultUnsealKeyFile: "/var/lib/ananke/vault-unseal.key",
VaultUnsealBreakglassTimeout: 15,
},
Shutdown: Shutdown{
DefaultBudgetSeconds: 1380,

View File

@ -51,41 +51,3 @@ startup:
t.Fatalf("expected validation failure")
}
}
// TestLoadKeepsExplicitServiceChecklist runs one orchestration or CLI step.
// Signature: TestLoadKeepsExplicitServiceChecklist(t *testing.T).
// Why: host recovery configs must be able to keep a narrow, explicit checklist
// without silently inheriting the full default service catalog.
func TestLoadKeepsExplicitServiceChecklist(t *testing.T) {
cfgPath := filepath.Join(t.TempDir(), "ananke.yaml")
raw := `
control_planes: [titan-0a]
expected_flux_branch: main
expected_flux_source_url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git
iac_repo_path: /opt/titan-iac
startup:
service_checklist_explicit_only: true
service_checklist:
- name: gitea-api
url: https://scm.bstein.dev/api/healthz
accepted_statuses: [200]
body_contains: pass
timeout_seconds: 12
ups:
enabled: false
`
if err := os.WriteFile(cfgPath, []byte(raw), 0o644); err != nil {
t.Fatalf("write config: %v", err)
}
cfg, err := Load(cfgPath)
if err != nil {
t.Fatalf("load config: %v", err)
}
if len(cfg.Startup.ServiceChecklist) != 1 {
t.Fatalf("expected 1 explicit service check, got %d", len(cfg.Startup.ServiceChecklist))
}
if cfg.Startup.ServiceChecklist[0].Name != "gitea-api" {
t.Fatalf("expected explicit gitea-api check, got %q", cfg.Startup.ServiceChecklist[0].Name)
}
}

View File

@ -27,75 +27,65 @@ type Config struct {
}
type Startup struct {
APIWaitSeconds int `yaml:"api_wait_seconds"`
APIPollSeconds int `yaml:"api_poll_seconds"`
ShutdownCooldownSeconds int `yaml:"shutdown_cooldown_seconds"`
MinimumBatteryPercent float64 `yaml:"minimum_battery_percent"`
RequireNodeInventoryReach bool `yaml:"require_node_inventory_reachability"`
NodeInventoryReachWaitSeconds int `yaml:"node_inventory_reachability_wait_seconds"`
NodeInventoryReachPollSeconds int `yaml:"node_inventory_reachability_poll_seconds"`
NodeInventoryReachRequiredNodes []string `yaml:"node_inventory_reachability_required_nodes"`
RequiredNodeLabels map[string]map[string]string `yaml:"required_node_labels"`
RequireTimeSync bool `yaml:"require_time_sync"`
TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"`
TimeSyncPollSeconds int `yaml:"time_sync_poll_seconds"`
TimeSyncMode string `yaml:"time_sync_mode"`
TimeSyncQuorum int `yaml:"time_sync_quorum"`
ReconcileAccessOnBoot bool `yaml:"reconcile_access_on_boot"`
AutoEtcdRestoreOnAPIFailure bool `yaml:"auto_etcd_restore_on_api_failure"`
EtcdRestoreControlPlane string `yaml:"etcd_restore_control_plane"`
RequireStorageReady bool `yaml:"require_storage_ready"`
StorageReadyWaitSeconds int `yaml:"storage_ready_wait_seconds"`
StorageReadyPollSeconds int `yaml:"storage_ready_poll_seconds"`
StorageMinReadyNodes int `yaml:"storage_min_ready_nodes"`
StorageCriticalPVCs []string `yaml:"storage_critical_pvcs"`
RequirePostStartProbes bool `yaml:"require_post_start_probes"`
PostStartProbeWaitSeconds int `yaml:"post_start_probe_wait_seconds"`
PostStartProbePollSeconds int `yaml:"post_start_probe_poll_seconds"`
PostStartProbes []string `yaml:"post_start_probes"`
RequireServiceChecklist bool `yaml:"require_service_checklist"`
ServiceChecklistWaitSeconds int `yaml:"service_checklist_wait_seconds"`
ServiceChecklistPollSeconds int `yaml:"service_checklist_poll_seconds"`
ServiceChecklistStabilitySec int `yaml:"service_checklist_stability_seconds"`
ServiceChecklistAuth ServiceChecklistAuthSettings `yaml:"service_checklist_auth"`
ServiceChecklistExplicitOnly bool `yaml:"service_checklist_explicit_only"`
ServiceChecklist []ServiceChecklistCheck `yaml:"service_checklist"`
RequireCriticalServiceEndpoints bool `yaml:"require_critical_service_endpoints"`
CriticalServiceEndpointWaitSec int `yaml:"critical_service_endpoint_wait_seconds"`
CriticalServiceEndpointPollSec int `yaml:"critical_service_endpoint_poll_seconds"`
CriticalServiceEndpoints []string `yaml:"critical_service_endpoints"`
RequireIngressChecklist bool `yaml:"require_ingress_checklist"`
IngressChecklistWaitSeconds int `yaml:"ingress_checklist_wait_seconds"`
IngressChecklistPollSeconds int `yaml:"ingress_checklist_poll_seconds"`
IngressChecklistAccepted []int `yaml:"ingress_checklist_accepted_statuses"`
IngressChecklistIgnoreHosts []string `yaml:"ingress_checklist_ignore_hosts"`
IngressChecklistInsecureSkip bool `yaml:"ingress_checklist_insecure_skip_tls"`
RequireNodeSSHAuth bool `yaml:"require_node_ssh_auth"`
NodeSSHAuthWaitSeconds int `yaml:"node_ssh_auth_wait_seconds"`
NodeSSHAuthPollSeconds int `yaml:"node_ssh_auth_poll_seconds"`
NodeSSHAuthRequiredNodes []string `yaml:"node_ssh_auth_required_nodes"`
RequireFluxHealth bool `yaml:"require_flux_health"`
FluxHealthWaitSeconds int `yaml:"flux_health_wait_seconds"`
FluxHealthPollSeconds int `yaml:"flux_health_poll_seconds"`
FluxHealthRequiredKustomizations []string `yaml:"flux_health_required_kustomizations"`
IgnoreFluxKustomizations []string `yaml:"ignore_flux_kustomizations"`
RequireWorkloadConvergence bool `yaml:"require_workload_convergence"`
WorkloadConvergenceWaitSeconds int `yaml:"workload_convergence_wait_seconds"`
WorkloadConvergencePollSeconds int `yaml:"workload_convergence_poll_seconds"`
WorkloadConvergenceRequiredNamespaces []string `yaml:"workload_convergence_required_namespaces"`
IgnoreWorkloadNamespaces []string `yaml:"ignore_workload_namespaces"`
IgnoreWorkloads []string `yaml:"ignore_workloads"`
IgnoreUnavailableNodes []string `yaml:"ignore_unavailable_nodes"`
AutoRecycleStuckPods bool `yaml:"auto_recycle_stuck_pods"`
AutoQuarantineSchedulingStorms bool `yaml:"auto_quarantine_scheduling_storms"`
SchedulingStormEventThreshold int `yaml:"scheduling_storm_event_threshold"`
SchedulingStormWindowSeconds int `yaml:"scheduling_storm_window_seconds"`
StuckPodGraceSeconds int `yaml:"stuck_pod_grace_seconds"`
PostStartAutoHealSeconds int `yaml:"post_start_auto_heal_seconds"`
DeadNodeCleanupGraceSeconds int `yaml:"dead_node_cleanup_grace_seconds"`
VaultUnsealKeyFile string `yaml:"vault_unseal_key_file"`
VaultUnsealBreakglassCommand string `yaml:"vault_unseal_breakglass_command"`
VaultUnsealBreakglassTimeout int `yaml:"vault_unseal_breakglass_timeout_seconds"`
APIWaitSeconds int `yaml:"api_wait_seconds"`
APIPollSeconds int `yaml:"api_poll_seconds"`
ShutdownCooldownSeconds int `yaml:"shutdown_cooldown_seconds"`
MinimumBatteryPercent float64 `yaml:"minimum_battery_percent"`
RequireNodeInventoryReach bool `yaml:"require_node_inventory_reachability"`
NodeInventoryReachWaitSeconds int `yaml:"node_inventory_reachability_wait_seconds"`
NodeInventoryReachPollSeconds int `yaml:"node_inventory_reachability_poll_seconds"`
RequiredNodeLabels map[string]map[string]string `yaml:"required_node_labels"`
RequireTimeSync bool `yaml:"require_time_sync"`
TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"`
TimeSyncPollSeconds int `yaml:"time_sync_poll_seconds"`
TimeSyncMode string `yaml:"time_sync_mode"`
TimeSyncQuorum int `yaml:"time_sync_quorum"`
ReconcileAccessOnBoot bool `yaml:"reconcile_access_on_boot"`
AutoEtcdRestoreOnAPIFailure bool `yaml:"auto_etcd_restore_on_api_failure"`
EtcdRestoreControlPlane string `yaml:"etcd_restore_control_plane"`
RequireStorageReady bool `yaml:"require_storage_ready"`
StorageReadyWaitSeconds int `yaml:"storage_ready_wait_seconds"`
StorageReadyPollSeconds int `yaml:"storage_ready_poll_seconds"`
StorageMinReadyNodes int `yaml:"storage_min_ready_nodes"`
StorageCriticalPVCs []string `yaml:"storage_critical_pvcs"`
RequirePostStartProbes bool `yaml:"require_post_start_probes"`
PostStartProbeWaitSeconds int `yaml:"post_start_probe_wait_seconds"`
PostStartProbePollSeconds int `yaml:"post_start_probe_poll_seconds"`
PostStartProbes []string `yaml:"post_start_probes"`
RequireServiceChecklist bool `yaml:"require_service_checklist"`
ServiceChecklistWaitSeconds int `yaml:"service_checklist_wait_seconds"`
ServiceChecklistPollSeconds int `yaml:"service_checklist_poll_seconds"`
ServiceChecklistStabilitySec int `yaml:"service_checklist_stability_seconds"`
ServiceChecklistAuth ServiceChecklistAuthSettings `yaml:"service_checklist_auth"`
ServiceChecklist []ServiceChecklistCheck `yaml:"service_checklist"`
RequireCriticalServiceEndpoints bool `yaml:"require_critical_service_endpoints"`
CriticalServiceEndpointWaitSec int `yaml:"critical_service_endpoint_wait_seconds"`
CriticalServiceEndpointPollSec int `yaml:"critical_service_endpoint_poll_seconds"`
CriticalServiceEndpoints []string `yaml:"critical_service_endpoints"`
RequireIngressChecklist bool `yaml:"require_ingress_checklist"`
IngressChecklistWaitSeconds int `yaml:"ingress_checklist_wait_seconds"`
IngressChecklistPollSeconds int `yaml:"ingress_checklist_poll_seconds"`
IngressChecklistAccepted []int `yaml:"ingress_checklist_accepted_statuses"`
IngressChecklistIgnoreHosts []string `yaml:"ingress_checklist_ignore_hosts"`
IngressChecklistInsecureSkip bool `yaml:"ingress_checklist_insecure_skip_tls"`
RequireNodeSSHAuth bool `yaml:"require_node_ssh_auth"`
NodeSSHAuthWaitSeconds int `yaml:"node_ssh_auth_wait_seconds"`
NodeSSHAuthPollSeconds int `yaml:"node_ssh_auth_poll_seconds"`
RequireFluxHealth bool `yaml:"require_flux_health"`
FluxHealthWaitSeconds int `yaml:"flux_health_wait_seconds"`
FluxHealthPollSeconds int `yaml:"flux_health_poll_seconds"`
IgnoreFluxKustomizations []string `yaml:"ignore_flux_kustomizations"`
RequireWorkloadConvergence bool `yaml:"require_workload_convergence"`
WorkloadConvergenceWaitSeconds int `yaml:"workload_convergence_wait_seconds"`
WorkloadConvergencePollSeconds int `yaml:"workload_convergence_poll_seconds"`
IgnoreWorkloadNamespaces []string `yaml:"ignore_workload_namespaces"`
IgnoreWorkloads []string `yaml:"ignore_workloads"`
IgnoreUnavailableNodes []string `yaml:"ignore_unavailable_nodes"`
AutoRecycleStuckPods bool `yaml:"auto_recycle_stuck_pods"`
StuckPodGraceSeconds int `yaml:"stuck_pod_grace_seconds"`
VaultUnsealKeyFile string `yaml:"vault_unseal_key_file"`
VaultUnsealBreakglassCommand string `yaml:"vault_unseal_breakglass_command"`
VaultUnsealBreakglassTimeout int `yaml:"vault_unseal_breakglass_timeout_seconds"`
}
type ServiceChecklistCheck struct {
@ -146,7 +136,6 @@ type UPS struct {
Targets []UPSTarget `yaml:"targets"`
PollSeconds int `yaml:"poll_seconds"`
RuntimeSafetyFactor float64 `yaml:"runtime_safety_factor"`
OnBatteryGraceSeconds int `yaml:"on_battery_grace_seconds"`
DebounceCount int `yaml:"debounce_count"`
TelemetryTimeoutSeconds int `yaml:"telemetry_timeout_seconds"`
}

View File

@ -61,11 +61,6 @@ func (c Config) Validate() error {
if c.Startup.NodeInventoryReachPollSeconds <= 0 {
return fmt.Errorf("config.startup.node_inventory_reachability_poll_seconds must be > 0")
}
for _, node := range c.Startup.NodeInventoryReachRequiredNodes {
if strings.TrimSpace(node) == "" {
return fmt.Errorf("config.startup.node_inventory_reachability_required_nodes entries must not be empty")
}
}
for node, labels := range c.Startup.RequiredNodeLabels {
if strings.TrimSpace(node) == "" {
return fmt.Errorf("config.startup.required_node_labels keys must not be empty")
@ -238,46 +233,21 @@ func (c Config) Validate() error {
if c.Startup.NodeSSHAuthPollSeconds <= 0 {
return fmt.Errorf("config.startup.node_ssh_auth_poll_seconds must be > 0")
}
for _, node := range c.Startup.NodeSSHAuthRequiredNodes {
if strings.TrimSpace(node) == "" {
return fmt.Errorf("config.startup.node_ssh_auth_required_nodes entries must not be empty")
}
}
if c.Startup.FluxHealthWaitSeconds <= 0 {
return fmt.Errorf("config.startup.flux_health_wait_seconds must be > 0")
}
if c.Startup.FluxHealthPollSeconds <= 0 {
return fmt.Errorf("config.startup.flux_health_poll_seconds must be > 0")
}
for _, item := range c.Startup.FluxHealthRequiredKustomizations {
item = strings.TrimSpace(item)
if item == "" {
return fmt.Errorf("config.startup.flux_health_required_kustomizations entries must not be empty")
}
if strings.Count(item, "/") != 1 {
return fmt.Errorf("config.startup.flux_health_required_kustomizations entries must be namespace/name, got %q", item)
}
}
if c.Startup.WorkloadConvergenceWaitSeconds <= 0 {
return fmt.Errorf("config.startup.workload_convergence_wait_seconds must be > 0")
}
if c.Startup.WorkloadConvergencePollSeconds <= 0 {
return fmt.Errorf("config.startup.workload_convergence_poll_seconds must be > 0")
}
for _, ns := range c.Startup.WorkloadConvergenceRequiredNamespaces {
if strings.TrimSpace(ns) == "" {
return fmt.Errorf("config.startup.workload_convergence_required_namespaces entries must not be empty")
}
}
if c.Startup.StuckPodGraceSeconds <= 0 {
return fmt.Errorf("config.startup.stuck_pod_grace_seconds must be > 0")
}
if c.Startup.PostStartAutoHealSeconds <= 0 {
return fmt.Errorf("config.startup.post_start_auto_heal_seconds must be > 0")
}
if c.Startup.DeadNodeCleanupGraceSeconds <= 0 {
return fmt.Errorf("config.startup.dead_node_cleanup_grace_seconds must be > 0")
}
for _, probe := range c.Startup.PostStartProbes {
if strings.TrimSpace(probe) == "" {
return fmt.Errorf("config.startup.post_start_probes entries must not be empty")
@ -307,16 +277,6 @@ func (c Config) Validate() error {
return fmt.Errorf("config.startup.ignore_workload_namespaces entries must not be empty")
}
}
for _, item := range c.Startup.FluxHealthRequiredKustomizations {
if containsTrimmed(c.Startup.IgnoreFluxKustomizations, item) {
return fmt.Errorf("config.startup.flux_health_required_kustomizations must not overlap ignore_flux_kustomizations (%q)", strings.TrimSpace(item))
}
}
for _, ns := range c.Startup.WorkloadConvergenceRequiredNamespaces {
if containsTrimmed(c.Startup.IgnoreWorkloadNamespaces, ns) {
return fmt.Errorf("config.startup.workload_convergence_required_namespaces must not overlap ignore_workload_namespaces (%q)", strings.TrimSpace(ns))
}
}
for _, node := range c.Startup.IgnoreUnavailableNodes {
if strings.TrimSpace(node) == "" {
return fmt.Errorf("config.startup.ignore_unavailable_nodes entries must not be empty")
@ -332,9 +292,6 @@ func (c Config) Validate() error {
if c.UPS.Provider == "" {
return fmt.Errorf("config.ups.provider must not be empty when ups is enabled")
}
if c.UPS.OnBatteryGraceSeconds < 0 {
return fmt.Errorf("config.ups.on_battery_grace_seconds must be >= 0")
}
if c.UPS.Target == "" && len(c.UPS.Targets) == 0 {
return fmt.Errorf("config.ups.target or config.ups.targets must be set when ups is enabled")
}
@ -349,14 +306,6 @@ func (c Config) Validate() error {
return fmt.Errorf("config.coordination.forward_shutdown_config must not be empty when forward_shutdown_host is set")
}
}
if c.Startup.AutoQuarantineSchedulingStorms {
if c.Startup.SchedulingStormEventThreshold <= 0 {
return fmt.Errorf("config.startup.scheduling_storm_event_threshold must be > 0 when auto_quarantine_scheduling_storms is enabled")
}
if c.Startup.SchedulingStormWindowSeconds <= 0 {
return fmt.Errorf("config.startup.scheduling_storm_window_seconds must be > 0 when auto_quarantine_scheduling_storms is enabled")
}
}
for _, peer := range c.Coordination.PeerHosts {
if strings.TrimSpace(peer) == "" {
return fmt.Errorf("config.coordination.peer_hosts entries must not be empty")
@ -379,20 +328,3 @@ func (c Config) Validate() error {
}
return nil
}
// containsTrimmed runs one orchestration or CLI step.
// Signature: containsTrimmed(entries []string, needle string) bool.
// Why: startup config now supports both required and ignored recovery scopes, so
// validation needs a single normalized overlap check for those lists.
func containsTrimmed(entries []string, needle string) bool {
needle = strings.TrimSpace(needle)
if needle == "" {
return false
}
for _, entry := range entries {
if strings.TrimSpace(entry) == needle {
return true
}
}
return false
}

View File

@ -30,7 +30,6 @@ func TestValidateRejectsInvalidFieldsMatrix(t *testing.T) {
{"bad_api_poll", func(c *Config) { c.Startup.APIPollSeconds = 0 }},
{"bad_min_battery_percent", func(c *Config) { c.Startup.MinimumBatteryPercent = 101 }},
{"bad_node_inventory_poll", func(c *Config) { c.Startup.NodeInventoryReachPollSeconds = 0 }},
{"bad_empty_node_inventory_required_node", func(c *Config) { c.Startup.NodeInventoryReachRequiredNodes = []string{"titan-0a", ""} }},
{"bad_time_sync_wait", func(c *Config) { c.Startup.TimeSyncWaitSeconds = 0 }},
{"bad_time_sync_poll", func(c *Config) { c.Startup.TimeSyncPollSeconds = 0 }},
{"bad_time_sync_quorum", func(c *Config) { c.Startup.TimeSyncMode = "quorum"; c.Startup.TimeSyncQuorum = 0 }},
@ -69,42 +68,19 @@ func TestValidateRejectsInvalidFieldsMatrix(t *testing.T) {
{"bad_ingress_ignore_entry", func(c *Config) { c.Startup.IngressChecklistIgnoreHosts = []string{"", "grafana.bstein.dev"} }},
{"bad_node_ssh_wait", func(c *Config) { c.Startup.NodeSSHAuthWaitSeconds = 0 }},
{"bad_node_ssh_poll", func(c *Config) { c.Startup.NodeSSHAuthPollSeconds = 0 }},
{"bad_empty_node_ssh_required_node", func(c *Config) { c.Startup.NodeSSHAuthRequiredNodes = []string{"titan-0a", ""} }},
{"bad_flux_wait", func(c *Config) { c.Startup.FluxHealthWaitSeconds = 0 }},
{"bad_flux_poll", func(c *Config) { c.Startup.FluxHealthPollSeconds = 0 }},
{"bad_empty_flux_required_kustomization", func(c *Config) { c.Startup.FluxHealthRequiredKustomizations = []string{"flux-system/core", ""} }},
{"bad_malformed_flux_required_kustomization", func(c *Config) { c.Startup.FluxHealthRequiredKustomizations = []string{"flux-system-core"} }},
{"bad_workload_wait", func(c *Config) { c.Startup.WorkloadConvergenceWaitSeconds = 0 }},
{"bad_workload_poll", func(c *Config) { c.Startup.WorkloadConvergencePollSeconds = 0 }},
{"bad_empty_required_workload_namespace", func(c *Config) { c.Startup.WorkloadConvergenceRequiredNamespaces = []string{"monitoring", ""} }},
{"bad_stuck_pod_grace", func(c *Config) { c.Startup.StuckPodGraceSeconds = 0 }},
{"bad_post_start_auto_heal_seconds", func(c *Config) { c.Startup.PostStartAutoHealSeconds = 0 }},
{"bad_dead_node_cleanup_grace_seconds", func(c *Config) { c.Startup.DeadNodeCleanupGraceSeconds = 0 }},
{"bad_empty_post_start_probe_entry", func(c *Config) { c.Startup.PostStartProbes = []string{"https://ok", ""} }},
{"bad_empty_ignore_flux_entry", func(c *Config) { c.Startup.IgnoreFluxKustomizations = []string{"", "ns/name"} }},
{"bad_empty_ignore_workloads_entry", func(c *Config) { c.Startup.IgnoreWorkloads = []string{"", "ns/name"} }},
{"bad_empty_ignore_workload_namespaces_entry", func(c *Config) { c.Startup.IgnoreWorkloadNamespaces = []string{"", "vault"} }},
{"bad_overlap_flux_required_and_ignored", func(c *Config) {
c.Startup.FluxHealthRequiredKustomizations = []string{"flux-system/core"}
c.Startup.IgnoreFluxKustomizations = []string{"flux-system/core"}
}},
{"bad_overlap_workload_required_and_ignored", func(c *Config) {
c.Startup.WorkloadConvergenceRequiredNamespaces = []string{"monitoring"}
c.Startup.IgnoreWorkloadNamespaces = []string{"monitoring"}
}},
{"bad_empty_ignore_unavailable_nodes_entry", func(c *Config) { c.Startup.IgnoreUnavailableNodes = []string{"", "titan-22"} }},
{"bad_empty_vault_key_file", func(c *Config) { c.Startup.VaultUnsealKeyFile = "" }},
{"bad_scheduling_storm_threshold", func(c *Config) {
c.Startup.AutoQuarantineSchedulingStorms = true
c.Startup.SchedulingStormEventThreshold = 0
}},
{"bad_scheduling_storm_window", func(c *Config) {
c.Startup.AutoQuarantineSchedulingStorms = true
c.Startup.SchedulingStormWindowSeconds = 0
}},
{"bad_ssh_port_low", func(c *Config) { c.SSHPort = 0 }},
{"bad_ups_provider", func(c *Config) { c.UPS.Enabled = true; c.UPS.Provider = "" }},
{"bad_ups_on_battery_grace_negative", func(c *Config) { c.UPS.Enabled = true; c.UPS.OnBatteryGraceSeconds = -1 }},
{"bad_ups_target_empty", func(c *Config) { c.UPS.Enabled = true; c.UPS.Provider = "nut"; c.UPS.Target = ""; c.UPS.Targets = nil }},
{"bad_ups_targets_item_empty", func(c *Config) {
c.UPS.Enabled = true
@ -145,13 +121,6 @@ func TestApplyDefaultsPopulatesZeroConfig(t *testing.T) {
if cfg.Startup.TimeSyncMode == "" || cfg.Startup.EtcdRestoreControlPlane == "" || cfg.Startup.VaultUnsealKeyFile == "" {
t.Fatalf("expected startup defaults to be set")
}
if cfg.Startup.PostStartAutoHealSeconds <= 0 || cfg.Startup.DeadNodeCleanupGraceSeconds <= 0 {
t.Fatalf("expected post-start auto-heal defaults to be set")
}
if cfg.Startup.NodeInventoryReachRequiredNodes == nil || cfg.Startup.NodeSSHAuthRequiredNodes == nil ||
cfg.Startup.FluxHealthRequiredKustomizations == nil || cfg.Startup.WorkloadConvergenceRequiredNamespaces == nil {
t.Fatalf("expected startup recovery scope slices to be initialized")
}
if cfg.Startup.CriticalServiceEndpointWaitSec <= 0 || cfg.Startup.CriticalServiceEndpointPollSec <= 0 {
t.Fatalf("expected critical service endpoint timing defaults to be set")
}

View File

@ -32,8 +32,6 @@ type Daemon struct {
targets []Target
log *log.Logger
exporter *metrics.Exporter
postStartAutoHealOverride func(context.Context) error
}
var sshConfigCandidates = []string{
@ -94,9 +92,7 @@ func (d *Daemon) Run(ctx context.Context) error {
lastGood := map[string]time.Time{}
lastOnBattery := map[string]bool{}
onBatterySince := map[string]time.Time{}
breachCount := map[string]int{}
lastAutoHeal := time.Time{}
for _, t := range d.targets {
lastGood[t.Name] = time.Now()
}
@ -111,16 +107,12 @@ func (d *Daemon) Run(ctx context.Context) error {
case <-t.C:
budget := d.orch.EstimatedEmergencyShutdownSeconds()
threshold := int(math.Ceil(float64(budget) * d.cfg.UPS.RuntimeSafetyFactor))
anyOnBattery := false
d.exporter.UpdateBudget(budget)
for _, target := range d.targets {
sample, err := target.Provider.Read(ctx)
if err != nil {
if lastOnBattery[target.Name] {
anyOnBattery = true
}
d.log.Printf("warning: ups read failed target=%s (%s): %v", target.Name, target.Target, err)
d.exporter.UpdateSample(metrics.Sample{
Name: target.Name,
@ -139,45 +131,17 @@ func (d *Daemon) Run(ctx context.Context) error {
}
lastGood[target.Name] = time.Now()
if sample.OnBattery {
anyOnBattery = true
}
wasOnBattery := lastOnBattery[target.Name]
if sample.OnBattery {
if !wasOnBattery || onBatterySince[target.Name].IsZero() {
onBatterySince[target.Name] = time.Now()
}
} else {
onBatterySince[target.Name] = time.Time{}
}
lastOnBattery[target.Name] = sample.OnBattery
onBatteryElapsed := 0
if sample.OnBattery && !onBatterySince[target.Name].IsZero() {
onBatteryElapsed = int(time.Since(onBatterySince[target.Name]).Seconds())
}
trigger := false
triggerReason := ""
switch {
case sample.LowBattery:
trigger = true
triggerReason = fmt.Sprintf("ups-low-battery target=%s status=%s", target.Name, sample.RawStatus)
case sample.OnBattery && sample.RuntimeSeconds > 0 && sample.RuntimeSeconds <= threshold:
trigger = true
triggerReason = fmt.Sprintf("ups-threshold target=%s runtime=%ds threshold=%ds status=%s", target.Name, sample.RuntimeSeconds, threshold, sample.RawStatus)
case sample.OnBattery && d.cfg.UPS.OnBatteryGraceSeconds > 0 && onBatteryElapsed >= d.cfg.UPS.OnBatteryGraceSeconds:
trigger = true
triggerReason = fmt.Sprintf("ups-on-battery target=%s elapsed=%ds grace=%ds status=%s", target.Name, onBatteryElapsed, d.cfg.UPS.OnBatteryGraceSeconds, sample.RawStatus)
}
trigger := sample.LowBattery || (sample.OnBattery && sample.RuntimeSeconds > 0 && sample.RuntimeSeconds <= threshold)
if trigger {
breachCount[target.Name]++
} else {
breachCount[target.Name] = 0
}
d.log.Printf("ups target=%s status=%s on_battery=%t on_battery_s=%d grace_s=%d runtime_s=%d threshold_s=%d budget_s=%d trigger=%t breach=%d",
target.Name, sample.RawStatus, sample.OnBattery, onBatteryElapsed, d.cfg.UPS.OnBatteryGraceSeconds, sample.RuntimeSeconds, threshold, budget, trigger, breachCount[target.Name])
d.log.Printf("ups target=%s status=%s on_battery=%t runtime_s=%d threshold_s=%d budget_s=%d trigger=%t breach=%d",
target.Name, sample.RawStatus, sample.OnBattery, sample.RuntimeSeconds, threshold, budget, trigger, breachCount[target.Name])
d.exporter.UpdateSample(metrics.Sample{
Name: target.Name,
@ -196,54 +160,14 @@ func (d *Daemon) Run(ctx context.Context) error {
})
if breachCount[target.Name] >= debounce {
return d.triggerShutdown(ctx, triggerReason)
reason := fmt.Sprintf("ups-threshold target=%s runtime=%ds threshold=%ds status=%s", target.Name, sample.RuntimeSeconds, threshold, sample.RawStatus)
return d.triggerShutdown(ctx, reason)
}
}
d.maybeRunPostStartAutoHeal(ctx, &lastAutoHeal, anyOnBattery)
}
}
}
// maybeRunPostStartAutoHeal runs one orchestration or CLI step.
// Signature: (d *Daemon) maybeRunPostStartAutoHeal(ctx context.Context, lastRun *time.Time, anyOnBattery bool).
// Why: gives the long-running daemon a bounded path to repair post-start drift
// like a later Vault reseal or stale dead-node deletions without waiting for a
// fresh bootstrap run.
func (d *Daemon) maybeRunPostStartAutoHeal(ctx context.Context, lastRun *time.Time, anyOnBattery bool) {
interval := time.Duration(d.cfg.Startup.PostStartAutoHealSeconds) * time.Second
if interval <= 0 || anyOnBattery {
return
}
if d.orch == nil && d.postStartAutoHealOverride == nil {
return
}
now := time.Now()
if lastRun != nil && !lastRun.IsZero() && now.Sub(*lastRun) < interval {
return
}
if lastRun != nil {
*lastRun = now
}
if err := d.runPostStartAutoHeal(ctx); err != nil {
d.log.Printf("warning: post-start auto-heal: %v", err)
}
}
// runPostStartAutoHeal runs one orchestration or CLI step.
// Signature: (d *Daemon) runPostStartAutoHeal(ctx context.Context) error.
// Why: keeps the daemon loop readable while allowing unit tests to inject a
// deterministic repair hook without a live cluster.
func (d *Daemon) runPostStartAutoHeal(ctx context.Context) error {
if d.postStartAutoHealOverride != nil {
return d.postStartAutoHealOverride(ctx)
}
if d.orch == nil {
return nil
}
return d.orch.RunPostStartAutoHeal(ctx)
}
// triggerShutdown runs one orchestration or CLI step.
// Signature: (d *Daemon) triggerShutdown(ctx context.Context, reason string) error.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.

View File

@ -165,50 +165,6 @@ func TestDaemonRunTriggersShutdownOnTelemetryTimeout(t *testing.T) {
}
}
// TestDaemonRunTriggersShutdownAfterOnBatteryGrace runs one orchestration or CLI step.
// Signature: TestDaemonRunTriggersShutdownAfterOnBatteryGrace(t *testing.T).
// Why: covers the sustained-on-battery trigger so short runtime estimates are not
// the only path to a graceful shutdown during abrupt power loss.
func TestDaemonRunTriggersShutdownAfterOnBatteryGrace(t *testing.T) {
stateDir := t.TempDir()
orch := newDaemonTestOrchestrator(t, stateDir)
d := &Daemon{
cfg: config.Config{
UPS: config.UPS{
Enabled: true,
PollSeconds: 1,
DebounceCount: 1,
RuntimeSafetyFactor: 1.0,
OnBatteryGraceSeconds: 1,
},
State: config.State{
IntentPath: filepath.Join(stateDir, "intent.json"),
},
Shutdown: config.Shutdown{
EmergencySkipDrain: true,
EmergencySkipEtcd: true,
},
},
orch: orch,
targets: []Target{
{
Name: "Pyrphoros",
Target: "pyrphoros@localhost",
Provider: &daemonFakeProvider{
samples: []ups.Sample{{OnBattery: true, LowBattery: false, RuntimeSeconds: 9999, RawStatus: "OB"}},
},
},
},
log: log.New(io.Discard, "", 0),
exporter: metrics.New(),
}
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
if err := d.Run(ctx); err != nil {
t.Fatalf("expected sustained-on-battery shutdown path to complete, got %v", err)
}
}
// TestForwardShutdownSucceedsWithSSHShim runs one orchestration or CLI step.
// Signature: TestForwardShutdownSucceedsWithSSHShim(t *testing.T).
// Why: covers forward-shutdown SSH execution path.

View File

@ -1,51 +0,0 @@
package service
import (
"context"
"testing"
"time"
"scm.bstein.dev/bstein/ananke/internal/config"
)
// TestDaemonMaybeRunPostStartAutoHeal runs one orchestration or CLI step.
// Signature: TestDaemonMaybeRunPostStartAutoHeal(t *testing.T).
// Why: covers the daemon-side interval and on-battery guards for the new
// post-start repair loop.
func TestDaemonMaybeRunPostStartAutoHeal(t *testing.T) {
calls := 0
d := &Daemon{
cfg: config.Config{
Startup: config.Startup{
PostStartAutoHealSeconds: 10,
},
},
postStartAutoHealOverride: func(context.Context) error {
calls++
return nil
},
}
var last time.Time
d.maybeRunPostStartAutoHeal(context.Background(), &last, false)
if calls != 1 {
t.Fatalf("expected first auto-heal invocation, got %d", calls)
}
d.maybeRunPostStartAutoHeal(context.Background(), &last, false)
if calls != 1 {
t.Fatalf("expected interval guard to suppress second call, got %d", calls)
}
last = time.Now().Add(-11 * time.Second)
d.maybeRunPostStartAutoHeal(context.Background(), &last, true)
if calls != 1 {
t.Fatalf("expected on-battery guard to suppress call, got %d", calls)
}
last = time.Now().Add(-11 * time.Second)
d.maybeRunPostStartAutoHeal(context.Background(), &last, false)
if calls != 2 {
t.Fatalf("expected second allowed auto-heal call, got %d", calls)
}
}

View File

@ -22,23 +22,12 @@ type Intent struct {
UpdatedAt time.Time `json:"updated_at"`
}
var (
readIntentImpl = readIntentDefault
writeIntentImpl = writeIntentDefault
)
var writeIntentImpl = writeIntentDefault
// ReadIntent runs one orchestration or CLI step.
// Signature: ReadIntent(path string) (Intent, error).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func ReadIntent(path string) (Intent, error) {
return readIntentImpl(path)
}
// readIntentDefault runs one orchestration or CLI step.
// Signature: readIntentDefault(path string) (Intent, error).
// Why: keeps production read behavior available while tests can override intent
// reads deterministically without racing background file mutations.
func readIntentDefault(path string) (Intent, error) {
b, err := os.ReadFile(path)
if err != nil {
if os.IsNotExist(err) {

View File

@ -22,34 +22,6 @@ func TestHookWriteIntentDefault(path string, in Intent) error {
return writeIntentDefault(path, in)
}
// TestHookReadIntentDefault runs one orchestration or CLI step.
// Signature: TestHookReadIntentDefault(path string) (Intent, error).
// Why: lets top-level tests delegate to production ReadIntent behavior while
// selectively forcing deterministic read sequences for lifecycle branches.
func TestHookReadIntentDefault(path string) (Intent, error) {
return readIntentDefault(path)
}
// TestHookSetReadIntentOverride runs one orchestration or CLI step.
// Signature: TestHookSetReadIntentOverride(fn func(path string) (Intent, error)) (restore func()).
// Why: enables deterministic intent-read failure injection without sleeping
// goroutines that race slower CI agents.
func TestHookSetReadIntentOverride(fn func(path string) (Intent, error)) (restore func()) {
testHookOverrideMu.Lock()
prev := readIntentImpl
if fn == nil {
readIntentImpl = readIntentDefault
} else {
readIntentImpl = fn
}
testHookOverrideMu.Unlock()
return func() {
testHookOverrideMu.Lock()
readIntentImpl = prev
testHookOverrideMu.Unlock()
}
}
// TestHookSetWriteIntentOverride runs one orchestration or CLI step.
// Signature: TestHookSetWriteIntentOverride(fn func(path string, in Intent) error) (restore func()).
// Why: enables deterministic intent-write failure injection from the top-level

View File

@ -1,116 +0,0 @@
# Binary, config template, and systemd artifact helpers for the installer.
resolve_build_target() {
if [[ -d "${REPO_DIR}/cmd/ananke" ]]; then
echo "./cmd/ananke"
return 0
fi
return 1
}
install_config_template() {
local template="$1"
local dest="$2"
local src legacy
local -a modern_candidates=()
local -a legacy_candidates=()
case "${template}" in
coordinator)
modern_candidates=("configs/ananke.coordinator.yaml" "configs/ananke.titan-db.yaml")
legacy_candidates=("configs/hecate.titan-db.yaml")
;;
peer)
modern_candidates=("configs/ananke.peer.yaml" "configs/ananke.tethys.yaml")
legacy_candidates=("configs/hecate.tethys.yaml")
;;
example)
modern_candidates=("configs/ananke.example.yaml")
legacy_candidates=("configs/hecate.example.yaml")
;;
*)
echo "[install] unknown config template key: ${template}" >&2
return 1
;;
esac
for src in "${modern_candidates[@]}"; do
if [[ -f "${src}" ]]; then
install -m 0640 "${src}" "${dest}"
return 0
fi
done
for legacy in "${legacy_candidates[@]}"; do
if [[ -f "${legacy}" ]]; then
src="$(mktemp)"
legacy_path_rewrite "${legacy}" "${src}"
install -m 0640 "${src}" "${dest}"
rm -f "${src}"
return 0
fi
done
echo "[install] missing config template sources for '${template}'. modern=[${modern_candidates[*]}] legacy=[${legacy_candidates[*]}]" >&2
return 1
}
install_systemd_units() {
local tmp
while IFS='|' read -r target_name modern_name legacy_name; do
local modern_src="deploy/systemd/${modern_name}"
local legacy_src="deploy/systemd/${legacy_name}"
local target="${SYSTEMD_DIR}/${target_name}"
if [[ -f "${modern_src}" ]]; then
install -m 0644 "${modern_src}" "${target}"
continue
fi
if [[ -f "${legacy_src}" ]]; then
tmp="$(mktemp)"
legacy_path_rewrite "${legacy_src}" "${tmp}"
install -m 0644 "${tmp}" "${target}"
rm -f "${tmp}"
continue
fi
echo "[install] missing both modern and legacy systemd unit sources for ${target_name}" >&2
return 1
done <<'EOF_UNITS'
ananke.service|ananke.service|hecate.service
ananke-bootstrap.service|ananke-bootstrap.service|hecate-bootstrap.service
ananke-update.service|ananke-update.service|hecate-update.service
ananke-update.timer|ananke-update.timer|hecate-update.timer
EOF_UNITS
}
install_self_update_script() {
local modern_src="scripts/ananke-self-update.sh"
local legacy_src="scripts/hecate-self-update.sh"
local target="${LIB_DIR}/ananke-self-update.sh"
local tmp
if [[ -f "${modern_src}" ]]; then
install -m 0755 "${modern_src}" "${target}"
return 0
fi
if [[ -f "${legacy_src}" ]]; then
tmp="$(mktemp)"
legacy_path_rewrite "${legacy_src}" "${tmp}"
sed -Ei \
-e 's/HECATE_/ANANKE_/g' \
-e 's/hecate-self-update/ananke-self-update/g' \
-e 's#/opt/hecate#/opt/ananke#g' \
-e 's#bstein/hecate\.git#bstein/ananke.git#g' \
"${tmp}"
install -m 0755 "${tmp}" "${target}"
rm -f "${tmp}"
return 0
fi
echo "[install] missing both modern and legacy self-update scripts." >&2
return 1
}

View File

@ -1,334 +0,0 @@
# Config migration helpers for the Ananke host installer.
read_ananke_role() {
if [[ ! -f "${CONF_DIR}/ananke.yaml" ]]; then
echo "coordinator"
return 0
fi
local role
role="$(awk '/^[[:space:]]*role:[[:space:]]*/ {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true)"
if [[ -z "${role}" ]]; then
role="coordinator"
fi
echo "${role}"
}
migration_yaml_lookup() {
local key="$1"
awk -F': *' -v k="${key}" '$1 == k {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true
}
first_control_plane_name() {
awk '
/^control_planes:[[:space:]]*$/ {in_list=1; next}
in_list && /^[[:space:]]*-[[:space:]]*/ {gsub(/^[[:space:]]*-[[:space:]]*/, "", $0); print $0; exit}
in_list && /^[^[:space:]]/ {in_list=0}
' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true
}
lookup_node_host() {
local node="$1"
awk -F': *' -v n="${node}" '$1 == " " n {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true
}
migrate_ananke_config() {
if [[ ! -f "${CONF_DIR}/ananke.yaml" ]]; then
return 0
fi
local changed=0
local role_hint
role_hint="$(read_ananke_role)"
if grep -Eq 'default_budget_seconds:[[:space:]]*300' "${CONF_DIR}/ananke.yaml"; then
sed -Ei 's/(default_budget_seconds:[[:space:]]*)300/\11380/' "${CONF_DIR}/ananke.yaml"
echo "[install] migrated default_budget_seconds 300 -> 1380 in ${CONF_DIR}/ananke.yaml"
changed=1
fi
if grep -Eq 'runtime_safety_factor:[[:space:]]*1\.10' "${CONF_DIR}/ananke.yaml"; then
sed -Ei 's/(runtime_safety_factor:[[:space:]]*)1\.10/\11.25/' "${CONF_DIR}/ananke.yaml"
echo "[install] migrated runtime_safety_factor 1.10 -> 1.25 in ${CONF_DIR}/ananke.yaml"
changed=1
fi
if grep -Eq '^ssh_node_users:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
&& grep -Eq '^ titan-24:[[:space:]]*tethys[[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
sed -Ei 's/^ titan-24:[[:space:]]*tethys[[:space:]]*$/ titan-24: atlas/' "${CONF_DIR}/ananke.yaml"
echo "[install] migrated ssh_node_users titan-24 override to atlas"
changed=1
fi
if grep -Eq '^ command_timeout_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ startup_guard_max_age_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ command_timeout_seconds:[[:space:]]*[0-9]+/a\ startup_guard_max_age_seconds: 900' "${CONF_DIR}/ananke.yaml"
echo "[install] added coordination.startup_guard_max_age_seconds=900"
changed=1
fi
if grep -Eq '^[[:space:]]*poweroff_enabled:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
sed -Ei \
-e '/^[[:space:]]*poweroff_enabled:[[:space:]]*(true|false)/d' \
-e '/^[[:space:]]*poweroff_delay_seconds:[[:space:]]*[0-9]+/d' \
-e '/^[[:space:]]*poweroff_local_host:[[:space:]]*(true|false)/d' \
-e '/^[[:space:]]*extra_poweroff_hosts:[[:space:]]*(\[\])?[[:space:]]*$/d' \
"${CONF_DIR}/ananke.yaml"
echo "[install] removed deprecated host-poweroff shutdown config keys"
changed=1
fi
if grep -Eq '^ minimum_battery_percent:[[:space:]]*[0-9.]+' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ require_node_inventory_reachability:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ minimum_battery_percent:[[:space:]]*[0-9.]+/a\ require_node_inventory_reachability: true\n node_inventory_reachability_wait_seconds: 300\n node_inventory_reachability_poll_seconds: 5' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup node inventory reachability gate defaults"
changed=1
fi
if grep -Eq '^state:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ reports_dir:[[:space:]]*/var/lib/ananke/reports' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ dir:[[:space:]]*\/var\/lib\/ananke$/a\ reports_dir: /var/lib/ananke/reports' "${CONF_DIR}/ananke.yaml"
echo "[install] added state.reports_dir default"
changed=1
fi
if ! grep -Eq '^ peer_hosts:' "${CONF_DIR}/ananke.yaml"; then
if [[ "${role_hint}" == "peer" ]] && grep -Eq '^ forward_shutdown_host:[[:space:]]*[A-Za-z0-9._-]+' "${CONF_DIR}/ananke.yaml"; then
local peer_host
peer_host="$(awk -F': *' '/^ forward_shutdown_host:[[:space:]]*/ {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true)"
if [[ -n "${peer_host}" ]]; then
sed -Ei '/^ forward_shutdown_config:[[:space:]]*.*$/a\ peer_hosts:\n - '"${peer_host}"'' "${CONF_DIR}/ananke.yaml"
echo "[install] added coordination.peer_hosts from forward_shutdown_host (${peer_host})"
changed=1
fi
elif [[ "${role_hint}" == "coordinator" ]] && grep -Eq '^ titan-24:[[:space:]]*[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ forward_shutdown_config:[[:space:]]*.*$/a\ peer_hosts:\n - titan-24' "${CONF_DIR}/ananke.yaml"
echo "[install] added coordination.peer_hosts default (titan-24) for coordinator role"
changed=1
else
sed -Ei '/^ forward_shutdown_config:[[:space:]]*.*$/a\ peer_hosts: []' "${CONF_DIR}/ananke.yaml"
echo "[install] added coordination.peer_hosts empty default"
changed=1
fi
fi
local default_restore_cp
default_restore_cp="$(first_control_plane_name)"
if [[ -z "${default_restore_cp}" ]]; then
default_restore_cp="titan-0a"
fi
if grep -Eq '^ api_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ auto_etcd_restore_on_api_failure:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ api_poll_seconds:[[:space:]]*[0-9]+/a\ require_time_sync: true\n time_sync_wait_seconds: 240\n time_sync_poll_seconds: 5\n reconcile_access_on_boot: true\n auto_etcd_restore_on_api_failure: true\n etcd_restore_control_plane: '"${default_restore_cp}"'' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup.auto_etcd_restore_on_api_failure + startup.etcd_restore_control_plane defaults"
changed=1
fi
if grep -Eq '^ api_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ require_time_sync:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ api_poll_seconds:[[:space:]]*[0-9]+/a\ require_time_sync: true\n time_sync_wait_seconds: 240\n time_sync_poll_seconds: 5\n reconcile_access_on_boot: true' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup time sync + access reconciliation defaults"
changed=1
fi
if grep -Eq '^ time_sync_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ time_sync_mode:[[:space:]]*(strict|quorum)' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ time_sync_poll_seconds:[[:space:]]*[0-9]+/a\ time_sync_mode: quorum\n time_sync_quorum: 2' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup time sync quorum defaults"
changed=1
fi
if grep -Eq '^ etcd_restore_control_plane:[[:space:]]*[A-Za-z0-9._-]+' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ require_storage_ready:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ etcd_restore_control_plane:[[:space:]]*[A-Za-z0-9._-]+/a\ require_storage_ready: true\n storage_ready_wait_seconds: 420\n storage_ready_poll_seconds: 5\n storage_min_ready_nodes: 2\n storage_critical_pvcs:\n - vault/data-vault-0\n - postgres/postgres-data-postgres-0\n - gitea/gitea-data\n - sso/keycloak-data' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup storage readiness defaults"
changed=1
fi
if grep -Eq '^ storage_critical_pvcs:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ require_post_start_probes:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ - sso\/keycloak-data$/a\ require_post_start_probes: true\n post_start_probe_wait_seconds: 240\n post_start_probe_poll_seconds: 5\n post_start_probes:\n - https://scm.bstein.dev/api/healthz\n - https://metrics.bstein.dev/api/health\n require_service_checklist: true\n service_checklist_wait_seconds: 420\n service_checklist_poll_seconds: 5\n service_checklist_stability_seconds: 120\n service_checklist:\n - name: gitea-api\n url: https://scm.bstein.dev/api/healthz\n accepted_statuses: [200]\n body_contains: pass\n timeout_seconds: 12\n - name: grafana-api\n url: https://metrics.bstein.dev/api/health\n accepted_statuses: [200]\n body_contains: '\''\"database\":\"ok\"'\''\n timeout_seconds: 12\n vault_unseal_key_file: /var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup post-start probe + vault key fallback defaults"
changed=1
fi
if grep -Eq '^ - https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration$' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ - https:\/\/sso\.bstein\.dev\/realms\/atlas\/\.well-known\/openid-configuration$/d' "${CONF_DIR}/ananke.yaml"
echo "[install] removed sso OIDC probe from startup.post_start_probes (returns 404 in current deployment)"
changed=1
fi
if ! grep -Eq '^ vault_unseal_key_file:[[:space:]]*/var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"; then
if grep -Eq '^startup:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" && grep -Eq '^ post_start_probes:[[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ - https:\/\/metrics\.bstein\.dev\/api\/health$/a\ vault_unseal_key_file: /var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup.vault_unseal_key_file default"
changed=1
fi
fi
if ! grep -Eq '^ vault_unseal_breakglass_timeout_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml"; then
if grep -Eq '^ vault_unseal_key_file:[[:space:]]*/var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ vault_unseal_key_file:[[:space:]]*\/var\/lib\/ananke\/vault-unseal.key$/a\ vault_unseal_breakglass_command: ""\n vault_unseal_breakglass_timeout_seconds: 15' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup break-glass fallback defaults"
changed=1
fi
fi
install_cluster_inventory_defaults "${role_hint}" && changed=1
if [[ "${changed}" -eq 1 ]]; then
chmod 0640 "${CONF_DIR}/ananke.yaml" || true
fi
}
install_cluster_inventory_defaults() {
local role="$1"
local changed=0
local inventory_block=""
local managed_block=""
local workers_block
workers_block='workers:
- titan-04
- titan-05
- titan-06
- titan-07
- titan-08
- titan-09
- titan-10
- titan-11
- titan-12
- titan-13
- titan-14
- titan-15
- titan-17
- titan-18
- titan-19
- titan-20
- titan-21
- titan-22
- titan-24'
if [[ "${role}" == "coordinator" || "${role}" == "peer" ]]; then
inventory_block='ssh_node_hosts:
titan-db: 192.168.22.10
titan-0a: 192.168.22.11
titan-0b: 192.168.22.12
titan-0c: 192.168.22.13
titan-04: 192.168.22.30
titan-05: 192.168.22.31
titan-06: 192.168.22.32
titan-07: 192.168.22.33
titan-08: 192.168.22.34
titan-09: 192.168.22.35
titan-10: 192.168.22.36
titan-11: 192.168.22.37
titan-12: 192.168.22.40
titan-13: 192.168.22.41
titan-14: 192.168.22.42
titan-15: 192.168.22.43
titan-17: 192.168.22.45
titan-18: 192.168.22.46
titan-19: 192.168.22.47
titan-20: 192.168.22.20
titan-21: 192.168.22.21
titan-22: 192.168.22.22
titan-24: 192.168.22.26'
managed_block='ssh_managed_nodes:
- titan-db
- titan-0a
- titan-0b
- titan-0c
- titan-04
- titan-05
- titan-06
- titan-07
- titan-08
- titan-09
- titan-10
- titan-11
- titan-12
- titan-13
- titan-14
- titan-15
- titan-17
- titan-18
- titan-19
- titan-20
- titan-21
- titan-22
- titan-24'
fi
if [[ -n "${inventory_block}" ]] && grep -Eq '^ssh_node_hosts:[[:space:]]*\{\}[[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
perl -0pi -e 's#ssh_node_hosts:\s*\{\}\n#'"${inventory_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
echo "[install] hydrated ssh_node_hosts inventory for role=${role}"
changed=1
fi
if grep -Eq '^workers:[[:space:]]*\[\][[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
perl -0pi -e 's#workers:\s*\[\]\n#'"${workers_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
echo "[install] hydrated workers inventory for startup/shutdown orchestration"
changed=1
fi
if [[ -n "${managed_block}" ]]; then
if grep -Eq '^ssh_managed_nodes:[[:space:]]*\[\][[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
perl -0pi -e 's#ssh_managed_nodes:\s*\[\]\n#'"${managed_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
echo "[install] hydrated ssh_managed_nodes inventory for role=${role}"
changed=1
fi
if ! grep -Eq '^ - titan-04$' "${CONF_DIR}/ananke.yaml" || ! grep -Eq '^ - titan-21$' "${CONF_DIR}/ananke.yaml"; then
perl -0pi -e 's#ssh_managed_nodes:\n(?: - [^\n]*\n)*#'"${managed_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
echo "[install] refreshed ssh_managed_nodes coverage for role=${role}"
changed=1
fi
fi
if [[ "${role}" == "peer" ]]; then
install_peer_inventory_defaults && changed=1
fi
[[ "${changed}" -eq 1 ]]
}
install_peer_inventory_defaults() {
local changed=0
if grep -Eq '^ssh_managed_nodes:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
&& grep -Eq '^ - titan-db$' "${CONF_DIR}/ananke.yaml" \
&& grep -Eq '^ - titan-24$' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ - titan-0a$' "${CONF_DIR}/ananke.yaml"; then
perl -0pi -e 's#ssh_managed_nodes:\n - titan-db\n - titan-24\n#ssh_managed_nodes:\n - titan-db\n - titan-0a\n - titan-0b\n - titan-0c\n - titan-04\n - titan-05\n - titan-06\n - titan-07\n - titan-08\n - titan-09\n - titan-10\n - titan-11\n - titan-12\n - titan-13\n - titan-14\n - titan-15\n - titan-17\n - titan-18\n - titan-19\n - titan-20\n - titan-21\n - titan-22\n - titan-24\n#s' "${CONF_DIR}/ananke.yaml"
echo "[install] expanded peer ssh_managed_nodes for bootstrap fallback coverage"
changed=1
fi
if ! grep -Eq '^ - services/keycloak$' "${CONF_DIR}/ananke.yaml" || ! grep -Eq '^ - infrastructure/cert-manager$' "${CONF_DIR}/ananke.yaml" || ! grep -Eq '^ - services/oauth2-proxy$' "${CONF_DIR}/ananke.yaml"; then
perl -0pi -e 's#local_bootstrap_paths:\n(?: - [^\n]*\n)*#local_bootstrap_paths:\n - infrastructure/core\n - clusters/atlas/flux-system\n - infrastructure/sources/helm\n - infrastructure/metallb\n - infrastructure/traefik\n - infrastructure/cert-manager\n - infrastructure/vault-csi\n - infrastructure/vault-injector\n - services/vault\n - infrastructure/postgres\n - services/gitea\n - services/keycloak\n - services/oauth2-proxy\n#s' "${CONF_DIR}/ananke.yaml"
echo "[install] refreshed peer local_bootstrap_paths for full fallback bootstrap parity"
changed=1
fi
[[ "${changed}" -eq 1 ]]
}
sanitize_migrated_ananke_config() {
local cfg="${CONF_DIR}/ananke.yaml"
[[ -f "${cfg}" ]] || return 0
local tmp changed=0
tmp="$(mktemp)"
# If a legacy migration bug appended root-level node entries after
# ssh_managed_nodes, drop those orphan entries until the next top-level key.
awk '
BEGIN {in_managed=0}
/^ssh_managed_nodes:[[:space:]]*$/ {in_managed=1; print; next}
{
if (in_managed) {
if ($0 ~ /^ - /) {print; next}
if ($0 ~ /^- /) {next}
if ($0 ~ /^[A-Za-z0-9_]+:[[:space:]]*/) {in_managed=0}
}
print
}
' "${cfg}" > "${tmp}"
if ! cmp -s "${cfg}" "${tmp}"; then
mv "${tmp}" "${cfg}"
changed=1
echo "[install] sanitized malformed ssh_managed_nodes block in ${cfg}"
else
rm -f "${tmp}"
fi
if grep -Eq '^[[:space:]]*forward_shutdown_config:[[:space:]]*/etc/ananke/hecate.yaml[[:space:]]*$' "${cfg}"; then
sed -Ei 's#(^[[:space:]]*forward_shutdown_config:[[:space:]]*)/etc/ananke/hecate.yaml#\1/etc/ananke/ananke.yaml#' "${cfg}"
changed=1
echo "[install] migrated coordination.forward_shutdown_config to /etc/ananke/ananke.yaml"
fi
if [[ "${changed}" -eq 1 ]]; then
chmod 0640 "${cfg}" || true
fi
}

View File

@ -1,239 +0,0 @@
# Host bootstrap helpers for the Ananke installer.
resolve_nut_ups_name() {
if [[ -n "${NUT_UPS_NAME}" ]]; then
return 0
fi
if [[ -f "${CONF_DIR}/ananke.yaml" ]]; then
local target=""
target="$(grep -Eo 'target:[[:space:]]*[A-Za-z0-9._-]+@localhost' "${CONF_DIR}/ananke.yaml" | head -n 1 | awk '{print $2}')"
if [[ -n "${target}" ]]; then
NUT_UPS_NAME="${target%@localhost}"
echo "[install] inferred NUT UPS name from config: ${NUT_UPS_NAME}"
return 0
fi
fi
NUT_UPS_NAME="pyrphoros"
echo "[install] defaulting NUT UPS name to ${NUT_UPS_NAME}"
}
ensure_ananke_kubeconfig() {
local kubeconfig_path
kubeconfig_path="$(migration_yaml_lookup "kubeconfig")"
if [[ -z "${kubeconfig_path}" ]]; then
kubeconfig_path="/etc/ananke/kubeconfig"
fi
install -d -m 0750 "$(dirname "${kubeconfig_path}")"
if [[ -s "${kubeconfig_path}" ]] && KUBECONFIG="${kubeconfig_path}" kubectl version --request-timeout=5s >/dev/null 2>&1; then
return 0
fi
if [[ -r /etc/rancher/k3s/k3s.yaml ]]; then
install -m 0600 /etc/rancher/k3s/k3s.yaml "${kubeconfig_path}"
echo "[install] refreshed kubeconfig from local /etc/rancher/k3s/k3s.yaml"
if KUBECONFIG="${kubeconfig_path}" kubectl version --request-timeout=5s >/dev/null 2>&1; then
return 0
fi
fi
local cp_name cp_host ssh_user ssh_port ssh_cfg ssh_key
cp_name="$(first_control_plane_name)"
if [[ -z "${cp_name}" ]]; then
echo "[install] warning: cannot infer control plane name; kubeconfig bootstrap skipped"
return 0
fi
cp_host="$(lookup_node_host "${cp_name}")"
if [[ -z "${cp_host}" ]]; then
cp_host="${cp_name}"
fi
ssh_user="$(migration_yaml_lookup "ssh_user")"
ssh_port="$(migration_yaml_lookup "ssh_port")"
ssh_cfg="$(migration_yaml_lookup "ssh_config_file")"
ssh_key="$(migration_yaml_lookup "ssh_identity_file")"
if [[ -z "${ssh_port}" ]]; then
ssh_port="2277"
fi
local target
target="${cp_host}"
if [[ -n "${ssh_user}" ]]; then
target="${ssh_user}@${cp_host}"
fi
local ssh_args=(
-o BatchMode=yes
-o ConnectTimeout=8
-o StrictHostKeyChecking=accept-new
)
if [[ -n "${ssh_cfg}" && -f "${ssh_cfg}" ]]; then
ssh_args+=(-F "${ssh_cfg}")
fi
if [[ -n "${ssh_key}" && -f "${ssh_key}" ]]; then
ssh_args+=(-i "${ssh_key}")
fi
if [[ -n "${ssh_port}" ]]; then
ssh_args+=(-p "${ssh_port}")
fi
local remote_cfg
if remote_cfg="$(ssh "${ssh_args[@]}" "${target}" "sudo cat /etc/rancher/k3s/k3s.yaml" 2>/dev/null)"; then
printf '%s\n' "${remote_cfg}" > "${kubeconfig_path}"
sed -Ei "s#server:[[:space:]]*https://127\\.0\\.0\\.1:6443#server: https://${cp_host}:6443#g" "${kubeconfig_path}" || true
chmod 0600 "${kubeconfig_path}"
echo "[install] bootstrapped kubeconfig from control plane ${cp_name} (${cp_host})"
if KUBECONFIG="${kubeconfig_path}" kubectl version --request-timeout=5s >/dev/null 2>&1; then
return 0
fi
else
echo "[install] warning: failed to fetch kubeconfig from ${cp_name} (${cp_host})"
fi
echo "[install] warning: kubeconfig at ${kubeconfig_path} is still not validated; local startup fallback may fail"
}
ensure_ananke_ssh_identity() {
local key_path key_dir key_user key_comment
key_path="$(migration_yaml_lookup "ssh_identity_file")"
if [[ -z "${key_path}" ]]; then
key_path="/home/atlas/.ssh/id_ed25519"
fi
key_dir="$(dirname "${key_path}")"
key_comment="ananke-$(hostname)-forward"
key_user="root"
if [[ "${key_path}" == /home/*/* ]]; then
key_user="${key_path#/home/}"
key_user="${key_user%%/*}"
fi
if ! id "${key_user}" >/dev/null 2>&1; then
echo "[install] warning: ssh identity owner ${key_user} does not exist; skipping key bootstrap for ${key_path}"
return 0
fi
install -d -m 0700 -o "${key_user}" -g "${key_user}" "${key_dir}"
if [[ ! -s "${key_path}" ]]; then
echo "[install] generating missing SSH identity at ${key_path}"
if [[ "${key_user}" == "root" ]]; then
ssh-keygen -q -t ed25519 -N '' -C "${key_comment}" -f "${key_path}"
else
runuser -u "${key_user}" -- ssh-keygen -q -t ed25519 -N '' -C "${key_comment}" -f "${key_path}"
fi
fi
chown "${key_user}:${key_user}" "${key_path}" "${key_path}.pub" 2>/dev/null || true
chmod 0600 "${key_path}" || true
chmod 0644 "${key_path}.pub" || true
}
ensure_apt_packages() {
local missing=()
for pkg in "$@"; do
if ! dpkg -s "${pkg}" >/dev/null 2>&1; then
missing+=("${pkg}")
fi
done
if [[ ${#missing[@]} -eq 0 ]]; then
return 0
fi
echo "[install] apt install: ${missing[*]}"
export DEBIAN_FRONTEND=noninteractive
apt-get update -y
apt-get install -y "${missing[@]}"
}
install_kubectl_if_missing() {
if command -v kubectl >/dev/null 2>&1; then
return 0
fi
ensure_apt_packages kubernetes-client || true
if command -v kubectl >/dev/null 2>&1; then
return 0
fi
echo "[install] installing kubectl via upstream binary"
local arch
arch="$(uname -m)"
case "${arch}" in
x86_64) arch="amd64" ;;
aarch64|arm64) arch="arm64" ;;
*) echo "Unsupported arch for kubectl install: ${arch}" >&2; return 1 ;;
esac
local version
version="$(curl -fsSL https://dl.k8s.io/release/stable.txt)"
curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/${version}/bin/linux/${arch}/kubectl"
chmod 0755 /usr/local/bin/kubectl
}
ensure_dependencies() {
if [[ "${INSTALL_DEPS}" -eq 0 ]]; then
echo "[install] skipping dependency installation"
return 0
fi
if ! command -v apt-get >/dev/null 2>&1; then
echo "This installer currently supports apt-based hosts only." >&2
exit 1
fi
ensure_apt_packages ca-certificates curl git openssh-client jq nut-client nut-server nut-monitor golang-go
install_kubectl_if_missing
}
configure_nut() {
if [[ "${MANAGE_NUT}" != "1" ]]; then
echo "[install] skipping NUT configuration (ANANKE_MANAGE_NUT=${MANAGE_NUT})"
return 0
fi
echo "[install] configuring NUT + udev for UPS ${NUT_UPS_NAME} (${NUT_VENDOR_ID}:${NUT_PRODUCT_ID})"
install -d -m 0755 /etc/nut /etc/udev/rules.d
cat > /etc/nut/nut.conf <<EOF
MODE=standalone
EOF
cat > /etc/nut/ups.conf <<EOF
[${NUT_UPS_NAME}]
driver = usbhid-ups
port = auto
vendorid = ${NUT_VENDOR_ID}
productid = ${NUT_PRODUCT_ID}
pollinterval = 5
EOF
cat > /etc/nut/upsd.users <<EOF
[${NUT_MONITOR_USER}]
password = ${NUT_MONITOR_PASSWORD}
upsmon primary
EOF
chmod 0640 /etc/nut/upsd.users
if getent group nut >/dev/null 2>&1; then
chown root:nut /etc/nut/upsd.users
else
chown root:root /etc/nut/upsd.users
fi
cat > /etc/nut/upsmon.conf <<EOF
RUN_AS_USER nut
MONITOR ${NUT_UPS_NAME}@localhost 1 ${NUT_MONITOR_USER} ${NUT_MONITOR_PASSWORD} primary
MINSUPPLIES 1
SHUTDOWNCMD "/sbin/shutdown -h +0"
POLLFREQ 5
POLLFREQALERT 5
HOSTSYNC 15
DEADTIME 15
POWERDOWNFLAG /etc/killpower
EOF
cat > /etc/udev/rules.d/99-ananke-ups.rules <<EOF
# Managed by ananke install.sh: ensure UPS USB HID devices are readable by NUT
ACTION=="add|change", SUBSYSTEM=="usb", ATTR{idVendor}=="${NUT_VENDOR_ID}", ATTR{idProduct}=="${NUT_PRODUCT_ID}", MODE:="0660", GROUP:="nut"
EOF
udevadm control --reload-rules || true
udevadm trigger --subsystem-match=usb --attr-match=idVendor="${NUT_VENDOR_ID}" --attr-match=idProduct="${NUT_PRODUCT_ID}" || true
systemctl enable nut-driver-enumerator.service nut-server.service nut-monitor.service >/dev/null 2>&1 || true
systemctl restart nut-driver-enumerator.service >/dev/null 2>&1 || true
systemctl restart "nut-driver@${NUT_UPS_NAME}.service" >/dev/null 2>&1 || true
systemctl restart nut-server.service nut-monitor.service >/dev/null 2>&1 || true
}

View File

@ -1,98 +0,0 @@
# Legacy Hecate migration helpers for the Ananke installer.
legacy_path_rewrite() {
local src="$1"
local dst="$2"
sed \
-e 's#/etc/hecate/hecate.yaml#/etc/ananke/ananke.yaml#g' \
-e 's#/etc/hecate/kubeconfig#/etc/ananke/kubeconfig#g' \
-e 's#/var/lib/hecate/vault-unseal.key#/var/lib/ananke/vault-unseal.key#g' \
-e 's#/var/lib/hecate/hecate.lock#/var/lib/ananke/ananke.lock#g' \
-e 's#/opt/hecate#/opt/ananke#g' \
-e 's#/etc/hecate#/etc/ananke#g' \
-e 's#/var/lib/hecate#/var/lib/ananke#g' \
-e 's#/usr/local/bin/hecate#/usr/local/bin/ananke#g' \
-e 's#/usr/local/lib/hecate#/usr/local/lib/ananke#g' \
-e 's/hecate.yaml/ananke.yaml/g' \
-e 's/hecate.lock/ananke.lock/g' \
-e 's/hecate/ananke/g' \
-e 's/Hecate/Ananke/g' \
-e 's#hecate\.lock#ananke.lock#g' \
"${src}" > "${dst}"
}
migrate_legacy_hecate_install() {
local legacy_conf_dir="/etc/hecate"
local legacy_state_dir="/var/lib/hecate"
local legacy_systemd_dir="/etc/systemd/system"
install -d -m 0750 "${CONF_DIR}"
install -d -m 0750 "${STATE_DIR}"
if [[ ! -f "${CONF_DIR}/ananke.yaml" && -f "${legacy_conf_dir}/hecate.yaml" ]]; then
echo "[install] migrating legacy config ${legacy_conf_dir}/hecate.yaml -> ${CONF_DIR}/ananke.yaml"
legacy_path_rewrite "${legacy_conf_dir}/hecate.yaml" "${CONF_DIR}/ananke.yaml"
chmod 0640 "${CONF_DIR}/ananke.yaml"
fi
if [[ ! -f "${CONF_DIR}/kubeconfig" && -f "${legacy_conf_dir}/kubeconfig" ]]; then
echo "[install] migrating legacy kubeconfig ${legacy_conf_dir}/kubeconfig -> ${CONF_DIR}/kubeconfig"
install -m 0600 "${legacy_conf_dir}/kubeconfig" "${CONF_DIR}/kubeconfig"
fi
if [[ ! -f "${STATE_DIR}/vault-unseal.key" && -f "${legacy_state_dir}/vault-unseal.key" ]]; then
echo "[install] migrating legacy vault key ${legacy_state_dir}/vault-unseal.key -> ${STATE_DIR}/vault-unseal.key"
install -m 0600 "${legacy_state_dir}/vault-unseal.key" "${STATE_DIR}/vault-unseal.key"
fi
if [[ ! -f "${STATE_DIR}/runs.json" && -f "${legacy_state_dir}/runs.json" ]]; then
echo "[install] migrating legacy run history ${legacy_state_dir}/runs.json -> ${STATE_DIR}/runs.json"
install -m 0640 "${legacy_state_dir}/runs.json" "${STATE_DIR}/runs.json"
fi
if [[ ! -f "${STATE_DIR}/intent.json" && -f "${legacy_state_dir}/intent.json" ]]; then
echo "[install] migrating legacy intent state ${legacy_state_dir}/intent.json -> ${STATE_DIR}/intent.json"
install -m 0640 "${legacy_state_dir}/intent.json" "${STATE_DIR}/intent.json"
fi
if [[ ! -f "${STATE_DIR}/ananke.lock" && -f "${legacy_state_dir}/hecate.lock" ]]; then
echo "[install] migrating legacy lock ${legacy_state_dir}/hecate.lock -> ${STATE_DIR}/ananke.lock"
install -m 0640 "${legacy_state_dir}/hecate.lock" "${STATE_DIR}/ananke.lock"
fi
if [[ -d "${legacy_systemd_dir}" ]]; then
if ls "${legacy_systemd_dir}"/hecate*.service >/dev/null 2>&1 || ls "${legacy_systemd_dir}"/hecate*.timer >/dev/null 2>&1; then
echo "[install] detected legacy hecate systemd unit files; will retire after ananke install"
fi
fi
}
retire_legacy_hecate_install() {
local ts backup_dir
ts="$(date +%Y%m%d%H%M%S)"
backup_dir="/var/backups/ananke-legacy-hecate-${ts}"
systemctl disable --now hecate.service hecate-bootstrap.service hecate-update.timer >/dev/null 2>&1 || true
systemctl stop hecate-update.service >/dev/null 2>&1 || true
if [[ -d /etc/hecate || -d /var/lib/hecate || -d /usr/local/lib/hecate || -d /opt/hecate ]]; then
install -d -m 0750 "${backup_dir}"
[[ -d /etc/hecate ]] && cp -a /etc/hecate "${backup_dir}/" || true
[[ -d /var/lib/hecate ]] && cp -a /var/lib/hecate "${backup_dir}/" || true
[[ -d /usr/local/lib/hecate ]] && cp -a /usr/local/lib/hecate "${backup_dir}/" || true
[[ -d /opt/hecate ]] && cp -a /opt/hecate "${backup_dir}/" || true
[[ -f /usr/local/bin/hecate ]] && install -m 0755 /usr/local/bin/hecate "${backup_dir}/hecate.bin" || true
echo "[install] backed up legacy hecate assets to ${backup_dir}"
fi
rm -f \
/etc/systemd/system/hecate.service \
/etc/systemd/system/hecate-bootstrap.service \
/etc/systemd/system/hecate-update.service \
/etc/systemd/system/hecate-update.timer
rm -f /usr/local/bin/hecate
rm -rf /usr/local/lib/hecate
rm -rf /opt/hecate
rm -rf /etc/hecate
rm -rf /var/lib/hecate
}

View File

@ -41,10 +41,829 @@ while [[ $# -gt 0 ]]; do
esac
done
source "${REPO_DIR}/scripts/install-config-migration.sh"
source "${REPO_DIR}/scripts/install-host-bootstrap.sh"
source "${REPO_DIR}/scripts/install-legacy-migration.sh"
source "${REPO_DIR}/scripts/install-artifacts.sh"
resolve_nut_ups_name() {
if [[ -n "${NUT_UPS_NAME}" ]]; then
return 0
fi
if [[ -f "${CONF_DIR}/ananke.yaml" ]]; then
local target=""
target="$(grep -Eo 'target:[[:space:]]*[A-Za-z0-9._-]+@localhost' "${CONF_DIR}/ananke.yaml" | head -n 1 | awk '{print $2}')"
if [[ -n "${target}" ]]; then
NUT_UPS_NAME="${target%@localhost}"
echo "[install] inferred NUT UPS name from config: ${NUT_UPS_NAME}"
return 0
fi
fi
NUT_UPS_NAME="pyrphoros"
echo "[install] defaulting NUT UPS name to ${NUT_UPS_NAME}"
}
read_ananke_role() {
if [[ ! -f "${CONF_DIR}/ananke.yaml" ]]; then
echo "coordinator"
return 0
fi
local role
role="$(awk '/^[[:space:]]*role:[[:space:]]*/ {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true)"
if [[ -z "${role}" ]]; then
role="coordinator"
fi
echo "${role}"
}
migration_yaml_lookup() {
local key="$1"
awk -F': *' -v k="${key}" '$1 == k {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true
}
first_control_plane_name() {
awk '
/^control_planes:[[:space:]]*$/ {in_list=1; next}
in_list && /^[[:space:]]*-[[:space:]]*/ {gsub(/^[[:space:]]*-[[:space:]]*/, "", $0); print $0; exit}
in_list && /^[^[:space:]]/ {in_list=0}
' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true
}
lookup_node_host() {
local node="$1"
awk -F': *' -v n="${node}" '$1 == " " n {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true
}
ensure_ananke_kubeconfig() {
local kubeconfig_path
kubeconfig_path="$(migration_yaml_lookup "kubeconfig")"
if [[ -z "${kubeconfig_path}" ]]; then
kubeconfig_path="/etc/ananke/kubeconfig"
fi
install -d -m 0750 "$(dirname "${kubeconfig_path}")"
if [[ -s "${kubeconfig_path}" ]] && KUBECONFIG="${kubeconfig_path}" kubectl version --request-timeout=5s >/dev/null 2>&1; then
return 0
fi
if [[ -r /etc/rancher/k3s/k3s.yaml ]]; then
install -m 0600 /etc/rancher/k3s/k3s.yaml "${kubeconfig_path}"
echo "[install] refreshed kubeconfig from local /etc/rancher/k3s/k3s.yaml"
if KUBECONFIG="${kubeconfig_path}" kubectl version --request-timeout=5s >/dev/null 2>&1; then
return 0
fi
fi
local cp_name cp_host ssh_user ssh_port ssh_cfg ssh_key
cp_name="$(first_control_plane_name)"
if [[ -z "${cp_name}" ]]; then
echo "[install] warning: cannot infer control plane name; kubeconfig bootstrap skipped"
return 0
fi
cp_host="$(lookup_node_host "${cp_name}")"
if [[ -z "${cp_host}" ]]; then
cp_host="${cp_name}"
fi
ssh_user="$(migration_yaml_lookup "ssh_user")"
ssh_port="$(migration_yaml_lookup "ssh_port")"
ssh_cfg="$(migration_yaml_lookup "ssh_config_file")"
ssh_key="$(migration_yaml_lookup "ssh_identity_file")"
if [[ -z "${ssh_port}" ]]; then
ssh_port="2277"
fi
local target
target="${cp_host}"
if [[ -n "${ssh_user}" ]]; then
target="${ssh_user}@${cp_host}"
fi
local ssh_args=(
-o BatchMode=yes
-o ConnectTimeout=8
-o StrictHostKeyChecking=accept-new
)
if [[ -n "${ssh_cfg}" && -f "${ssh_cfg}" ]]; then
ssh_args+=(-F "${ssh_cfg}")
fi
if [[ -n "${ssh_key}" && -f "${ssh_key}" ]]; then
ssh_args+=(-i "${ssh_key}")
fi
if [[ -n "${ssh_port}" ]]; then
ssh_args+=(-p "${ssh_port}")
fi
local remote_cfg
if remote_cfg="$(ssh "${ssh_args[@]}" "${target}" "sudo cat /etc/rancher/k3s/k3s.yaml" 2>/dev/null)"; then
printf '%s\n' "${remote_cfg}" > "${kubeconfig_path}"
sed -Ei "s#server:[[:space:]]*https://127\\.0\\.0\\.1:6443#server: https://${cp_host}:6443#g" "${kubeconfig_path}" || true
chmod 0600 "${kubeconfig_path}"
echo "[install] bootstrapped kubeconfig from control plane ${cp_name} (${cp_host})"
if KUBECONFIG="${kubeconfig_path}" kubectl version --request-timeout=5s >/dev/null 2>&1; then
return 0
fi
else
echo "[install] warning: failed to fetch kubeconfig from ${cp_name} (${cp_host})"
fi
echo "[install] warning: kubeconfig at ${kubeconfig_path} is still not validated; local startup fallback may fail"
}
ensure_ananke_ssh_identity() {
local key_path key_dir key_user key_comment
key_path="$(migration_yaml_lookup "ssh_identity_file")"
if [[ -z "${key_path}" ]]; then
key_path="/home/atlas/.ssh/id_ed25519"
fi
key_dir="$(dirname "${key_path}")"
key_comment="ananke-$(hostname)-forward"
key_user="root"
if [[ "${key_path}" == /home/*/* ]]; then
key_user="${key_path#/home/}"
key_user="${key_user%%/*}"
fi
if ! id "${key_user}" >/dev/null 2>&1; then
echo "[install] warning: ssh identity owner ${key_user} does not exist; skipping key bootstrap for ${key_path}"
return 0
fi
install -d -m 0700 -o "${key_user}" -g "${key_user}" "${key_dir}"
if [[ ! -s "${key_path}" ]]; then
echo "[install] generating missing SSH identity at ${key_path}"
if [[ "${key_user}" == "root" ]]; then
ssh-keygen -q -t ed25519 -N '' -C "${key_comment}" -f "${key_path}"
else
runuser -u "${key_user}" -- ssh-keygen -q -t ed25519 -N '' -C "${key_comment}" -f "${key_path}"
fi
fi
chown "${key_user}:${key_user}" "${key_path}" "${key_path}.pub" 2>/dev/null || true
chmod 0600 "${key_path}" || true
chmod 0644 "${key_path}.pub" || true
}
migrate_ananke_config() {
if [[ ! -f "${CONF_DIR}/ananke.yaml" ]]; then
return 0
fi
local changed=0
local role_hint
role_hint="$(read_ananke_role)"
if grep -Eq 'default_budget_seconds:[[:space:]]*300' "${CONF_DIR}/ananke.yaml"; then
sed -Ei 's/(default_budget_seconds:[[:space:]]*)300/\11380/' "${CONF_DIR}/ananke.yaml"
echo "[install] migrated default_budget_seconds 300 -> 1380 in ${CONF_DIR}/ananke.yaml"
changed=1
fi
if grep -Eq 'runtime_safety_factor:[[:space:]]*1\.10' "${CONF_DIR}/ananke.yaml"; then
sed -Ei 's/(runtime_safety_factor:[[:space:]]*)1\.10/\11.25/' "${CONF_DIR}/ananke.yaml"
echo "[install] migrated runtime_safety_factor 1.10 -> 1.25 in ${CONF_DIR}/ananke.yaml"
changed=1
fi
if grep -Eq '^ssh_node_users:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
&& grep -Eq '^ titan-24:[[:space:]]*tethys[[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
sed -Ei 's/^ titan-24:[[:space:]]*tethys[[:space:]]*$/ titan-24: atlas/' "${CONF_DIR}/ananke.yaml"
echo "[install] migrated ssh_node_users titan-24 override to atlas"
changed=1
fi
if grep -Eq '^ command_timeout_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ startup_guard_max_age_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ command_timeout_seconds:[[:space:]]*[0-9]+/a\ startup_guard_max_age_seconds: 900' "${CONF_DIR}/ananke.yaml"
echo "[install] added coordination.startup_guard_max_age_seconds=900"
changed=1
fi
if grep -Eq '^[[:space:]]*poweroff_enabled:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
sed -Ei \
-e '/^[[:space:]]*poweroff_enabled:[[:space:]]*(true|false)/d' \
-e '/^[[:space:]]*poweroff_delay_seconds:[[:space:]]*[0-9]+/d' \
-e '/^[[:space:]]*poweroff_local_host:[[:space:]]*(true|false)/d' \
-e '/^[[:space:]]*extra_poweroff_hosts:[[:space:]]*(\[\])?[[:space:]]*$/d' \
"${CONF_DIR}/ananke.yaml"
echo "[install] removed deprecated host-poweroff shutdown config keys"
changed=1
fi
if grep -Eq '^ minimum_battery_percent:[[:space:]]*[0-9.]+' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ require_node_inventory_reachability:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ minimum_battery_percent:[[:space:]]*[0-9.]+/a\ require_node_inventory_reachability: true\n node_inventory_reachability_wait_seconds: 300\n node_inventory_reachability_poll_seconds: 5' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup node inventory reachability gate defaults"
changed=1
fi
if grep -Eq '^state:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ reports_dir:[[:space:]]*/var/lib/ananke/reports' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ dir:[[:space:]]*\/var\/lib\/ananke$/a\ reports_dir: /var/lib/ananke/reports' "${CONF_DIR}/ananke.yaml"
echo "[install] added state.reports_dir default"
changed=1
fi
if ! grep -Eq '^ peer_hosts:' "${CONF_DIR}/ananke.yaml"; then
if [[ "${role_hint}" == "peer" ]] && grep -Eq '^ forward_shutdown_host:[[:space:]]*[A-Za-z0-9._-]+' "${CONF_DIR}/ananke.yaml"; then
local peer_host
peer_host="$(awk -F': *' '/^ forward_shutdown_host:[[:space:]]*/ {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true)"
if [[ -n "${peer_host}" ]]; then
sed -Ei '/^ forward_shutdown_config:[[:space:]]*.*$/a\ peer_hosts:\n - '"${peer_host}"'' "${CONF_DIR}/ananke.yaml"
echo "[install] added coordination.peer_hosts from forward_shutdown_host (${peer_host})"
changed=1
fi
elif [[ "${role_hint}" == "coordinator" ]] && grep -Eq '^ titan-24:[[:space:]]*[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ forward_shutdown_config:[[:space:]]*.*$/a\ peer_hosts:\n - titan-24' "${CONF_DIR}/ananke.yaml"
echo "[install] added coordination.peer_hosts default (titan-24) for coordinator role"
changed=1
else
sed -Ei '/^ forward_shutdown_config:[[:space:]]*.*$/a\ peer_hosts: []' "${CONF_DIR}/ananke.yaml"
echo "[install] added coordination.peer_hosts empty default"
changed=1
fi
fi
local default_restore_cp
default_restore_cp="$(first_control_plane_name)"
if [[ -z "${default_restore_cp}" ]]; then
default_restore_cp="titan-0a"
fi
if grep -Eq '^ api_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ auto_etcd_restore_on_api_failure:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ api_poll_seconds:[[:space:]]*[0-9]+/a\ require_time_sync: true\n time_sync_wait_seconds: 240\n time_sync_poll_seconds: 5\n reconcile_access_on_boot: true\n auto_etcd_restore_on_api_failure: true\n etcd_restore_control_plane: '"${default_restore_cp}"'' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup.auto_etcd_restore_on_api_failure + startup.etcd_restore_control_plane defaults"
changed=1
fi
if grep -Eq '^ api_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ require_time_sync:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ api_poll_seconds:[[:space:]]*[0-9]+/a\ require_time_sync: true\n time_sync_wait_seconds: 240\n time_sync_poll_seconds: 5\n reconcile_access_on_boot: true' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup time sync + access reconciliation defaults"
changed=1
fi
if grep -Eq '^ time_sync_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ time_sync_mode:[[:space:]]*(strict|quorum)' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ time_sync_poll_seconds:[[:space:]]*[0-9]+/a\ time_sync_mode: quorum\n time_sync_quorum: 2' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup time sync quorum defaults"
changed=1
fi
if grep -Eq '^ etcd_restore_control_plane:[[:space:]]*[A-Za-z0-9._-]+' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ require_storage_ready:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ etcd_restore_control_plane:[[:space:]]*[A-Za-z0-9._-]+/a\ require_storage_ready: true\n storage_ready_wait_seconds: 420\n storage_ready_poll_seconds: 5\n storage_min_ready_nodes: 2\n storage_critical_pvcs:\n - vault/data-vault-0\n - postgres/postgres-data-postgres-0\n - gitea/gitea-data\n - sso/keycloak-data' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup storage readiness defaults"
changed=1
fi
if grep -Eq '^ storage_critical_pvcs:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ require_post_start_probes:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ - sso\/keycloak-data$/a\ require_post_start_probes: true\n post_start_probe_wait_seconds: 240\n post_start_probe_poll_seconds: 5\n post_start_probes:\n - https://scm.bstein.dev/api/healthz\n - https://metrics.bstein.dev/api/health\n require_service_checklist: true\n service_checklist_wait_seconds: 420\n service_checklist_poll_seconds: 5\n service_checklist_stability_seconds: 120\n service_checklist:\n - name: gitea-api\n url: https://scm.bstein.dev/api/healthz\n accepted_statuses: [200]\n body_contains: pass\n timeout_seconds: 12\n - name: grafana-api\n url: https://metrics.bstein.dev/api/health\n accepted_statuses: [200]\n body_contains: '\''\"database\":\"ok\"'\''\n timeout_seconds: 12\n vault_unseal_key_file: /var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup post-start probe + vault key fallback defaults"
changed=1
fi
if grep -Eq '^ - https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration$' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ - https:\/\/sso\.bstein\.dev\/realms\/atlas\/\.well-known\/openid-configuration$/d' "${CONF_DIR}/ananke.yaml"
echo "[install] removed sso OIDC probe from startup.post_start_probes (returns 404 in current deployment)"
changed=1
fi
if ! grep -Eq '^ vault_unseal_key_file:[[:space:]]*/var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"; then
if grep -Eq '^startup:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" && grep -Eq '^ post_start_probes:[[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ - https:\/\/metrics\.bstein\.dev\/api\/health$/a\ vault_unseal_key_file: /var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup.vault_unseal_key_file default"
changed=1
fi
fi
if ! grep -Eq '^ vault_unseal_breakglass_timeout_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml"; then
if grep -Eq '^ vault_unseal_key_file:[[:space:]]*/var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ vault_unseal_key_file:[[:space:]]*\/var\/lib\/ananke\/vault-unseal.key$/a\ vault_unseal_breakglass_command: ""\n vault_unseal_breakglass_timeout_seconds: 15' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup break-glass fallback defaults"
changed=1
fi
fi
local role
role="$(read_ananke_role)"
local inventory_block
local managed_block
local workers_block
workers_block='workers:
- titan-04
- titan-05
- titan-06
- titan-07
- titan-08
- titan-09
- titan-10
- titan-11
- titan-12
- titan-13
- titan-14
- titan-15
- titan-17
- titan-18
- titan-19
- titan-20
- titan-21
- titan-22
- titan-24'
if [[ "${role}" == "coordinator" ]]; then
inventory_block='ssh_node_hosts:
titan-db: 192.168.22.10
titan-0a: 192.168.22.11
titan-0b: 192.168.22.12
titan-0c: 192.168.22.13
titan-04: 192.168.22.30
titan-05: 192.168.22.31
titan-06: 192.168.22.32
titan-07: 192.168.22.33
titan-08: 192.168.22.34
titan-09: 192.168.22.35
titan-10: 192.168.22.36
titan-11: 192.168.22.37
titan-12: 192.168.22.40
titan-13: 192.168.22.41
titan-14: 192.168.22.42
titan-15: 192.168.22.43
titan-17: 192.168.22.45
titan-18: 192.168.22.46
titan-19: 192.168.22.47
titan-20: 192.168.22.20
titan-21: 192.168.22.21
titan-22: 192.168.22.22
titan-24: 192.168.22.26'
managed_block='ssh_managed_nodes:
- titan-db
- titan-0a
- titan-0b
- titan-0c
- titan-04
- titan-05
- titan-06
- titan-07
- titan-08
- titan-09
- titan-10
- titan-11
- titan-12
- titan-13
- titan-14
- titan-15
- titan-17
- titan-18
- titan-19
- titan-20
- titan-21
- titan-22
- titan-24'
elif [[ "${role}" == "peer" ]]; then
inventory_block='ssh_node_hosts:
titan-db: 192.168.22.10
titan-0a: 192.168.22.11
titan-0b: 192.168.22.12
titan-0c: 192.168.22.13
titan-04: 192.168.22.30
titan-05: 192.168.22.31
titan-06: 192.168.22.32
titan-07: 192.168.22.33
titan-08: 192.168.22.34
titan-09: 192.168.22.35
titan-10: 192.168.22.36
titan-11: 192.168.22.37
titan-12: 192.168.22.40
titan-13: 192.168.22.41
titan-14: 192.168.22.42
titan-15: 192.168.22.43
titan-17: 192.168.22.45
titan-18: 192.168.22.46
titan-19: 192.168.22.47
titan-20: 192.168.22.20
titan-21: 192.168.22.21
titan-22: 192.168.22.22
titan-24: 192.168.22.26'
managed_block='ssh_managed_nodes:
- titan-db
- titan-0a
- titan-0b
- titan-0c
- titan-04
- titan-05
- titan-06
- titan-07
- titan-08
- titan-09
- titan-10
- titan-11
- titan-12
- titan-13
- titan-14
- titan-15
- titan-17
- titan-18
- titan-19
- titan-20
- titan-21
- titan-22
- titan-24'
fi
if [[ -n "${inventory_block}" ]]; then
if grep -Eq '^ssh_node_hosts:[[:space:]]*\{\}[[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
perl -0pi -e 's#ssh_node_hosts:\s*\{\}\n#'"${inventory_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
echo "[install] hydrated ssh_node_hosts inventory for role=${role}"
changed=1
fi
fi
if grep -Eq '^workers:[[:space:]]*\[\][[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
perl -0pi -e 's#workers:\s*\[\]\n#'"${workers_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
echo "[install] hydrated workers inventory for startup/shutdown orchestration"
changed=1
fi
if [[ -n "${managed_block}" ]]; then
if grep -Eq '^ssh_managed_nodes:[[:space:]]*\[\][[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
perl -0pi -e 's#ssh_managed_nodes:\s*\[\]\n#'"${managed_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
echo "[install] hydrated ssh_managed_nodes inventory for role=${role}"
changed=1
fi
if ! grep -Eq '^ - titan-04$' "${CONF_DIR}/ananke.yaml" || ! grep -Eq '^ - titan-21$' "${CONF_DIR}/ananke.yaml"; then
perl -0pi -e 's#ssh_managed_nodes:\n(?: - [^\n]*\n)*#'"${managed_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
echo "[install] refreshed ssh_managed_nodes coverage for role=${role}"
changed=1
fi
fi
if [[ "${role}" == "peer" ]]; then
if grep -Eq '^ssh_managed_nodes:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
&& grep -Eq '^ - titan-db$' "${CONF_DIR}/ananke.yaml" \
&& grep -Eq '^ - titan-24$' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ - titan-0a$' "${CONF_DIR}/ananke.yaml"; then
perl -0pi -e 's#ssh_managed_nodes:\n - titan-db\n - titan-24\n#ssh_managed_nodes:\n - titan-db\n - titan-0a\n - titan-0b\n - titan-0c\n - titan-04\n - titan-05\n - titan-06\n - titan-07\n - titan-08\n - titan-09\n - titan-10\n - titan-11\n - titan-12\n - titan-13\n - titan-14\n - titan-15\n - titan-17\n - titan-18\n - titan-19\n - titan-20\n - titan-21\n - titan-22\n - titan-24\n#s' "${CONF_DIR}/ananke.yaml"
echo "[install] expanded peer ssh_managed_nodes for bootstrap fallback coverage"
changed=1
fi
if ! grep -Eq '^ - services/keycloak$' "${CONF_DIR}/ananke.yaml" || ! grep -Eq '^ - infrastructure/cert-manager$' "${CONF_DIR}/ananke.yaml" || ! grep -Eq '^ - services/oauth2-proxy$' "${CONF_DIR}/ananke.yaml"; then
perl -0pi -e 's#local_bootstrap_paths:\n(?: - [^\n]*\n)*#local_bootstrap_paths:\n - infrastructure/core\n - clusters/atlas/flux-system\n - infrastructure/sources/helm\n - infrastructure/metallb\n - infrastructure/traefik\n - infrastructure/cert-manager\n - infrastructure/vault-csi\n - infrastructure/vault-injector\n - services/vault\n - infrastructure/postgres\n - services/gitea\n - services/keycloak\n - services/oauth2-proxy\n#s' "${CONF_DIR}/ananke.yaml"
echo "[install] refreshed peer local_bootstrap_paths for full fallback bootstrap parity"
changed=1
fi
fi
if [[ "${changed}" -eq 1 ]]; then
chmod 0640 "${CONF_DIR}/ananke.yaml" || true
fi
}
sanitize_migrated_ananke_config() {
local cfg="${CONF_DIR}/ananke.yaml"
[[ -f "${cfg}" ]] || return 0
local tmp changed=0
tmp="$(mktemp)"
# Legacy migration bug guard:
# If root-level "- node" entries were accidentally appended after ssh_managed_nodes,
# drop those orphan entries until the next top-level key.
awk '
BEGIN {in_managed=0}
/^ssh_managed_nodes:[[:space:]]*$/ {in_managed=1; print; next}
{
if (in_managed) {
if ($0 ~ /^ - /) {print; next}
if ($0 ~ /^- /) {next}
if ($0 ~ /^[A-Za-z0-9_]+:[[:space:]]*/) {in_managed=0}
}
print
}
' "${cfg}" > "${tmp}"
if ! cmp -s "${cfg}" "${tmp}"; then
mv "${tmp}" "${cfg}"
changed=1
echo "[install] sanitized malformed ssh_managed_nodes block in ${cfg}"
else
rm -f "${tmp}"
fi
if grep -Eq '^[[:space:]]*forward_shutdown_config:[[:space:]]*/etc/ananke/hecate.yaml[[:space:]]*$' "${cfg}"; then
sed -Ei 's#(^[[:space:]]*forward_shutdown_config:[[:space:]]*)/etc/ananke/hecate.yaml#\1/etc/ananke/ananke.yaml#' "${cfg}"
changed=1
echo "[install] migrated coordination.forward_shutdown_config to /etc/ananke/ananke.yaml"
fi
if [[ "${changed}" -eq 1 ]]; then
chmod 0640 "${cfg}" || true
fi
}
ensure_apt_packages() {
local missing=()
for pkg in "$@"; do
if ! dpkg -s "${pkg}" >/dev/null 2>&1; then
missing+=("${pkg}")
fi
done
if [[ ${#missing[@]} -eq 0 ]]; then
return 0
fi
echo "[install] apt install: ${missing[*]}"
export DEBIAN_FRONTEND=noninteractive
apt-get update -y
apt-get install -y "${missing[@]}"
}
install_kubectl_if_missing() {
if command -v kubectl >/dev/null 2>&1; then
return 0
fi
ensure_apt_packages kubernetes-client || true
if command -v kubectl >/dev/null 2>&1; then
return 0
fi
echo "[install] installing kubectl via upstream binary"
local arch
arch="$(uname -m)"
case "${arch}" in
x86_64) arch="amd64" ;;
aarch64|arm64) arch="arm64" ;;
*) echo "Unsupported arch for kubectl install: ${arch}" >&2; return 1 ;;
esac
local version
version="$(curl -fsSL https://dl.k8s.io/release/stable.txt)"
curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/${version}/bin/linux/${arch}/kubectl"
chmod 0755 /usr/local/bin/kubectl
}
ensure_dependencies() {
if [[ "${INSTALL_DEPS}" -eq 0 ]]; then
echo "[install] skipping dependency installation"
return 0
fi
if ! command -v apt-get >/dev/null 2>&1; then
echo "This installer currently supports apt-based hosts only." >&2
exit 1
fi
ensure_apt_packages ca-certificates curl git openssh-client jq nut-client nut-server nut-monitor golang-go
install_kubectl_if_missing
}
legacy_path_rewrite() {
local src="$1"
local dst="$2"
sed \
-e 's#/etc/hecate/hecate.yaml#/etc/ananke/ananke.yaml#g' \
-e 's#/etc/hecate/kubeconfig#/etc/ananke/kubeconfig#g' \
-e 's#/var/lib/hecate/vault-unseal.key#/var/lib/ananke/vault-unseal.key#g' \
-e 's#/var/lib/hecate/hecate.lock#/var/lib/ananke/ananke.lock#g' \
-e 's#/opt/hecate#/opt/ananke#g' \
-e 's#/etc/hecate#/etc/ananke#g' \
-e 's#/var/lib/hecate#/var/lib/ananke#g' \
-e 's#/usr/local/bin/hecate#/usr/local/bin/ananke#g' \
-e 's#/usr/local/lib/hecate#/usr/local/lib/ananke#g' \
-e 's/hecate.yaml/ananke.yaml/g' \
-e 's/hecate.lock/ananke.lock/g' \
-e 's/hecate/ananke/g' \
-e 's/Hecate/Ananke/g' \
-e 's#hecate\.lock#ananke.lock#g' \
"${src}" > "${dst}"
}
migrate_legacy_hecate_install() {
local legacy_conf_dir="/etc/hecate"
local legacy_state_dir="/var/lib/hecate"
local legacy_systemd_dir="/etc/systemd/system"
install -d -m 0750 "${CONF_DIR}"
install -d -m 0750 "${STATE_DIR}"
if [[ ! -f "${CONF_DIR}/ananke.yaml" && -f "${legacy_conf_dir}/hecate.yaml" ]]; then
echo "[install] migrating legacy config ${legacy_conf_dir}/hecate.yaml -> ${CONF_DIR}/ananke.yaml"
legacy_path_rewrite "${legacy_conf_dir}/hecate.yaml" "${CONF_DIR}/ananke.yaml"
chmod 0640 "${CONF_DIR}/ananke.yaml"
fi
if [[ ! -f "${CONF_DIR}/kubeconfig" && -f "${legacy_conf_dir}/kubeconfig" ]]; then
echo "[install] migrating legacy kubeconfig ${legacy_conf_dir}/kubeconfig -> ${CONF_DIR}/kubeconfig"
install -m 0600 "${legacy_conf_dir}/kubeconfig" "${CONF_DIR}/kubeconfig"
fi
if [[ ! -f "${STATE_DIR}/vault-unseal.key" && -f "${legacy_state_dir}/vault-unseal.key" ]]; then
echo "[install] migrating legacy vault key ${legacy_state_dir}/vault-unseal.key -> ${STATE_DIR}/vault-unseal.key"
install -m 0600 "${legacy_state_dir}/vault-unseal.key" "${STATE_DIR}/vault-unseal.key"
fi
if [[ ! -f "${STATE_DIR}/runs.json" && -f "${legacy_state_dir}/runs.json" ]]; then
echo "[install] migrating legacy run history ${legacy_state_dir}/runs.json -> ${STATE_DIR}/runs.json"
install -m 0640 "${legacy_state_dir}/runs.json" "${STATE_DIR}/runs.json"
fi
if [[ ! -f "${STATE_DIR}/intent.json" && -f "${legacy_state_dir}/intent.json" ]]; then
echo "[install] migrating legacy intent state ${legacy_state_dir}/intent.json -> ${STATE_DIR}/intent.json"
install -m 0640 "${legacy_state_dir}/intent.json" "${STATE_DIR}/intent.json"
fi
if [[ ! -f "${STATE_DIR}/ananke.lock" && -f "${legacy_state_dir}/hecate.lock" ]]; then
echo "[install] migrating legacy lock ${legacy_state_dir}/hecate.lock -> ${STATE_DIR}/ananke.lock"
install -m 0640 "${legacy_state_dir}/hecate.lock" "${STATE_DIR}/ananke.lock"
fi
if [[ -d "${legacy_systemd_dir}" ]]; then
if ls "${legacy_systemd_dir}"/hecate*.service >/dev/null 2>&1 || ls "${legacy_systemd_dir}"/hecate*.timer >/dev/null 2>&1; then
echo "[install] detected legacy hecate systemd unit files; will retire after ananke install"
fi
fi
}
retire_legacy_hecate_install() {
local ts backup_dir
ts="$(date +%Y%m%d%H%M%S)"
backup_dir="/var/backups/ananke-legacy-hecate-${ts}"
systemctl disable --now hecate.service hecate-bootstrap.service hecate-update.timer >/dev/null 2>&1 || true
systemctl stop hecate-update.service >/dev/null 2>&1 || true
if [[ -d /etc/hecate || -d /var/lib/hecate || -d /usr/local/lib/hecate || -d /opt/hecate ]]; then
install -d -m 0750 "${backup_dir}"
[[ -d /etc/hecate ]] && cp -a /etc/hecate "${backup_dir}/" || true
[[ -d /var/lib/hecate ]] && cp -a /var/lib/hecate "${backup_dir}/" || true
[[ -d /usr/local/lib/hecate ]] && cp -a /usr/local/lib/hecate "${backup_dir}/" || true
[[ -d /opt/hecate ]] && cp -a /opt/hecate "${backup_dir}/" || true
[[ -f /usr/local/bin/hecate ]] && install -m 0755 /usr/local/bin/hecate "${backup_dir}/hecate.bin" || true
echo "[install] backed up legacy hecate assets to ${backup_dir}"
fi
rm -f \
/etc/systemd/system/hecate.service \
/etc/systemd/system/hecate-bootstrap.service \
/etc/systemd/system/hecate-update.service \
/etc/systemd/system/hecate-update.timer
rm -f /usr/local/bin/hecate
rm -rf /usr/local/lib/hecate
rm -rf /opt/hecate
rm -rf /etc/hecate
rm -rf /var/lib/hecate
}
resolve_build_target() {
if [[ -d "${REPO_DIR}/cmd/ananke" ]]; then
echo "./cmd/ananke"
return 0
fi
return 1
}
install_config_template() {
local template="$1"
local dest="$2"
local src legacy
local -a modern_candidates=()
local -a legacy_candidates=()
case "${template}" in
coordinator)
modern_candidates=("configs/ananke.coordinator.yaml" "configs/ananke.titan-db.yaml")
legacy_candidates=("configs/hecate.titan-db.yaml")
;;
peer)
modern_candidates=("configs/ananke.peer.yaml" "configs/ananke.tethys.yaml")
legacy_candidates=("configs/hecate.tethys.yaml")
;;
example)
modern_candidates=("configs/ananke.example.yaml")
legacy_candidates=("configs/hecate.example.yaml")
;;
*)
echo "[install] unknown config template key: ${template}" >&2
return 1
;;
esac
for src in "${modern_candidates[@]}"; do
if [[ -f "${src}" ]]; then
install -m 0640 "${src}" "${dest}"
return 0
fi
done
for legacy in "${legacy_candidates[@]}"; do
if [[ -f "${legacy}" ]]; then
src="$(mktemp)"
legacy_path_rewrite "${legacy}" "${src}"
install -m 0640 "${src}" "${dest}"
rm -f "${src}"
return 0
fi
done
echo "[install] missing config template sources for '${template}'. modern=[${modern_candidates[*]}] legacy=[${legacy_candidates[*]}]" >&2
return 1
}
install_systemd_units() {
local source_map
local tmp
while IFS='|' read -r target_name modern_name legacy_name; do
local modern_src="deploy/systemd/${modern_name}"
local legacy_src="deploy/systemd/${legacy_name}"
local target="${SYSTEMD_DIR}/${target_name}"
if [[ -f "${modern_src}" ]]; then
install -m 0644 "${modern_src}" "${target}"
continue
fi
if [[ -f "${legacy_src}" ]]; then
tmp="$(mktemp)"
legacy_path_rewrite "${legacy_src}" "${tmp}"
install -m 0644 "${tmp}" "${target}"
rm -f "${tmp}"
continue
fi
echo "[install] missing both modern and legacy systemd unit sources for ${target_name}" >&2
return 1
done <<'EOF_UNITS'
ananke.service|ananke.service|hecate.service
ananke-bootstrap.service|ananke-bootstrap.service|hecate-bootstrap.service
ananke-update.service|ananke-update.service|hecate-update.service
ananke-update.timer|ananke-update.timer|hecate-update.timer
EOF_UNITS
}
install_self_update_script() {
local modern_src="scripts/ananke-self-update.sh"
local legacy_src="scripts/hecate-self-update.sh"
local target="${LIB_DIR}/ananke-self-update.sh"
local tmp
if [[ -f "${modern_src}" ]]; then
install -m 0755 "${modern_src}" "${target}"
return 0
fi
if [[ -f "${legacy_src}" ]]; then
tmp="$(mktemp)"
legacy_path_rewrite "${legacy_src}" "${tmp}"
sed -Ei \
-e 's/HECATE_/ANANKE_/g' \
-e 's/hecate-self-update/ananke-self-update/g' \
-e 's#/opt/hecate#/opt/ananke#g' \
-e 's#bstein/hecate\.git#bstein/ananke.git#g' \
"${tmp}"
install -m 0755 "${tmp}" "${target}"
rm -f "${tmp}"
return 0
fi
echo "[install] missing both modern and legacy self-update scripts." >&2
return 1
}
configure_nut() {
if [[ "${MANAGE_NUT}" != "1" ]]; then
echo "[install] skipping NUT configuration (ANANKE_MANAGE_NUT=${MANAGE_NUT})"
return 0
fi
echo "[install] configuring NUT + udev for UPS ${NUT_UPS_NAME} (${NUT_VENDOR_ID}:${NUT_PRODUCT_ID})"
install -d -m 0755 /etc/nut /etc/udev/rules.d
cat > /etc/nut/nut.conf <<EOF
MODE=standalone
EOF
cat > /etc/nut/ups.conf <<EOF
[${NUT_UPS_NAME}]
driver = usbhid-ups
port = auto
vendorid = ${NUT_VENDOR_ID}
productid = ${NUT_PRODUCT_ID}
pollinterval = 5
EOF
cat > /etc/nut/upsd.users <<EOF
[${NUT_MONITOR_USER}]
password = ${NUT_MONITOR_PASSWORD}
upsmon primary
EOF
chmod 0640 /etc/nut/upsd.users
if getent group nut >/dev/null 2>&1; then
chown root:nut /etc/nut/upsd.users
else
chown root:root /etc/nut/upsd.users
fi
cat > /etc/nut/upsmon.conf <<EOF
RUN_AS_USER nut
MONITOR ${NUT_UPS_NAME}@localhost 1 ${NUT_MONITOR_USER} ${NUT_MONITOR_PASSWORD} primary
MINSUPPLIES 1
SHUTDOWNCMD "/sbin/shutdown -h +0"
POLLFREQ 5
POLLFREQALERT 5
HOSTSYNC 15
DEADTIME 15
POWERDOWNFLAG /etc/killpower
EOF
cat > /etc/udev/rules.d/99-ananke-ups.rules <<EOF
# Managed by ananke install.sh: ensure UPS USB HID devices are readable by NUT
ACTION=="add|change", SUBSYSTEM=="usb", ATTR{idVendor}=="${NUT_VENDOR_ID}", ATTR{idProduct}=="${NUT_PRODUCT_ID}", MODE:="0660", GROUP:="nut"
EOF
udevadm control --reload-rules || true
udevadm trigger --subsystem-match=usb --attr-match=idVendor="${NUT_VENDOR_ID}" --attr-match=idProduct="${NUT_PRODUCT_ID}" || true
systemctl enable nut-driver-enumerator.service nut-server.service nut-monitor.service >/dev/null 2>&1 || true
systemctl restart nut-driver-enumerator.service >/dev/null 2>&1 || true
systemctl restart "nut-driver@${NUT_UPS_NAME}.service" >/dev/null 2>&1 || true
systemctl restart nut-server.service nut-monitor.service >/dev/null 2>&1 || true
}
ensure_dependencies
migrate_legacy_hecate_install

View File

@ -6,28 +6,9 @@ cd "${REPO_DIR}"
export PATH="$(go env GOPATH)/bin:${PATH}"
STATICCHECK_VERSION="${ANANKE_STATICCHECK_VERSION:-2025.1.1}"
run_with_retry() {
local attempts="$1"
shift
local try=1
local delay=3
local rc=0
while true; do
"$@" && return 0
rc=$?
if [[ "${try}" -ge "${attempts}" ]]; then
return "${rc}"
fi
echo "[lint] retry ${try}/${attempts} after rc=${rc}: $*" >&2
sleep "${delay}"
delay=$((delay * 2))
try=$((try + 1))
done
}
if ! command -v staticcheck >/dev/null 2>&1 || ! staticcheck -version 2>/dev/null | grep -q "${STATICCHECK_VERSION}"; then
echo "[lint] installing staticcheck ${STATICCHECK_VERSION}"
run_with_retry 4 go install "honnef.co/go/tools/cmd/staticcheck@${STATICCHECK_VERSION}"
go install "honnef.co/go/tools/cmd/staticcheck@${STATICCHECK_VERSION}"
fi
echo "[lint] go vet"

View File

@ -77,17 +77,6 @@ def _fetch_existing_counter(pushgateway_url: str, metric: str, labels: dict[str,
return 0.0
def _series_exists(pushgateway_url: str, metric: str, labels: dict[str, str], timeout_seconds: float) -> bool:
"""Return whether Pushgateway already has a series for this build."""
text = _read_http(f"{pushgateway_url.rstrip('/')}/metrics", timeout_seconds)
for line in text.splitlines():
if not line.startswith(metric + "{"):
continue
if all(f'{key}="{value}"' in line for key, value in labels.items()):
return True
return False
def _build_payload(
suite: str,
trigger: str,
@ -100,25 +89,9 @@ def _build_payload(
tests_skipped: int,
test_cases: list[tuple[str, str]],
coverage_percent: float,
source_files_total: int,
source_lines_over_500: int,
branch: str,
build_number: str,
jenkins_job: str,
checks: dict[str, str],
) -> str:
build_labels = {
"suite": suite,
"branch": branch,
"build_number": build_number or "unknown",
"jenkins_job": jenkins_job,
}
test_case_base_labels = {
"suite": suite,
"branch": branch,
"build_number": build_number or "unknown",
"jenkins_job": jenkins_job,
}
lines = [
"# TYPE platform_quality_gate_runs_total counter",
f'platform_quality_gate_runs_total{{suite="{suite}",status="ok"}} {ok_count}',
@ -132,30 +105,21 @@ def _build_payload(
f'ananke_quality_gate_coverage_percent{{suite="{suite}"}} {coverage_percent:.3f}',
"# TYPE platform_quality_gate_workspace_line_coverage_percent gauge",
f'platform_quality_gate_workspace_line_coverage_percent{{suite="{suite}"}} {coverage_percent:.3f}',
"# TYPE platform_quality_gate_source_files_total gauge",
f'platform_quality_gate_source_files_total{{suite="{suite}"}} {source_files_total}',
"# TYPE platform_quality_gate_source_lines_over_500_total gauge",
f'platform_quality_gate_source_lines_over_500_total{{suite="{suite}"}} {source_lines_over_500}',
"# TYPE platform_quality_gate_build_info gauge",
f"platform_quality_gate_build_info{_label_str(build_labels)} 1",
"# TYPE platform_quality_gate_test_case_result gauge",
"# TYPE ananke_quality_gate_checks_total gauge",
"# TYPE ananke_quality_gate_publish_info gauge",
f'ananke_quality_gate_publish_info{_label_str({"suite": suite, "trigger": trigger})} 1',
]
lines.extend(
f'platform_quality_gate_test_case_result{{suite="{suite}",test="{_escape_label(test_name)}",status="{_escape_label(test_status)}"}} 1'
for test_name, test_status in test_cases
)
lines.extend(
f'ananke_quality_gate_checks_total{{suite="{suite}",check="{check_name}",result="{check_status}"}} 1'
for check_name, check_status in checks.items()
)
lines.append("# TYPE platform_quality_gate_test_case_result gauge")
if test_cases:
lines.extend(
f"platform_quality_gate_test_case_result{_label_str({**test_case_base_labels, 'test': test_name, 'status': test_status})} 1"
for test_name, test_status in test_cases
)
else:
lines.append(
f"platform_quality_gate_test_case_result{_label_str({**test_case_base_labels, 'test': '__no_test_cases__', 'status': 'skipped'})} 1"
)
return "\n".join(lines) + "\n"
@ -172,7 +136,8 @@ def _read_coverage_percent(path: str) -> float:
return 0.0
def _iter_source_files(repo_root: Path):
def _count_source_files_over_limit(repo_root: Path, max_lines: int = 500) -> int:
count = 0
for rel_root in SOURCE_SCAN_ROOTS:
base = repo_root / rel_root
if not base.exists():
@ -182,37 +147,12 @@ def _iter_source_files(repo_root: Path):
continue
if path.suffix not in SOURCE_EXTENSIONS:
continue
if path.name.endswith("_test.go") or path.name.endswith(".test.py"):
continue
yield path
def _count_source_files(repo_root: Path) -> int:
return sum(1 for _ in _iter_source_files(repo_root))
def _count_source_files_over_limit(repo_root: Path, max_lines: int = 500) -> int:
count = 0
for path in _iter_source_files(repo_root):
lines = len(path.read_text(encoding="utf-8", errors="ignore").splitlines())
if lines > max_lines:
count += 1
lines = len(path.read_text(encoding="utf-8", errors="ignore").splitlines())
if lines > max_lines:
count += 1
return count
def _unit_tests_failed(output_path: Path, coverage_percent: float) -> bool:
if coverage_percent <= 0 or not output_path.exists():
return True
text = output_path.read_text(encoding="utf-8", errors="ignore")
start_marker = "[quality] unit tests + workspace coverage profile"
end_marker = "[quality] hygiene: doc contracts"
if start_marker in text:
text = text.split(start_marker, 1)[1]
if end_marker in text:
text = text.split(end_marker, 1)[0]
return bool(re.search(r"^(--- FAIL:|FAIL\\b)", text, flags=re.M))
def _parse_go_test_counts(output_path: Path) -> dict[str, int]:
if not output_path.exists():
return {"passed": 0, "failed": 0, "errors": 0, "skipped": 0}
@ -226,37 +166,14 @@ def _parse_go_test_counts(output_path: Path) -> dict[str, int]:
def _parse_go_test_cases(output_path: Path) -> list[tuple[str, str]]:
"""Parse per-test status records from go test output text."""
if not output_path.exists():
return []
text = output_path.read_text(encoding="utf-8", errors="ignore")
cases: list[tuple[str, str]] = []
patterns = {
"passed": re.compile(r"^--- PASS: ([^\s(]+)", flags=re.M),
"failed": re.compile(r"^--- FAIL: ([^\s(]+)", flags=re.M),
"skipped": re.compile(r"^--- SKIP: ([^\s(]+)", flags=re.M),
}
for status, pattern in patterns.items():
for test_name in pattern.findall(text):
cleaned = str(test_name).strip()
if cleaned:
cases.append((cleaned, status))
if cases:
return cases
# Fallback for non-verbose `go test` output where individual test names are absent.
package_cases: list[tuple[str, str]] = []
for package_name in re.findall(r"^ok\s+([^\s]+)", text, flags=re.M):
cleaned = str(package_name).strip()
if cleaned:
package_cases.append((f"package::{cleaned}", "passed"))
for package_name in re.findall(r"^FAIL\s+([^\s]+)", text, flags=re.M):
cleaned = str(package_name).strip()
if cleaned:
package_cases.append((f"package::{cleaned}", "failed"))
if package_cases:
deduped = list(dict.fromkeys(package_cases))
return deduped
for match in re.finditer(r"^---\s+(PASS|FAIL|SKIP):\s+(\S+)", text, flags=re.M):
raw_status, test_name = match.groups()
status = {"PASS": "passed", "FAIL": "failed", "SKIP": "skipped"}.get(raw_status, "error")
cases.append((test_name.strip(), status))
return cases
@ -307,23 +224,17 @@ def _sonarqube_check_status(build_dir: Path) -> str:
def _supply_chain_check_status(build_dir: Path) -> str:
required = os.getenv("QUALITY_GATE_IRONBANK_REQUIRED", "0").strip().lower() in {"1", "true", "yes", "on"}
report = _load_json(Path(os.getenv("QUALITY_GATE_IRONBANK_REPORT", str(build_dir / "ironbank-compliance.json"))))
if not report:
return "failed" if required else "not_applicable"
return "not_applicable"
compliant = report.get("compliant")
if isinstance(compliant, bool):
return "ok" if compliant else "failed"
status_candidates = [report.get("status"), report.get("result"), report.get("compliance")]
for value in status_candidates:
if isinstance(value, str):
normalized = value.strip().lower()
if normalized in QUALITY_SUCCESS_STATES:
return "ok"
if normalized in {"n/a", "na", "not_applicable", "not-applicable", "skipped", "skip"}:
return "failed" if required else "not_applicable"
return "failed" if required else "not_applicable"
return "failed" if required else "not_applicable"
return "ok" if value.strip().lower() in QUALITY_SUCCESS_STATES else "failed"
return "failed"
def parse_args(argv: list[str]) -> argparse.Namespace:
@ -367,19 +278,10 @@ def main(argv: list[str] | None = None) -> int:
args = parse_args(argv or sys.argv[1:])
repo_root = Path(__file__).resolve().parents[1]
build_dir = repo_root / "build"
gate_rc = _read_exit_code(Path(os.getenv("ANANKE_QUALITY_EXIT_CODE_PATH", str(build_dir / "quality-gate.rc"))))
current_ok = 1 if gate_rc == 0 else 0
current_failed = 0 if gate_rc == 0 else 1
branch = os.getenv("BRANCH_NAME") or os.getenv("GIT_BRANCH") or "unknown"
if branch.startswith("origin/"):
branch = branch[len("origin/") :]
build_number = os.getenv("BUILD_NUMBER", "")
jenkins_job = os.getenv("JOB_NAME", "ananke")
remote_ok = 0
remote_failed = 0
remote_error = ""
already_recorded = False
try:
remote_ok = int(
_fetch_existing_counter(
@ -397,39 +299,21 @@ def main(argv: list[str] | None = None) -> int:
args.timeout_seconds,
)
)
already_recorded = bool(build_number) and _series_exists(
args.pushgateway_url,
"platform_quality_gate_build_info",
{
"job": args.job_name,
"suite": args.suite,
"branch": branch or "unknown",
"build_number": build_number or "unknown",
"jenkins_job": jenkins_job,
},
args.timeout_seconds,
)
except Exception as exc:
remote_error = str(exc)
resolved_ok = remote_ok
resolved_failed = remote_failed
if remote_error:
resolved_ok = args.local_ok
resolved_failed = args.local_failed
elif not already_recorded:
resolved_ok += current_ok
resolved_failed += current_failed
resolved_ok = max(args.local_ok, remote_ok)
resolved_failed = max(args.local_failed, remote_failed)
coverage_percent = _read_coverage_percent(args.coverage_percent_file)
source_files_total = _count_source_files(repo_root)
source_lines_over_500 = _count_source_files_over_limit(repo_root, max_lines=500)
quality_output = Path(os.getenv("ANANKE_QUALITY_OUTPUT_FILE", str(build_dir / "quality-gate.out")))
tests = _parse_go_test_counts(quality_output)
test_cases = _parse_go_test_cases(quality_output)
test_output = Path(os.getenv("ANANKE_QUALITY_OUTPUT_FILE", str(build_dir / "quality-gate.out")))
tests = _parse_go_test_counts(test_output)
test_cases = _parse_go_test_cases(test_output)
gate_rc = _read_exit_code(Path(os.getenv("ANANKE_QUALITY_EXIT_CODE_PATH", str(build_dir / "quality-gate.rc"))))
docs_status = _read_status(Path(os.getenv("ANANKE_QUALITY_DOCS_STATUS_PATH", str(build_dir / "docs-naming.status"))))
unit_tests_failed = _unit_tests_failed(quality_output, coverage_percent)
gate_failed = gate_rc != 0
checks = {
"tests": "failed" if unit_tests_failed or tests["failed"] > 0 or tests["errors"] > 0 else "ok",
"tests": "failed" if gate_failed or tests["failed"] > 0 else "ok",
"coverage": "ok" if coverage_percent >= 95.0 else "failed",
"loc": "ok" if source_lines_over_500 == 0 else "failed",
"docs_naming": docs_status,
@ -448,11 +332,7 @@ def main(argv: list[str] | None = None) -> int:
tests_skipped=tests["skipped"],
test_cases=test_cases,
coverage_percent=coverage_percent,
source_files_total=source_files_total,
source_lines_over_500=source_lines_over_500,
branch=branch,
build_number=build_number,
jenkins_job=jenkins_job,
checks=checks,
)
@ -465,8 +345,7 @@ def main(argv: list[str] | None = None) -> int:
summary = (
f"[quality] published Pushgateway metrics suite={args.suite} job={args.job_name} ok={resolved_ok} "
f"failed={resolved_failed} coverage={coverage_percent:.3f} source_files_total={source_files_total} "
f"source_lines_over_500={source_lines_over_500}"
f"failed={resolved_failed} coverage={coverage_percent:.3f} source_lines_over_500={source_lines_over_500}"
)
if remote_error:
summary += f" remote_read_error={remote_error}"

View File

@ -3,11 +3,8 @@
from __future__ import annotations
import http.server
from pathlib import Path
import socketserver
import tempfile
import threading
from unittest import mock
import unittest
import publish_quality_metrics as publisher
@ -61,19 +58,7 @@ class PublishQualityMetricsTest(unittest.TestCase):
self.server.server_close()
self.thread.join(timeout=5)
def _env_for_gate_status(self, status: int = 0) -> dict[str, str]:
tmp_dir = tempfile.TemporaryDirectory()
self.addCleanup(tmp_dir.cleanup)
rc_path = Path(tmp_dir.name) / "quality-gate.rc"
rc_path.write_text(f"{status}\n", encoding="utf-8")
return {
"ANANKE_QUALITY_EXIT_CODE_PATH": str(rc_path),
"ANANKE_QUALITY_COVERAGE_PERCENT_FILE": str(Path(tmp_dir.name) / "coverage.txt"),
"ANANKE_QUALITY_OUTPUT_FILE": str(Path(tmp_dir.name) / "quality-gate.out"),
"ANANKE_QUALITY_DOCS_STATUS_PATH": str(Path(tmp_dir.name) / "docs-naming.status"),
}
def test_publish_adds_current_run_to_remote_counters(self) -> None:
def test_publish_uses_remote_high_water_mark(self) -> None:
_GatewayHandler.metrics_text = "\n".join(
[
'# TYPE platform_quality_gate_runs_total counter',
@ -82,93 +67,51 @@ class PublishQualityMetricsTest(unittest.TestCase):
]
)
with mock.patch.dict("os.environ", self._env_for_gate_status(0)):
exit_code = publisher.main(
[
"--pushgateway-url",
self.base_url,
"--job-name",
"platform-quality-ci",
"--suite",
"ananke",
"--trigger",
"host",
"--local-ok",
"5",
"--local-failed",
"2",
]
)
exit_code = publisher.main(
[
"--pushgateway-url",
self.base_url,
"--job-name",
"platform-quality-ci",
"--suite",
"ananke",
"--trigger",
"host",
"--local-ok",
"5",
"--local-failed",
"2",
]
)
self.assertEqual(exit_code, 0)
self.assertEqual(len(_GatewayHandler.posts), 1)
path, body = _GatewayHandler.posts[0]
self.assertEqual(path, "/metrics/job/platform-quality-ci/suite/ananke")
self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="ok"} 8', body)
self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="failed"} 1', body)
self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="ok"} 7', body)
self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="failed"} 2', body)
self.assertIn('ananke_quality_gate_publish_info{suite="ananke",trigger="host"} 1', body)
self.assertIn('ananke_quality_gate_coverage_percent{suite="ananke"}', body)
self.assertIn('platform_quality_gate_workspace_line_coverage_percent{suite="ananke"}', body)
self.assertIn('platform_quality_gate_source_files_total{suite="ananke"}', body)
self.assertIn('platform_quality_gate_source_lines_over_500_total{suite="ananke"}', body)
def test_publish_does_not_double_count_same_build(self) -> None:
_GatewayHandler.metrics_text = "\n".join(
[
'platform_quality_gate_runs_total{job="platform-quality-ci",suite="ananke",status="ok"} 7',
'platform_quality_gate_runs_total{job="platform-quality-ci",suite="ananke",status="failed"} 1',
'platform_quality_gate_build_info{job="platform-quality-ci",suite="ananke",branch="main",build_number="78",jenkins_job="ananke"} 1',
]
)
with mock.patch.dict(
"os.environ",
{
**self._env_for_gate_status(0),
"BRANCH_NAME": "main",
"BUILD_NUMBER": "78",
"JOB_NAME": "ananke",
},
):
exit_code = publisher.main(
[
"--pushgateway-url",
self.base_url,
"--job-name",
"platform-quality-ci",
"--suite",
"ananke",
"--trigger",
"host",
"--local-ok",
"1",
"--local-failed",
"0",
]
)
self.assertEqual(exit_code, 0)
_, body = _GatewayHandler.posts[0]
self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="ok"} 7', body)
self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="failed"} 1', body)
def test_publish_falls_back_to_local_counters_when_metrics_read_fails(self) -> None:
_GatewayHandler.fail_metrics_read = True
with mock.patch.dict("os.environ", self._env_for_gate_status(0)):
exit_code = publisher.main(
[
"--pushgateway-url",
self.base_url,
"--job-name",
"platform-quality-ci",
"--suite",
"ananke",
"--local-ok",
"11",
"--local-failed",
"3",
]
)
exit_code = publisher.main(
[
"--pushgateway-url",
self.base_url,
"--job-name",
"platform-quality-ci",
"--suite",
"ananke",
"--local-ok",
"11",
"--local-failed",
"3",
]
)
self.assertEqual(exit_code, 0)
self.assertEqual(len(_GatewayHandler.posts), 1)
@ -176,7 +119,6 @@ class PublishQualityMetricsTest(unittest.TestCase):
self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="ok"} 11', body)
self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="failed"} 3', body)
self.assertIn('platform_quality_gate_workspace_line_coverage_percent{suite="ananke"}', body)
self.assertIn('platform_quality_gate_source_files_total{suite="ananke"}', body)
self.assertIn('platform_quality_gate_source_lines_over_500_total{suite="ananke"}', body)

View File

@ -158,9 +158,15 @@ mkdir -p "${BUILD_DIR}"
rm -f "${COVERAGE_PROFILE}" "${COVERAGE_PERCENT_FILE}"
printf 'failed\n' > "${BUILD_DIR}/docs-naming.status"
echo "[quality] dependency download"
echo "[quality] unit tests + workspace coverage profile"
export GOPROXY="${GOPROXY:-https://proxy.golang.org,direct}"
run_with_retry 4 go mod download
run_with_retry 3 go test -coverprofile="${COVERAGE_PROFILE}" ./...
coverage_percent="$(go tool cover -func="${COVERAGE_PROFILE}" | awk '/^total:/ {gsub("%","",$3); print $3}')"
if [[ -z "${coverage_percent}" ]]; then
coverage_percent="0"
fi
printf '%s\n' "${coverage_percent}" > "${COVERAGE_PERCENT_FILE}"
echo "[quality] hygiene: doc contracts"
cd testing
@ -183,14 +189,6 @@ echo "[quality] lint"
echo "[quality] installer template contracts"
./scripts/verify_install_templates.sh
echo "[quality] unit tests + workspace coverage profile"
run_with_retry 3 go test -coverprofile="${COVERAGE_PROFILE}" ./...
coverage_percent="$(go tool cover -func="${COVERAGE_PROFILE}" | awk '/^total:/ {gsub("%","",$3); print $3}')"
if [[ -z "${coverage_percent}" ]]; then
coverage_percent="0"
fi
printf '%s\n' "${coverage_percent}" > "${COVERAGE_PERCENT_FILE}"
echo "[quality] per-file coverage gate (95%)"
cd testing
ANANKE_ENFORCE_COVERAGE=1 ANANKE_PER_FILE_COVERAGE_TARGET=95 go test ./coverage -run TestPerFileCoverageReport -count=1 -v

View File

@ -17,12 +17,6 @@ import (
const maxGoFileLOC = 500
var goFileNamePattern = regexp.MustCompile(`^[a-z0-9]+(_[a-z0-9]+)*(_test)?\.go$`)
var genericFileNameTokens = map[string]struct{}{
"chunk": {},
"part": {},
"piece": {},
"split": {},
}
func repoRoot(tb testing.TB) string {
tb.Helper()
@ -67,16 +61,13 @@ func collectGoFiles(tb testing.TB, roots ...string) []string {
func TestHygieneContracts(t *testing.T) {
root := repoRoot(t)
files := collectGoFiles(t, filepath.Join(root, "cmd"), filepath.Join(root, "internal"))
namingFiles := append([]string{}, files...)
namingFiles = append(namingFiles, collectGoFiles(t, filepath.Join(root, "testing"))...)
sort.Strings(files)
sort.Strings(namingFiles)
t.Run("doc_contract", func(t *testing.T) {
checkDocContracts(t, files)
})
t.Run("naming_contract", func(t *testing.T) {
checkNamingContracts(t, namingFiles)
checkNamingContracts(t, files)
})
t.Run("loc_limit", func(t *testing.T) {
checkFileLOCLimits(t, files)
@ -130,19 +121,9 @@ func checkNamingContracts(t *testing.T, files []string) {
if !goFileNamePattern.MatchString(base) {
t.Errorf("%s: filename %q violates naming contract %s", file, base, goFileNamePattern.String())
}
for _, token := range filenameTokens(base) {
if _, ok := genericFileNameTokens[token]; ok {
t.Errorf("%s: filename %q uses generic split-file token %q", file, base, token)
}
}
}
}
func filenameTokens(name string) []string {
trimmed := strings.TrimSuffix(strings.TrimSuffix(name, ".go"), "_test")
return strings.Split(trimmed, "_")
}
// checkFileLOCLimits runs one orchestration or CLI step.
// Signature: checkFileLOCLimits(t *testing.T, files []string).
// Why: A strict LOC cap forces focused files and keeps refactors manageable.

View File

@ -13,8 +13,6 @@ cmd/ananke/power_safety_test.go
cmd/ananke/test_helpers_test.go
internal/cluster/orchestrator_inventory_test.go
internal/cluster/orchestrator_report_test.go
internal/cluster/orchestrator_autorepair_test.go
internal/cluster/orchestrator_autorepair_cleanup_test.go
internal/cluster/orchestrator_test.go
internal/cluster/orchestrator_unit_additional_test.go
internal/cluster/orchestrator_vault_test.go
@ -23,7 +21,6 @@ internal/config/load_additional_test.go
internal/config/validate_matrix_test.go
internal/service/daemon_additional_test.go
internal/service/daemon_coverage_closeout_test.go
internal/service/daemon_poststart_autorepair_test.go
internal/service/daemon_quality_branches_test.go
internal/service/daemon_test.go
internal/sshutil/repair_test.go

View File

@ -363,3 +363,4 @@ func TestHookBootstrapCacheAndRepoSyncFailureBranches(t *testing.T) {
}
})
}

View File

@ -79,29 +79,6 @@ func TestHookFluxAndWorkloadMatrix(t *testing.T) {
}
})
t.Run("flux-health-required-kustomizations-ignore-optional-failures", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.FluxHealthRequiredKustomizations = []string{"flux-system/monitoring"}
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "get kustomizations.kustomize.toolkit.fluxcd.io -A -o json"):
return `{"items":[
{"metadata":{"namespace":"flux-system","name":"monitoring"},"spec":{"suspend":false},"status":{"conditions":[{"type":"Ready","status":"True","message":"ok"}]}},
{"metadata":{"namespace":"flux-system","name":"jellyfin"},"spec":{"suspend":false},"status":{"conditions":[{"type":"Ready","status":"False","message":"optional still down"}]}}
]}`, nil
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
ready, detail, err := orch.TestHookFluxHealthReady(context.Background())
if err != nil || !ready || !strings.Contains(detail, "required kustomizations ready=") {
t.Fatalf("expected required flux kustomization scope to pass, ready=%v detail=%q err=%v", ready, detail, err)
}
})
t.Run("workload-convergence-and-recycle-branches", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.WorkloadConvergenceWaitSeconds = 1
@ -168,42 +145,6 @@ func TestHookFluxAndWorkloadMatrix(t *testing.T) {
}
})
t.Run("required-workload-namespaces-ignore-optional-failure-pods", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.WorkloadConvergenceRequiredNamespaces = []string{"monitoring"}
cfg.Startup.StuckPodGraceSeconds = 1
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "get deploy,statefulset,daemonset -A -o json"):
return `{"items":[
{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"grafana"},"spec":{"replicas":1},"status":{"readyReplicas":1}},
{"kind":"Deployment","metadata":{"namespace":"jellyfin","name":"pegasus"},"spec":{"replicas":1},"status":{"readyReplicas":0}}
]}`, nil
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
return `{"items":[
{"metadata":{"namespace":"monitoring","name":"grafana-0","creationTimestamp":"2020-01-01T00:00:00Z","ownerReferences":[{"kind":"ReplicaSet","name":"grafana"}]},"spec":{"nodeName":"titan-23","containers":[{"name":"grafana"}]},"status":{"containerStatuses":[{"state":{"running":{}}}]}},
{"metadata":{"namespace":"jellyfin","name":"pegasus-0","creationTimestamp":"2020-01-01T00:00:00Z","ownerReferences":[{"kind":"ReplicaSet","name":"pegasus"}]},"spec":{"nodeName":"titan-23","containers":[{"name":"pegasus"}]},"status":{"containerStatuses":[{"state":{"waiting":{"reason":"CrashLoopBackOff"}}}]}}
]}`, nil
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
ready, detail, err := orch.TestHookWorkloadConvergenceReady(context.Background())
if err != nil || !ready || !strings.Contains(detail, "controllers ready=") {
t.Fatalf("expected required workload namespace scope to pass, ready=%v detail=%q err=%v", ready, detail, err)
}
failures, err := orch.TestHookStartupFailurePods(context.Background())
if err != nil {
t.Fatalf("startup failure pod query: %v", err)
}
if len(failures) != 0 {
t.Fatalf("expected optional namespace failures to be ignored, got %v", failures)
}
})
t.Run("critical-workload-replica-heal-branches", func(t *testing.T) {
cfg := lifecycleConfig(t)
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {

View File

@ -19,11 +19,11 @@ import (
"scm.bstein.dev/bstein/ananke/internal/state"
)
// newHookOrchestratorWithRunnerMode runs one orchestration or CLI step.
// Signature: newHookOrchestratorWithRunnerMode(t *testing.T, cfg config.Config, dryRun bool, run commandOverride, runSensitive commandOverride) (*cluster.Orchestrator, *commandRecorder).
// Why: these scenarios needs dry-run and non-dry-run variants while keeping
// newHookOrchestratorAdvanced runs one orchestration or CLI step.
// Signature: newHookOrchestratorAdvanced(t *testing.T, cfg config.Config, dryRun bool, run commandOverride, runSensitive commandOverride) (*cluster.Orchestrator, *commandRecorder).
// Why: this part10 matrix needs dry-run and non-dry-run variants while keeping
// command dispatch deterministic from the top-level testing module.
func newHookOrchestratorWithRunnerMode(
func newHookOrchestratorAdvanced(
t *testing.T,
cfg config.Config,
dryRun bool,
@ -49,11 +49,11 @@ func newHookOrchestratorWithRunnerMode(
return orch, recorder
}
// TestHookVaultLifecycleBranchMatrix runs one orchestration or CLI step.
// Signature: TestHookVaultLifecycleBranchMatrix(t *testing.T).
// TestHookGapMatrixPart10LowFileClosure runs one orchestration or CLI step.
// Signature: TestHookGapMatrixPart10LowFileClosure(t *testing.T).
// Why: closes remaining branch gaps on low-coverage orchestrator files using
// targeted hook-level scenarios instead of brittle full-drill reruns.
func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
func TestHookGapMatrixPart10LowFileClosure(t *testing.T) {
t.Run("critical-vault-low-branches", func(t *testing.T) {
t.Run("vault-sealed-parse-error", func(t *testing.T) {
cfg := lifecycleConfig(t)
@ -64,7 +64,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
if _, err := orch.TestHookVaultSealed(context.Background()); err == nil || !strings.Contains(err.Error(), "parse vault status") {
t.Fatalf("expected vault status parse error branch, got %v", err)
}
@ -81,7 +81,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
if _, err := orch.TestHookVaultUnsealKey(context.Background()); err == nil || !strings.Contains(err.Error(), "vault-init unseal key is empty") {
t.Fatalf("expected empty decoded unseal key branch, got %v", err)
}
@ -90,7 +90,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
t.Run("write-unseal-key-file-write-error", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.VaultUnsealKeyFile = t.TempDir()
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, nil, nil)
orch, _ := newHookOrchestratorAdvanced(t, cfg, false, nil, nil)
if err := orch.TestHookWriteVaultUnsealKeyFile("vault-key"); err == nil || !strings.Contains(err.Error(), "write vault unseal key file") {
t.Fatalf("expected write failure branch when key path is a directory, got %v", err)
}
@ -105,7 +105,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchNoValue, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, runNoValue, runNoValue)
orchNoValue, _ := newHookOrchestratorAdvanced(t, cfg, false, runNoValue, runNoValue)
ready, err := orchNoValue.TestHookWorkloadReady(context.Background(), "vault", "statefulset", "vault")
if err != nil || ready {
t.Fatalf("expected no-value readiness branch, ready=%v err=%v", ready, err)
@ -124,7 +124,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orchEnsureErr, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, runEnsureErr, runEnsureErr)
orchEnsureErr, _ := newHookOrchestratorAdvanced(t, cfg, false, runEnsureErr, runEnsureErr)
if err := orchEnsureErr.TestHookEnsureCriticalStartupWorkloads(context.Background()); err == nil || !strings.Contains(err.Error(), "rollout failed") {
t.Fatalf("expected ensureCriticalStartupWorkloads wait error branch, got %v", err)
}
@ -139,7 +139,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchPhase, _ := newHookOrchestratorWithRunnerMode(t, cfgPhase, false, runPhase, runPhase)
orchPhase, _ := newHookOrchestratorAdvanced(t, cfgPhase, false, runPhase, runPhase)
if err := orchPhase.TestHookEnsureVaultUnsealed(context.Background()); err == nil || !strings.Contains(err.Error(), "pod phase") {
t.Fatalf("expected pod phase guard branch, got %v", err)
}
@ -170,7 +170,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
}
return runFollowup(ctx, timeout, name, args...)
}
orchFollowup, _ := newHookOrchestratorWithRunnerMode(t, cfgFollowup, false, runFollowup, runSensitive)
orchFollowup, _ := newHookOrchestratorAdvanced(t, cfgFollowup, false, runFollowup, runSensitive)
if err := orchFollowup.TestHookEnsureVaultUnsealed(context.Background()); err == nil || !strings.Contains(err.Error(), "vault status check failed") {
t.Fatalf("expected follow-up sealed status error branch, got %v", err)
}
@ -204,7 +204,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
err := orch.TestHookDrainWorkers(context.Background(), workers)
if err == nil || !strings.Contains(err.Error(), "drain workers had 5 errors") {
t.Fatalf("expected drain aggregation branch, got %v", err)
@ -217,7 +217,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
cfg.SSHManagedNodes = []string{"titan-db"}
rec := &commandRecorder{}
base := lifecycleDispatcher(rec)
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, base, base)
orch, _ := newHookOrchestratorAdvanced(t, cfg, false, base, base)
orch.TestHookRunSSHAcrossNodes(context.Background(), []string{"titan-db", "not-managed"}, "noop", "echo ok")
if !rec.contains("atlas@titan-db echo ok") {
t.Fatalf("expected managed ssh execution branch")
@ -233,7 +233,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
if _, err := orch.TestHookLatestEtcdSnapshotPath(context.Background(), "titan-db"); err == nil || !strings.Contains(err.Error(), "no etcd snapshots found") {
t.Fatalf("expected empty snapshot-list branch, got %v", err)
}
@ -250,7 +250,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchWorkers, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, runWorkers, runWorkers)
orchWorkers, _ := newHookOrchestratorAdvanced(t, cfg, false, runWorkers, runWorkers)
workers, err := orchWorkers.TestHookEffectiveWorkers(context.Background())
if err != nil || len(workers) == 0 {
t.Fatalf("expected inventory worker fallback branch, workers=%v err=%v", workers, err)
@ -273,7 +273,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orchWrite, _ := newHookOrchestratorWithRunnerMode(t, cfgWrite, false, runWrite, runWrite)
orchWrite, _ := newHookOrchestratorAdvanced(t, cfgWrite, false, runWrite, runWrite)
if err := orchWrite.TestHookScaleDownApps(context.Background()); err == nil || !strings.Contains(err.Error(), "write scaled workload snapshot") {
t.Fatalf("expected scaled snapshot write-failure branch, got %v", err)
}
@ -294,7 +294,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orchReady, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, runReady, runReady)
orchReady, _ := newHookOrchestratorAdvanced(t, cfg, false, runReady, runReady)
ready, detail, err := orchReady.TestHookFluxHealthReady(context.Background())
if err != nil || ready || !strings.Contains(detail, "ready=false") {
t.Fatalf("expected flux ready-reason fallback branch, ready=%v detail=%q err=%v", ready, detail, err)
@ -319,7 +319,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
ctx, cancel := context.WithCancel(context.Background())
cancel()
if err := orch.TestHookWaitForFluxHealth(ctx); !errors.Is(err, context.Canceled) {
@ -336,7 +336,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
}
rec := &commandRecorder{}
base := lifecycleDispatcher(rec)
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, base, base)
orch, _ := newHookOrchestratorAdvanced(t, cfg, false, base, base)
if err := orch.TestHookEnsureRequiredNodeLabels(context.Background()); err != nil {
t.Fatalf("expected ensureRequiredNodeLabels skip/apply branches, got %v", err)
}
@ -347,7 +347,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
t.Run("wait-for-startup-convergence-dryrun-and-critical-endpoint-fail", func(t *testing.T) {
cfgDry := lifecycleConfig(t)
orchDry, _ := newHookOrchestratorWithRunnerMode(t, cfgDry, true, nil, nil)
orchDry, _ := newHookOrchestratorAdvanced(t, cfgDry, true, nil, nil)
if err := orchDry.TestHookWaitForStartupConvergence(context.Background()); err != nil {
t.Fatalf("expected startup convergence dry-run fast path, got %v", err)
}
@ -365,7 +365,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchFail, _ := newHookOrchestratorWithRunnerMode(t, cfgFail, false, run, run)
orchFail, _ := newHookOrchestratorAdvanced(t, cfgFail, false, run, run)
if err := orchFail.TestHookWaitForStartupConvergence(context.Background()); err == nil || !strings.Contains(err.Error(), "query endpoints") {
t.Fatalf("expected critical-endpoint convergence failure branch, got %v", err)
}
@ -373,7 +373,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
t.Run("ingress-namespace-discovery-empty-and-query-error", func(t *testing.T) {
cfg := lifecycleConfig(t)
orchEmpty, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, nil, nil)
orchEmpty, _ := newHookOrchestratorAdvanced(t, cfg, false, nil, nil)
namespaces, err := orchEmpty.TestHookDiscoverIngressNamespacesForHost(context.Background(), " ")
if err != nil || len(namespaces) != 0 {
t.Fatalf("expected empty-host fast path, namespaces=%v err=%v", namespaces, err)
@ -386,7 +386,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchErr, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, runErr, runErr)
orchErr, _ := newHookOrchestratorAdvanced(t, cfg, false, runErr, runErr)
if _, err := orchErr.TestHookDiscoverIngressNamespacesForHost(context.Background(), "metrics.bstein.dev"); err == nil || !strings.Contains(err.Error(), "query ingresses") {
t.Fatalf("expected ingress query error branch, got %v", err)
}
@ -412,7 +412,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
URL: "http://" + listener.Addr().String() + "/health",
AcceptedStatuses: []int{200},
}}
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, nil, nil)
orch, _ := newHookOrchestratorAdvanced(t, cfg, false, nil, nil)
ready, detail := orch.TestHookServiceChecklistReady(context.Background())
if ready || !strings.Contains(detail, "http://") {
t.Fatalf("expected service checklist URL-name fallback failure, ready=%v detail=%q", ready, detail)
@ -435,7 +435,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
ctx, cancel := context.WithCancel(context.Background())
cancel()
if err := orch.TestHookWaitForNodeInventoryReachability(ctx); !errors.Is(err, context.Canceled) {
@ -456,7 +456,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
ctx, cancel := context.WithCancel(context.Background())
cancel()
if err := orch.TestHookWaitForPostStartProbes(ctx); !errors.Is(err, context.Canceled) {
@ -478,7 +478,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
if err := orch.TestHookResumeFluxAndReconcile(context.Background()); err != nil {
t.Fatalf("expected resume flux warning-only branch, got %v", err)
}
@ -505,7 +505,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
ctx, cancel := context.WithCancel(context.Background())
cancel()
if err := orch.TestHookWaitForTimeSync(ctx, []string{"", "titan-db"}); !errors.Is(err, context.Canceled) {
@ -532,14 +532,14 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
if err := orch.TestHookWaitForWorkloadConvergence(context.Background()); err != nil {
t.Fatalf("expected workload convergence default-branch success, got %v", err)
}
cfgIgnore := lifecycleConfig(t)
cfgIgnore.Startup.AutoRecycleStuckPods = false
orchIgnoreDry, _ := newHookOrchestratorWithRunnerMode(t, cfgIgnore, true, run, run)
orchIgnoreDry, _ := newHookOrchestratorAdvanced(t, cfgIgnore, true, run, run)
now := time.Now().UTC().Add(-time.Hour)
orchIgnoreDry.TestHookMaybeAutoRecycleStuckPods(context.Background(), &now)
orchIgnoreDry.TestHookMaybeAutoHealCriticalWorkloadReplicas(context.Background(), &now)
@ -551,7 +551,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchHealErr, _ := newHookOrchestratorWithRunnerMode(t, lifecycleConfig(t), false, runHealErr, runHealErr)
orchHealErr, _ := newHookOrchestratorAdvanced(t, lifecycleConfig(t), false, runHealErr, runHealErr)
if _, err := orchHealErr.TestHookHealCriticalWorkloadReplicas(context.Background()); err == nil || !strings.Contains(err.Error(), "query workloads") {
t.Fatalf("expected critical workload heal query-error branch, got %v", err)
}

View File

@ -20,7 +20,7 @@ import (
// newLifecycleMatrixOrchestrator runs one orchestration or CLI step.
// Signature: newLifecycleMatrixOrchestrator(t *testing.T, cfg config.Config, dryRun bool, run commandOverride, runSensitive commandOverride, kubeconfig string) *cluster.Orchestrator.
// Why: lifecycle cleanup scenarios need direct control over runner dry-run and kubeconfig branches.
// Why: part11 needs direct control over runner dry-run and kubeconfig branches.
func newLifecycleMatrixOrchestrator(
t *testing.T,
cfg config.Config,
@ -49,11 +49,11 @@ func newLifecycleMatrixOrchestrator(
return orch
}
// TestHookLifecycleCleanupRemainingClosure runs one orchestration or CLI step.
// Signature: TestHookLifecycleCleanupRemainingClosure(t *testing.T).
// TestHookGapMatrixPart11RemainingClosure runs one orchestration or CLI step.
// Signature: TestHookGapMatrixPart11RemainingClosure(t *testing.T).
// Why: closes final branch gaps for lifecycle + remaining near-threshold
// orchestrator files so per-file coverage reaches the enforced 95% target.
func TestHookLifecycleCleanupRemainingClosure(t *testing.T) {
func TestHookGapMatrixPart11RemainingClosure(t *testing.T) {
t.Run("critical-vault-final-closures", func(t *testing.T) {
t.Run("ensure-critical-cleanup-error-and-cleanup-branches", func(t *testing.T) {
cfg := lifecycleConfig(t)
@ -633,7 +633,7 @@ func TestHookLifecycleCleanupRemainingClosure(t *testing.T) {
switch {
case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
apiVersionCalls++
if apiVersionCalls <= 2 {
if apiVersionCalls == 1 {
return "", errors.New("api down")
}
return "v1.31.0", nil

View File

@ -17,11 +17,11 @@ import (
"scm.bstein.dev/bstein/ananke/internal/state"
)
// TestHookTimesyncAndStabilityMatrix runs one orchestration or CLI step.
// Signature: TestHookTimesyncAndStabilityMatrix(t *testing.T).
// TestHookGapMatrixPart2TimesyncAndStability runs one orchestration or CLI step.
// Signature: TestHookGapMatrixPart2TimesyncAndStability(t *testing.T).
// Why: drives low-coverage time-sync, datastore parsing, and startup stability
// branches from the top-level testing module.
func TestHookTimesyncAndStabilityMatrix(t *testing.T) {
func TestHookGapMatrixPart2TimesyncAndStability(t *testing.T) {
t.Run("parse-datastore-endpoint-matrix", func(t *testing.T) {
cases := []struct {
line string
@ -162,11 +162,11 @@ func TestHookTimesyncAndStabilityMatrix(t *testing.T) {
})
}
// TestHookFluxScalingReportMatrix runs one orchestration or CLI step.
// Signature: TestHookFluxScalingReportMatrix(t *testing.T).
// TestHookGapMatrixPart2FluxScalingReport runs one orchestration or CLI step.
// Signature: TestHookGapMatrixPart2FluxScalingReport(t *testing.T).
// Why: targets low branch density in flux-health, scaling snapshot handling,
// and report sanitization helpers.
func TestHookFluxScalingReportMatrix(t *testing.T) {
func TestHookGapMatrixPart2FluxScalingReport(t *testing.T) {
t.Run("flux-helper-matrix", func(t *testing.T) {
if !cluster.TestHookLooksLikeImmutableJobError("Job update failed: FIELD IS IMMUTABLE") {
t.Fatalf("expected immutable matcher true for uppercase+job variant")
@ -241,11 +241,11 @@ func TestHookFluxScalingReportMatrix(t *testing.T) {
})
}
// TestHookVaultAndCoordinationMatrix runs one orchestration or CLI step.
// Signature: TestHookVaultAndCoordinationMatrix(t *testing.T).
// TestHookGapMatrixPart2VaultAndCoordination runs one orchestration or CLI step.
// Signature: TestHookGapMatrixPart2VaultAndCoordination(t *testing.T).
// Why: raises branch coverage on vault/key and coordination helpers without
// requiring package-local tests.
func TestHookVaultAndCoordinationMatrix(t *testing.T) {
func TestHookGapMatrixPart2VaultAndCoordination(t *testing.T) {
t.Run("vault-unseal-and-file-branches", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.VaultUnsealKeyFile = ""
@ -296,11 +296,11 @@ func TestHookVaultAndCoordinationMatrix(t *testing.T) {
})
}
// TestHookWorkloadIgnoreMatrix runs one orchestration or CLI step.
// Signature: TestHookWorkloadIgnoreMatrix(t *testing.T).
// TestHookGapMatrixPart2WorkloadIgnore runs one orchestration or CLI step.
// Signature: TestHookGapMatrixPart2WorkloadIgnore(t *testing.T).
// Why: expands low branch coverage in workload ignore helpers and startup-failure
// pod classification.
func TestHookWorkloadIgnoreMatrix(t *testing.T) {
func TestHookGapMatrixPart2WorkloadIgnore(t *testing.T) {
t.Run("ignored-node-helper-matrix", func(t *testing.T) {
if !cluster.TestHookWorkloadTargetsIgnoredNodes("titan-22", nil, []string{"titan-22"}) {
t.Fatalf("expected selector-host ignored match")

View File

@ -11,11 +11,11 @@ import (
"scm.bstein.dev/bstein/ananke/internal/config"
)
// TestHookConvergenceAndStabilityMatrix runs one orchestration or CLI step.
// Signature: TestHookConvergenceAndStabilityMatrix(t *testing.T).
// TestHookGapMatrixPart3ConvergenceAndStability runs one orchestration or CLI step.
// Signature: TestHookGapMatrixPart3ConvergenceAndStability(t *testing.T).
// Why: raises coverage for startup convergence orchestration and stability gates
// that determine whether startup is considered truly complete.
func TestHookConvergenceAndStabilityMatrix(t *testing.T) {
func TestHookGapMatrixPart3ConvergenceAndStability(t *testing.T) {
t.Run("wait-for-startup-convergence-gate-matrix", func(t *testing.T) {
cfgIngress := lifecycleConfig(t)
cfgIngress.Startup.RequireIngressChecklist = true
@ -108,11 +108,11 @@ func TestHookConvergenceAndStabilityMatrix(t *testing.T) {
})
}
// TestHookLifecycleRestoreShutdownMatrix runs one orchestration or CLI step.
// Signature: TestHookLifecycleRestoreShutdownMatrix(t *testing.T).
// TestHookGapMatrixPart3LifecycleRestoreShutdown runs one orchestration or CLI step.
// Signature: TestHookGapMatrixPart3LifecycleRestoreShutdown(t *testing.T).
// Why: fills lifecycle restore/shutdown success paths that are easy to miss in
// failure-focused drill tests.
func TestHookLifecycleRestoreShutdownMatrix(t *testing.T) {
func TestHookGapMatrixPart3LifecycleRestoreShutdown(t *testing.T) {
t.Run("etcd-restore-dry-run-and-success", func(t *testing.T) {
cfgDry := lifecycleConfig(t)
dry := newDryRunHookOrchestrator(t, cfgDry, nil)

View File

@ -19,11 +19,11 @@ import (
"scm.bstein.dev/bstein/ananke/internal/state"
)
// TestHookCoordinationAndReachabilityMatrix runs one orchestration or CLI step.
// Signature: TestHookCoordinationAndReachabilityMatrix(t *testing.T).
// TestHookGapMatrixPart4CoordinationAndReachability runs one orchestration or CLI step.
// Signature: TestHookGapMatrixPart4CoordinationAndReachability(t *testing.T).
// Why: closes remaining coordination/reachability low branches with deterministic
// command responses and short timeouts.
func TestHookCoordinationAndReachabilityMatrix(t *testing.T) {
func TestHookGapMatrixPart4CoordinationAndReachability(t *testing.T) {
t.Run("peer-shutdown-complete-cooldown-blocks-startup", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Coordination.PeerHosts = []string{"titan-24"}
@ -136,11 +136,11 @@ func TestHookCoordinationAndReachabilityMatrix(t *testing.T) {
})
}
// TestHookIngressServiceAndPostStartMatrix runs one orchestration or CLI step.
// Signature: TestHookIngressServiceAndPostStartMatrix(t *testing.T).
// TestHookGapMatrixPart4IngressServiceAndPostStart runs one orchestration or CLI step.
// Signature: TestHookGapMatrixPart4IngressServiceAndPostStart(t *testing.T).
// Why: drives ingress/service checklist and post-start branches that were still
// under-covered after drill-focused matrix tests.
func TestHookIngressServiceAndPostStartMatrix(t *testing.T) {
func TestHookGapMatrixPart4IngressServiceAndPostStart(t *testing.T) {
t.Run("ingress-backend-autoscale-cooldown-and-host-parser", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.ServiceChecklist = []config.ServiceChecklistCheck{
@ -194,11 +194,11 @@ func TestHookIngressServiceAndPostStartMatrix(t *testing.T) {
cfg := lifecycleConfig(t)
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
ok, detail := orch.TestHookServiceCheckReady(context.Background(), config.ServiceChecklistCheck{
Name: "forbidden-marker",
URL: srv.URL,
Name: "forbidden-marker",
URL: srv.URL,
AcceptedStatuses: []int{200},
BodyNotContains: "marker",
TimeoutSeconds: 2,
BodyNotContains: "marker",
TimeoutSeconds: 2,
})
if ok || !strings.Contains(detail, "forbidden marker") {
t.Fatalf("expected forbidden-marker branch, ok=%v detail=%q", ok, detail)
@ -233,11 +233,11 @@ func TestHookIngressServiceAndPostStartMatrix(t *testing.T) {
})
}
// TestHookReportScalingStorageDrainMatrix runs one orchestration or CLI step.
// Signature: TestHookReportScalingStorageDrainMatrix(t *testing.T).
// TestHookGapMatrixPart4ReportScalingStorageDrain runs one orchestration or CLI step.
// Signature: TestHookGapMatrixPart4ReportScalingStorageDrain(t *testing.T).
// Why: covers artifact, scaling snapshot, storage, and drain error branches that
// are difficult to hit from happy-path lifecycle drills.
func TestHookReportScalingStorageDrainMatrix(t *testing.T) {
func TestHookGapMatrixPart4ReportScalingStorageDrain(t *testing.T) {
t.Run("report-artifact-and-progress-error-paths", func(t *testing.T) {
cfg := lifecycleConfig(t)
reportsFile := filepath.Join(t.TempDir(), "reports-as-file")
@ -339,11 +339,11 @@ func TestHookReportScalingStorageDrainMatrix(t *testing.T) {
})
}
// TestHookTimesyncLifecycleAndAccessMatrix runs one orchestration or CLI step.
// Signature: TestHookTimesyncLifecycleAndAccessMatrix(t *testing.T).
// TestHookGapMatrixPart4TimesyncLifecycleAndAccess runs one orchestration or CLI step.
// Signature: TestHookGapMatrixPart4TimesyncLifecycleAndAccess(t *testing.T).
// Why: closes remaining timing/access/lifecycle branches that still sat below
// target after the earlier matrices.
func TestHookTimesyncLifecycleAndAccessMatrix(t *testing.T) {
func TestHookGapMatrixPart4TimesyncLifecycleAndAccess(t *testing.T) {
t.Run("timesync-quorum-and-strict-failure-branches", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.TimeSyncMode = "quorum"

View File

@ -20,11 +20,11 @@ import (
"scm.bstein.dev/bstein/ananke/internal/state"
)
// TestHookEndpointHealingCoverageClosure runs one orchestration or CLI step.
// Signature: TestHookEndpointHealingCoverageClosure(t *testing.T).
// TestHookGapMatrixPart5CoverageClosure runs one orchestration or CLI step.
// Signature: TestHookGapMatrixPart5CoverageClosure(t *testing.T).
// Why: closes branch gaps that still remained after drill-style tests by driving
// low-coverage orchestrator internals through the exported top-level hook surface.
func TestHookEndpointHealingCoverageClosure(t *testing.T) {
func TestHookGapMatrixPart5CoverageClosure(t *testing.T) {
t.Run("critical-endpoint-backend-heal-matrix", func(t *testing.T) {
t.Run("empty-namespace-service-noop", func(t *testing.T) {
orch, _ := newHookOrchestrator(t, lifecycleConfig(t), nil, nil)
@ -491,10 +491,10 @@ func httpStatusHandler(code int, body string) func(http.ResponseWriter, *http.Re
}
}
// TestHookIngressHostMappingRegression runs one orchestration or CLI step.
// Signature: TestHookIngressHostMappingRegression(t *testing.T).
// TestHookGapMatrixPart5IngressHostMappingRegression runs one orchestration or CLI step.
// Signature: TestHookGapMatrixPart5IngressHostMappingRegression(t *testing.T).
// Why: ensures host parsing fallback paths stay stable for ingress/service checklist failures.
func TestHookIngressHostMappingRegression(t *testing.T) {
func TestHookGapMatrixPart5IngressHostMappingRegression(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.ServiceChecklist = []config.ServiceChecklistCheck{
{Name: "metrics", URL: "https://metrics.bstein.dev/api/health"},

View File

@ -16,11 +16,11 @@ import (
"scm.bstein.dev/bstein/ananke/internal/state"
)
// TestHookVaultPostStartBranchMatrix runs one orchestration or CLI step.
// Signature: TestHookVaultPostStartBranchMatrix(t *testing.T).
// Why: targets the remaining low branch paths after endpoint-healing coverage so per-file coverage
// TestHookGapMatrixPart6CoverageClosure runs one orchestration or CLI step.
// Signature: TestHookGapMatrixPart6CoverageClosure(t *testing.T).
// Why: targets the remaining low branch paths after part5 so per-file coverage
// can move toward the strict 95% quality gate.
func TestHookVaultPostStartBranchMatrix(t *testing.T) {
func TestHookGapMatrixPart6CoverageClosure(t *testing.T) {
t.Run("critical-vault-and-poststart-branches", func(t *testing.T) {
t.Run("wait-vault-ready-dryrun-and-cancel", func(t *testing.T) {
cfg := lifecycleConfig(t)

View File

@ -14,11 +14,11 @@ import (
"scm.bstein.dev/bstein/ananke/internal/state"
)
// TestHookWorkloadStorageAccessMatrix runs one orchestration or CLI step.
// Signature: TestHookWorkloadStorageAccessMatrix(t *testing.T).
// TestHookGapMatrixPart7CoverageClosure runs one orchestration or CLI step.
// Signature: TestHookGapMatrixPart7CoverageClosure(t *testing.T).
// Why: closes additional low-coverage branches in convergence, storage, access,
// flux, lifecycle, and sensitive command wrappers.
func TestHookWorkloadStorageAccessMatrix(t *testing.T) {
func TestHookGapMatrixPart7CoverageClosure(t *testing.T) {
t.Run("workload-convergence-branch-matrix", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.WorkloadConvergenceWaitSeconds = 1
@ -165,32 +165,6 @@ func TestHookWorkloadStorageAccessMatrix(t *testing.T) {
t.Fatalf("expected inventory reachability timeout on unexpected output, got %v", err)
}
})
t.Run("wait-for-node-gates-only-require-core-nodes", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.RequireNodeSSHAuth = true
cfg.Startup.NodeSSHAuthWaitSeconds = 1
cfg.Startup.NodeSSHAuthPollSeconds = 1
cfg.Startup.NodeInventoryReachWaitSeconds = 1
cfg.Startup.NodeInventoryReachPollSeconds = 1
cfg.Startup.NodeInventoryReachRequiredNodes = []string{"titan-db"}
cfg.Startup.NodeSSHAuthRequiredNodes = []string{"titan-db"}
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "ssh" && strings.Contains(command, "titan-23") && strings.Contains(command, "__ANANKE_") {
return "", errors.New("no route to host")
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
if err := orch.TestHookWaitForNodeSSHAuth(context.Background(), []string{"titan-db", "titan-23"}); err != nil {
t.Fatalf("expected ssh-auth gate to ignore non-core worker failure, got %v", err)
}
if err := orch.TestHookWaitForNodeInventoryReachability(context.Background()); err != nil {
t.Fatalf("expected inventory reachability gate to ignore non-core worker failure, got %v", err)
}
})
})
t.Run("flux-lifecycle-and-sensitive-run-branches", func(t *testing.T) {

View File

@ -19,11 +19,11 @@ import (
"scm.bstein.dev/bstein/ananke/internal/state"
)
// TestHookAccessVaultLifecycleMatrix runs one orchestration or CLI step.
// Signature: TestHookAccessVaultLifecycleMatrix(t *testing.T).
// TestHookGapMatrixPart8CoverageClosure runs one orchestration or CLI step.
// Signature: TestHookGapMatrixPart8CoverageClosure(t *testing.T).
// Why: closes additional low-coverage branches in access, vault, lifecycle,
// ingress/service stability, and timesync/inventory orchestration paths.
func TestHookAccessVaultLifecycleMatrix(t *testing.T) {
func TestHookGapMatrixPart8CoverageClosure(t *testing.T) {
t.Run("access-ssh-and-branch-guard-branches", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.RequireNodeSSHAuth = true
@ -331,11 +331,11 @@ func TestHookAccessVaultLifecycleMatrix(t *testing.T) {
})
}
// TestHookLifecycleStartupAutoRestoreBranch runs one orchestration or CLI step.
// Signature: TestHookLifecycleStartupAutoRestoreBranch(t *testing.T).
// TestHookGapMatrixPart8LifecycleStartupAutoRestoreBranch runs one orchestration or CLI step.
// Signature: TestHookGapMatrixPart8LifecycleStartupAutoRestoreBranch(t *testing.T).
// Why: covers Startup's API-failure->auto-restore retry path that is otherwise
// hard to exercise in deterministic top-level tests.
func TestHookLifecycleStartupAutoRestoreBranch(t *testing.T) {
func TestHookGapMatrixPart8LifecycleStartupAutoRestoreBranch(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.AutoEtcdRestoreOnAPIFailure = true
cfg.Startup.EtcdRestoreControlPlane = "titan-db"
@ -384,7 +384,7 @@ func TestHookLifecycleStartupAutoRestoreBranch(t *testing.T) {
}
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
err = orch.Startup(context.Background(), cluster.StartupOptions{Reason: "lifecycle-auto-restore"})
err = orch.Startup(context.Background(), cluster.StartupOptions{Reason: "part8-auto-restore"})
if err != nil {
t.Fatalf("expected startup auto-restore path success, got %v", err)
}
@ -394,7 +394,7 @@ func TestHookLifecycleStartupAutoRestoreBranch(t *testing.T) {
cfgBadMode := lifecycleConfig(t)
orchBadMode, _ := newHookOrchestrator(t, cfgBadMode, nil, nil)
err = orchBadMode.Shutdown(context.Background(), cluster.ShutdownOptions{Reason: "lifecycle", Mode: "unknown-mode"})
err = orchBadMode.Shutdown(context.Background(), cluster.ShutdownOptions{Reason: "part8", Mode: "unknown-mode"})
if err == nil || !strings.Contains(err.Error(), "unsupported shutdown mode") {
t.Fatalf("expected shutdown unsupported-mode branch, got %v", err)
}

View File

@ -16,11 +16,11 @@ import (
"scm.bstein.dev/bstein/ananke/internal/state"
)
// TestHookAccessCoordinationEndpointsMatrix runs one orchestration or CLI step.
// Signature: TestHookAccessCoordinationEndpointsMatrix(t *testing.T).
// TestHookGapMatrixPart9AccessCoordinationEndpoints runs one orchestration or CLI step.
// Signature: TestHookGapMatrixPart9AccessCoordinationEndpoints(t *testing.T).
// Why: closes uncovered statement ranges in access/fluxsource, coordination,
// and critical-endpoint orchestration helpers.
func TestHookAccessCoordinationEndpointsMatrix(t *testing.T) {
func TestHookGapMatrixPart9AccessCoordinationEndpoints(t *testing.T) {
t.Run("access-fluxsource-uncovered-ranges", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Shutdown.SSHParallelism = 0

View File

@ -53,48 +53,6 @@ func TestHookIngressServiceMatrix(t *testing.T) {
}
})
t.Run("required-node-labels-skip-ignored-unavailable-nodes", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.RequiredNodeLabels = map[string]map[string]string{
"titan-09": {
"ananke.bstein.dev/harbor-bootstrap": "true",
},
}
cfg.Startup.IgnoreUnavailableNodes = []string{"titan-09"}
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "label node titan-09 --overwrite") {
t.Fatalf("expected ignored unavailable node labels to be skipped, got %q", command)
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
if err := orch.TestHookEnsureRequiredNodeLabels(context.Background()); err != nil {
t.Fatalf("expected ignored unavailable node label enforcement to be skipped, got %v", err)
}
})
t.Run("required-node-labels-skip-absent-non-core-nodes", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.RequiredNodeLabels = map[string]map[string]string{
"titan-09": {
"ananke.bstein.dev/harbor-bootstrap": "true",
},
}
cfg.Startup.NodeInventoryReachRequiredNodes = []string{"titan-db"}
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "label node titan-09 --overwrite") {
return "", errors.New("Error from server (NotFound): nodes \"titan-09\" not found")
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
if err := orch.TestHookEnsureRequiredNodeLabels(context.Background()); err != nil {
t.Fatalf("expected absent non-core node label enforcement to be skipped, got %v", err)
}
})
t.Run("ingress-discovery-checklist-and-heal", func(t *testing.T) {
tlsServer := httptest.NewTLSServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
w.WriteHeader(http.StatusOK)

View File

@ -4,6 +4,7 @@ import (
"context"
"errors"
"net"
"os"
"strings"
"testing"
"time"
@ -124,25 +125,20 @@ func TestLifecycleDeepFailureMatrix(t *testing.T) {
t.Run("startup-cooldown-reread-intent-error", func(t *testing.T) {
cfg := lifecycleFastConfig(t)
cfg.Startup.RequireNodeInventoryReach = false
cfg.Startup.ShutdownCooldownSeconds = 5
reads := 0
restoreRead := state.TestHookSetReadIntentOverride(func(path string) (state.Intent, error) {
if path != cfg.State.IntentPath {
return state.TestHookReadIntentDefault(path)
}
reads++
if reads == 1 {
return state.Intent{
State: state.IntentShutdownComplete,
Reason: "recent",
Source: "test",
UpdatedAt: time.Now().UTC().Add(-4 * time.Second),
}, nil
}
return state.Intent{}, errors.New("forced reread failure")
})
t.Cleanup(restoreRead)
cfg.Startup.ShutdownCooldownSeconds = 1
if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{
State: state.IntentShutdownComplete,
Reason: "recent",
Source: "test",
UpdatedAt: time.Now().UTC(),
}); err != nil {
t.Fatalf("seed cooldown intent: %v", err)
}
go func(intentPath string) {
time.Sleep(150 * time.Millisecond)
_ = os.Remove(intentPath)
_ = os.Mkdir(intentPath, 0o755)
}(cfg.State.IntentPath)
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "cooldown-reread"})
if err == nil || !strings.Contains(err.Error(), "re-read startup intent after cooldown wait") {
@ -152,30 +148,24 @@ func TestLifecycleDeepFailureMatrix(t *testing.T) {
t.Run("startup-cooldown-shutdown-became-active", func(t *testing.T) {
cfg := lifecycleFastConfig(t)
cfg.Startup.RequireNodeInventoryReach = false
cfg.Startup.ShutdownCooldownSeconds = 5
reads := 0
restoreRead := state.TestHookSetReadIntentOverride(func(path string) (state.Intent, error) {
if path != cfg.State.IntentPath {
return state.TestHookReadIntentDefault(path)
}
reads++
if reads == 1 {
return state.Intent{
State: state.IntentShutdownComplete,
Reason: "recent",
Source: "test",
UpdatedAt: time.Now().UTC().Add(-4 * time.Second),
}, nil
}
return state.Intent{
cfg.Startup.ShutdownCooldownSeconds = 1
if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{
State: state.IntentShutdownComplete,
Reason: "recent",
Source: "test",
UpdatedAt: time.Now().UTC(),
}); err != nil {
t.Fatalf("seed cooldown intent: %v", err)
}
go func(intentPath string) {
time.Sleep(150 * time.Millisecond)
_ = state.WriteIntent(intentPath, state.Intent{
State: state.IntentShuttingDown,
Reason: "peer-shutdown",
Source: "test",
UpdatedAt: time.Now().UTC(),
}, nil
})
t.Cleanup(restoreRead)
})
}(cfg.State.IntentPath)
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "cooldown-peer-active"})
if err == nil || !strings.Contains(err.Error(), "shutdown intent became active during cooldown wait") {

View File

@ -1,432 +0,0 @@
package orchestrator
import (
"context"
"errors"
"strings"
"testing"
"time"
"scm.bstein.dev/bstein/ananke/internal/cluster"
)
// TestHookSchedulingStormHelpers runs one orchestration or CLI step.
// Signature: TestHookSchedulingStormHelpers(t *testing.T).
// Why: keeps scheduling-storm helper coverage in the split top-level testing module
// required by the repo hygiene contract.
func TestHookSchedulingStormHelpers(t *testing.T) {
if got, ok := cluster.TestHookSchedulingStormOwnerWorkload("ai", "ReplicaSet", "ollama-rs", "Deployment", "ollama"); !ok || got != "ai/deployment/ollama" {
t.Fatalf("unexpected deployment owner resolution: got=%q ok=%v", got, ok)
}
if got, ok := cluster.TestHookSchedulingStormOwnerWorkload("storage", "StatefulSet", "nextcloud", "", ""); !ok || got != "storage/statefulset/nextcloud" {
t.Fatalf("unexpected statefulset owner resolution: got=%q ok=%v", got, ok)
}
if got, ok := cluster.TestHookSchedulingStormOwnerWorkload("ai", "ReplicaSet", "missing", "", ""); ok || got != "" {
t.Fatalf("expected missing replicaset owner lookup to fail, got=%q ok=%v", got, ok)
}
if got := cluster.TestHookEventObservationCount(3, 9); got != 9 {
t.Fatalf("expected series count to win, got %d", got)
}
if got := cluster.TestHookEventObservationCount(0, 0); got != 1 {
t.Fatalf("expected zero-count normalization to 1, got %d", got)
}
now := time.Now().UTC().Round(time.Second)
if got := cluster.TestHookEventLastObservedAt(now, now.Add(-time.Minute), now.Add(-2*time.Minute), now.Add(-3*time.Minute)); !got.Equal(now) {
t.Fatalf("expected series timestamp priority, got %s", got)
}
if got := cluster.TestHookEventLastObservedAt(time.Time{}, now, now.Add(-time.Minute), now.Add(-2*time.Minute)); !got.Equal(now) {
t.Fatalf("expected lastTimestamp fallback, got %s", got)
}
if got := cluster.TestHookEventLastObservedAt(time.Time{}, time.Time{}, now, now.Add(-time.Minute)); !got.Equal(now) {
t.Fatalf("expected eventTime fallback, got %s", got)
}
if got := cluster.TestHookEventLastObservedAt(time.Time{}, time.Time{}, time.Time{}, now); !got.Equal(now) {
t.Fatalf("expected creationTimestamp fallback, got %s", got)
}
}
// TestHookSchedulingStormQuarantine runs one orchestration or CLI step.
// Signature: TestHookSchedulingStormQuarantine(t *testing.T).
// Why: verifies that only non-core workloads generating real scheduling storms
// are auto-quarantined, which prevents event/Kine churn from spiking control-plane CPU.
func TestHookSchedulingStormQuarantine(t *testing.T) {
now := time.Now().UTC().Format(time.RFC3339)
cfg := lifecycleConfig(t)
cfg.Startup.AutoQuarantineSchedulingStorms = true
cfg.Startup.SchedulingStormEventThreshold = 30
cfg.Startup.SchedulingStormWindowSeconds = 180
cfg.Startup.WorkloadConvergenceRequiredNamespaces = []string{"vault"}
cfg.Startup.IgnoreWorkloadNamespaces = []string{"ignored-ns"}
cfg.Startup.IgnoreWorkloads = []string{"monitoring/deployment/ignore-me"}
cfg.Startup.IgnoreUnavailableNodes = []string{"titan-22"}
scaledOllama := false
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
return `{"items":[
{"metadata":{"namespace":"ai","name":"ollama-pod","ownerReferences":[{"kind":"ReplicaSet","name":"ollama-rs"}]},"spec":{},"status":{"phase":"Pending"}},
{"metadata":{"namespace":"vault","name":"vault-0","ownerReferences":[{"kind":"StatefulSet","name":"vault"}]},"spec":{},"status":{"phase":"Pending"}},
{"metadata":{"namespace":"ignored-ns","name":"skip-pod","ownerReferences":[{"kind":"ReplicaSet","name":"skip-rs"}]},"spec":{},"status":{"phase":"Pending"}},
{"metadata":{"namespace":"monitoring","name":"ignore-me-pod","ownerReferences":[{"kind":"ReplicaSet","name":"ignore-me-rs"}]},"spec":{},"status":{"phase":"Pending"}},
{"metadata":{"namespace":"monitoring","name":"ignored-node-pod","ownerReferences":[{"kind":"ReplicaSet","name":"ignored-node-rs"}]},"spec":{"nodeName":"titan-22"},"status":{"phase":"Pending"}},
{"metadata":{"namespace":"monitoring","name":"running-pod","ownerReferences":[{"kind":"ReplicaSet","name":"running-rs"}]},"spec":{},"status":{"phase":"Running"}}
]}`, nil
case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
return `{"items":[
{"metadata":{"namespace":"ai","name":"ollama-rs","ownerReferences":[{"kind":"Deployment","name":"ollama"}]}},
{"metadata":{"namespace":"ignored-ns","name":"skip-rs","ownerReferences":[{"kind":"Deployment","name":"skip"}]}},
{"metadata":{"namespace":"monitoring","name":"ignore-me-rs","ownerReferences":[{"kind":"Deployment","name":"ignore-me"}]}},
{"metadata":{"namespace":"monitoring","name":"ignored-node-rs","ownerReferences":[{"kind":"Deployment","name":"ignored-node"}]}},
{"metadata":{"namespace":"monitoring","name":"running-rs","ownerReferences":[{"kind":"Deployment","name":"running"}]}}
]}`, nil
case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
return `{"items":[
{"metadata":{"creationTimestamp":"` + now + `"},"involvedObject":{"kind":"Pod","namespace":"ai","name":"ollama-pod"},"type":"Warning","reason":"FailedScheduling","count":45},
{"metadata":{"creationTimestamp":"` + now + `"},"involvedObject":{"kind":"Pod","namespace":"vault","name":"vault-0"},"type":"Warning","reason":"FailedScheduling","count":45},
{"metadata":{"creationTimestamp":"` + now + `"},"involvedObject":{"kind":"Pod","namespace":"ignored-ns","name":"skip-pod"},"type":"Warning","reason":"FailedScheduling","count":45},
{"metadata":{"creationTimestamp":"` + now + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"ignore-me-pod"},"type":"Warning","reason":"FailedScheduling","count":45},
{"metadata":{"creationTimestamp":"` + now + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"ignored-node-pod"},"type":"Warning","reason":"FailedScheduling","count":45},
{"metadata":{"creationTimestamp":"` + now + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"running-pod"},"type":"Warning","reason":"FailedScheduling","count":45},
{"metadata":{"creationTimestamp":"2000-01-01T00:00:00Z"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"stale-pod"},"type":"Warning","reason":"FailedScheduling","count":99}
]}`, nil
case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"):
return `{"items":[
{"kind":"Deployment","metadata":{"namespace":"ai","name":"ollama"},"spec":{"replicas":1}},
{"kind":"StatefulSet","metadata":{"namespace":"vault","name":"vault"},"spec":{"replicas":1}},
{"kind":"Deployment","metadata":{"namespace":"ignored-ns","name":"skip"},"spec":{"replicas":1}},
{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"ignore-me"},"spec":{"replicas":1}},
{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"ignored-node"},"spec":{"replicas":1}},
{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"running"},"spec":{"replicas":1}}
]}`, nil
case name == "kubectl" && strings.Contains(command, "-n ai scale deployment ollama --replicas=0"):
scaledOllama = true
return "", nil
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
orch.TestHookBeginStartupReport("scheduling-storm")
defer orch.TestHookFinalizeStartupReport(nil)
if err := orch.TestHookQuarantineSchedulingStormWorkloads(context.Background()); err != nil {
t.Fatalf("quarantine scheduling storm workloads: %v", err)
}
if !scaledOllama {
t.Fatalf("expected ollama deployment to be scaled to zero")
}
progress := readStartupProgress(t, orch)
if !strings.Contains(progress, "ollama") {
t.Fatalf("expected startup progress to mention ollama quarantine, payload=%s", progress)
}
if strings.Contains(progress, "vault") || strings.Contains(progress, "ignore-me") || strings.Contains(progress, "ignored-node") {
t.Fatalf("expected only the non-core eligible workload to be quarantined, payload=%s", progress)
}
}
// TestHookSchedulingStormTriggerGuards runs one orchestration or CLI step.
// Signature: TestHookSchedulingStormTriggerGuards(t *testing.T).
// Why: covers dry-run/disabled/rate-limit guards so the scheduler-storm auto-heal
// only activates when the cluster is actually suffering this exact failure mode.
func TestHookSchedulingStormTriggerGuards(t *testing.T) {
cfgDisabled := lifecycleConfig(t)
orchDisabled, _ := newHookOrchestrator(t, cfgDisabled, nil, nil)
lastAttempt := time.Time{}
orchDisabled.TestHookMaybeAutoQuarantineSchedulingStorms(context.Background(), &lastAttempt)
if !lastAttempt.IsZero() {
t.Fatalf("expected disabled scheduling-storm trigger to be skipped")
}
cfgDry := lifecycleConfig(t)
cfgDry.Startup.AutoQuarantineSchedulingStorms = true
orchDry := newDryRunHookOrchestrator(t, cfgDry, nil)
orchDry.TestHookMaybeAutoQuarantineSchedulingStorms(context.Background(), &lastAttempt)
if !lastAttempt.IsZero() {
t.Fatalf("expected dry-run scheduling-storm trigger to be skipped")
}
cfgRate := lifecycleConfig(t)
cfgRate.Startup.AutoQuarantineSchedulingStorms = true
cfgRate.Startup.SchedulingStormEventThreshold = 5
cfgRate.Startup.SchedulingStormWindowSeconds = 60
recorder := &commandRecorder{}
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
recorder.record(name, args)
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
return `{"items":[]}`, nil
case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
return `{"items":[]}`, nil
case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
return `{"items":[]}`, nil
case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"):
return `{"items":[]}`, nil
default:
return lifecycleDispatcher(recorder)(ctx, timeout, name, args...)
}
}
orchRate, _ := newHookOrchestrator(t, cfgRate, run, run)
lastAttempt = time.Now()
orchRate.TestHookMaybeAutoQuarantineSchedulingStorms(context.Background(), &lastAttempt)
if recorder.contains("get pods -A -o json") {
t.Fatalf("expected rate-limited scheduling-storm trigger to skip kubectl scans")
}
}
// TestHookSchedulingStormTriggerAndNoOpBranches runs one orchestration or CLI step.
// Signature: TestHookSchedulingStormTriggerAndNoOpBranches(t *testing.T).
// Why: raises scheduling-storm branch coverage on the success/no-op paths so the
// auto-heal only acts on genuine event storms and stays quiet otherwise.
func TestHookSchedulingStormTriggerAndNoOpBranches(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.AutoQuarantineSchedulingStorms = true
cfg.Startup.SchedulingStormEventThreshold = 0
cfg.Startup.SchedulingStormWindowSeconds = 0
scanRan := false
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
scanRan = true
return `{"items":[
{"metadata":{"namespace":"","name":"missing"}},
{"metadata":{"namespace":"monitoring","name":"no-owner"},"spec":{},"status":{"phase":"Pending"}},
{"metadata":{"namespace":"monitoring","name":"done","ownerReferences":[{"kind":"ReplicaSet","name":"done-rs"}]},"spec":{},"status":{"phase":"Running"}},
{"metadata":{"namespace":"monitoring","name":"zero-replicas","ownerReferences":[{"kind":"ReplicaSet","name":"zero-rs"}]},"spec":{},"status":{"phase":"Pending"}}
]}`, nil
case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
return `{"items":[
{"metadata":{"namespace":"","name":"bad-rs"}},
{"metadata":{"namespace":"monitoring","name":"done-rs","ownerReferences":[{"kind":"","name":"ignored"}]}},
{"metadata":{"namespace":"monitoring","name":"zero-rs","ownerReferences":[{"kind":"Deployment","name":"zero"}]}}
]}`, nil
case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
return `{"items":[
{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"normal"},"type":"Normal","reason":"FailedScheduling","count":99},
{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"wrong-reason"},"type":"Warning","reason":"SomeOtherReason","count":99},
{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Service","namespace":"monitoring","name":"wrong-kind"},"type":"Warning","reason":"FailedScheduling","count":99},
{"metadata":{"creationTimestamp":"2000-01-01T00:00:00Z"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"old"},"type":"Warning","reason":"FailedScheduling","count":99},
{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"low-count"},"type":"Warning","reason":"FailedScheduling","count":1},
{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"missing-pod"},"type":"Warning","reason":"FailedScheduling","count":99},
{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"done"},"type":"Warning","reason":"FailedScheduling","count":99},
{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"no-owner"},"type":"Warning","reason":"FailedScheduling","count":99},
{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"zero-replicas"},"type":"Warning","reason":"FailedScheduling","count":99}
]}`, nil
case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"):
return `{"items":[
{"kind":"","metadata":{"namespace":"monitoring","name":"blank-kind"}},
{"kind":"Job","metadata":{"namespace":"monitoring","name":"unsupported"}},
{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"zero"},"spec":{"replicas":0}}
]}`, nil
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
orch.TestHookBeginStartupReport("scheduling-storm-noop")
defer orch.TestHookFinalizeStartupReport(nil)
lastAttempt := time.Time{}
orch.TestHookMaybeAutoQuarantineSchedulingStorms(context.Background(), &lastAttempt)
if lastAttempt.IsZero() {
t.Fatalf("expected successful scheduling-storm trigger to update lastAttempt")
}
if !scanRan {
t.Fatalf("expected scheduling-storm scan to execute")
}
progress := readStartupProgress(t, orch)
if strings.Contains(progress, "quarantined scheduling storm workload") {
t.Fatalf("expected no-op scheduling-storm scan to avoid auto-heal output, payload=%s", progress)
}
}
// TestHookSchedulingStormErrorMatrix runs one orchestration or CLI step.
// Signature: TestHookSchedulingStormErrorMatrix(t *testing.T).
// Why: covers malformed/error response branches in the scheduling-storm scan so
// Ananke can surface precise diagnostics when the API itself is part of the problem.
func TestHookSchedulingStormErrorMatrix(t *testing.T) {
cases := []struct {
name string
run func(context.Context, time.Duration, string, ...string) (string, error)
wantErr string
}{
{
name: "pods-query-error",
run: func(_ context.Context, _ time.Duration, name string, _ ...string) (string, error) {
if name == "kubectl" {
return "", errors.New("pods boom")
}
return "", nil
},
wantErr: "query pods for scheduling storm scan",
},
{
name: "pods-decode-error",
run: func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
if name == "kubectl" && strings.Contains(strings.Join(args, " "), "get pods -A -o json") {
return "{", nil
}
return `{"items":[]}`, nil
},
wantErr: "decode pods for scheduling storm scan",
},
{
name: "replicasets-query-error",
run: func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
return `{"items":[]}`, nil
case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
return "", errors.New("replicasets boom")
default:
return "", nil
}
},
wantErr: "query replicasets for scheduling storm scan",
},
{
name: "replicasets-decode-error",
run: func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
return `{"items":[]}`, nil
case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
return "{", nil
default:
return `{"items":[]}`, nil
}
},
wantErr: "decode replicasets for scheduling storm scan",
},
{
name: "events-query-error",
run: func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
return `{"items":[]}`, nil
case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
return `{"items":[]}`, nil
case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
return "", errors.New("events boom")
default:
return "", nil
}
},
wantErr: "query events for scheduling storm scan",
},
{
name: "events-decode-error",
run: func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
return `{"items":[]}`, nil
case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
return `{"items":[]}`, nil
case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
return "{", nil
default:
return `{"items":[]}`, nil
}
},
wantErr: "decode events for scheduling storm scan",
},
{
name: "workloads-query-error",
run: func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
return `{"items":[]}`, nil
case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
return `{"items":[]}`, nil
case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
return `{"items":[]}`, nil
case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"):
return "", errors.New("workloads boom")
default:
return "", nil
}
},
wantErr: "query workloads for scheduling storm scan",
},
{
name: "workloads-decode-error",
run: func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
return `{"items":[]}`, nil
case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
return `{"items":[]}`, nil
case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
return `{"items":[]}`, nil
case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"):
return "{", nil
default:
return "", nil
}
},
wantErr: "decode workloads for scheduling storm scan",
},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.AutoQuarantineSchedulingStorms = true
orch, _ := newHookOrchestrator(t, cfg, tc.run, tc.run)
err := orch.TestHookQuarantineSchedulingStormWorkloads(context.Background())
if err == nil || !strings.Contains(err.Error(), tc.wantErr) {
t.Fatalf("expected error containing %q, got %v", tc.wantErr, err)
}
})
}
}
// TestHookSchedulingStormScaleError runs one orchestration or CLI step.
// Signature: TestHookSchedulingStormScaleError(t *testing.T).
// Why: covers the final error path where Ananke detects a real storm but cannot
// scale the offending workload down.
func TestHookSchedulingStormScaleError(t *testing.T) {
now := time.Now().UTC().Format(time.RFC3339)
cfg := lifecycleConfig(t)
cfg.Startup.AutoQuarantineSchedulingStorms = true
cfg.Startup.SchedulingStormEventThreshold = 5
cfg.Startup.SchedulingStormWindowSeconds = 60
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
return `{"items":[{"metadata":{"namespace":"ai","name":"ollama-pod","ownerReferences":[{"kind":"ReplicaSet","name":"ollama-rs"}]},"spec":{},"status":{"phase":"Pending"}}]}`, nil
case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
return `{"items":[{"metadata":{"namespace":"ai","name":"ollama-rs","ownerReferences":[{"kind":"Deployment","name":"ollama"}]}}]}`, nil
case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
return `{"items":[{"metadata":{"creationTimestamp":"` + now + `"},"involvedObject":{"kind":"Pod","namespace":"ai","name":"ollama-pod"},"type":"Warning","reason":"FailedScheduling","count":45}]}`, nil
case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"):
return `{"items":[{"kind":"Deployment","metadata":{"namespace":"ai","name":"ollama"},"spec":{"replicas":1}}]}`, nil
case name == "kubectl" && strings.Contains(command, "-n ai scale deployment ollama --replicas=0"):
return "", errors.New("scale denied")
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
err := orch.TestHookQuarantineSchedulingStormWorkloads(context.Background())
if err == nil || !strings.Contains(err.Error(), "scale scheduling storm workload ai/deployment/ollama to 0") {
t.Fatalf("expected scale error, got %v", err)
}
}

View File

@ -1,222 +0,0 @@
package orchestrator
import (
"context"
"errors"
"os"
"strings"
"testing"
"time"
"scm.bstein.dev/bstein/ananke/internal/cluster"
)
// readStartupProgress runs one orchestration or CLI step.
// Signature: readStartupProgress(t *testing.T, orch *cluster.Orchestrator) string.
// Why: startup helper tests need to inspect progress artifacts without reaching
// into internal package state from the top-level testing module.
func readStartupProgress(t *testing.T, orch *cluster.Orchestrator) string {
t.Helper()
payload, err := os.ReadFile(orch.TestHookStartupProgressPath())
if err != nil {
t.Fatalf("read startup progress: %v", err)
}
return string(payload)
}
// TestHookStartupScopeAndVaultHelpers runs one orchestration or CLI step.
// Signature: TestHookStartupScopeAndVaultHelpers(t *testing.T).
// Why: keeps startup-scope and startup-Vault helper branches covered from the
// split top-level testing module required by the repo hygiene contract.
func TestHookStartupScopeAndVaultHelpers(t *testing.T) {
t.Run("startup-scope-helpers", func(t *testing.T) {
nodes := []string{"titan-db", " titan-23 ", "", "titan-24"}
if got := cluster.TestHookStartupRequiredNodes(nodes, nil); len(got) != len(nodes) {
t.Fatalf("expected passthrough node list, got %v", got)
}
got := cluster.TestHookStartupRequiredNodes(nodes, []string{"titan-db", "titan-24"})
if len(got) != 2 || got[0] != "titan-db" || got[1] != "titan-24" {
t.Fatalf("unexpected filtered node list: %v", got)
}
if !cluster.TestHookContainsNode([]string{"titan-db", " titan-23 "}, " titan-23 ") {
t.Fatalf("expected trimmed node membership match")
}
if cluster.TestHookContainsNode([]string{"titan-db"}, " ") {
t.Fatalf("expected blank node probe to be ignored")
}
cfg := lifecycleConfig(t)
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
if !orch.TestHookStartupNodeStrictlyRequired("titan-23") {
t.Fatalf("expected all nodes to be strict when no recovery scopes are configured")
}
cfgScoped := lifecycleConfig(t)
cfgScoped.Startup.NodeInventoryReachRequiredNodes = []string{"titan-23"}
cfgScoped.Startup.NodeSSHAuthRequiredNodes = []string{"titan-24"}
cfgScoped.Startup.FluxHealthRequiredKustomizations = []string{"flux-system/core", " flux-system/gitea "}
cfgScoped.Startup.WorkloadConvergenceRequiredNamespaces = []string{"vault", " monitoring "}
orchScoped, _ := newHookOrchestrator(t, cfgScoped, nil, nil)
if !orchScoped.TestHookStartupNodeStrictlyRequired("titan-db") {
t.Fatalf("expected control plane to remain strict")
}
if !orchScoped.TestHookStartupNodeStrictlyRequired("titan-23") {
t.Fatalf("expected inventory-scoped node to remain strict")
}
if !orchScoped.TestHookStartupNodeStrictlyRequired("titan-24") {
t.Fatalf("expected ssh-scoped node to remain strict")
}
if orchScoped.TestHookStartupNodeStrictlyRequired("titan-25") {
t.Fatalf("expected non-core worker to stop being strict")
}
flux := orchScoped.TestHookStartupRequiredFluxKustomizations()
if _, ok := flux["flux-system/core"]; !ok {
t.Fatalf("expected core flux kustomization in required set: %v", flux)
}
if _, ok := flux["flux-system/gitea"]; !ok {
t.Fatalf("expected trimmed gitea kustomization in required set: %v", flux)
}
namespaces := orchScoped.TestHookStartupRequiredWorkloadNamespaces()
if _, ok := namespaces["vault"]; !ok {
t.Fatalf("expected vault namespace in required set: %v", namespaces)
}
if _, ok := namespaces["monitoring"]; !ok {
t.Fatalf("expected trimmed monitoring namespace in required set: %v", namespaces)
}
})
t.Run("startup-vault-helpers", func(t *testing.T) {
t.Run("early-vault-unseal-paths", func(t *testing.T) {
cfgAPI := lifecycleConfig(t)
runAPI := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "version --request-timeout=5s") {
return "", errors.New("api down")
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchAPI, _ := newHookOrchestrator(t, cfgAPI, runAPI, runAPI)
orchAPI.TestHookBeginStartupReport("startup-vault")
orchAPI.TestHookMaybeRunEarlyVaultUnseal(context.Background())
if payload := readStartupProgress(t, orchAPI); strings.Contains(payload, "vault-unseal-early") {
t.Fatalf("expected no early vault check when api is unavailable, payload=%s", payload)
}
cfgErr := lifecycleConfig(t)
runErr := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
return "v1.31.0", nil
case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
return "", errors.New("phase probe failed")
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orchErr, _ := newHookOrchestrator(t, cfgErr, runErr, runErr)
orchErr.TestHookBeginStartupReport("startup-vault")
orchErr.TestHookMaybeRunEarlyVaultUnseal(context.Background())
if payload := readStartupProgress(t, orchErr); !strings.Contains(payload, "deferred early vault unseal") {
t.Fatalf("expected early vault auto-heal detail, payload=%s", payload)
}
cfgDeferred := lifecycleConfig(t)
runDeferred := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
return "v1.31.0", nil
case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
return "Pending", nil
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orchDeferred, _ := newHookOrchestrator(t, cfgDeferred, runDeferred, runDeferred)
orchDeferred.TestHookBeginStartupReport("startup-vault")
orchDeferred.TestHookMaybeRunEarlyVaultUnseal(context.Background())
if payload := readStartupProgress(t, orchDeferred); !strings.Contains(payload, `vault-0 pod phase is \"Pending\"`) {
t.Fatalf("expected deferred early vault detail, payload=%s", payload)
}
cfgSuccess := lifecycleConfig(t)
runSuccess := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
return "v1.31.0", nil
case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
return "Running", nil
case name == "kubectl" && strings.Contains(command, "vault status -format=json"):
return `{"sealed":false,"initialized":true}`, nil
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orchSuccess, _ := newHookOrchestrator(t, cfgSuccess, runSuccess, runSuccess)
orchSuccess.TestHookBeginStartupReport("startup-vault")
orchSuccess.TestHookMaybeRunEarlyVaultUnseal(context.Background())
if payload := readStartupProgress(t, orchSuccess); !strings.Contains(payload, `"vault-unseal-early"`) || !strings.Contains(payload, `"passed"`) {
t.Fatalf("expected successful early vault check, payload=%s", payload)
}
})
t.Run("startup-vault-gate-paths", func(t *testing.T) {
cfgErr := lifecycleConfig(t)
runErr := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}") {
return "", errors.New("phase probe failed")
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchErr, _ := newHookOrchestrator(t, cfgErr, runErr, runErr)
orchErr.TestHookBeginStartupReport("startup-vault")
if err := orchErr.TestHookRunStartupVaultUnsealGate(context.Background()); err == nil || !strings.Contains(err.Error(), "phase probe failed") {
t.Fatalf("expected startup vault gate error, got %v", err)
}
cfgDeferred := lifecycleConfig(t)
runDeferred := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}") {
return "Pending", nil
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchDeferred, _ := newHookOrchestrator(t, cfgDeferred, runDeferred, runDeferred)
orchDeferred.TestHookBeginStartupReport("startup-vault")
if err := orchDeferred.TestHookRunStartupVaultUnsealGate(context.Background()); err != nil {
t.Fatalf("expected deferred startup vault gate to succeed, got %v", err)
}
if payload := readStartupProgress(t, orchDeferred); !strings.Contains(payload, `vault-0 pod phase is \"Pending\"`) {
t.Fatalf("expected deferred startup vault detail, payload=%s", payload)
}
cfgSuccess := lifecycleConfig(t)
runSuccess := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
return "Running", nil
case name == "kubectl" && strings.Contains(command, "vault status -format=json"):
return `{"sealed":false,"initialized":true}`, nil
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orchSuccess, _ := newHookOrchestrator(t, cfgSuccess, runSuccess, runSuccess)
orchSuccess.TestHookBeginStartupReport("startup-vault")
if err := orchSuccess.TestHookRunStartupVaultUnsealGate(context.Background()); err != nil {
t.Fatalf("expected successful startup vault gate, got %v", err)
}
if payload := readStartupProgress(t, orchSuccess); !strings.Contains(payload, `"vault is unsealed"`) {
t.Fatalf("expected successful startup vault detail, payload=%s", payload)
}
})
})
}

View File

@ -24,36 +24,12 @@ func TestStateTestHookOverrideSetters(t *testing.T) {
}
restoreWriteNil()
restoreReadNil := state.TestHookSetReadIntentOverride(nil)
readAfterNil, err := state.ReadIntent(intentPath)
if err != nil || readAfterNil.State != state.IntentNormal {
t.Fatalf("expected default read intent path after nil override, got %v / %v", readAfterNil, err)
}
restoreReadNil()
readOverrideCalled := false
restoreRead := state.TestHookSetReadIntentOverride(func(path string) (state.Intent, error) {
readOverrideCalled = true
return state.Intent{}, errors.New("forced read override")
})
_, err = state.ReadIntent(intentPath)
if err == nil || !strings.Contains(err.Error(), "forced read override") {
t.Fatalf("expected forced read override error, got %v", err)
}
if !readOverrideCalled {
t.Fatalf("expected read override to be invoked")
}
restoreRead()
if _, err := state.TestHookReadIntentDefault(intentPath); err != nil {
t.Fatalf("expected explicit default read helper to succeed, got %v", err)
}
writeOverrideCalled := false
restoreWrite := state.TestHookSetWriteIntentOverride(func(path string, in state.Intent) error {
writeOverrideCalled = true
return errors.New("forced write override")
})
err = state.WriteIntent(intentPath, state.Intent{State: state.IntentNormal})
err := state.WriteIntent(intentPath, state.Intent{State: state.IntentNormal})
if err == nil || !strings.Contains(err.Error(), "forced write override") {
t.Fatalf("expected forced write override error, got %v", err)
}