Compare commits
No commits in common. "main" and "codex/ananke-gate-platform-metrics" have entirely different histories.
main
...
codex/anan
2
.gitignore
vendored
2
.gitignore
vendored
@ -1,6 +1,4 @@
|
||||
/bin/
|
||||
/build/
|
||||
/dist/
|
||||
internal/state/.corrupt-*
|
||||
*.log
|
||||
*.tmp
|
||||
|
||||
201
Jenkinsfile
vendored
201
Jenkinsfile
vendored
@ -1,59 +1,25 @@
|
||||
pipeline {
|
||||
agent {
|
||||
kubernetes {
|
||||
label 'ananke-quality'
|
||||
defaultContainer 'go-tester'
|
||||
yaml """
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
spec:
|
||||
nodeSelector:
|
||||
hardware: rpi5
|
||||
kubernetes.io/arch: arm64
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: kubernetes.io/hostname
|
||||
operator: NotIn
|
||||
values:
|
||||
- titan-06
|
||||
preferredDuringSchedulingIgnoredDuringExecution:
|
||||
- weight: 100
|
||||
preference:
|
||||
matchExpressions:
|
||||
- key: kubernetes.io/hostname
|
||||
operator: NotIn
|
||||
values:
|
||||
- titan-13
|
||||
- titan-15
|
||||
- titan-17
|
||||
- titan-19
|
||||
topologySpreadConstraints:
|
||||
- maxSkew: 1
|
||||
topologyKey: kubernetes.io/hostname
|
||||
whenUnsatisfiable: ScheduleAnyway
|
||||
labelSelector:
|
||||
matchLabels:
|
||||
jenkins/jenkins-jenkins-agent: "true"
|
||||
containers:
|
||||
- name: go-tester
|
||||
image: registry.bstein.dev/bstein/golang:1.25-bookworm
|
||||
image: golang:1.25-bookworm
|
||||
command: ["cat"]
|
||||
tty: true
|
||||
volumeMounts:
|
||||
- name: workspace-volume
|
||||
mountPath: /home/jenkins/agent
|
||||
- name: publisher
|
||||
image: registry.bstein.dev/bstein/python:3.12-slim
|
||||
command: ["cat"]
|
||||
tty: true
|
||||
volumeMounts:
|
||||
- name: workspace-volume
|
||||
mountPath: /home/jenkins/agent
|
||||
- name: quality-tools
|
||||
image: registry.bstein.dev/bstein/quality-tools:sonar8.0.1-trivy0.70.0-db20260422-arm64
|
||||
image: python:3.12-slim
|
||||
command: ["cat"]
|
||||
tty: true
|
||||
volumeMounts:
|
||||
@ -69,13 +35,7 @@ spec:
|
||||
environment {
|
||||
SUITE_NAME = 'ananke'
|
||||
PUSHGATEWAY_URL = 'http://platform-quality-gateway.monitoring.svc.cluster.local:9091'
|
||||
SONARQUBE_HOST_URL = 'http://sonarqube.quality.svc.cluster.local:9000'
|
||||
SONARQUBE_PROJECT_KEY = 'ananke'
|
||||
SONARQUBE_TOKEN = credentials('sonarqube-token')
|
||||
QUALITY_GATE_SONARQUBE_ENFORCE = '1'
|
||||
QUALITY_GATE_SONARQUBE_REPORT = 'build/sonarqube-quality-gate.json'
|
||||
QUALITY_GATE_IRONBANK_ENFORCE = '1'
|
||||
QUALITY_GATE_IRONBANK_REQUIRED = '0'
|
||||
QUALITY_GATE_IRONBANK_REPORT = 'build/ironbank-compliance.json'
|
||||
}
|
||||
|
||||
@ -97,27 +57,6 @@ spec:
|
||||
|
||||
stage('Collect SonarQube evidence') {
|
||||
steps {
|
||||
container('quality-tools') {
|
||||
sh '''#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
mkdir -p build
|
||||
args=(
|
||||
"-Dsonar.host.url=${SONARQUBE_HOST_URL}"
|
||||
"-Dsonar.login=${SONARQUBE_TOKEN}"
|
||||
"-Dsonar.projectKey=${SONARQUBE_PROJECT_KEY}"
|
||||
"-Dsonar.projectName=${SONARQUBE_PROJECT_KEY}"
|
||||
"-Dsonar.sources=."
|
||||
"-Dsonar.exclusions=**/.git/**,**/build/**,**/dist/**,**/node_modules/**,**/.venv/**,**/__pycache__/**,**/coverage/**,**/test-results/**,**/playwright-report/**"
|
||||
"-Dsonar.test.inclusions=**/tests/**,**/testing/**,**/*_test.go,**/*.test.ts,**/*.test.tsx,**/*.spec.ts,**/*.spec.tsx"
|
||||
)
|
||||
[ -f build/coverage.out ] && args+=("-Dsonar.go.coverage.reportPaths=build/coverage.out")
|
||||
set +e
|
||||
sonar-scanner "${args[@]}" | tee build/sonar-scanner.log
|
||||
rc=${PIPESTATUS[0]}
|
||||
set -e
|
||||
printf '%s\n' "${rc}" > build/sonarqube-analysis.rc
|
||||
'''
|
||||
}
|
||||
container('publisher') {
|
||||
sh '''
|
||||
set -eu
|
||||
@ -156,34 +95,6 @@ PY
|
||||
|
||||
stage('Collect Supply Chain evidence') {
|
||||
steps {
|
||||
container('quality-tools') {
|
||||
sh '''#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
mkdir -p build
|
||||
set +e
|
||||
trivy fs --cache-dir "${TRIVY_CACHE_DIR}" --skip-db-update --timeout 5m --no-progress --format json --output build/trivy-fs.json --scanners vuln,secret,misconfig --severity HIGH,CRITICAL .
|
||||
trivy_rc=$?
|
||||
set -e
|
||||
if [ ! -s build/trivy-fs.json ]; then
|
||||
cat > build/ironbank-compliance.json <<EOF
|
||||
{"status":"failed","compliant":false,"scanner":"trivy","scan_type":"filesystem","error":"trivy did not produce JSON output","trivy_rc":${trivy_rc}}
|
||||
EOF
|
||||
exit 0
|
||||
fi
|
||||
critical="$(jq '[.Results[]? | .Vulnerabilities[]? | select(.Severity=="CRITICAL")] | length' build/trivy-fs.json)"
|
||||
high="$(jq '[.Results[]? | .Vulnerabilities[]? | select(.Severity=="HIGH")] | length' build/trivy-fs.json)"
|
||||
secrets="$(jq '[.Results[]? | .Secrets[]?] | length' build/trivy-fs.json)"
|
||||
misconfigs="$(jq '[.Results[]? | .Misconfigurations[]? | select(.Status=="FAIL" and (.Severity=="CRITICAL" or .Severity=="HIGH"))] | length' build/trivy-fs.json)"
|
||||
status=ok
|
||||
compliant=true
|
||||
if [ "${critical}" -gt 0 ] || [ "${secrets}" -gt 0 ] || [ "${misconfigs}" -gt 0 ]; then
|
||||
status=failed
|
||||
compliant=false
|
||||
fi
|
||||
jq -n --arg status "${status}" --argjson compliant "${compliant}" --argjson critical "${critical}" --argjson high "${high}" --argjson secrets "${secrets}" --argjson misconfigs "${misconfigs}" --argjson trivy_rc "${trivy_rc}" \
|
||||
'{status:$status, compliant:$compliant, category:"artifact_security", scan_type:"filesystem", scanner:"trivy", critical_vulnerabilities:$critical, high_vulnerabilities:$high, secrets:$secrets, high_or_critical_misconfigurations:$misconfigs, trivy_rc:$trivy_rc, high_vulnerability_policy:"observe"}' > build/ironbank-compliance.json
|
||||
'''
|
||||
}
|
||||
container('publisher') {
|
||||
sh '''
|
||||
set -eu
|
||||
@ -241,25 +152,13 @@ PY
|
||||
failed_runs="$(awk -F= '$1=="failed"{print $2}' build/quality-gate.state 2>/dev/null | tail -n1)"
|
||||
[ -n "${ok_runs}" ] || ok_runs=0
|
||||
[ -n "${failed_runs}" ] || failed_runs=0
|
||||
coverage_percent="$(python3 - <<'PY'
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
log_path = Path("build/quality-gate.out")
|
||||
text = log_path.read_text(encoding="utf-8", errors="ignore") if log_path.exists() else ""
|
||||
values = [float(match.group(1)) for match in re.finditer(r"([0-9]+(?:\\.[0-9]+)?)%", text)]
|
||||
print(values[-1] if values else 0.0)
|
||||
PY
|
||||
)"
|
||||
printf '%s\n' "${coverage_percent}" > build/coverage-percent.txt
|
||||
python3 scripts/publish_quality_metrics.py \
|
||||
--pushgateway-url "${PUSHGATEWAY_URL}" \
|
||||
--job-name platform-quality-ci \
|
||||
--suite "${SUITE_NAME}" \
|
||||
--trigger jenkins \
|
||||
--local-ok "${ok_runs}" \
|
||||
--local-failed "${failed_runs}" \
|
||||
--coverage-percent-file build/coverage-percent.txt
|
||||
--local-failed "${failed_runs}"
|
||||
'''
|
||||
}
|
||||
}
|
||||
@ -270,95 +169,7 @@ PY
|
||||
container('publisher') {
|
||||
sh '''
|
||||
set -eu
|
||||
gate_rc="$(cat build/quality-gate.rc 2>/dev/null || echo 1)"
|
||||
fail=0
|
||||
if [ "${gate_rc}" -ne 0 ]; then
|
||||
echo "quality gate failed with rc=${gate_rc}" >&2
|
||||
fail=1
|
||||
fi
|
||||
|
||||
enabled() {
|
||||
case "$(printf '%s' "${1:-}" | tr '[:upper:]' '[:lower:]')" in
|
||||
1|true|yes|on) return 0 ;;
|
||||
*) return 1 ;;
|
||||
esac
|
||||
}
|
||||
|
||||
if enabled "${QUALITY_GATE_SONARQUBE_ENFORCE:-1}"; then
|
||||
sonar_status="$(python3 - <<'PY'
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
path = Path("build/sonarqube-quality-gate.json")
|
||||
if not path.exists():
|
||||
print("missing")
|
||||
raise SystemExit(0)
|
||||
try:
|
||||
payload = json.loads(path.read_text(encoding="utf-8"))
|
||||
except Exception: # noqa: BLE001
|
||||
print("error")
|
||||
raise SystemExit(0)
|
||||
status = (payload.get("status") or payload.get("projectStatus", {}).get("status") or payload.get("qualityGate", {}).get("status") or "").strip().lower()
|
||||
print(status or "missing")
|
||||
PY
|
||||
)"
|
||||
case "${sonar_status}" in
|
||||
ok|pass|passed|success) ;;
|
||||
*)
|
||||
echo "sonarqube gate failed: ${sonar_status}" >&2
|
||||
fail=1
|
||||
;;
|
||||
esac
|
||||
fi
|
||||
|
||||
ironbank_required="${QUALITY_GATE_IRONBANK_REQUIRED:-0}"
|
||||
if [ "${PUBLISH_IMAGES:-false}" = "true" ]; then
|
||||
ironbank_required=1
|
||||
fi
|
||||
if enabled "${QUALITY_GATE_IRONBANK_ENFORCE:-1}"; then
|
||||
supply_status="$(python3 - <<'PY'
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
path = Path("build/ironbank-compliance.json")
|
||||
if not path.exists():
|
||||
print("missing")
|
||||
raise SystemExit(0)
|
||||
try:
|
||||
payload = json.loads(path.read_text(encoding="utf-8"))
|
||||
except Exception: # noqa: BLE001
|
||||
print("error")
|
||||
raise SystemExit(0)
|
||||
compliant = payload.get("compliant")
|
||||
if compliant is True:
|
||||
print("ok")
|
||||
elif compliant is False:
|
||||
print("failed")
|
||||
else:
|
||||
status = str(payload.get("status") or payload.get("result") or payload.get("compliance") or "").strip().lower()
|
||||
print(status or "missing")
|
||||
PY
|
||||
)"
|
||||
case "${supply_status}" in
|
||||
ok|pass|passed|success|compliant) ;;
|
||||
not_applicable|na|n/a)
|
||||
if enabled "${ironbank_required}"; then
|
||||
echo "supply chain gate required but status=${supply_status}" >&2
|
||||
fail=1
|
||||
fi
|
||||
;;
|
||||
*)
|
||||
if enabled "${ironbank_required}"; then
|
||||
echo "supply chain gate failed: ${supply_status}" >&2
|
||||
fail=1
|
||||
else
|
||||
echo "supply chain gate not passing (${supply_status}) but not required for this run" >&2
|
||||
fi
|
||||
;;
|
||||
esac
|
||||
fi
|
||||
|
||||
exit "${fail}"
|
||||
test "$(cat build/quality-gate.rc 2>/dev/null || echo 1)" -eq 0
|
||||
'''
|
||||
}
|
||||
}
|
||||
@ -367,7 +178,7 @@ PY
|
||||
|
||||
post {
|
||||
always {
|
||||
archiveArtifacts artifacts: 'build/*.json,build/*.out,build/*.rc,build/*.txt,build/*.xml', allowEmptyArchive: true, fingerprint: true
|
||||
archiveArtifacts artifacts: 'build/quality-gate.out,build/quality-gate.rc', allowEmptyArchive: true, fingerprint: true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
14
README.md
14
README.md
@ -97,15 +97,10 @@ Primary config path:
|
||||
Keep these fields accurate:
|
||||
- `expected_flux_source_url`
|
||||
- `expected_flux_branch`
|
||||
- `startup.service_checklist_explicit_only`
|
||||
- `startup.service_checklist`
|
||||
- `startup.critical_service_endpoints`
|
||||
- `startup.require_ingress_checklist`
|
||||
- `startup.require_node_inventory_reachability`
|
||||
- `startup.node_inventory_reachability_required_nodes`
|
||||
- `startup.node_ssh_auth_required_nodes`
|
||||
- `startup.flux_health_required_kustomizations`
|
||||
- `startup.workload_convergence_required_namespaces`
|
||||
- `startup.ignore_unavailable_nodes`
|
||||
- `coordination.role`
|
||||
- `coordination.peer_hosts`
|
||||
@ -139,10 +134,9 @@ Installer behavior:
|
||||
|
||||
When adding nodes or services:
|
||||
1. Update inventory and node mapping in config.
|
||||
2. Keep the explicit service checklist focused on the core services that must come back during an outage.
|
||||
3. Keep `*_required_*` startup scopes aligned with the same core set so optional stacks do not block bootstrap.
|
||||
4. Add/adjust ingress expectations for exposed services.
|
||||
5. Use temporary ignores only when truly intentional, then remove them.
|
||||
6. Run `scripts/quality_gate.sh` before host deployment.
|
||||
2. Add/adjust service checklist entries for anything user-facing or critical.
|
||||
3. Add/adjust ingress expectations for exposed services.
|
||||
4. Use temporary ignores only when truly intentional, then remove them.
|
||||
5. Run `scripts/quality_gate.sh` before host deployment.
|
||||
|
||||
Recovery quality should improve over time: every drill should reduce manual work in the next drill.
|
||||
|
||||
@ -51,7 +51,6 @@ startup:
|
||||
require_node_inventory_reachability: true
|
||||
node_inventory_reachability_wait_seconds: 300
|
||||
node_inventory_reachability_poll_seconds: 5
|
||||
node_inventory_reachability_required_nodes: []
|
||||
required_node_labels:
|
||||
titan-09:
|
||||
ananke.bstein.dev/harbor-bootstrap: "true"
|
||||
@ -91,7 +90,6 @@ startup:
|
||||
admin_secret_name: keycloak-admin
|
||||
admin_secret_username_key: username
|
||||
admin_secret_password_key: password
|
||||
service_checklist_explicit_only: false
|
||||
service_checklist:
|
||||
- name: gitea-api
|
||||
url: https://scm.bstein.dev/api/healthz
|
||||
@ -136,26 +134,18 @@ startup:
|
||||
require_node_ssh_auth: true
|
||||
node_ssh_auth_wait_seconds: 240
|
||||
node_ssh_auth_poll_seconds: 5
|
||||
node_ssh_auth_required_nodes: []
|
||||
require_flux_health: true
|
||||
flux_health_wait_seconds: 900
|
||||
flux_health_poll_seconds: 5
|
||||
flux_health_required_kustomizations: []
|
||||
ignore_flux_kustomizations: []
|
||||
require_workload_convergence: true
|
||||
workload_convergence_wait_seconds: 900
|
||||
workload_convergence_poll_seconds: 5
|
||||
workload_convergence_required_namespaces: []
|
||||
ignore_workload_namespaces: []
|
||||
ignore_workloads: []
|
||||
ignore_unavailable_nodes: []
|
||||
auto_recycle_stuck_pods: true
|
||||
auto_quarantine_scheduling_storms: false
|
||||
scheduling_storm_event_threshold: 30
|
||||
scheduling_storm_window_seconds: 180
|
||||
stuck_pod_grace_seconds: 180
|
||||
post_start_auto_heal_seconds: 60
|
||||
dead_node_cleanup_grace_seconds: 300
|
||||
vault_unseal_key_file: /var/lib/ananke/vault-unseal.key
|
||||
vault_unseal_breakglass_command: ""
|
||||
vault_unseal_breakglass_timeout_seconds: 15
|
||||
@ -180,7 +170,6 @@ ups:
|
||||
target: pyrphoros@localhost
|
||||
poll_seconds: 5
|
||||
runtime_safety_factor: 1.25
|
||||
on_battery_grace_seconds: 90
|
||||
debounce_count: 3
|
||||
telemetry_timeout_seconds: 90
|
||||
coordination:
|
||||
|
||||
@ -117,52 +117,8 @@ startup:
|
||||
require_node_inventory_reachability: true
|
||||
node_inventory_reachability_wait_seconds: 300
|
||||
node_inventory_reachability_poll_seconds: 5
|
||||
node_inventory_reachability_required_nodes:
|
||||
- titan-0a
|
||||
- titan-0b
|
||||
- titan-0c
|
||||
required_node_labels:
|
||||
titan-04:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-05:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-06:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-07:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-08:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-11:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-12:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-13:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-14:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-15:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-17:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-18:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-19:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-09:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
ananke.bstein.dev/harbor-bootstrap: "true"
|
||||
require_time_sync: true
|
||||
time_sync_wait_seconds: 240
|
||||
@ -200,7 +156,6 @@ startup:
|
||||
admin_secret_name: keycloak-admin
|
||||
admin_secret_username_key: username
|
||||
admin_secret_password_key: password
|
||||
service_checklist_explicit_only: true
|
||||
service_checklist:
|
||||
- name: gitea-api
|
||||
url: https://scm.bstein.dev/api/healthz
|
||||
@ -245,49 +200,18 @@ startup:
|
||||
require_node_ssh_auth: true
|
||||
node_ssh_auth_wait_seconds: 240
|
||||
node_ssh_auth_poll_seconds: 5
|
||||
node_ssh_auth_required_nodes:
|
||||
- titan-0a
|
||||
- titan-0b
|
||||
- titan-0c
|
||||
require_flux_health: true
|
||||
flux_health_wait_seconds: 900
|
||||
flux_health_poll_seconds: 5
|
||||
flux_health_required_kustomizations:
|
||||
- flux-system/core
|
||||
- flux-system/helm
|
||||
- flux-system/traefik
|
||||
- flux-system/cert-manager
|
||||
- flux-system/longhorn
|
||||
- flux-system/vault-csi
|
||||
- flux-system/vault-injector
|
||||
- flux-system/postgres
|
||||
- flux-system/vault
|
||||
- flux-system/keycloak
|
||||
- flux-system/oauth2-proxy
|
||||
- flux-system/gitea
|
||||
- flux-system/monitoring
|
||||
- flux-system/harbor
|
||||
ignore_flux_kustomizations: []
|
||||
require_workload_convergence: true
|
||||
workload_convergence_wait_seconds: 900
|
||||
workload_convergence_poll_seconds: 5
|
||||
workload_convergence_required_namespaces:
|
||||
- vault
|
||||
- postgres
|
||||
- sso
|
||||
- gitea
|
||||
- monitoring
|
||||
- harbor
|
||||
ignore_workload_namespaces: []
|
||||
ignore_workloads: []
|
||||
ignore_unavailable_nodes: []
|
||||
auto_recycle_stuck_pods: true
|
||||
auto_quarantine_scheduling_storms: true
|
||||
scheduling_storm_event_threshold: 30
|
||||
scheduling_storm_window_seconds: 180
|
||||
stuck_pod_grace_seconds: 180
|
||||
post_start_auto_heal_seconds: 60
|
||||
dead_node_cleanup_grace_seconds: 300
|
||||
vault_unseal_key_file: /var/lib/ananke/vault-unseal.key
|
||||
vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/tethys/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.ananke-breakglass/vault-unseal.key'"
|
||||
vault_unseal_breakglass_timeout_seconds: 15
|
||||
@ -311,7 +235,6 @@ ups:
|
||||
target: statera@localhost
|
||||
poll_seconds: 5
|
||||
runtime_safety_factor: 1.25
|
||||
on_battery_grace_seconds: 90
|
||||
debounce_count: 3
|
||||
telemetry_timeout_seconds: 90
|
||||
coordination:
|
||||
|
||||
@ -117,52 +117,8 @@ startup:
|
||||
require_node_inventory_reachability: true
|
||||
node_inventory_reachability_wait_seconds: 300
|
||||
node_inventory_reachability_poll_seconds: 5
|
||||
node_inventory_reachability_required_nodes:
|
||||
- titan-0a
|
||||
- titan-0b
|
||||
- titan-0c
|
||||
required_node_labels:
|
||||
titan-04:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-05:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-06:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-07:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-08:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-11:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-12:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-13:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-14:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-15:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-17:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-18:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-19:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-09:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
ananke.bstein.dev/harbor-bootstrap: "true"
|
||||
require_time_sync: true
|
||||
time_sync_wait_seconds: 240
|
||||
@ -200,7 +156,6 @@ startup:
|
||||
admin_secret_name: keycloak-admin
|
||||
admin_secret_username_key: username
|
||||
admin_secret_password_key: password
|
||||
service_checklist_explicit_only: true
|
||||
service_checklist:
|
||||
- name: gitea-api
|
||||
url: https://scm.bstein.dev/api/healthz
|
||||
@ -245,49 +200,18 @@ startup:
|
||||
require_node_ssh_auth: true
|
||||
node_ssh_auth_wait_seconds: 240
|
||||
node_ssh_auth_poll_seconds: 5
|
||||
node_ssh_auth_required_nodes:
|
||||
- titan-0a
|
||||
- titan-0b
|
||||
- titan-0c
|
||||
require_flux_health: true
|
||||
flux_health_wait_seconds: 900
|
||||
flux_health_poll_seconds: 5
|
||||
flux_health_required_kustomizations:
|
||||
- flux-system/core
|
||||
- flux-system/helm
|
||||
- flux-system/traefik
|
||||
- flux-system/cert-manager
|
||||
- flux-system/longhorn
|
||||
- flux-system/vault-csi
|
||||
- flux-system/vault-injector
|
||||
- flux-system/postgres
|
||||
- flux-system/vault
|
||||
- flux-system/keycloak
|
||||
- flux-system/oauth2-proxy
|
||||
- flux-system/gitea
|
||||
- flux-system/monitoring
|
||||
- flux-system/harbor
|
||||
ignore_flux_kustomizations: []
|
||||
require_workload_convergence: true
|
||||
workload_convergence_wait_seconds: 900
|
||||
workload_convergence_poll_seconds: 5
|
||||
workload_convergence_required_namespaces:
|
||||
- vault
|
||||
- postgres
|
||||
- sso
|
||||
- gitea
|
||||
- monitoring
|
||||
- harbor
|
||||
ignore_workload_namespaces: []
|
||||
ignore_workloads: []
|
||||
ignore_unavailable_nodes: []
|
||||
auto_recycle_stuck_pods: true
|
||||
auto_quarantine_scheduling_storms: true
|
||||
scheduling_storm_event_threshold: 30
|
||||
scheduling_storm_window_seconds: 180
|
||||
stuck_pod_grace_seconds: 180
|
||||
post_start_auto_heal_seconds: 60
|
||||
dead_node_cleanup_grace_seconds: 300
|
||||
vault_unseal_key_file: /var/lib/ananke/vault-unseal.key
|
||||
vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/atlas/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.ananke-breakglass/vault-unseal.key'"
|
||||
vault_unseal_breakglass_timeout_seconds: 15
|
||||
@ -311,7 +235,6 @@ ups:
|
||||
target: pyrphoros@localhost
|
||||
poll_seconds: 5
|
||||
runtime_safety_factor: 1.25
|
||||
on_battery_grace_seconds: 90
|
||||
debounce_count: 3
|
||||
telemetry_timeout_seconds: 90
|
||||
coordination:
|
||||
|
||||
@ -77,7 +77,7 @@ func (o *Orchestrator) waitForNodeSSHAuth(ctx context.Context, nodes []string) e
|
||||
ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
|
||||
seen := map[string]struct{}{}
|
||||
targets := make([]string, 0, len(nodes))
|
||||
for _, node := range startupRequiredNodes(nodes, o.cfg.Startup.NodeSSHAuthRequiredNodes) {
|
||||
for _, node := range nodes {
|
||||
node = strings.TrimSpace(node)
|
||||
if node == "" {
|
||||
continue
|
||||
|
||||
@ -1,288 +0,0 @@
|
||||
package cluster
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type nodeReadyList struct {
|
||||
Items []struct {
|
||||
Metadata struct {
|
||||
Name string `json:"name"`
|
||||
} `json:"metadata"`
|
||||
Status struct {
|
||||
Conditions []struct {
|
||||
Type string `json:"type"`
|
||||
Status string `json:"status"`
|
||||
} `json:"conditions"`
|
||||
} `json:"status"`
|
||||
} `json:"items"`
|
||||
}
|
||||
|
||||
type podDeleteList struct {
|
||||
Items []struct {
|
||||
Metadata struct {
|
||||
Namespace string `json:"namespace"`
|
||||
Name string `json:"name"`
|
||||
DeletionTimestamp *time.Time `json:"deletionTimestamp"`
|
||||
} `json:"metadata"`
|
||||
Spec struct {
|
||||
NodeName string `json:"nodeName"`
|
||||
} `json:"spec"`
|
||||
} `json:"items"`
|
||||
}
|
||||
|
||||
// RunPostStartAutoHeal runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) RunPostStartAutoHeal(ctx context.Context) error.
|
||||
// Why: gives the long-running daemon a narrow, testable repair entrypoint for
|
||||
// post-start drift without rerunning the full startup flow.
|
||||
func (o *Orchestrator) RunPostStartAutoHeal(ctx context.Context) error {
|
||||
return o.postStartAutoHeal(ctx)
|
||||
}
|
||||
|
||||
// postStartAutoHeal runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) postStartAutoHeal(ctx context.Context) error.
|
||||
// Why: centralizes bounded post-start repair actions so recurring outage
|
||||
// patterns only trigger the specific remediation they need.
|
||||
func (o *Orchestrator) postStartAutoHeal(ctx context.Context) error {
|
||||
if o.runner.DryRun {
|
||||
return nil
|
||||
}
|
||||
|
||||
errs := []string{}
|
||||
requestReconcile := false
|
||||
|
||||
if err := o.ensureRequiredNodeLabels(ctx); err != nil {
|
||||
errs = append(errs, fmt.Sprintf("required node labels: %v", err))
|
||||
}
|
||||
|
||||
vaultRecovered, err := o.autoRecoverSealedVault(ctx)
|
||||
if err != nil {
|
||||
errs = append(errs, fmt.Sprintf("vault auto-recovery: %v", err))
|
||||
} else if vaultRecovered {
|
||||
requestReconcile = true
|
||||
if err := o.rerunVaultK8sAuthConfigJob(ctx); err != nil {
|
||||
errs = append(errs, fmt.Sprintf("vault k8s auth config rerun: %v", err))
|
||||
}
|
||||
}
|
||||
|
||||
cleaned, err := o.cleanupTerminatingPodsOnUnavailableNodes(ctx)
|
||||
if err != nil {
|
||||
errs = append(errs, fmt.Sprintf("dead-node terminating pod cleanup: %v", err))
|
||||
} else if cleaned > 0 {
|
||||
requestReconcile = true
|
||||
}
|
||||
|
||||
if requestReconcile {
|
||||
o.bestEffort("request flux reconcile after post-start auto-heal", func() error {
|
||||
return o.requestFluxReconcile(ctx)
|
||||
})
|
||||
}
|
||||
|
||||
if len(errs) > 0 {
|
||||
return errors.New(strings.Join(errs, "; "))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// autoRecoverSealedVault runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) autoRecoverSealedVault(ctx context.Context) (bool, error).
|
||||
// Why: lets the daemon repair a later Vault reseal without waiting for a new
|
||||
// bootstrap run.
|
||||
func (o *Orchestrator) autoRecoverSealedVault(ctx context.Context) (bool, error) {
|
||||
if o.runner.DryRun {
|
||||
return false, nil
|
||||
}
|
||||
|
||||
phase, err := o.kubectl(ctx, 15*time.Second, "-n", "vault", "get", "pod", "vault-0", "-o", "jsonpath={.status.phase}")
|
||||
if err != nil {
|
||||
if isNotFoundErr(err) {
|
||||
return false, nil
|
||||
}
|
||||
return false, fmt.Errorf("vault pod phase check failed: %w", err)
|
||||
}
|
||||
if strings.TrimSpace(phase) != "Running" {
|
||||
return false, nil
|
||||
}
|
||||
|
||||
sealed, err := o.vaultSealed(ctx)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
if !sealed {
|
||||
return false, nil
|
||||
}
|
||||
|
||||
o.log.Printf("warning: detected sealed Vault after startup; attempting post-start auto-recovery")
|
||||
if err := o.ensureVaultUnsealed(ctx); err != nil {
|
||||
return false, err
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// rerunVaultK8sAuthConfigJob runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) rerunVaultK8sAuthConfigJob(ctx context.Context) error.
|
||||
// Why: post-unseal Vault recovery needs the auth-config job retriggered so
|
||||
// downstream secret consumers stop carrying stale failures from the sealed window.
|
||||
func (o *Orchestrator) rerunVaultK8sAuthConfigJob(ctx context.Context) error {
|
||||
if o.runner.DryRun {
|
||||
return nil
|
||||
}
|
||||
jobName := fmt.Sprintf("vault-k8s-auth-config-autoheal-%d", time.Now().Unix())
|
||||
if _, err := o.kubectl(
|
||||
ctx,
|
||||
25*time.Second,
|
||||
"-n", "vault",
|
||||
"create", "job",
|
||||
"--from=cronjob/vault-k8s-auth-config",
|
||||
jobName,
|
||||
); err != nil {
|
||||
return fmt.Errorf("create job %s: %w", jobName, err)
|
||||
}
|
||||
o.log.Printf("triggered vault k8s auth config job %s after vault recovery", jobName)
|
||||
return nil
|
||||
}
|
||||
|
||||
// cleanupTerminatingPodsOnUnavailableNodes runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) cleanupTerminatingPodsOnUnavailableNodes(ctx context.Context) (int, error).
|
||||
// Why: dead nodes can strand terminating pods indefinitely, so the daemon should
|
||||
// clear only that narrow failure class instead of leaving garbage behind forever.
|
||||
func (o *Orchestrator) cleanupTerminatingPodsOnUnavailableNodes(ctx context.Context) (int, error) {
|
||||
if o.runner.DryRun {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
unavailable, err := o.unavailableNodeSet(ctx)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
if len(unavailable) == 0 {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
out, err := o.kubectl(ctx, 30*time.Second, "get", "pods", "-A", "-o", "json")
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("query pods: %w", err)
|
||||
}
|
||||
var pods podDeleteList
|
||||
if err := json.Unmarshal([]byte(out), &pods); err != nil {
|
||||
return 0, fmt.Errorf("decode pods: %w", err)
|
||||
}
|
||||
|
||||
grace := time.Duration(o.cfg.Startup.DeadNodeCleanupGraceSeconds) * time.Second
|
||||
now := time.Now()
|
||||
count := 0
|
||||
for _, item := range pods.Items {
|
||||
if item.Metadata.DeletionTimestamp == nil || item.Spec.NodeName == "" {
|
||||
continue
|
||||
}
|
||||
if _, badNode := unavailable[item.Spec.NodeName]; !badNode {
|
||||
continue
|
||||
}
|
||||
if now.Sub(*item.Metadata.DeletionTimestamp) < grace {
|
||||
continue
|
||||
}
|
||||
o.log.Printf("warning: force deleting terminating pod %s/%s on unavailable node %s", item.Metadata.Namespace, item.Metadata.Name, item.Spec.NodeName)
|
||||
if _, err := o.kubectl(
|
||||
ctx,
|
||||
20*time.Second,
|
||||
"-n", item.Metadata.Namespace,
|
||||
"delete", "pod", item.Metadata.Name,
|
||||
"--grace-period=0",
|
||||
"--force",
|
||||
"--wait=false",
|
||||
); err != nil && !isNotFoundErr(err) {
|
||||
return count, fmt.Errorf("delete pod %s/%s: %w", item.Metadata.Namespace, item.Metadata.Name, err)
|
||||
}
|
||||
count++
|
||||
}
|
||||
if count > 0 {
|
||||
o.log.Printf("post-start auto-heal cleaned %d terminating pod(s) from unavailable nodes", count)
|
||||
}
|
||||
return count, nil
|
||||
}
|
||||
|
||||
// unavailableNodeSet runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) unavailableNodeSet(ctx context.Context) (map[string]struct{}, error).
|
||||
// Why: isolates Ready-condition parsing so dead-node cleanup stays targeted.
|
||||
func (o *Orchestrator) unavailableNodeSet(ctx context.Context) (map[string]struct{}, error) {
|
||||
out, err := o.kubectl(ctx, 20*time.Second, "get", "nodes", "-o", "json")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("query nodes: %w", err)
|
||||
}
|
||||
var nodes nodeReadyList
|
||||
if err := json.Unmarshal([]byte(out), &nodes); err != nil {
|
||||
return nil, fmt.Errorf("decode nodes: %w", err)
|
||||
}
|
||||
|
||||
unavailable := map[string]struct{}{}
|
||||
for _, item := range nodes.Items {
|
||||
ready := ""
|
||||
for _, cond := range item.Status.Conditions {
|
||||
if strings.EqualFold(strings.TrimSpace(cond.Type), "Ready") {
|
||||
ready = strings.TrimSpace(cond.Status)
|
||||
break
|
||||
}
|
||||
}
|
||||
if ready != "True" {
|
||||
unavailable[item.Metadata.Name] = struct{}{}
|
||||
}
|
||||
}
|
||||
return unavailable, nil
|
||||
}
|
||||
|
||||
// requestFluxReconcile runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) requestFluxReconcile(ctx context.Context) error.
|
||||
// Why: post-start repairs need a lightweight way to refresh GitOps health
|
||||
// without reusing the broader startup flux-resume flow.
|
||||
func (o *Orchestrator) requestFluxReconcile(ctx context.Context) error {
|
||||
if o.runner.DryRun {
|
||||
return nil
|
||||
}
|
||||
|
||||
now := time.Now().UTC().Format(time.RFC3339)
|
||||
if _, err := o.kubectl(
|
||||
ctx,
|
||||
25*time.Second,
|
||||
"-n", "flux-system",
|
||||
"annotate", "gitrepository", "flux-system",
|
||||
"reconcile.fluxcd.io/requestedAt="+now,
|
||||
"--overwrite",
|
||||
); err != nil {
|
||||
return fmt.Errorf("annotate flux source reconcile: %w", err)
|
||||
}
|
||||
if _, err := o.kubectl(
|
||||
ctx,
|
||||
25*time.Second,
|
||||
"-n", "flux-system",
|
||||
"annotate",
|
||||
"kustomizations.kustomize.toolkit.fluxcd.io",
|
||||
"--all",
|
||||
"reconcile.fluxcd.io/requestedAt="+now,
|
||||
"--overwrite",
|
||||
); err != nil {
|
||||
return fmt.Errorf("annotate flux kustomizations reconcile: %w", err)
|
||||
}
|
||||
if _, err := o.kubectl(
|
||||
ctx,
|
||||
25*time.Second,
|
||||
"annotate",
|
||||
"--all-namespaces",
|
||||
"helmreleases.helm.toolkit.fluxcd.io",
|
||||
"--all",
|
||||
"reconcile.fluxcd.io/requestedAt="+now,
|
||||
"--overwrite",
|
||||
); err != nil {
|
||||
o.log.Printf("warning: annotate helmreleases for post-start reconcile failed: %v", err)
|
||||
}
|
||||
if o.runOverride == nil && o.runner.CommandExists("flux") {
|
||||
if _, err := o.run(ctx, 75*time.Second, "flux", "reconcile", "source", "git", "flux-system", "-n", "flux-system", "--timeout=60s"); err != nil {
|
||||
o.log.Printf("warning: flux source reconcile command failed during post-start auto-heal: %v", err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@ -1,296 +0,0 @@
|
||||
package cluster
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"io"
|
||||
"log"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"scm.bstein.dev/bstein/ananke/internal/config"
|
||||
"scm.bstein.dev/bstein/ananke/internal/execx"
|
||||
"scm.bstein.dev/bstein/ananke/internal/state"
|
||||
)
|
||||
|
||||
// TestCleanupTerminatingPodsOnUnavailableNodesBranches runs one orchestration or CLI step.
|
||||
// Signature: TestCleanupTerminatingPodsOnUnavailableNodesBranches(t *testing.T).
|
||||
// Why: cleanup on dead nodes must be selective so Ananke only force-deletes the
|
||||
// truly stranded pods and tolerates already-gone objects.
|
||||
func TestCleanupTerminatingPodsOnUnavailableNodesBranches(t *testing.T) {
|
||||
t.Run("dry run skips", func(t *testing.T) {
|
||||
orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
|
||||
orch.runner.DryRun = true
|
||||
count, err := orch.cleanupTerminatingPodsOnUnavailableNodes(context.Background())
|
||||
if err != nil || count != 0 {
|
||||
t.Fatalf("expected dry-run skip, got count=%d err=%v", count, err)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("selective cleanup tolerates not found", func(t *testing.T) {
|
||||
oldDelete := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
|
||||
recentDelete := time.Now().Add(-2 * time.Minute).UTC().Format(time.RFC3339)
|
||||
orch := buildOrchestratorWithStubs(t, config.Config{
|
||||
Startup: config.Startup{DeadNodeCleanupGraceSeconds: 300},
|
||||
}, []commandStub{
|
||||
{
|
||||
match: matchContains("kubectl", "get nodes -o json"),
|
||||
out: `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"Unknown"}]}},{"metadata":{"name":"titan-07"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`,
|
||||
},
|
||||
{
|
||||
match: matchContains("kubectl", "get pods -A -o json"),
|
||||
out: `{"items":[` +
|
||||
`{"metadata":{"namespace":"maintenance","name":"old-stale","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-22"}},` +
|
||||
`{"metadata":{"namespace":"maintenance","name":"fresh-stale","deletionTimestamp":"` + recentDelete + `"},"spec":{"nodeName":"titan-22"}},` +
|
||||
`{"metadata":{"namespace":"logging","name":"healthy-node","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-18"}},` +
|
||||
`{"metadata":{"namespace":"logging","name":"no-delete"},"spec":{"nodeName":"titan-22"}}]}`,
|
||||
},
|
||||
{
|
||||
match: matchContains("kubectl", "-n maintenance delete pod old-stale --grace-period=0 --force --wait=false"),
|
||||
err: errors.New("pod old-stale not found"),
|
||||
},
|
||||
})
|
||||
|
||||
count, err := orch.cleanupTerminatingPodsOnUnavailableNodes(context.Background())
|
||||
if err != nil {
|
||||
t.Fatalf("cleanupTerminatingPodsOnUnavailableNodes failed: %v", err)
|
||||
}
|
||||
if count != 1 {
|
||||
t.Fatalf("expected one cleaned pod, got %d", count)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("query and decode errors surface", func(t *testing.T) {
|
||||
queryErrOrch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
||||
{
|
||||
match: matchContains("kubectl", "get nodes -o json"),
|
||||
err: errors.New("nodes failed"),
|
||||
},
|
||||
})
|
||||
if _, err := queryErrOrch.cleanupTerminatingPodsOnUnavailableNodes(context.Background()); err == nil || !strings.Contains(err.Error(), "query nodes") {
|
||||
t.Fatalf("expected node query error, got %v", err)
|
||||
}
|
||||
|
||||
decodeErrOrch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
||||
{
|
||||
match: matchContains("kubectl", "get nodes -o json"),
|
||||
out: `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"Unknown"}]}}]}`,
|
||||
},
|
||||
{
|
||||
match: matchContains("kubectl", "get pods -A -o json"),
|
||||
out: `{bad json`,
|
||||
},
|
||||
})
|
||||
if _, err := decodeErrOrch.cleanupTerminatingPodsOnUnavailableNodes(context.Background()); err == nil || !strings.Contains(err.Error(), "decode pods") {
|
||||
t.Fatalf("expected pod decode error, got %v", err)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("delete hard error surfaces", func(t *testing.T) {
|
||||
oldDelete := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
|
||||
orch := buildOrchestratorWithStubs(t, config.Config{
|
||||
Startup: config.Startup{DeadNodeCleanupGraceSeconds: 300},
|
||||
}, []commandStub{
|
||||
{
|
||||
match: matchContains("kubectl", "get nodes -o json"),
|
||||
out: `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"False"}]}}]}`,
|
||||
},
|
||||
{
|
||||
match: matchContains("kubectl", "get pods -A -o json"),
|
||||
out: `{"items":[{"metadata":{"namespace":"maintenance","name":"old-stale","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-22"}}]}`,
|
||||
},
|
||||
{
|
||||
match: matchContains("kubectl", "-n maintenance delete pod old-stale --grace-period=0 --force --wait=false"),
|
||||
err: errors.New("delete failed"),
|
||||
},
|
||||
})
|
||||
|
||||
count, err := orch.cleanupTerminatingPodsOnUnavailableNodes(context.Background())
|
||||
if count != 0 || err == nil || !strings.Contains(err.Error(), "delete pod maintenance/old-stale") {
|
||||
t.Fatalf("expected delete failure, got count=%d err=%v", count, err)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// TestUnavailableNodeSetBranches runs one orchestration or CLI step.
|
||||
// Signature: TestUnavailableNodeSetBranches(t *testing.T).
|
||||
// Why: node Ready parsing drives dead-node cleanup, so malformed and missing
|
||||
// Ready condition payloads need direct coverage too.
|
||||
func TestUnavailableNodeSetBranches(t *testing.T) {
|
||||
t.Run("decode error surfaces", func(t *testing.T) {
|
||||
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
||||
{match: matchContains("kubectl", "get nodes -o json"), out: `{bad json`},
|
||||
})
|
||||
if _, err := orch.unavailableNodeSet(context.Background()); err == nil || !strings.Contains(err.Error(), "decode nodes") {
|
||||
t.Fatalf("expected decode error, got %v", err)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("missing ready condition counts as unavailable", func(t *testing.T) {
|
||||
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
||||
{
|
||||
match: matchContains("kubectl", "get nodes -o json"),
|
||||
out: `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"MemoryPressure","status":"False"}]}},{"metadata":{"name":"titan-07"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`,
|
||||
},
|
||||
})
|
||||
nodes, err := orch.unavailableNodeSet(context.Background())
|
||||
if err != nil {
|
||||
t.Fatalf("unavailableNodeSet failed: %v", err)
|
||||
}
|
||||
if _, ok := nodes["titan-22"]; !ok {
|
||||
t.Fatalf("expected titan-22 to be treated as unavailable")
|
||||
}
|
||||
if _, ok := nodes["titan-07"]; ok {
|
||||
t.Fatalf("did not expect titan-07 to be treated as unavailable")
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// TestRequestFluxReconcileBranches runs one orchestration or CLI step.
|
||||
// Signature: TestRequestFluxReconcileBranches(t *testing.T).
|
||||
// Why: the post-start repair loop needs predictable Flux refresh behavior even
|
||||
// when one annotation call is flaky.
|
||||
func TestRequestFluxReconcileBranches(t *testing.T) {
|
||||
t.Run("dry run skips", func(t *testing.T) {
|
||||
orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
|
||||
orch.runner.DryRun = true
|
||||
if err := orch.requestFluxReconcile(context.Background()); err != nil {
|
||||
t.Fatalf("dry-run requestFluxReconcile failed: %v", err)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("git source annotate error surfaces", func(t *testing.T) {
|
||||
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
||||
{
|
||||
match: matchContains("kubectl", "-n flux-system annotate gitrepository flux-system reconcile.fluxcd.io/requestedAt="),
|
||||
err: errors.New("annotate failed"),
|
||||
},
|
||||
})
|
||||
if err := orch.requestFluxReconcile(context.Background()); err == nil || !strings.Contains(err.Error(), "annotate flux source reconcile") {
|
||||
t.Fatalf("expected gitrepository annotate error, got %v", err)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("kustomization annotate error surfaces", func(t *testing.T) {
|
||||
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
||||
{
|
||||
match: matchContains("kubectl", "-n flux-system annotate gitrepository flux-system reconcile.fluxcd.io/requestedAt="),
|
||||
out: "",
|
||||
},
|
||||
{
|
||||
match: matchContains("kubectl", "-n flux-system annotate kustomizations.kustomize.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt="),
|
||||
err: errors.New("annotate failed"),
|
||||
},
|
||||
})
|
||||
if err := orch.requestFluxReconcile(context.Background()); err == nil || !strings.Contains(err.Error(), "annotate flux kustomizations reconcile") {
|
||||
t.Fatalf("expected kustomization annotate error, got %v", err)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("helm annotate warning and flux command path", func(t *testing.T) {
|
||||
tmpDir := t.TempDir()
|
||||
callLog := filepath.Join(tmpDir, "calls.log")
|
||||
kubectlPath := filepath.Join(tmpDir, "kubectl")
|
||||
fluxPath := filepath.Join(tmpDir, "flux")
|
||||
|
||||
kubectlScript := "#!/bin/sh\n" +
|
||||
"printf '%s\\n' \"$*\" >> \"" + callLog + "\"\n" +
|
||||
"case \"$*\" in\n" +
|
||||
" *helmreleases.helm.toolkit.fluxcd.io*) echo helm annotate failed >&2; exit 1 ;;\n" +
|
||||
"esac\n" +
|
||||
"exit 0\n"
|
||||
fluxScript := "#!/bin/sh\n" +
|
||||
"printf 'flux %s\\n' \"$*\" >> \"" + callLog + "\"\n" +
|
||||
"exit 0\n"
|
||||
|
||||
if err := os.WriteFile(kubectlPath, []byte(kubectlScript), 0o755); err != nil {
|
||||
t.Fatalf("write fake kubectl: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(fluxPath, []byte(fluxScript), 0o755); err != nil {
|
||||
t.Fatalf("write fake flux: %v", err)
|
||||
}
|
||||
t.Setenv("PATH", tmpDir+":"+os.Getenv("PATH"))
|
||||
|
||||
cfg := config.Config{
|
||||
State: config.State{
|
||||
Dir: t.TempDir(),
|
||||
ReportsDir: filepath.Join(t.TempDir(), "reports"),
|
||||
RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"),
|
||||
},
|
||||
}
|
||||
orch := &Orchestrator{
|
||||
cfg: cfg,
|
||||
runner: &execx.Runner{},
|
||||
store: state.New(cfg.State.RunHistoryPath),
|
||||
log: log.New(io.Discard, "", 0),
|
||||
}
|
||||
|
||||
if err := orch.requestFluxReconcile(context.Background()); err != nil {
|
||||
t.Fatalf("requestFluxReconcile with fake binaries failed: %v", err)
|
||||
}
|
||||
calls, err := os.ReadFile(callLog)
|
||||
if err != nil {
|
||||
t.Fatalf("read fake command log: %v", err)
|
||||
}
|
||||
logText := string(calls)
|
||||
if !strings.Contains(logText, "annotate gitrepository flux-system") {
|
||||
t.Fatalf("expected gitrepository annotate call, got %q", logText)
|
||||
}
|
||||
if !strings.Contains(logText, "annotate kustomizations.kustomize.toolkit.fluxcd.io --all") {
|
||||
t.Fatalf("expected kustomization annotate call, got %q", logText)
|
||||
}
|
||||
if !strings.Contains(logText, "flux reconcile source git flux-system -n flux-system --timeout=60s") {
|
||||
t.Fatalf("expected flux reconcile command, got %q", logText)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("flux command failure is tolerated", func(t *testing.T) {
|
||||
tmpDir := t.TempDir()
|
||||
callLog := filepath.Join(tmpDir, "calls.log")
|
||||
kubectlPath := filepath.Join(tmpDir, "kubectl")
|
||||
fluxPath := filepath.Join(tmpDir, "flux")
|
||||
|
||||
kubectlScript := "#!/bin/sh\n" +
|
||||
"printf '%s\\n' \"$*\" >> \"" + callLog + "\"\n" +
|
||||
"exit 0\n"
|
||||
fluxScript := "#!/bin/sh\n" +
|
||||
"printf 'flux-fail %s\\n' \"$*\" >> \"" + callLog + "\"\n" +
|
||||
"exit 1\n"
|
||||
|
||||
if err := os.WriteFile(kubectlPath, []byte(kubectlScript), 0o755); err != nil {
|
||||
t.Fatalf("write fake kubectl: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(fluxPath, []byte(fluxScript), 0o755); err != nil {
|
||||
t.Fatalf("write fake flux: %v", err)
|
||||
}
|
||||
t.Setenv("PATH", tmpDir+":"+os.Getenv("PATH"))
|
||||
|
||||
cfg := config.Config{
|
||||
State: config.State{
|
||||
Dir: t.TempDir(),
|
||||
ReportsDir: filepath.Join(t.TempDir(), "reports"),
|
||||
RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"),
|
||||
},
|
||||
}
|
||||
orch := &Orchestrator{
|
||||
cfg: cfg,
|
||||
runner: &execx.Runner{},
|
||||
store: state.New(cfg.State.RunHistoryPath),
|
||||
log: log.New(io.Discard, "", 0),
|
||||
}
|
||||
|
||||
if err := orch.requestFluxReconcile(context.Background()); err != nil {
|
||||
t.Fatalf("requestFluxReconcile should tolerate flux failure, got %v", err)
|
||||
}
|
||||
calls, err := os.ReadFile(callLog)
|
||||
if err != nil {
|
||||
t.Fatalf("read fake command log: %v", err)
|
||||
}
|
||||
if !strings.Contains(string(calls), "flux-fail reconcile source git flux-system -n flux-system --timeout=60s") {
|
||||
t.Fatalf("expected failing flux command to be attempted, got %q", string(calls))
|
||||
}
|
||||
})
|
||||
}
|
||||
@ -1,382 +0,0 @@
|
||||
package cluster
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/base64"
|
||||
"errors"
|
||||
"io"
|
||||
"log"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"scm.bstein.dev/bstein/ananke/internal/config"
|
||||
"scm.bstein.dev/bstein/ananke/internal/execx"
|
||||
"scm.bstein.dev/bstein/ananke/internal/state"
|
||||
)
|
||||
|
||||
// TestPostStartAutoHealRepairsVaultAndUnavailableNodes runs one orchestration or CLI step.
|
||||
// Signature: TestPostStartAutoHealRepairsVaultAndUnavailableNodes(t *testing.T).
|
||||
// Why: covers the new daemon-triggered repair path for late Vault reseals and
|
||||
// stale terminating pods anchored to unavailable nodes.
|
||||
func TestPostStartAutoHealRepairsVaultAndUnavailableNodes(t *testing.T) {
|
||||
cfg := config.Config{
|
||||
Startup: config.Startup{
|
||||
DeadNodeCleanupGraceSeconds: 300,
|
||||
RequiredNodeLabels: map[string]map[string]string{
|
||||
"titan-07": {"node-role.kubernetes.io/worker": "true"},
|
||||
},
|
||||
},
|
||||
State: config.State{
|
||||
Dir: t.TempDir(),
|
||||
ReportsDir: filepath.Join(t.TempDir(), "reports"),
|
||||
RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"),
|
||||
},
|
||||
}
|
||||
orch := &Orchestrator{
|
||||
cfg: cfg,
|
||||
runner: &execx.Runner{},
|
||||
store: state.New(filepath.Join(t.TempDir(), "runs.json")),
|
||||
log: log.New(io.Discard, "", 0),
|
||||
}
|
||||
|
||||
oldDelete := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
|
||||
unsealCalls := 0
|
||||
jobCreated := false
|
||||
reconciled := false
|
||||
deleted := map[string]bool{}
|
||||
dispatch := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
|
||||
if name != "kubectl" {
|
||||
return "", nil
|
||||
}
|
||||
joined := strings.Join(args, " ")
|
||||
switch {
|
||||
case strings.Contains(joined, "label node titan-07 --overwrite node-role.kubernetes.io/worker=true"):
|
||||
return "", nil
|
||||
case strings.Contains(joined, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
|
||||
return "Running", nil
|
||||
case strings.Contains(joined, "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"):
|
||||
if unsealCalls == 0 {
|
||||
return `{"initialized":true,"sealed":true}`, nil
|
||||
}
|
||||
return `{"initialized":true,"sealed":false}`, nil
|
||||
case strings.Contains(joined, "-n vault get secret vault-init -o jsonpath={.data.unseal_key_b64}"):
|
||||
return base64.StdEncoding.EncodeToString([]byte("vault-unseal-key")), nil
|
||||
case strings.Contains(joined, "vault operator unseal"):
|
||||
unsealCalls++
|
||||
return "", nil
|
||||
case strings.Contains(joined, "-n vault create job --from=cronjob/vault-k8s-auth-config"):
|
||||
jobCreated = true
|
||||
return "", nil
|
||||
case strings.Contains(joined, "get nodes -o json"):
|
||||
return `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"Unknown"}]}},{"metadata":{"name":"titan-07"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`, nil
|
||||
case strings.Contains(joined, "get pods -A -o json"):
|
||||
return `{"items":[{"metadata":{"namespace":"maintenance","name":"stale-pod","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-22"}},{"metadata":{"namespace":"logging","name":"healthy-node-pod","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-18"}}]}`, nil
|
||||
case strings.Contains(joined, "-n maintenance delete pod stale-pod --grace-period=0 --force --wait=false"):
|
||||
deleted["maintenance/stale-pod"] = true
|
||||
return "", nil
|
||||
case strings.Contains(joined, "-n flux-system annotate gitrepository flux-system reconcile.fluxcd.io/requestedAt="):
|
||||
reconciled = true
|
||||
return "", nil
|
||||
case strings.Contains(joined, "-n flux-system annotate kustomizations.kustomize.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt="):
|
||||
return "", nil
|
||||
case strings.Contains(joined, "annotate --all-namespaces helmreleases.helm.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt="):
|
||||
return "", nil
|
||||
default:
|
||||
return "", nil
|
||||
}
|
||||
}
|
||||
orch.SetCommandOverrides(dispatch, dispatch)
|
||||
|
||||
if err := orch.postStartAutoHeal(context.Background()); err != nil {
|
||||
t.Fatalf("postStartAutoHeal failed: %v", err)
|
||||
}
|
||||
if unsealCalls != 1 {
|
||||
t.Fatalf("expected one Vault unseal attempt, got %d", unsealCalls)
|
||||
}
|
||||
if !jobCreated {
|
||||
t.Fatalf("expected vault k8s auth config job to be created")
|
||||
}
|
||||
if !deleted["maintenance/stale-pod"] {
|
||||
t.Fatalf("expected stale unavailable-node pod to be deleted")
|
||||
}
|
||||
if !reconciled {
|
||||
t.Fatalf("expected flux reconcile request after repairs")
|
||||
}
|
||||
if deleted["logging/healthy-node-pod"] {
|
||||
t.Fatalf("did not expect terminating pod on healthy node to be deleted")
|
||||
}
|
||||
}
|
||||
|
||||
// TestPostStartAutoHealSkipsWhenClusterIsHealthy runs one orchestration or CLI step.
|
||||
// Signature: TestPostStartAutoHealSkipsWhenClusterIsHealthy(t *testing.T).
|
||||
// Why: proves the new post-start repair loop stays quiet when the specific
|
||||
// failure patterns are absent.
|
||||
func TestPostStartAutoHealSkipsWhenClusterIsHealthy(t *testing.T) {
|
||||
cfg := config.Config{
|
||||
Startup: config.Startup{
|
||||
DeadNodeCleanupGraceSeconds: 300,
|
||||
},
|
||||
State: config.State{
|
||||
Dir: t.TempDir(),
|
||||
ReportsDir: filepath.Join(t.TempDir(), "reports"),
|
||||
RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"),
|
||||
},
|
||||
}
|
||||
orch := &Orchestrator{
|
||||
cfg: cfg,
|
||||
runner: &execx.Runner{},
|
||||
store: state.New(filepath.Join(t.TempDir(), "runs.json")),
|
||||
log: log.New(io.Discard, "", 0),
|
||||
}
|
||||
|
||||
unsealCalls := 0
|
||||
jobCreated := false
|
||||
reconciled := false
|
||||
dispatch := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
|
||||
if name != "kubectl" {
|
||||
return "", nil
|
||||
}
|
||||
joined := strings.Join(args, " ")
|
||||
switch {
|
||||
case strings.Contains(joined, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
|
||||
return "Running", nil
|
||||
case strings.Contains(joined, "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"):
|
||||
return `{"initialized":true,"sealed":false}`, nil
|
||||
case strings.Contains(joined, "-n vault create job --from=cronjob/vault-k8s-auth-config"):
|
||||
jobCreated = true
|
||||
return "", nil
|
||||
case strings.Contains(joined, "vault operator unseal"):
|
||||
unsealCalls++
|
||||
return "", nil
|
||||
case strings.Contains(joined, "get nodes -o json"):
|
||||
return `{"items":[{"metadata":{"name":"titan-07"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`, nil
|
||||
case strings.Contains(joined, "get pods -A -o json"):
|
||||
return `{"items":[]}`, nil
|
||||
case strings.Contains(joined, "reconcile.fluxcd.io/requestedAt="):
|
||||
reconciled = true
|
||||
return "", nil
|
||||
default:
|
||||
return "", nil
|
||||
}
|
||||
}
|
||||
orch.SetCommandOverrides(dispatch, dispatch)
|
||||
|
||||
if err := orch.postStartAutoHeal(context.Background()); err != nil {
|
||||
t.Fatalf("postStartAutoHeal failed: %v", err)
|
||||
}
|
||||
if unsealCalls != 0 {
|
||||
t.Fatalf("did not expect Vault unseal calls, got %d", unsealCalls)
|
||||
}
|
||||
if jobCreated {
|
||||
t.Fatalf("did not expect vault auth config job creation")
|
||||
}
|
||||
if reconciled {
|
||||
t.Fatalf("did not expect flux reconcile request for healthy cluster")
|
||||
}
|
||||
}
|
||||
|
||||
// TestRunPostStartAutoHealDryRun runs one orchestration or CLI step.
|
||||
// Signature: TestRunPostStartAutoHealDryRun(t *testing.T).
|
||||
// Why: covers the exported wrapper and the top-level dry-run guard so daemon
|
||||
// auto-heal never mutates cluster state during rehearsal runs.
|
||||
func TestRunPostStartAutoHealDryRun(t *testing.T) {
|
||||
orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
|
||||
orch.runner.DryRun = true
|
||||
|
||||
if err := orch.RunPostStartAutoHeal(context.Background()); err != nil {
|
||||
t.Fatalf("RunPostStartAutoHeal dry-run failed: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestPostStartAutoHealAggregatesErrors runs one orchestration or CLI step.
|
||||
// Signature: TestPostStartAutoHealAggregatesErrors(t *testing.T).
|
||||
// Why: proves the daemon reports each failed sub-repair together instead of
|
||||
// hiding later failures behind the first problem.
|
||||
func TestPostStartAutoHealAggregatesErrors(t *testing.T) {
|
||||
cfg := config.Config{
|
||||
Startup: config.Startup{
|
||||
DeadNodeCleanupGraceSeconds: 300,
|
||||
RequiredNodeLabels: map[string]map[string]string{
|
||||
"titan-07": {"node-role.kubernetes.io/worker": "true"},
|
||||
},
|
||||
},
|
||||
}
|
||||
orch := buildOrchestratorWithStubs(t, cfg, []commandStub{
|
||||
{
|
||||
match: matchContains("kubectl", "label node titan-07 --overwrite node-role.kubernetes.io/worker=true"),
|
||||
err: errors.New("label failed"),
|
||||
},
|
||||
{
|
||||
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
|
||||
err: errors.New("vault phase failed"),
|
||||
},
|
||||
{
|
||||
match: matchContains("kubectl", "get nodes -o json"),
|
||||
err: errors.New("node query failed"),
|
||||
},
|
||||
})
|
||||
|
||||
err := orch.postStartAutoHeal(context.Background())
|
||||
if err == nil {
|
||||
t.Fatalf("expected aggregated error")
|
||||
}
|
||||
msg := err.Error()
|
||||
for _, want := range []string{
|
||||
"required node labels:",
|
||||
"vault auto-recovery:",
|
||||
"dead-node terminating pod cleanup:",
|
||||
} {
|
||||
if !strings.Contains(msg, want) {
|
||||
t.Fatalf("expected %q in %q", want, msg)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestAutoRecoverSealedVaultBranches runs one orchestration or CLI step.
|
||||
// Signature: TestAutoRecoverSealedVaultBranches(t *testing.T).
|
||||
// Why: late Vault reseals are a high-risk failure path, so the daemon needs
|
||||
// coverage across the quiet-skip, parse-failure, and unseal-failure branches.
|
||||
func TestAutoRecoverSealedVaultBranches(t *testing.T) {
|
||||
t.Run("dry run skips", func(t *testing.T) {
|
||||
orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
|
||||
orch.runner.DryRun = true
|
||||
|
||||
recovered, err := orch.autoRecoverSealedVault(context.Background())
|
||||
if err != nil || recovered {
|
||||
t.Fatalf("expected dry-run skip, got recovered=%v err=%v", recovered, err)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("pod missing is quiet", func(t *testing.T) {
|
||||
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
||||
{
|
||||
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
|
||||
err: errors.New("vault-0 not found"),
|
||||
},
|
||||
})
|
||||
|
||||
recovered, err := orch.autoRecoverSealedVault(context.Background())
|
||||
if err != nil || recovered {
|
||||
t.Fatalf("expected quiet skip, got recovered=%v err=%v", recovered, err)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("phase check error surfaces", func(t *testing.T) {
|
||||
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
||||
{
|
||||
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
|
||||
err: errors.New("phase check failed"),
|
||||
},
|
||||
})
|
||||
|
||||
recovered, err := orch.autoRecoverSealedVault(context.Background())
|
||||
if recovered || err == nil || !strings.Contains(err.Error(), "vault pod phase check failed") {
|
||||
t.Fatalf("expected phase check error, got recovered=%v err=%v", recovered, err)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("non-running pod defers", func(t *testing.T) {
|
||||
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
||||
{
|
||||
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
|
||||
out: "Pending",
|
||||
},
|
||||
})
|
||||
|
||||
recovered, err := orch.autoRecoverSealedVault(context.Background())
|
||||
if err != nil || recovered {
|
||||
t.Fatalf("expected pending pod skip, got recovered=%v err=%v", recovered, err)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("status parse failure surfaces", func(t *testing.T) {
|
||||
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
||||
{
|
||||
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
|
||||
out: "Running",
|
||||
},
|
||||
{
|
||||
match: matchContains("kubectl", "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"),
|
||||
out: "garbage",
|
||||
},
|
||||
})
|
||||
|
||||
recovered, err := orch.autoRecoverSealedVault(context.Background())
|
||||
if recovered || err == nil || !strings.Contains(err.Error(), "parse vault status") {
|
||||
t.Fatalf("expected parse error, got recovered=%v err=%v", recovered, err)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("already unsealed stays quiet", func(t *testing.T) {
|
||||
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
||||
{
|
||||
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
|
||||
out: "Running",
|
||||
},
|
||||
{
|
||||
match: matchContains("kubectl", "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"),
|
||||
out: `{"sealed":false}`,
|
||||
},
|
||||
})
|
||||
|
||||
recovered, err := orch.autoRecoverSealedVault(context.Background())
|
||||
if err != nil || recovered {
|
||||
t.Fatalf("expected already-unsealed skip, got recovered=%v err=%v", recovered, err)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("unseal failure surfaces", func(t *testing.T) {
|
||||
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
||||
{
|
||||
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
|
||||
out: "Running",
|
||||
},
|
||||
{
|
||||
match: matchContains("kubectl", "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"),
|
||||
out: `{"sealed":true}`,
|
||||
},
|
||||
{
|
||||
match: matchContains("kubectl", "-n vault get secret vault-init -o jsonpath={.data.unseal_key_b64}"),
|
||||
out: base64.StdEncoding.EncodeToString([]byte("vault-unseal-key")),
|
||||
},
|
||||
{
|
||||
match: matchContains("kubectl", "vault operator unseal"),
|
||||
err: errors.New("exec boom"),
|
||||
},
|
||||
})
|
||||
|
||||
recovered, err := orch.autoRecoverSealedVault(context.Background())
|
||||
if recovered || err == nil || !strings.Contains(err.Error(), "vault unseal attempt 1 failed") {
|
||||
t.Fatalf("expected unseal failure, got recovered=%v err=%v", recovered, err)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// TestRerunVaultK8sAuthConfigJobBranches runs one orchestration or CLI step.
|
||||
// Signature: TestRerunVaultK8sAuthConfigJobBranches(t *testing.T).
|
||||
// Why: the post-unseal auth job is part of the production recovery chain, so
|
||||
// dry-run and create-error behavior both need explicit coverage.
|
||||
func TestRerunVaultK8sAuthConfigJobBranches(t *testing.T) {
|
||||
t.Run("dry run skips", func(t *testing.T) {
|
||||
orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
|
||||
orch.runner.DryRun = true
|
||||
if err := orch.rerunVaultK8sAuthConfigJob(context.Background()); err != nil {
|
||||
t.Fatalf("dry-run rerunVaultK8sAuthConfigJob failed: %v", err)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("create error surfaces", func(t *testing.T) {
|
||||
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
||||
{
|
||||
match: matchContains("kubectl", "-n vault create job --from=cronjob/vault-k8s-auth-config"),
|
||||
err: errors.New("create failed"),
|
||||
},
|
||||
})
|
||||
err := orch.rerunVaultK8sAuthConfigJob(context.Background())
|
||||
if err == nil || !strings.Contains(err.Error(), "create job vault-k8s-auth-config-autoheal-") {
|
||||
t.Fatalf("expected create-job error, got %v", err)
|
||||
}
|
||||
})
|
||||
}
|
||||
@ -227,31 +227,6 @@ func (o *Orchestrator) waitVaultReady(ctx context.Context, w startupWorkload) er
|
||||
return fmt.Errorf("wait ready %s/%s/%s: timeout", w.Namespace, w.Kind, w.Name)
|
||||
}
|
||||
|
||||
// ensureVaultUnsealedWhenRunnable runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) ensureVaultUnsealedWhenRunnable(ctx context.Context) (bool, string, error).
|
||||
// Why: lets startup defer vault unseal until the pod is actually runnable, while
|
||||
// keeping the direct unseal helper strict for explicit recovery paths and tests.
|
||||
func (o *Orchestrator) ensureVaultUnsealedWhenRunnable(ctx context.Context) (bool, string, error) {
|
||||
if o.runner.DryRun {
|
||||
return false, "", nil
|
||||
}
|
||||
|
||||
phase, err := o.kubectl(ctx, 15*time.Second, "-n", "vault", "get", "pod", "vault-0", "-o", "jsonpath={.status.phase}")
|
||||
if err != nil {
|
||||
if isNotFoundErr(err) {
|
||||
return true, "vault-0 pod is not present yet; deferring unseal until critical workload recovery", nil
|
||||
}
|
||||
return false, "", fmt.Errorf("vault pod phase check failed: %w", err)
|
||||
}
|
||||
|
||||
trimmedPhase := strings.TrimSpace(phase)
|
||||
if trimmedPhase != "Running" {
|
||||
return true, fmt.Sprintf("vault-0 pod phase is %q; deferring unseal until critical workload recovery", trimmedPhase), nil
|
||||
}
|
||||
|
||||
return false, "", o.ensureVaultUnsealed(ctx)
|
||||
}
|
||||
|
||||
// ensureVaultUnsealed runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) ensureVaultUnsealed(ctx context.Context) error.
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
|
||||
@ -143,8 +143,6 @@ func (o *Orchestrator) fluxHealthReady(ctx context.Context) (bool, string, error
|
||||
return false, "", fmt.Errorf("decode flux kustomizations: %w", err)
|
||||
}
|
||||
ignored := makeStringSet(o.cfg.Startup.IgnoreFluxKustomizations)
|
||||
required := o.startupRequiredFluxKustomizations()
|
||||
requiredSeen := map[string]struct{}{}
|
||||
notReady := []string{}
|
||||
for _, ks := range list.Items {
|
||||
ns := strings.TrimSpace(ks.Metadata.Namespace)
|
||||
@ -156,12 +154,6 @@ func (o *Orchestrator) fluxHealthReady(ctx context.Context) (bool, string, error
|
||||
if ks.Spec.Suspend {
|
||||
continue
|
||||
}
|
||||
if len(required) > 0 {
|
||||
if _, ok := required[full]; !ok {
|
||||
continue
|
||||
}
|
||||
requiredSeen[full] = struct{}{}
|
||||
}
|
||||
if _, ok := ignored[full]; ok {
|
||||
continue
|
||||
}
|
||||
@ -181,25 +173,10 @@ func (o *Orchestrator) fluxHealthReady(ctx context.Context) (bool, string, error
|
||||
}
|
||||
notReady = append(notReady, fmt.Sprintf("%s(%s)", full, reason))
|
||||
}
|
||||
if len(required) > 0 {
|
||||
missing := []string{}
|
||||
for full := range required {
|
||||
if _, ok := requiredSeen[full]; !ok {
|
||||
missing = append(missing, full+"(missing)")
|
||||
}
|
||||
}
|
||||
if len(missing) > 0 {
|
||||
sort.Strings(missing)
|
||||
notReady = append(notReady, missing...)
|
||||
}
|
||||
}
|
||||
if len(notReady) > 0 {
|
||||
sort.Strings(notReady)
|
||||
return false, "not ready: " + joinLimited(notReady, 6), nil
|
||||
}
|
||||
if len(required) > 0 {
|
||||
return true, fmt.Sprintf("required kustomizations ready=%d", len(requiredSeen)), nil
|
||||
}
|
||||
return true, fmt.Sprintf("all kustomizations ready=%d", len(list.Items)), nil
|
||||
}
|
||||
|
||||
|
||||
@ -19,7 +19,6 @@ func (o *Orchestrator) ensureRequiredNodeLabels(ctx context.Context) error {
|
||||
if o.runner.DryRun || len(o.cfg.Startup.RequiredNodeLabels) == 0 {
|
||||
return nil
|
||||
}
|
||||
ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
|
||||
nodes := make([]string, 0, len(o.cfg.Startup.RequiredNodeLabels))
|
||||
for node := range o.cfg.Startup.RequiredNodeLabels {
|
||||
node = strings.TrimSpace(node)
|
||||
@ -29,10 +28,6 @@ func (o *Orchestrator) ensureRequiredNodeLabels(ctx context.Context) error {
|
||||
}
|
||||
sort.Strings(nodes)
|
||||
for _, node := range nodes {
|
||||
if _, skip := ignored[node]; skip {
|
||||
o.log.Printf("skipping required node labels for ignored unavailable node %s", node)
|
||||
continue
|
||||
}
|
||||
labels := o.cfg.Startup.RequiredNodeLabels[node]
|
||||
if len(labels) == 0 {
|
||||
continue
|
||||
@ -60,11 +55,6 @@ func (o *Orchestrator) ensureRequiredNodeLabels(ctx context.Context) error {
|
||||
continue
|
||||
}
|
||||
if _, err := o.kubectl(ctx, 25*time.Second, args...); err != nil {
|
||||
if isNotFoundErr(err) && !o.startupNodeStrictlyRequired(node) {
|
||||
o.log.Printf("warning: skipping required labels for absent non-core node %s: %v", node, err)
|
||||
o.noteStartupAutoHeal(fmt.Sprintf("skipped required node labels for absent non-core node %s", node))
|
||||
continue
|
||||
}
|
||||
return fmt.Errorf("ensure required node labels on %s (%s): %w", node, strings.Join(pairs, ", "), err)
|
||||
}
|
||||
o.log.Printf("ensured required labels on node %s: %s", node, strings.Join(pairs, ", "))
|
||||
|
||||
@ -37,7 +37,6 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
|
||||
return invErr
|
||||
}
|
||||
o.noteStartupCheck("node-inventory", true, "inventory/user/port validation passed")
|
||||
o.maybeRunEarlyVaultUnseal(ctx)
|
||||
o.setStartupPhase("preflight-node-reachability", "waiting for ssh reachability across configured inventory")
|
||||
if reachErr := o.waitForNodeInventoryReachability(ctx); reachErr != nil {
|
||||
o.noteStartupCheck("node-inventory-reachability", false, reachErr.Error())
|
||||
@ -180,9 +179,6 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
|
||||
}
|
||||
}
|
||||
o.noteStartupCheck("kubernetes-api", true, "kubernetes api reachable")
|
||||
if err := o.runStartupVaultUnsealGate(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := o.ensureRequiredNodeLabels(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
@ -480,3 +476,18 @@ func (o *Orchestrator) Shutdown(ctx context.Context, opts ShutdownOptions) (err
|
||||
o.log.Printf("shutdown flow complete")
|
||||
return nil
|
||||
}
|
||||
|
||||
// normalizeShutdownMode runs one orchestration or CLI step.
|
||||
// Signature: normalizeShutdownMode(raw string) (string, error).
|
||||
// Why: keeps shutdown behavior explicit and safe by allowing only cluster-only
|
||||
// semantics while preserving compatibility with legacy "config" callers.
|
||||
func normalizeShutdownMode(raw string) (string, error) {
|
||||
switch strings.TrimSpace(raw) {
|
||||
case "", "config", "cluster-only":
|
||||
return "cluster-only", nil
|
||||
case "poweroff":
|
||||
return "", fmt.Errorf("shutdown mode %q has been removed; ananke no longer powers off hosts", raw)
|
||||
default:
|
||||
return "", fmt.Errorf("unsupported shutdown mode %q (expected config|cluster-only)", raw)
|
||||
}
|
||||
}
|
||||
|
||||
@ -30,7 +30,7 @@ func (o *Orchestrator) waitForNodeInventoryReachability(ctx context.Context) err
|
||||
ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
|
||||
targets := make([]string, 0, len(o.inventoryNodesForValidation()))
|
||||
seen := map[string]struct{}{}
|
||||
for _, node := range startupRequiredNodes(o.inventoryNodesForValidation(), o.cfg.Startup.NodeInventoryReachRequiredNodes) {
|
||||
for _, node := range o.inventoryNodesForValidation() {
|
||||
node = strings.TrimSpace(node)
|
||||
if node == "" {
|
||||
continue
|
||||
|
||||
@ -1,261 +0,0 @@
|
||||
package cluster
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// maybeAutoQuarantineSchedulingStorms runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) maybeAutoQuarantineSchedulingStorms(ctx context.Context, lastAttempt *time.Time).
|
||||
// Why: a non-core workload that cannot schedule can emit enough warning events to
|
||||
// thrash the control plane datastore; quarantine keeps startup moving while
|
||||
// preserving core services.
|
||||
func (o *Orchestrator) maybeAutoQuarantineSchedulingStorms(ctx context.Context, lastAttempt *time.Time) {
|
||||
if o.runner.DryRun || !o.cfg.Startup.AutoQuarantineSchedulingStorms {
|
||||
return
|
||||
}
|
||||
now := time.Now()
|
||||
if lastAttempt != nil && !lastAttempt.IsZero() && now.Sub(*lastAttempt) < 30*time.Second {
|
||||
return
|
||||
}
|
||||
if lastAttempt != nil {
|
||||
*lastAttempt = now
|
||||
}
|
||||
o.bestEffort("quarantine non-core scheduling storm workloads", func() error {
|
||||
return o.quarantineSchedulingStormWorkloads(ctx)
|
||||
})
|
||||
}
|
||||
|
||||
// quarantineSchedulingStormWorkloads runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) quarantineSchedulingStormWorkloads(ctx context.Context) error.
|
||||
// Why: limits startup-only mitigation to workloads proven to be generating a
|
||||
// scheduling event storm, instead of scaling optional apps down blindly.
|
||||
func (o *Orchestrator) quarantineSchedulingStormWorkloads(ctx context.Context) error {
|
||||
podsOut, err := o.kubectl(ctx, 30*time.Second, "get", "pods", "-A", "-o", "json")
|
||||
if err != nil {
|
||||
return fmt.Errorf("query pods for scheduling storm scan: %w", err)
|
||||
}
|
||||
var pods podList
|
||||
if err := json.Unmarshal([]byte(podsOut), &pods); err != nil {
|
||||
return fmt.Errorf("decode pods for scheduling storm scan: %w", err)
|
||||
}
|
||||
|
||||
rsOut, err := o.kubectl(ctx, 30*time.Second, "get", "replicasets", "-A", "-o", "json")
|
||||
if err != nil {
|
||||
return fmt.Errorf("query replicasets for scheduling storm scan: %w", err)
|
||||
}
|
||||
var rsList replicaSetList
|
||||
if err := json.Unmarshal([]byte(rsOut), &rsList); err != nil {
|
||||
return fmt.Errorf("decode replicasets for scheduling storm scan: %w", err)
|
||||
}
|
||||
|
||||
eventsOut, err := o.kubectl(ctx, 30*time.Second, "get", "events", "-A", "-o", "json")
|
||||
if err != nil {
|
||||
return fmt.Errorf("query events for scheduling storm scan: %w", err)
|
||||
}
|
||||
var events eventList
|
||||
if err := json.Unmarshal([]byte(eventsOut), &events); err != nil {
|
||||
return fmt.Errorf("decode events for scheduling storm scan: %w", err)
|
||||
}
|
||||
|
||||
workloadsOut, err := o.kubectl(ctx, 30*time.Second, "get", "deploy,statefulset", "-A", "-o", "json")
|
||||
if err != nil {
|
||||
return fmt.Errorf("query workloads for scheduling storm scan: %w", err)
|
||||
}
|
||||
var workloads workloadList
|
||||
if err := json.Unmarshal([]byte(workloadsOut), &workloads); err != nil {
|
||||
return fmt.Errorf("decode workloads for scheduling storm scan: %w", err)
|
||||
}
|
||||
|
||||
requiredNamespaces := o.startupRequiredWorkloadNamespaces()
|
||||
ignoredNamespaces := makeStringSet(o.cfg.Startup.IgnoreWorkloadNamespaces)
|
||||
ignoredNodes := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
|
||||
ignoreRules := parseWorkloadIgnoreRules(o.cfg.Startup.IgnoreWorkloads)
|
||||
eventThreshold := o.cfg.Startup.SchedulingStormEventThreshold
|
||||
if eventThreshold <= 0 {
|
||||
eventThreshold = 30
|
||||
}
|
||||
window := time.Duration(o.cfg.Startup.SchedulingStormWindowSeconds) * time.Second
|
||||
if window <= 0 {
|
||||
window = 3 * time.Minute
|
||||
}
|
||||
|
||||
podsByKey := map[string]podResource{}
|
||||
for _, pod := range pods.Items {
|
||||
ns := strings.TrimSpace(pod.Metadata.Namespace)
|
||||
name := strings.TrimSpace(pod.Metadata.Name)
|
||||
if ns == "" || name == "" {
|
||||
continue
|
||||
}
|
||||
podsByKey[ns+"/"+name] = pod
|
||||
}
|
||||
|
||||
rsOwners := map[string]ownerReference{}
|
||||
for _, rs := range rsList.Items {
|
||||
ns := strings.TrimSpace(rs.Metadata.Namespace)
|
||||
name := strings.TrimSpace(rs.Metadata.Name)
|
||||
if ns == "" || name == "" {
|
||||
continue
|
||||
}
|
||||
for _, owner := range rs.Metadata.OwnerReferences {
|
||||
kind := strings.TrimSpace(owner.Kind)
|
||||
ownerName := strings.TrimSpace(owner.Name)
|
||||
if kind == "" || ownerName == "" {
|
||||
continue
|
||||
}
|
||||
rsOwners[ns+"/"+name] = owner
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
workloadDesired := map[string]int32{}
|
||||
for _, item := range workloads.Items {
|
||||
kind := strings.ToLower(strings.TrimSpace(item.Kind))
|
||||
ns := strings.TrimSpace(item.Metadata.Namespace)
|
||||
name := strings.TrimSpace(item.Metadata.Name)
|
||||
if kind == "" || ns == "" || name == "" {
|
||||
continue
|
||||
}
|
||||
desired, _, ok := desiredReady(item)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
workloadDesired[ns+"/"+kind+"/"+name] = desired
|
||||
}
|
||||
|
||||
quarantined := []string{}
|
||||
seen := map[string]struct{}{}
|
||||
now := time.Now()
|
||||
for _, event := range events.Items {
|
||||
if !strings.EqualFold(strings.TrimSpace(event.Type), "Warning") {
|
||||
continue
|
||||
}
|
||||
if strings.TrimSpace(event.Reason) != "FailedScheduling" {
|
||||
continue
|
||||
}
|
||||
if !strings.EqualFold(strings.TrimSpace(event.InvolvedObject.Kind), "Pod") {
|
||||
continue
|
||||
}
|
||||
lastSeen := eventLastObservedAt(event)
|
||||
if !lastSeen.IsZero() && now.Sub(lastSeen) > window {
|
||||
continue
|
||||
}
|
||||
count := eventObservationCount(event)
|
||||
if count < eventThreshold {
|
||||
continue
|
||||
}
|
||||
podKey := strings.TrimSpace(event.InvolvedObject.Namespace) + "/" + strings.TrimSpace(event.InvolvedObject.Name)
|
||||
pod, ok := podsByKey[podKey]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if !strings.EqualFold(strings.TrimSpace(pod.Status.Phase), "Pending") {
|
||||
continue
|
||||
}
|
||||
ns := strings.TrimSpace(pod.Metadata.Namespace)
|
||||
if _, ok := requiredNamespaces[ns]; ok {
|
||||
continue
|
||||
}
|
||||
if _, ok := ignoredNamespaces[ns]; ok {
|
||||
continue
|
||||
}
|
||||
if workloadIgnored(ignoreRules, ns, "", strings.TrimSpace(pod.Metadata.Name)) {
|
||||
continue
|
||||
}
|
||||
if podTargetsIgnoredNode(pod, ignoredNodes) {
|
||||
continue
|
||||
}
|
||||
workload, ok := schedulingStormOwnerWorkload(pod, rsOwners)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if workloadIgnored(ignoreRules, workload.Namespace, workload.Kind, workload.Name) {
|
||||
continue
|
||||
}
|
||||
workloadKey := workload.Namespace + "/" + workload.Kind + "/" + workload.Name
|
||||
if _, done := seen[workloadKey]; done {
|
||||
continue
|
||||
}
|
||||
desired := workloadDesired[workloadKey]
|
||||
if desired <= 0 {
|
||||
continue
|
||||
}
|
||||
if err := o.ensureWorkloadReplicas(ctx, workload, 0); err != nil {
|
||||
return fmt.Errorf("scale scheduling storm workload %s to 0: %w", workloadKey, err)
|
||||
}
|
||||
seen[workloadKey] = struct{}{}
|
||||
quarantined = append(quarantined, fmt.Sprintf("%s events=%d window=%s", workloadKey, count, window))
|
||||
}
|
||||
|
||||
if len(quarantined) == 0 {
|
||||
return nil
|
||||
}
|
||||
sort.Strings(quarantined)
|
||||
detail := fmt.Sprintf("quarantined scheduling storm workload(s): %s", joinLimited(quarantined, 8))
|
||||
o.log.Printf("%s", detail)
|
||||
o.noteStartupAutoHeal(detail)
|
||||
return nil
|
||||
}
|
||||
|
||||
// schedulingStormOwnerWorkload runs one orchestration or CLI step.
|
||||
// Signature: schedulingStormOwnerWorkload(pod podResource, rsOwners map[string]ownerReference) (startupWorkload, bool).
|
||||
// Why: scheduling storms happen at the pod layer, but safe mitigation needs to
|
||||
// operate on the owning deployment or statefulset.
|
||||
func schedulingStormOwnerWorkload(pod podResource, rsOwners map[string]ownerReference) (startupWorkload, bool) {
|
||||
ns := strings.TrimSpace(pod.Metadata.Namespace)
|
||||
for _, owner := range pod.Metadata.OwnerReferences {
|
||||
switch strings.TrimSpace(owner.Kind) {
|
||||
case "StatefulSet":
|
||||
if name := strings.TrimSpace(owner.Name); name != "" {
|
||||
return startupWorkload{Namespace: ns, Kind: "statefulset", Name: name}, true
|
||||
}
|
||||
case "ReplicaSet":
|
||||
rsName := strings.TrimSpace(owner.Name)
|
||||
if rsName == "" {
|
||||
continue
|
||||
}
|
||||
rsOwner, ok := rsOwners[ns+"/"+rsName]
|
||||
if !ok || strings.TrimSpace(rsOwner.Kind) != "Deployment" || strings.TrimSpace(rsOwner.Name) == "" {
|
||||
continue
|
||||
}
|
||||
return startupWorkload{Namespace: ns, Kind: "deployment", Name: strings.TrimSpace(rsOwner.Name)}, true
|
||||
}
|
||||
}
|
||||
return startupWorkload{}, false
|
||||
}
|
||||
|
||||
// eventObservationCount runs one orchestration or CLI step.
|
||||
// Signature: eventObservationCount(event eventResource) int.
|
||||
// Why: event count can live either on the root event or in the series payload;
|
||||
// using the max keeps detection stable across Kubernetes versions.
|
||||
func eventObservationCount(event eventResource) int {
|
||||
count := event.Count
|
||||
if event.Series.Count > count {
|
||||
count = event.Series.Count
|
||||
}
|
||||
if count < 1 {
|
||||
return 1
|
||||
}
|
||||
return count
|
||||
}
|
||||
|
||||
// eventLastObservedAt runs one orchestration or CLI step.
|
||||
// Signature: eventLastObservedAt(event eventResource) time.Time.
|
||||
// Why: event recency fields vary by cluster version; prefer the newest explicit
|
||||
// observation time and fall back to creation time when needed.
|
||||
func eventLastObservedAt(event eventResource) time.Time {
|
||||
switch {
|
||||
case !event.Series.LastObservedTime.IsZero():
|
||||
return event.Series.LastObservedTime
|
||||
case !event.LastTimestamp.IsZero():
|
||||
return event.LastTimestamp
|
||||
case !event.EventTime.IsZero():
|
||||
return event.EventTime
|
||||
default:
|
||||
return event.Metadata.CreationTimestamp
|
||||
}
|
||||
}
|
||||
@ -1,21 +0,0 @@
|
||||
package cluster
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// normalizeShutdownMode runs one orchestration or CLI step.
|
||||
// Signature: normalizeShutdownMode(raw string) (string, error).
|
||||
// Why: keeps shutdown behavior explicit and safe by allowing only cluster-only
|
||||
// semantics while preserving compatibility with legacy "config" callers.
|
||||
func normalizeShutdownMode(raw string) (string, error) {
|
||||
switch strings.TrimSpace(raw) {
|
||||
case "", "config", "cluster-only":
|
||||
return "cluster-only", nil
|
||||
case "poweroff":
|
||||
return "", fmt.Errorf("shutdown mode %q has been removed; ananke no longer powers off hosts", raw)
|
||||
default:
|
||||
return "", fmt.Errorf("unsupported shutdown mode %q (expected config|cluster-only)", raw)
|
||||
}
|
||||
}
|
||||
@ -1,81 +0,0 @@
|
||||
package cluster
|
||||
|
||||
import "strings"
|
||||
|
||||
// startupRequiredNodes runs one orchestration or CLI step.
|
||||
// Signature: startupRequiredNodes(nodes []string, required []string) []string.
|
||||
// Why: lets startup enforce a smaller core node set during outage recovery
|
||||
// without losing the stricter all-nodes behavior when no override is configured.
|
||||
func startupRequiredNodes(nodes []string, required []string) []string {
|
||||
requiredSet := makeStringSet(required)
|
||||
if len(requiredSet) == 0 {
|
||||
return nodes
|
||||
}
|
||||
filtered := make([]string, 0, len(nodes))
|
||||
for _, node := range nodes {
|
||||
node = strings.TrimSpace(node)
|
||||
if node == "" {
|
||||
continue
|
||||
}
|
||||
if _, ok := requiredSet[node]; ok {
|
||||
filtered = append(filtered, node)
|
||||
}
|
||||
}
|
||||
return filtered
|
||||
}
|
||||
|
||||
// startupNodeStrictlyRequired runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) startupNodeStrictlyRequired(node string) bool.
|
||||
// Why: absent or broken non-core nodes should not block recovery-only actions
|
||||
// like label reconciliation once the operator has narrowed startup to core nodes.
|
||||
func (o *Orchestrator) startupNodeStrictlyRequired(node string) bool {
|
||||
node = strings.TrimSpace(node)
|
||||
if node == "" {
|
||||
return false
|
||||
}
|
||||
if len(o.cfg.Startup.NodeInventoryReachRequiredNodes) == 0 && len(o.cfg.Startup.NodeSSHAuthRequiredNodes) == 0 {
|
||||
return true
|
||||
}
|
||||
for _, controlPlane := range o.cfg.ControlPlanes {
|
||||
if strings.TrimSpace(controlPlane) == node {
|
||||
return true
|
||||
}
|
||||
}
|
||||
if containsNode(o.cfg.Startup.NodeInventoryReachRequiredNodes, node) {
|
||||
return true
|
||||
}
|
||||
return containsNode(o.cfg.Startup.NodeSSHAuthRequiredNodes, node)
|
||||
}
|
||||
|
||||
// startupRequiredFluxKustomizations runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) startupRequiredFluxKustomizations() map[string]struct{}.
|
||||
// Why: lets outage recovery wait on a declared core GitOps slice while leaving
|
||||
// optional stacks free to converge after bootstrap succeeds.
|
||||
func (o *Orchestrator) startupRequiredFluxKustomizations() map[string]struct{} {
|
||||
return makeStringSet(o.cfg.Startup.FluxHealthRequiredKustomizations)
|
||||
}
|
||||
|
||||
// startupRequiredWorkloadNamespaces runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) startupRequiredWorkloadNamespaces() map[string]struct{}.
|
||||
// Why: keeps workload readiness scoped to core namespaces during recovery while
|
||||
// preserving broad convergence checks when no explicit core list is configured.
|
||||
func (o *Orchestrator) startupRequiredWorkloadNamespaces() map[string]struct{} {
|
||||
return makeStringSet(o.cfg.Startup.WorkloadConvergenceRequiredNamespaces)
|
||||
}
|
||||
|
||||
// containsNode runs one orchestration or CLI step.
|
||||
// Signature: containsNode(entries []string, needle string) bool.
|
||||
// Why: keeps node-scope checks small and explicit anywhere startup narrows its
|
||||
// recovery gates to a declared core set.
|
||||
func containsNode(entries []string, needle string) bool {
|
||||
needle = strings.TrimSpace(needle)
|
||||
if needle == "" {
|
||||
return false
|
||||
}
|
||||
for _, entry := range entries {
|
||||
if strings.TrimSpace(entry) == needle {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
@ -1,52 +0,0 @@
|
||||
package cluster
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"time"
|
||||
)
|
||||
|
||||
// maybeRunEarlyVaultUnseal runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) maybeRunEarlyVaultUnseal(ctx context.Context).
|
||||
// Why: gives startup a best-effort Vault recovery path when the API is already
|
||||
// live, without consuming the hard startup failure path before workloads recover.
|
||||
func (o *Orchestrator) maybeRunEarlyVaultUnseal(ctx context.Context) {
|
||||
if err := o.waitForAPI(ctx, 1, time.Second); err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
o.noteStartupCheckState("vault-unseal-early", "running", "best-effort early vault unseal while kubernetes api is already available")
|
||||
deferred, detail, err := o.ensureVaultUnsealedWhenRunnable(ctx)
|
||||
if err != nil {
|
||||
o.log.Printf("warning: early vault unseal deferred: %v", err)
|
||||
o.noteStartupAutoHeal(fmt.Sprintf("deferred early vault unseal: %v", err))
|
||||
return
|
||||
}
|
||||
if deferred {
|
||||
o.log.Printf("vault early unseal deferred: %s", detail)
|
||||
o.noteStartupAutoHeal(detail)
|
||||
return
|
||||
}
|
||||
o.noteStartupCheck("vault-unseal-early", true, "vault is already unsealed")
|
||||
}
|
||||
|
||||
// runStartupVaultUnsealGate runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) runStartupVaultUnsealGate(ctx context.Context) error.
|
||||
// Why: keeps the top-level startup flow readable while allowing Vault unseal to
|
||||
// defer cleanly until critical workload recovery when the pod is not runnable yet.
|
||||
func (o *Orchestrator) runStartupVaultUnsealGate(ctx context.Context) error {
|
||||
o.noteStartupCheckState("vault-unseal", "running", "ensuring vault is unsealed before startup gates")
|
||||
deferred, detail, err := o.ensureVaultUnsealedWhenRunnable(ctx)
|
||||
if err != nil {
|
||||
o.noteStartupCheck("vault-unseal", false, err.Error())
|
||||
return err
|
||||
}
|
||||
if deferred {
|
||||
o.log.Printf("vault unseal deferred until workload recovery: %s", detail)
|
||||
o.noteStartupAutoHeal(detail)
|
||||
o.noteStartupCheck("vault-unseal", true, detail)
|
||||
return nil
|
||||
}
|
||||
o.noteStartupCheck("vault-unseal", true, "vault is unsealed")
|
||||
return nil
|
||||
}
|
||||
@ -177,46 +177,6 @@ type jobConditionRef struct {
|
||||
Status string `json:"status"`
|
||||
}
|
||||
|
||||
type eventList struct {
|
||||
Items []eventResource `json:"items"`
|
||||
}
|
||||
|
||||
type eventResource struct {
|
||||
Metadata struct {
|
||||
Namespace string `json:"namespace"`
|
||||
CreationTimestamp time.Time `json:"creationTimestamp"`
|
||||
} `json:"metadata"`
|
||||
InvolvedObject struct {
|
||||
Kind string `json:"kind"`
|
||||
Namespace string `json:"namespace"`
|
||||
Name string `json:"name"`
|
||||
} `json:"involvedObject"`
|
||||
Type string `json:"type"`
|
||||
Reason string `json:"reason"`
|
||||
Message string `json:"message"`
|
||||
Count int `json:"count"`
|
||||
EventTime time.Time `json:"eventTime"`
|
||||
LastTimestamp time.Time `json:"lastTimestamp"`
|
||||
Series eventSeries `json:"series"`
|
||||
}
|
||||
|
||||
type eventSeries struct {
|
||||
Count int `json:"count"`
|
||||
LastObservedTime time.Time `json:"lastObservedTime"`
|
||||
}
|
||||
|
||||
type replicaSetList struct {
|
||||
Items []replicaSetResource `json:"items"`
|
||||
}
|
||||
|
||||
type replicaSetResource struct {
|
||||
Metadata struct {
|
||||
Namespace string `json:"namespace"`
|
||||
Name string `json:"name"`
|
||||
OwnerReferences []ownerReference `json:"ownerReferences"`
|
||||
} `json:"metadata"`
|
||||
}
|
||||
|
||||
type workloadResource struct {
|
||||
Kind string `json:"kind"`
|
||||
Metadata struct {
|
||||
@ -261,7 +221,6 @@ type podResource struct {
|
||||
|
||||
type ownerReference struct {
|
||||
Kind string `json:"kind"`
|
||||
Name string `json:"name"`
|
||||
}
|
||||
|
||||
type podContainerStatus struct {
|
||||
|
||||
@ -26,12 +26,10 @@ func (o *Orchestrator) waitForWorkloadConvergence(ctx context.Context) error {
|
||||
lastLogged := time.Time{}
|
||||
lastRecycleAttempt := time.Time{}
|
||||
lastReplicaHeal := time.Time{}
|
||||
lastSchedulingStormHeal := time.Time{}
|
||||
for {
|
||||
prevFailure := lastFailure
|
||||
o.maybeAutoRecycleStuckPods(ctx, &lastRecycleAttempt)
|
||||
o.maybeAutoHealCriticalWorkloadReplicas(ctx, &lastReplicaHeal)
|
||||
o.maybeAutoQuarantineSchedulingStorms(ctx, &lastSchedulingStormHeal)
|
||||
ready, detail, err := o.workloadConvergenceReady(ctx)
|
||||
if err != nil {
|
||||
lastFailure = err.Error()
|
||||
@ -73,7 +71,6 @@ func (o *Orchestrator) workloadConvergenceReady(ctx context.Context) (bool, stri
|
||||
if err := json.Unmarshal([]byte(out), &list); err != nil {
|
||||
return false, "", fmt.Errorf("decode controllers: %w", err)
|
||||
}
|
||||
requiredNamespaces := o.startupRequiredWorkloadNamespaces()
|
||||
ignoredNamespaces := makeStringSet(o.cfg.Startup.IgnoreWorkloadNamespaces)
|
||||
ignoredNodes := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
|
||||
ignoreRules := parseWorkloadIgnoreRules(o.cfg.Startup.IgnoreWorkloads)
|
||||
@ -87,11 +84,6 @@ func (o *Orchestrator) workloadConvergenceReady(ctx context.Context) (bool, stri
|
||||
if kind == "" || ns == "" || name == "" {
|
||||
continue
|
||||
}
|
||||
if len(requiredNamespaces) > 0 {
|
||||
if _, ok := requiredNamespaces[ns]; !ok {
|
||||
continue
|
||||
}
|
||||
}
|
||||
if _, ok := ignoredNamespaces[ns]; ok {
|
||||
continue
|
||||
}
|
||||
|
||||
@ -116,7 +116,6 @@ func (o *Orchestrator) startupFailurePods(ctx context.Context) ([]string, error)
|
||||
return nil, fmt.Errorf("decode pods: %w", err)
|
||||
}
|
||||
|
||||
requiredNamespaces := o.startupRequiredWorkloadNamespaces()
|
||||
ignoredNamespaces := makeStringSet(o.cfg.Startup.IgnoreWorkloadNamespaces)
|
||||
ignoredNodes := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
|
||||
stuckReasons := map[string]struct{}{
|
||||
@ -139,11 +138,6 @@ func (o *Orchestrator) startupFailurePods(ctx context.Context) ([]string, error)
|
||||
if ns == "" || name == "" {
|
||||
continue
|
||||
}
|
||||
if len(requiredNamespaces) > 0 {
|
||||
if _, ok := requiredNamespaces[ns]; !ok {
|
||||
continue
|
||||
}
|
||||
}
|
||||
if _, ok := ignoredNamespaces[ns]; ok {
|
||||
continue
|
||||
}
|
||||
|
||||
@ -1,88 +0,0 @@
|
||||
package cluster
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// TestHookMaybeAutoQuarantineSchedulingStorms runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) TestHookMaybeAutoQuarantineSchedulingStorms(ctx context.Context, lastAttempt *time.Time).
|
||||
// Why: exposes the scheduling-storm trigger guard to the split top-level test module.
|
||||
func (o *Orchestrator) TestHookMaybeAutoQuarantineSchedulingStorms(ctx context.Context, lastAttempt *time.Time) {
|
||||
o.maybeAutoQuarantineSchedulingStorms(ctx, lastAttempt)
|
||||
}
|
||||
|
||||
// TestHookQuarantineSchedulingStormWorkloads runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) TestHookQuarantineSchedulingStormWorkloads(ctx context.Context) error.
|
||||
// Why: exposes the scheduling-storm auto-heal body to the split top-level test module.
|
||||
func (o *Orchestrator) TestHookQuarantineSchedulingStormWorkloads(ctx context.Context) error {
|
||||
return o.quarantineSchedulingStormWorkloads(ctx)
|
||||
}
|
||||
|
||||
// TestHookSchedulingStormOwnerWorkload runs one orchestration or CLI step.
|
||||
// Signature: TestHookSchedulingStormOwnerWorkload(namespace string, ownerKind string, ownerName string, rsOwnerKind string, rsOwnerName string) (string, bool).
|
||||
// Why: exposes owner-resolution behavior without leaking internal workload types.
|
||||
func TestHookSchedulingStormOwnerWorkload(
|
||||
namespace string,
|
||||
ownerKind string,
|
||||
ownerName string,
|
||||
rsOwnerKind string,
|
||||
rsOwnerName string,
|
||||
) (string, bool) {
|
||||
var pod podResource
|
||||
pod.Metadata.Namespace = strings.TrimSpace(namespace)
|
||||
pod.Metadata.OwnerReferences = []ownerReference{{
|
||||
Kind: strings.TrimSpace(ownerKind),
|
||||
Name: strings.TrimSpace(ownerName),
|
||||
}}
|
||||
rsOwners := map[string]ownerReference{}
|
||||
if rsName := strings.TrimSpace(ownerName); rsName != "" && strings.TrimSpace(ownerKind) == "ReplicaSet" {
|
||||
rsOwners[pod.Metadata.Namespace+"/"+rsName] = ownerReference{
|
||||
Kind: strings.TrimSpace(rsOwnerKind),
|
||||
Name: strings.TrimSpace(rsOwnerName),
|
||||
}
|
||||
}
|
||||
workload, ok := schedulingStormOwnerWorkload(pod, rsOwners)
|
||||
if !ok {
|
||||
return "", false
|
||||
}
|
||||
return fmt.Sprintf("%s/%s/%s", workload.Namespace, workload.Kind, workload.Name), true
|
||||
}
|
||||
|
||||
// TestHookEventObservationCount runs one orchestration or CLI step.
|
||||
// Signature: TestHookEventObservationCount(count int, seriesCount int) int.
|
||||
// Why: exposes event-count normalization used by scheduling-storm detection.
|
||||
func TestHookEventObservationCount(count int, seriesCount int) int {
|
||||
return eventObservationCount(eventResource{
|
||||
Count: count,
|
||||
Series: eventSeries{
|
||||
Count: seriesCount,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
// TestHookEventLastObservedAt runs one orchestration or CLI step.
|
||||
// Signature: TestHookEventLastObservedAt(seriesLastObserved time.Time, lastTimestamp time.Time, eventTime time.Time, creationTimestamp time.Time) time.Time.
|
||||
// Why: exposes event-time fallback behavior used by scheduling-storm detection.
|
||||
func TestHookEventLastObservedAt(
|
||||
seriesLastObserved time.Time,
|
||||
lastTimestamp time.Time,
|
||||
eventTime time.Time,
|
||||
creationTimestamp time.Time,
|
||||
) time.Time {
|
||||
return eventLastObservedAt(eventResource{
|
||||
LastTimestamp: lastTimestamp,
|
||||
EventTime: eventTime,
|
||||
Series: eventSeries{
|
||||
LastObservedTime: seriesLastObserved,
|
||||
},
|
||||
Metadata: struct {
|
||||
Namespace string `json:"namespace"`
|
||||
CreationTimestamp time.Time `json:"creationTimestamp"`
|
||||
}{
|
||||
CreationTimestamp: creationTimestamp,
|
||||
},
|
||||
})
|
||||
}
|
||||
@ -1,55 +0,0 @@
|
||||
package cluster
|
||||
|
||||
import "context"
|
||||
|
||||
// TestHookStartupRequiredNodes runs one orchestration or CLI step.
|
||||
// Signature: TestHookStartupRequiredNodes(nodes []string, required []string) []string.
|
||||
// Why: exposes recovery-scope node filtering so top-level tests can cover core-only startup narrowing.
|
||||
func TestHookStartupRequiredNodes(nodes []string, required []string) []string {
|
||||
return startupRequiredNodes(nodes, required)
|
||||
}
|
||||
|
||||
// TestHookContainsNode runs one orchestration or CLI step.
|
||||
// Signature: TestHookContainsNode(entries []string, needle string) bool.
|
||||
// Why: exposes the small startup-scope membership helper to top-level tests.
|
||||
func TestHookContainsNode(entries []string, needle string) bool {
|
||||
return containsNode(entries, needle)
|
||||
}
|
||||
|
||||
// TestHookStartupNodeStrictlyRequired runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) TestHookStartupNodeStrictlyRequired(node string) bool.
|
||||
// Why: exposes strict-node startup scoping so outage-recovery tests can confirm
|
||||
// non-core nodes stop blocking bootstrap.
|
||||
func (o *Orchestrator) TestHookStartupNodeStrictlyRequired(node string) bool {
|
||||
return o.startupNodeStrictlyRequired(node)
|
||||
}
|
||||
|
||||
// TestHookStartupRequiredFluxKustomizations runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) TestHookStartupRequiredFluxKustomizations() map[string]struct{}.
|
||||
// Why: exposes flux startup scoping so top-level tests can confirm only core
|
||||
// kustomizations block emergency bootstrap.
|
||||
func (o *Orchestrator) TestHookStartupRequiredFluxKustomizations() map[string]struct{} {
|
||||
return o.startupRequiredFluxKustomizations()
|
||||
}
|
||||
|
||||
// TestHookStartupRequiredWorkloadNamespaces runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) TestHookStartupRequiredWorkloadNamespaces() map[string]struct{}.
|
||||
// Why: exposes workload namespace startup scoping so top-level tests can
|
||||
// confirm only core workloads block emergency bootstrap.
|
||||
func (o *Orchestrator) TestHookStartupRequiredWorkloadNamespaces() map[string]struct{} {
|
||||
return o.startupRequiredWorkloadNamespaces()
|
||||
}
|
||||
|
||||
// TestHookMaybeRunEarlyVaultUnseal runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) TestHookMaybeRunEarlyVaultUnseal(ctx context.Context).
|
||||
// Why: exposes the early startup Vault deferral helper to top-level tests.
|
||||
func (o *Orchestrator) TestHookMaybeRunEarlyVaultUnseal(ctx context.Context) {
|
||||
o.maybeRunEarlyVaultUnseal(ctx)
|
||||
}
|
||||
|
||||
// TestHookRunStartupVaultUnsealGate runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) TestHookRunStartupVaultUnsealGate(ctx context.Context) error.
|
||||
// Why: exposes the startup Vault gate helper to top-level tests.
|
||||
func (o *Orchestrator) TestHookRunStartupVaultUnsealGate(ctx context.Context) error {
|
||||
return o.runStartupVaultUnsealGate(ctx)
|
||||
}
|
||||
@ -33,9 +33,6 @@ func (c *Config) applyDefaults() {
|
||||
if c.Startup.NodeInventoryReachPollSeconds <= 0 {
|
||||
c.Startup.NodeInventoryReachPollSeconds = 5
|
||||
}
|
||||
if c.Startup.NodeInventoryReachRequiredNodes == nil {
|
||||
c.Startup.NodeInventoryReachRequiredNodes = []string{}
|
||||
}
|
||||
if c.Startup.RequiredNodeLabels == nil {
|
||||
c.Startup.RequiredNodeLabels = map[string]map[string]string{
|
||||
"titan-09": {
|
||||
@ -124,11 +121,7 @@ func (c *Config) applyDefaults() {
|
||||
if strings.TrimSpace(c.Startup.ServiceChecklistAuth.AdminSecretPasswordKey) == "" {
|
||||
c.Startup.ServiceChecklistAuth.AdminSecretPasswordKey = "password"
|
||||
}
|
||||
if c.Startup.ServiceChecklistExplicitOnly {
|
||||
c.Startup.ServiceChecklist = mergeServiceChecklistDefaults(c.Startup.ServiceChecklist, []ServiceChecklistCheck{})
|
||||
} else {
|
||||
c.Startup.ServiceChecklist = mergeServiceChecklistDefaults(c.Startup.ServiceChecklist, defaultServiceChecklist())
|
||||
}
|
||||
c.Startup.ServiceChecklist = mergeServiceChecklistDefaults(c.Startup.ServiceChecklist, defaultServiceChecklist())
|
||||
for i := range c.Startup.ServiceChecklist {
|
||||
if c.Startup.ServiceChecklist[i].TimeoutSeconds <= 0 {
|
||||
c.Startup.ServiceChecklist[i].TimeoutSeconds = 12
|
||||
@ -159,18 +152,12 @@ func (c *Config) applyDefaults() {
|
||||
if c.Startup.NodeSSHAuthPollSeconds <= 0 {
|
||||
c.Startup.NodeSSHAuthPollSeconds = 5
|
||||
}
|
||||
if c.Startup.NodeSSHAuthRequiredNodes == nil {
|
||||
c.Startup.NodeSSHAuthRequiredNodes = []string{}
|
||||
}
|
||||
if c.Startup.FluxHealthWaitSeconds <= 0 {
|
||||
c.Startup.FluxHealthWaitSeconds = 900
|
||||
}
|
||||
if c.Startup.FluxHealthPollSeconds <= 0 {
|
||||
c.Startup.FluxHealthPollSeconds = 5
|
||||
}
|
||||
if c.Startup.FluxHealthRequiredKustomizations == nil {
|
||||
c.Startup.FluxHealthRequiredKustomizations = []string{}
|
||||
}
|
||||
if c.Startup.IgnoreFluxKustomizations == nil {
|
||||
c.Startup.IgnoreFluxKustomizations = []string{}
|
||||
}
|
||||
@ -180,9 +167,6 @@ func (c *Config) applyDefaults() {
|
||||
if c.Startup.WorkloadConvergencePollSeconds <= 0 {
|
||||
c.Startup.WorkloadConvergencePollSeconds = 5
|
||||
}
|
||||
if c.Startup.WorkloadConvergenceRequiredNamespaces == nil {
|
||||
c.Startup.WorkloadConvergenceRequiredNamespaces = []string{}
|
||||
}
|
||||
if c.Startup.IgnoreWorkloadNamespaces == nil {
|
||||
c.Startup.IgnoreWorkloadNamespaces = []string{}
|
||||
}
|
||||
@ -195,12 +179,6 @@ func (c *Config) applyDefaults() {
|
||||
if c.Startup.StuckPodGraceSeconds <= 0 {
|
||||
c.Startup.StuckPodGraceSeconds = 180
|
||||
}
|
||||
if c.Startup.PostStartAutoHealSeconds <= 0 {
|
||||
c.Startup.PostStartAutoHealSeconds = 60
|
||||
}
|
||||
if c.Startup.DeadNodeCleanupGraceSeconds <= 0 {
|
||||
c.Startup.DeadNodeCleanupGraceSeconds = 300
|
||||
}
|
||||
if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" {
|
||||
c.Startup.VaultUnsealKeyFile = "/var/lib/ananke/vault-unseal.key"
|
||||
}
|
||||
@ -243,12 +221,6 @@ func (c *Config) applyDefaults() {
|
||||
if c.UPS.TelemetryTimeoutSeconds <= 0 {
|
||||
c.UPS.TelemetryTimeoutSeconds = 90
|
||||
}
|
||||
if c.Startup.SchedulingStormEventThreshold <= 0 {
|
||||
c.Startup.SchedulingStormEventThreshold = 30
|
||||
}
|
||||
if c.Startup.SchedulingStormWindowSeconds <= 0 {
|
||||
c.Startup.SchedulingStormWindowSeconds = 180
|
||||
}
|
||||
if c.Coordination.ForwardShutdownConfig == "" {
|
||||
c.Coordination.ForwardShutdownConfig = "/etc/ananke/ananke.yaml"
|
||||
}
|
||||
|
||||
@ -39,25 +39,24 @@ func defaults() Config {
|
||||
"maintenance",
|
||||
},
|
||||
Startup: Startup{
|
||||
APIWaitSeconds: 1200,
|
||||
APIPollSeconds: 2,
|
||||
ShutdownCooldownSeconds: 45,
|
||||
RequireNodeInventoryReach: true,
|
||||
NodeInventoryReachWaitSeconds: 300,
|
||||
NodeInventoryReachPollSeconds: 5,
|
||||
NodeInventoryReachRequiredNodes: []string{},
|
||||
RequireTimeSync: true,
|
||||
TimeSyncWaitSeconds: 240,
|
||||
TimeSyncPollSeconds: 5,
|
||||
TimeSyncMode: "quorum",
|
||||
TimeSyncQuorum: 2,
|
||||
ReconcileAccessOnBoot: true,
|
||||
AutoEtcdRestoreOnAPIFailure: true,
|
||||
EtcdRestoreControlPlane: "titan-0a",
|
||||
RequireStorageReady: true,
|
||||
StorageReadyWaitSeconds: 420,
|
||||
StorageReadyPollSeconds: 5,
|
||||
StorageMinReadyNodes: 2,
|
||||
APIWaitSeconds: 1200,
|
||||
APIPollSeconds: 2,
|
||||
ShutdownCooldownSeconds: 45,
|
||||
RequireNodeInventoryReach: true,
|
||||
NodeInventoryReachWaitSeconds: 300,
|
||||
NodeInventoryReachPollSeconds: 5,
|
||||
RequireTimeSync: true,
|
||||
TimeSyncWaitSeconds: 240,
|
||||
TimeSyncPollSeconds: 5,
|
||||
TimeSyncMode: "quorum",
|
||||
TimeSyncQuorum: 2,
|
||||
ReconcileAccessOnBoot: true,
|
||||
AutoEtcdRestoreOnAPIFailure: true,
|
||||
EtcdRestoreControlPlane: "titan-0a",
|
||||
RequireStorageReady: true,
|
||||
StorageReadyWaitSeconds: 420,
|
||||
StorageReadyPollSeconds: 5,
|
||||
StorageMinReadyNodes: 2,
|
||||
StorageCriticalPVCs: []string{
|
||||
"vault/data-vault-0",
|
||||
"postgres/postgres-data-postgres-0",
|
||||
@ -92,36 +91,33 @@ func defaults() Config {
|
||||
AdminSecretUsernameKey: "username",
|
||||
AdminSecretPasswordKey: "password",
|
||||
},
|
||||
ServiceChecklist: defaultServiceChecklist(),
|
||||
RequireCriticalServiceEndpoints: true,
|
||||
CriticalServiceEndpointWaitSec: 420,
|
||||
CriticalServiceEndpointPollSec: 5,
|
||||
CriticalServiceEndpoints: defaultCriticalServiceEndpoints(),
|
||||
RequireIngressChecklist: true,
|
||||
IngressChecklistWaitSeconds: 420,
|
||||
IngressChecklistPollSeconds: 5,
|
||||
IngressChecklistAccepted: []int{200, 301, 302, 307, 308, 401, 403, 404},
|
||||
IngressChecklistIgnoreHosts: []string{},
|
||||
RequireNodeSSHAuth: true,
|
||||
NodeSSHAuthWaitSeconds: 240,
|
||||
NodeSSHAuthPollSeconds: 5,
|
||||
NodeSSHAuthRequiredNodes: []string{},
|
||||
RequireFluxHealth: true,
|
||||
FluxHealthWaitSeconds: 900,
|
||||
FluxHealthPollSeconds: 5,
|
||||
FluxHealthRequiredKustomizations: []string{},
|
||||
IgnoreFluxKustomizations: []string{},
|
||||
RequireWorkloadConvergence: true,
|
||||
WorkloadConvergenceWaitSeconds: 900,
|
||||
WorkloadConvergencePollSeconds: 5,
|
||||
WorkloadConvergenceRequiredNamespaces: []string{},
|
||||
IgnoreWorkloadNamespaces: []string{},
|
||||
IgnoreWorkloads: []string{},
|
||||
IgnoreUnavailableNodes: []string{},
|
||||
AutoRecycleStuckPods: true,
|
||||
StuckPodGraceSeconds: 180,
|
||||
VaultUnsealKeyFile: "/var/lib/ananke/vault-unseal.key",
|
||||
VaultUnsealBreakglassTimeout: 15,
|
||||
ServiceChecklist: defaultServiceChecklist(),
|
||||
RequireCriticalServiceEndpoints: true,
|
||||
CriticalServiceEndpointWaitSec: 420,
|
||||
CriticalServiceEndpointPollSec: 5,
|
||||
CriticalServiceEndpoints: defaultCriticalServiceEndpoints(),
|
||||
RequireIngressChecklist: true,
|
||||
IngressChecklistWaitSeconds: 420,
|
||||
IngressChecklistPollSeconds: 5,
|
||||
IngressChecklistAccepted: []int{200, 301, 302, 307, 308, 401, 403, 404},
|
||||
IngressChecklistIgnoreHosts: []string{},
|
||||
RequireNodeSSHAuth: true,
|
||||
NodeSSHAuthWaitSeconds: 240,
|
||||
NodeSSHAuthPollSeconds: 5,
|
||||
RequireFluxHealth: true,
|
||||
FluxHealthWaitSeconds: 900,
|
||||
FluxHealthPollSeconds: 5,
|
||||
IgnoreFluxKustomizations: []string{},
|
||||
RequireWorkloadConvergence: true,
|
||||
WorkloadConvergenceWaitSeconds: 900,
|
||||
WorkloadConvergencePollSeconds: 5,
|
||||
IgnoreWorkloadNamespaces: []string{},
|
||||
IgnoreWorkloads: []string{},
|
||||
IgnoreUnavailableNodes: []string{},
|
||||
AutoRecycleStuckPods: true,
|
||||
StuckPodGraceSeconds: 180,
|
||||
VaultUnsealKeyFile: "/var/lib/ananke/vault-unseal.key",
|
||||
VaultUnsealBreakglassTimeout: 15,
|
||||
},
|
||||
Shutdown: Shutdown{
|
||||
DefaultBudgetSeconds: 1380,
|
||||
|
||||
@ -51,41 +51,3 @@ startup:
|
||||
t.Fatalf("expected validation failure")
|
||||
}
|
||||
}
|
||||
|
||||
// TestLoadKeepsExplicitServiceChecklist runs one orchestration or CLI step.
|
||||
// Signature: TestLoadKeepsExplicitServiceChecklist(t *testing.T).
|
||||
// Why: host recovery configs must be able to keep a narrow, explicit checklist
|
||||
// without silently inheriting the full default service catalog.
|
||||
func TestLoadKeepsExplicitServiceChecklist(t *testing.T) {
|
||||
cfgPath := filepath.Join(t.TempDir(), "ananke.yaml")
|
||||
raw := `
|
||||
control_planes: [titan-0a]
|
||||
expected_flux_branch: main
|
||||
expected_flux_source_url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git
|
||||
iac_repo_path: /opt/titan-iac
|
||||
startup:
|
||||
service_checklist_explicit_only: true
|
||||
service_checklist:
|
||||
- name: gitea-api
|
||||
url: https://scm.bstein.dev/api/healthz
|
||||
accepted_statuses: [200]
|
||||
body_contains: pass
|
||||
timeout_seconds: 12
|
||||
ups:
|
||||
enabled: false
|
||||
`
|
||||
if err := os.WriteFile(cfgPath, []byte(raw), 0o644); err != nil {
|
||||
t.Fatalf("write config: %v", err)
|
||||
}
|
||||
|
||||
cfg, err := Load(cfgPath)
|
||||
if err != nil {
|
||||
t.Fatalf("load config: %v", err)
|
||||
}
|
||||
if len(cfg.Startup.ServiceChecklist) != 1 {
|
||||
t.Fatalf("expected 1 explicit service check, got %d", len(cfg.Startup.ServiceChecklist))
|
||||
}
|
||||
if cfg.Startup.ServiceChecklist[0].Name != "gitea-api" {
|
||||
t.Fatalf("expected explicit gitea-api check, got %q", cfg.Startup.ServiceChecklist[0].Name)
|
||||
}
|
||||
}
|
||||
|
||||
@ -27,75 +27,65 @@ type Config struct {
|
||||
}
|
||||
|
||||
type Startup struct {
|
||||
APIWaitSeconds int `yaml:"api_wait_seconds"`
|
||||
APIPollSeconds int `yaml:"api_poll_seconds"`
|
||||
ShutdownCooldownSeconds int `yaml:"shutdown_cooldown_seconds"`
|
||||
MinimumBatteryPercent float64 `yaml:"minimum_battery_percent"`
|
||||
RequireNodeInventoryReach bool `yaml:"require_node_inventory_reachability"`
|
||||
NodeInventoryReachWaitSeconds int `yaml:"node_inventory_reachability_wait_seconds"`
|
||||
NodeInventoryReachPollSeconds int `yaml:"node_inventory_reachability_poll_seconds"`
|
||||
NodeInventoryReachRequiredNodes []string `yaml:"node_inventory_reachability_required_nodes"`
|
||||
RequiredNodeLabels map[string]map[string]string `yaml:"required_node_labels"`
|
||||
RequireTimeSync bool `yaml:"require_time_sync"`
|
||||
TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"`
|
||||
TimeSyncPollSeconds int `yaml:"time_sync_poll_seconds"`
|
||||
TimeSyncMode string `yaml:"time_sync_mode"`
|
||||
TimeSyncQuorum int `yaml:"time_sync_quorum"`
|
||||
ReconcileAccessOnBoot bool `yaml:"reconcile_access_on_boot"`
|
||||
AutoEtcdRestoreOnAPIFailure bool `yaml:"auto_etcd_restore_on_api_failure"`
|
||||
EtcdRestoreControlPlane string `yaml:"etcd_restore_control_plane"`
|
||||
RequireStorageReady bool `yaml:"require_storage_ready"`
|
||||
StorageReadyWaitSeconds int `yaml:"storage_ready_wait_seconds"`
|
||||
StorageReadyPollSeconds int `yaml:"storage_ready_poll_seconds"`
|
||||
StorageMinReadyNodes int `yaml:"storage_min_ready_nodes"`
|
||||
StorageCriticalPVCs []string `yaml:"storage_critical_pvcs"`
|
||||
RequirePostStartProbes bool `yaml:"require_post_start_probes"`
|
||||
PostStartProbeWaitSeconds int `yaml:"post_start_probe_wait_seconds"`
|
||||
PostStartProbePollSeconds int `yaml:"post_start_probe_poll_seconds"`
|
||||
PostStartProbes []string `yaml:"post_start_probes"`
|
||||
RequireServiceChecklist bool `yaml:"require_service_checklist"`
|
||||
ServiceChecklistWaitSeconds int `yaml:"service_checklist_wait_seconds"`
|
||||
ServiceChecklistPollSeconds int `yaml:"service_checklist_poll_seconds"`
|
||||
ServiceChecklistStabilitySec int `yaml:"service_checklist_stability_seconds"`
|
||||
ServiceChecklistAuth ServiceChecklistAuthSettings `yaml:"service_checklist_auth"`
|
||||
ServiceChecklistExplicitOnly bool `yaml:"service_checklist_explicit_only"`
|
||||
ServiceChecklist []ServiceChecklistCheck `yaml:"service_checklist"`
|
||||
RequireCriticalServiceEndpoints bool `yaml:"require_critical_service_endpoints"`
|
||||
CriticalServiceEndpointWaitSec int `yaml:"critical_service_endpoint_wait_seconds"`
|
||||
CriticalServiceEndpointPollSec int `yaml:"critical_service_endpoint_poll_seconds"`
|
||||
CriticalServiceEndpoints []string `yaml:"critical_service_endpoints"`
|
||||
RequireIngressChecklist bool `yaml:"require_ingress_checklist"`
|
||||
IngressChecklistWaitSeconds int `yaml:"ingress_checklist_wait_seconds"`
|
||||
IngressChecklistPollSeconds int `yaml:"ingress_checklist_poll_seconds"`
|
||||
IngressChecklistAccepted []int `yaml:"ingress_checklist_accepted_statuses"`
|
||||
IngressChecklistIgnoreHosts []string `yaml:"ingress_checklist_ignore_hosts"`
|
||||
IngressChecklistInsecureSkip bool `yaml:"ingress_checklist_insecure_skip_tls"`
|
||||
RequireNodeSSHAuth bool `yaml:"require_node_ssh_auth"`
|
||||
NodeSSHAuthWaitSeconds int `yaml:"node_ssh_auth_wait_seconds"`
|
||||
NodeSSHAuthPollSeconds int `yaml:"node_ssh_auth_poll_seconds"`
|
||||
NodeSSHAuthRequiredNodes []string `yaml:"node_ssh_auth_required_nodes"`
|
||||
RequireFluxHealth bool `yaml:"require_flux_health"`
|
||||
FluxHealthWaitSeconds int `yaml:"flux_health_wait_seconds"`
|
||||
FluxHealthPollSeconds int `yaml:"flux_health_poll_seconds"`
|
||||
FluxHealthRequiredKustomizations []string `yaml:"flux_health_required_kustomizations"`
|
||||
IgnoreFluxKustomizations []string `yaml:"ignore_flux_kustomizations"`
|
||||
RequireWorkloadConvergence bool `yaml:"require_workload_convergence"`
|
||||
WorkloadConvergenceWaitSeconds int `yaml:"workload_convergence_wait_seconds"`
|
||||
WorkloadConvergencePollSeconds int `yaml:"workload_convergence_poll_seconds"`
|
||||
WorkloadConvergenceRequiredNamespaces []string `yaml:"workload_convergence_required_namespaces"`
|
||||
IgnoreWorkloadNamespaces []string `yaml:"ignore_workload_namespaces"`
|
||||
IgnoreWorkloads []string `yaml:"ignore_workloads"`
|
||||
IgnoreUnavailableNodes []string `yaml:"ignore_unavailable_nodes"`
|
||||
AutoRecycleStuckPods bool `yaml:"auto_recycle_stuck_pods"`
|
||||
AutoQuarantineSchedulingStorms bool `yaml:"auto_quarantine_scheduling_storms"`
|
||||
SchedulingStormEventThreshold int `yaml:"scheduling_storm_event_threshold"`
|
||||
SchedulingStormWindowSeconds int `yaml:"scheduling_storm_window_seconds"`
|
||||
StuckPodGraceSeconds int `yaml:"stuck_pod_grace_seconds"`
|
||||
PostStartAutoHealSeconds int `yaml:"post_start_auto_heal_seconds"`
|
||||
DeadNodeCleanupGraceSeconds int `yaml:"dead_node_cleanup_grace_seconds"`
|
||||
VaultUnsealKeyFile string `yaml:"vault_unseal_key_file"`
|
||||
VaultUnsealBreakglassCommand string `yaml:"vault_unseal_breakglass_command"`
|
||||
VaultUnsealBreakglassTimeout int `yaml:"vault_unseal_breakglass_timeout_seconds"`
|
||||
APIWaitSeconds int `yaml:"api_wait_seconds"`
|
||||
APIPollSeconds int `yaml:"api_poll_seconds"`
|
||||
ShutdownCooldownSeconds int `yaml:"shutdown_cooldown_seconds"`
|
||||
MinimumBatteryPercent float64 `yaml:"minimum_battery_percent"`
|
||||
RequireNodeInventoryReach bool `yaml:"require_node_inventory_reachability"`
|
||||
NodeInventoryReachWaitSeconds int `yaml:"node_inventory_reachability_wait_seconds"`
|
||||
NodeInventoryReachPollSeconds int `yaml:"node_inventory_reachability_poll_seconds"`
|
||||
RequiredNodeLabels map[string]map[string]string `yaml:"required_node_labels"`
|
||||
RequireTimeSync bool `yaml:"require_time_sync"`
|
||||
TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"`
|
||||
TimeSyncPollSeconds int `yaml:"time_sync_poll_seconds"`
|
||||
TimeSyncMode string `yaml:"time_sync_mode"`
|
||||
TimeSyncQuorum int `yaml:"time_sync_quorum"`
|
||||
ReconcileAccessOnBoot bool `yaml:"reconcile_access_on_boot"`
|
||||
AutoEtcdRestoreOnAPIFailure bool `yaml:"auto_etcd_restore_on_api_failure"`
|
||||
EtcdRestoreControlPlane string `yaml:"etcd_restore_control_plane"`
|
||||
RequireStorageReady bool `yaml:"require_storage_ready"`
|
||||
StorageReadyWaitSeconds int `yaml:"storage_ready_wait_seconds"`
|
||||
StorageReadyPollSeconds int `yaml:"storage_ready_poll_seconds"`
|
||||
StorageMinReadyNodes int `yaml:"storage_min_ready_nodes"`
|
||||
StorageCriticalPVCs []string `yaml:"storage_critical_pvcs"`
|
||||
RequirePostStartProbes bool `yaml:"require_post_start_probes"`
|
||||
PostStartProbeWaitSeconds int `yaml:"post_start_probe_wait_seconds"`
|
||||
PostStartProbePollSeconds int `yaml:"post_start_probe_poll_seconds"`
|
||||
PostStartProbes []string `yaml:"post_start_probes"`
|
||||
RequireServiceChecklist bool `yaml:"require_service_checklist"`
|
||||
ServiceChecklistWaitSeconds int `yaml:"service_checklist_wait_seconds"`
|
||||
ServiceChecklistPollSeconds int `yaml:"service_checklist_poll_seconds"`
|
||||
ServiceChecklistStabilitySec int `yaml:"service_checklist_stability_seconds"`
|
||||
ServiceChecklistAuth ServiceChecklistAuthSettings `yaml:"service_checklist_auth"`
|
||||
ServiceChecklist []ServiceChecklistCheck `yaml:"service_checklist"`
|
||||
RequireCriticalServiceEndpoints bool `yaml:"require_critical_service_endpoints"`
|
||||
CriticalServiceEndpointWaitSec int `yaml:"critical_service_endpoint_wait_seconds"`
|
||||
CriticalServiceEndpointPollSec int `yaml:"critical_service_endpoint_poll_seconds"`
|
||||
CriticalServiceEndpoints []string `yaml:"critical_service_endpoints"`
|
||||
RequireIngressChecklist bool `yaml:"require_ingress_checklist"`
|
||||
IngressChecklistWaitSeconds int `yaml:"ingress_checklist_wait_seconds"`
|
||||
IngressChecklistPollSeconds int `yaml:"ingress_checklist_poll_seconds"`
|
||||
IngressChecklistAccepted []int `yaml:"ingress_checklist_accepted_statuses"`
|
||||
IngressChecklistIgnoreHosts []string `yaml:"ingress_checklist_ignore_hosts"`
|
||||
IngressChecklistInsecureSkip bool `yaml:"ingress_checklist_insecure_skip_tls"`
|
||||
RequireNodeSSHAuth bool `yaml:"require_node_ssh_auth"`
|
||||
NodeSSHAuthWaitSeconds int `yaml:"node_ssh_auth_wait_seconds"`
|
||||
NodeSSHAuthPollSeconds int `yaml:"node_ssh_auth_poll_seconds"`
|
||||
RequireFluxHealth bool `yaml:"require_flux_health"`
|
||||
FluxHealthWaitSeconds int `yaml:"flux_health_wait_seconds"`
|
||||
FluxHealthPollSeconds int `yaml:"flux_health_poll_seconds"`
|
||||
IgnoreFluxKustomizations []string `yaml:"ignore_flux_kustomizations"`
|
||||
RequireWorkloadConvergence bool `yaml:"require_workload_convergence"`
|
||||
WorkloadConvergenceWaitSeconds int `yaml:"workload_convergence_wait_seconds"`
|
||||
WorkloadConvergencePollSeconds int `yaml:"workload_convergence_poll_seconds"`
|
||||
IgnoreWorkloadNamespaces []string `yaml:"ignore_workload_namespaces"`
|
||||
IgnoreWorkloads []string `yaml:"ignore_workloads"`
|
||||
IgnoreUnavailableNodes []string `yaml:"ignore_unavailable_nodes"`
|
||||
AutoRecycleStuckPods bool `yaml:"auto_recycle_stuck_pods"`
|
||||
StuckPodGraceSeconds int `yaml:"stuck_pod_grace_seconds"`
|
||||
VaultUnsealKeyFile string `yaml:"vault_unseal_key_file"`
|
||||
VaultUnsealBreakglassCommand string `yaml:"vault_unseal_breakglass_command"`
|
||||
VaultUnsealBreakglassTimeout int `yaml:"vault_unseal_breakglass_timeout_seconds"`
|
||||
}
|
||||
|
||||
type ServiceChecklistCheck struct {
|
||||
@ -146,7 +136,6 @@ type UPS struct {
|
||||
Targets []UPSTarget `yaml:"targets"`
|
||||
PollSeconds int `yaml:"poll_seconds"`
|
||||
RuntimeSafetyFactor float64 `yaml:"runtime_safety_factor"`
|
||||
OnBatteryGraceSeconds int `yaml:"on_battery_grace_seconds"`
|
||||
DebounceCount int `yaml:"debounce_count"`
|
||||
TelemetryTimeoutSeconds int `yaml:"telemetry_timeout_seconds"`
|
||||
}
|
||||
|
||||
@ -61,11 +61,6 @@ func (c Config) Validate() error {
|
||||
if c.Startup.NodeInventoryReachPollSeconds <= 0 {
|
||||
return fmt.Errorf("config.startup.node_inventory_reachability_poll_seconds must be > 0")
|
||||
}
|
||||
for _, node := range c.Startup.NodeInventoryReachRequiredNodes {
|
||||
if strings.TrimSpace(node) == "" {
|
||||
return fmt.Errorf("config.startup.node_inventory_reachability_required_nodes entries must not be empty")
|
||||
}
|
||||
}
|
||||
for node, labels := range c.Startup.RequiredNodeLabels {
|
||||
if strings.TrimSpace(node) == "" {
|
||||
return fmt.Errorf("config.startup.required_node_labels keys must not be empty")
|
||||
@ -238,46 +233,21 @@ func (c Config) Validate() error {
|
||||
if c.Startup.NodeSSHAuthPollSeconds <= 0 {
|
||||
return fmt.Errorf("config.startup.node_ssh_auth_poll_seconds must be > 0")
|
||||
}
|
||||
for _, node := range c.Startup.NodeSSHAuthRequiredNodes {
|
||||
if strings.TrimSpace(node) == "" {
|
||||
return fmt.Errorf("config.startup.node_ssh_auth_required_nodes entries must not be empty")
|
||||
}
|
||||
}
|
||||
if c.Startup.FluxHealthWaitSeconds <= 0 {
|
||||
return fmt.Errorf("config.startup.flux_health_wait_seconds must be > 0")
|
||||
}
|
||||
if c.Startup.FluxHealthPollSeconds <= 0 {
|
||||
return fmt.Errorf("config.startup.flux_health_poll_seconds must be > 0")
|
||||
}
|
||||
for _, item := range c.Startup.FluxHealthRequiredKustomizations {
|
||||
item = strings.TrimSpace(item)
|
||||
if item == "" {
|
||||
return fmt.Errorf("config.startup.flux_health_required_kustomizations entries must not be empty")
|
||||
}
|
||||
if strings.Count(item, "/") != 1 {
|
||||
return fmt.Errorf("config.startup.flux_health_required_kustomizations entries must be namespace/name, got %q", item)
|
||||
}
|
||||
}
|
||||
if c.Startup.WorkloadConvergenceWaitSeconds <= 0 {
|
||||
return fmt.Errorf("config.startup.workload_convergence_wait_seconds must be > 0")
|
||||
}
|
||||
if c.Startup.WorkloadConvergencePollSeconds <= 0 {
|
||||
return fmt.Errorf("config.startup.workload_convergence_poll_seconds must be > 0")
|
||||
}
|
||||
for _, ns := range c.Startup.WorkloadConvergenceRequiredNamespaces {
|
||||
if strings.TrimSpace(ns) == "" {
|
||||
return fmt.Errorf("config.startup.workload_convergence_required_namespaces entries must not be empty")
|
||||
}
|
||||
}
|
||||
if c.Startup.StuckPodGraceSeconds <= 0 {
|
||||
return fmt.Errorf("config.startup.stuck_pod_grace_seconds must be > 0")
|
||||
}
|
||||
if c.Startup.PostStartAutoHealSeconds <= 0 {
|
||||
return fmt.Errorf("config.startup.post_start_auto_heal_seconds must be > 0")
|
||||
}
|
||||
if c.Startup.DeadNodeCleanupGraceSeconds <= 0 {
|
||||
return fmt.Errorf("config.startup.dead_node_cleanup_grace_seconds must be > 0")
|
||||
}
|
||||
for _, probe := range c.Startup.PostStartProbes {
|
||||
if strings.TrimSpace(probe) == "" {
|
||||
return fmt.Errorf("config.startup.post_start_probes entries must not be empty")
|
||||
@ -307,16 +277,6 @@ func (c Config) Validate() error {
|
||||
return fmt.Errorf("config.startup.ignore_workload_namespaces entries must not be empty")
|
||||
}
|
||||
}
|
||||
for _, item := range c.Startup.FluxHealthRequiredKustomizations {
|
||||
if containsTrimmed(c.Startup.IgnoreFluxKustomizations, item) {
|
||||
return fmt.Errorf("config.startup.flux_health_required_kustomizations must not overlap ignore_flux_kustomizations (%q)", strings.TrimSpace(item))
|
||||
}
|
||||
}
|
||||
for _, ns := range c.Startup.WorkloadConvergenceRequiredNamespaces {
|
||||
if containsTrimmed(c.Startup.IgnoreWorkloadNamespaces, ns) {
|
||||
return fmt.Errorf("config.startup.workload_convergence_required_namespaces must not overlap ignore_workload_namespaces (%q)", strings.TrimSpace(ns))
|
||||
}
|
||||
}
|
||||
for _, node := range c.Startup.IgnoreUnavailableNodes {
|
||||
if strings.TrimSpace(node) == "" {
|
||||
return fmt.Errorf("config.startup.ignore_unavailable_nodes entries must not be empty")
|
||||
@ -332,9 +292,6 @@ func (c Config) Validate() error {
|
||||
if c.UPS.Provider == "" {
|
||||
return fmt.Errorf("config.ups.provider must not be empty when ups is enabled")
|
||||
}
|
||||
if c.UPS.OnBatteryGraceSeconds < 0 {
|
||||
return fmt.Errorf("config.ups.on_battery_grace_seconds must be >= 0")
|
||||
}
|
||||
if c.UPS.Target == "" && len(c.UPS.Targets) == 0 {
|
||||
return fmt.Errorf("config.ups.target or config.ups.targets must be set when ups is enabled")
|
||||
}
|
||||
@ -349,14 +306,6 @@ func (c Config) Validate() error {
|
||||
return fmt.Errorf("config.coordination.forward_shutdown_config must not be empty when forward_shutdown_host is set")
|
||||
}
|
||||
}
|
||||
if c.Startup.AutoQuarantineSchedulingStorms {
|
||||
if c.Startup.SchedulingStormEventThreshold <= 0 {
|
||||
return fmt.Errorf("config.startup.scheduling_storm_event_threshold must be > 0 when auto_quarantine_scheduling_storms is enabled")
|
||||
}
|
||||
if c.Startup.SchedulingStormWindowSeconds <= 0 {
|
||||
return fmt.Errorf("config.startup.scheduling_storm_window_seconds must be > 0 when auto_quarantine_scheduling_storms is enabled")
|
||||
}
|
||||
}
|
||||
for _, peer := range c.Coordination.PeerHosts {
|
||||
if strings.TrimSpace(peer) == "" {
|
||||
return fmt.Errorf("config.coordination.peer_hosts entries must not be empty")
|
||||
@ -379,20 +328,3 @@ func (c Config) Validate() error {
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// containsTrimmed runs one orchestration or CLI step.
|
||||
// Signature: containsTrimmed(entries []string, needle string) bool.
|
||||
// Why: startup config now supports both required and ignored recovery scopes, so
|
||||
// validation needs a single normalized overlap check for those lists.
|
||||
func containsTrimmed(entries []string, needle string) bool {
|
||||
needle = strings.TrimSpace(needle)
|
||||
if needle == "" {
|
||||
return false
|
||||
}
|
||||
for _, entry := range entries {
|
||||
if strings.TrimSpace(entry) == needle {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
@ -30,7 +30,6 @@ func TestValidateRejectsInvalidFieldsMatrix(t *testing.T) {
|
||||
{"bad_api_poll", func(c *Config) { c.Startup.APIPollSeconds = 0 }},
|
||||
{"bad_min_battery_percent", func(c *Config) { c.Startup.MinimumBatteryPercent = 101 }},
|
||||
{"bad_node_inventory_poll", func(c *Config) { c.Startup.NodeInventoryReachPollSeconds = 0 }},
|
||||
{"bad_empty_node_inventory_required_node", func(c *Config) { c.Startup.NodeInventoryReachRequiredNodes = []string{"titan-0a", ""} }},
|
||||
{"bad_time_sync_wait", func(c *Config) { c.Startup.TimeSyncWaitSeconds = 0 }},
|
||||
{"bad_time_sync_poll", func(c *Config) { c.Startup.TimeSyncPollSeconds = 0 }},
|
||||
{"bad_time_sync_quorum", func(c *Config) { c.Startup.TimeSyncMode = "quorum"; c.Startup.TimeSyncQuorum = 0 }},
|
||||
@ -69,42 +68,19 @@ func TestValidateRejectsInvalidFieldsMatrix(t *testing.T) {
|
||||
{"bad_ingress_ignore_entry", func(c *Config) { c.Startup.IngressChecklistIgnoreHosts = []string{"", "grafana.bstein.dev"} }},
|
||||
{"bad_node_ssh_wait", func(c *Config) { c.Startup.NodeSSHAuthWaitSeconds = 0 }},
|
||||
{"bad_node_ssh_poll", func(c *Config) { c.Startup.NodeSSHAuthPollSeconds = 0 }},
|
||||
{"bad_empty_node_ssh_required_node", func(c *Config) { c.Startup.NodeSSHAuthRequiredNodes = []string{"titan-0a", ""} }},
|
||||
{"bad_flux_wait", func(c *Config) { c.Startup.FluxHealthWaitSeconds = 0 }},
|
||||
{"bad_flux_poll", func(c *Config) { c.Startup.FluxHealthPollSeconds = 0 }},
|
||||
{"bad_empty_flux_required_kustomization", func(c *Config) { c.Startup.FluxHealthRequiredKustomizations = []string{"flux-system/core", ""} }},
|
||||
{"bad_malformed_flux_required_kustomization", func(c *Config) { c.Startup.FluxHealthRequiredKustomizations = []string{"flux-system-core"} }},
|
||||
{"bad_workload_wait", func(c *Config) { c.Startup.WorkloadConvergenceWaitSeconds = 0 }},
|
||||
{"bad_workload_poll", func(c *Config) { c.Startup.WorkloadConvergencePollSeconds = 0 }},
|
||||
{"bad_empty_required_workload_namespace", func(c *Config) { c.Startup.WorkloadConvergenceRequiredNamespaces = []string{"monitoring", ""} }},
|
||||
{"bad_stuck_pod_grace", func(c *Config) { c.Startup.StuckPodGraceSeconds = 0 }},
|
||||
{"bad_post_start_auto_heal_seconds", func(c *Config) { c.Startup.PostStartAutoHealSeconds = 0 }},
|
||||
{"bad_dead_node_cleanup_grace_seconds", func(c *Config) { c.Startup.DeadNodeCleanupGraceSeconds = 0 }},
|
||||
{"bad_empty_post_start_probe_entry", func(c *Config) { c.Startup.PostStartProbes = []string{"https://ok", ""} }},
|
||||
{"bad_empty_ignore_flux_entry", func(c *Config) { c.Startup.IgnoreFluxKustomizations = []string{"", "ns/name"} }},
|
||||
{"bad_empty_ignore_workloads_entry", func(c *Config) { c.Startup.IgnoreWorkloads = []string{"", "ns/name"} }},
|
||||
{"bad_empty_ignore_workload_namespaces_entry", func(c *Config) { c.Startup.IgnoreWorkloadNamespaces = []string{"", "vault"} }},
|
||||
{"bad_overlap_flux_required_and_ignored", func(c *Config) {
|
||||
c.Startup.FluxHealthRequiredKustomizations = []string{"flux-system/core"}
|
||||
c.Startup.IgnoreFluxKustomizations = []string{"flux-system/core"}
|
||||
}},
|
||||
{"bad_overlap_workload_required_and_ignored", func(c *Config) {
|
||||
c.Startup.WorkloadConvergenceRequiredNamespaces = []string{"monitoring"}
|
||||
c.Startup.IgnoreWorkloadNamespaces = []string{"monitoring"}
|
||||
}},
|
||||
{"bad_empty_ignore_unavailable_nodes_entry", func(c *Config) { c.Startup.IgnoreUnavailableNodes = []string{"", "titan-22"} }},
|
||||
{"bad_empty_vault_key_file", func(c *Config) { c.Startup.VaultUnsealKeyFile = "" }},
|
||||
{"bad_scheduling_storm_threshold", func(c *Config) {
|
||||
c.Startup.AutoQuarantineSchedulingStorms = true
|
||||
c.Startup.SchedulingStormEventThreshold = 0
|
||||
}},
|
||||
{"bad_scheduling_storm_window", func(c *Config) {
|
||||
c.Startup.AutoQuarantineSchedulingStorms = true
|
||||
c.Startup.SchedulingStormWindowSeconds = 0
|
||||
}},
|
||||
{"bad_ssh_port_low", func(c *Config) { c.SSHPort = 0 }},
|
||||
{"bad_ups_provider", func(c *Config) { c.UPS.Enabled = true; c.UPS.Provider = "" }},
|
||||
{"bad_ups_on_battery_grace_negative", func(c *Config) { c.UPS.Enabled = true; c.UPS.OnBatteryGraceSeconds = -1 }},
|
||||
{"bad_ups_target_empty", func(c *Config) { c.UPS.Enabled = true; c.UPS.Provider = "nut"; c.UPS.Target = ""; c.UPS.Targets = nil }},
|
||||
{"bad_ups_targets_item_empty", func(c *Config) {
|
||||
c.UPS.Enabled = true
|
||||
@ -145,13 +121,6 @@ func TestApplyDefaultsPopulatesZeroConfig(t *testing.T) {
|
||||
if cfg.Startup.TimeSyncMode == "" || cfg.Startup.EtcdRestoreControlPlane == "" || cfg.Startup.VaultUnsealKeyFile == "" {
|
||||
t.Fatalf("expected startup defaults to be set")
|
||||
}
|
||||
if cfg.Startup.PostStartAutoHealSeconds <= 0 || cfg.Startup.DeadNodeCleanupGraceSeconds <= 0 {
|
||||
t.Fatalf("expected post-start auto-heal defaults to be set")
|
||||
}
|
||||
if cfg.Startup.NodeInventoryReachRequiredNodes == nil || cfg.Startup.NodeSSHAuthRequiredNodes == nil ||
|
||||
cfg.Startup.FluxHealthRequiredKustomizations == nil || cfg.Startup.WorkloadConvergenceRequiredNamespaces == nil {
|
||||
t.Fatalf("expected startup recovery scope slices to be initialized")
|
||||
}
|
||||
if cfg.Startup.CriticalServiceEndpointWaitSec <= 0 || cfg.Startup.CriticalServiceEndpointPollSec <= 0 {
|
||||
t.Fatalf("expected critical service endpoint timing defaults to be set")
|
||||
}
|
||||
|
||||
@ -32,8 +32,6 @@ type Daemon struct {
|
||||
targets []Target
|
||||
log *log.Logger
|
||||
exporter *metrics.Exporter
|
||||
|
||||
postStartAutoHealOverride func(context.Context) error
|
||||
}
|
||||
|
||||
var sshConfigCandidates = []string{
|
||||
@ -94,9 +92,7 @@ func (d *Daemon) Run(ctx context.Context) error {
|
||||
|
||||
lastGood := map[string]time.Time{}
|
||||
lastOnBattery := map[string]bool{}
|
||||
onBatterySince := map[string]time.Time{}
|
||||
breachCount := map[string]int{}
|
||||
lastAutoHeal := time.Time{}
|
||||
for _, t := range d.targets {
|
||||
lastGood[t.Name] = time.Now()
|
||||
}
|
||||
@ -111,16 +107,12 @@ func (d *Daemon) Run(ctx context.Context) error {
|
||||
case <-t.C:
|
||||
budget := d.orch.EstimatedEmergencyShutdownSeconds()
|
||||
threshold := int(math.Ceil(float64(budget) * d.cfg.UPS.RuntimeSafetyFactor))
|
||||
anyOnBattery := false
|
||||
|
||||
d.exporter.UpdateBudget(budget)
|
||||
|
||||
for _, target := range d.targets {
|
||||
sample, err := target.Provider.Read(ctx)
|
||||
if err != nil {
|
||||
if lastOnBattery[target.Name] {
|
||||
anyOnBattery = true
|
||||
}
|
||||
d.log.Printf("warning: ups read failed target=%s (%s): %v", target.Name, target.Target, err)
|
||||
d.exporter.UpdateSample(metrics.Sample{
|
||||
Name: target.Name,
|
||||
@ -139,45 +131,17 @@ func (d *Daemon) Run(ctx context.Context) error {
|
||||
}
|
||||
|
||||
lastGood[target.Name] = time.Now()
|
||||
if sample.OnBattery {
|
||||
anyOnBattery = true
|
||||
}
|
||||
wasOnBattery := lastOnBattery[target.Name]
|
||||
if sample.OnBattery {
|
||||
if !wasOnBattery || onBatterySince[target.Name].IsZero() {
|
||||
onBatterySince[target.Name] = time.Now()
|
||||
}
|
||||
} else {
|
||||
onBatterySince[target.Name] = time.Time{}
|
||||
}
|
||||
lastOnBattery[target.Name] = sample.OnBattery
|
||||
|
||||
onBatteryElapsed := 0
|
||||
if sample.OnBattery && !onBatterySince[target.Name].IsZero() {
|
||||
onBatteryElapsed = int(time.Since(onBatterySince[target.Name]).Seconds())
|
||||
}
|
||||
|
||||
trigger := false
|
||||
triggerReason := ""
|
||||
switch {
|
||||
case sample.LowBattery:
|
||||
trigger = true
|
||||
triggerReason = fmt.Sprintf("ups-low-battery target=%s status=%s", target.Name, sample.RawStatus)
|
||||
case sample.OnBattery && sample.RuntimeSeconds > 0 && sample.RuntimeSeconds <= threshold:
|
||||
trigger = true
|
||||
triggerReason = fmt.Sprintf("ups-threshold target=%s runtime=%ds threshold=%ds status=%s", target.Name, sample.RuntimeSeconds, threshold, sample.RawStatus)
|
||||
case sample.OnBattery && d.cfg.UPS.OnBatteryGraceSeconds > 0 && onBatteryElapsed >= d.cfg.UPS.OnBatteryGraceSeconds:
|
||||
trigger = true
|
||||
triggerReason = fmt.Sprintf("ups-on-battery target=%s elapsed=%ds grace=%ds status=%s", target.Name, onBatteryElapsed, d.cfg.UPS.OnBatteryGraceSeconds, sample.RawStatus)
|
||||
}
|
||||
trigger := sample.LowBattery || (sample.OnBattery && sample.RuntimeSeconds > 0 && sample.RuntimeSeconds <= threshold)
|
||||
if trigger {
|
||||
breachCount[target.Name]++
|
||||
} else {
|
||||
breachCount[target.Name] = 0
|
||||
}
|
||||
|
||||
d.log.Printf("ups target=%s status=%s on_battery=%t on_battery_s=%d grace_s=%d runtime_s=%d threshold_s=%d budget_s=%d trigger=%t breach=%d",
|
||||
target.Name, sample.RawStatus, sample.OnBattery, onBatteryElapsed, d.cfg.UPS.OnBatteryGraceSeconds, sample.RuntimeSeconds, threshold, budget, trigger, breachCount[target.Name])
|
||||
d.log.Printf("ups target=%s status=%s on_battery=%t runtime_s=%d threshold_s=%d budget_s=%d trigger=%t breach=%d",
|
||||
target.Name, sample.RawStatus, sample.OnBattery, sample.RuntimeSeconds, threshold, budget, trigger, breachCount[target.Name])
|
||||
|
||||
d.exporter.UpdateSample(metrics.Sample{
|
||||
Name: target.Name,
|
||||
@ -196,54 +160,14 @@ func (d *Daemon) Run(ctx context.Context) error {
|
||||
})
|
||||
|
||||
if breachCount[target.Name] >= debounce {
|
||||
return d.triggerShutdown(ctx, triggerReason)
|
||||
reason := fmt.Sprintf("ups-threshold target=%s runtime=%ds threshold=%ds status=%s", target.Name, sample.RuntimeSeconds, threshold, sample.RawStatus)
|
||||
return d.triggerShutdown(ctx, reason)
|
||||
}
|
||||
}
|
||||
|
||||
d.maybeRunPostStartAutoHeal(ctx, &lastAutoHeal, anyOnBattery)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// maybeRunPostStartAutoHeal runs one orchestration or CLI step.
|
||||
// Signature: (d *Daemon) maybeRunPostStartAutoHeal(ctx context.Context, lastRun *time.Time, anyOnBattery bool).
|
||||
// Why: gives the long-running daemon a bounded path to repair post-start drift
|
||||
// like a later Vault reseal or stale dead-node deletions without waiting for a
|
||||
// fresh bootstrap run.
|
||||
func (d *Daemon) maybeRunPostStartAutoHeal(ctx context.Context, lastRun *time.Time, anyOnBattery bool) {
|
||||
interval := time.Duration(d.cfg.Startup.PostStartAutoHealSeconds) * time.Second
|
||||
if interval <= 0 || anyOnBattery {
|
||||
return
|
||||
}
|
||||
if d.orch == nil && d.postStartAutoHealOverride == nil {
|
||||
return
|
||||
}
|
||||
now := time.Now()
|
||||
if lastRun != nil && !lastRun.IsZero() && now.Sub(*lastRun) < interval {
|
||||
return
|
||||
}
|
||||
if lastRun != nil {
|
||||
*lastRun = now
|
||||
}
|
||||
if err := d.runPostStartAutoHeal(ctx); err != nil {
|
||||
d.log.Printf("warning: post-start auto-heal: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// runPostStartAutoHeal runs one orchestration or CLI step.
|
||||
// Signature: (d *Daemon) runPostStartAutoHeal(ctx context.Context) error.
|
||||
// Why: keeps the daemon loop readable while allowing unit tests to inject a
|
||||
// deterministic repair hook without a live cluster.
|
||||
func (d *Daemon) runPostStartAutoHeal(ctx context.Context) error {
|
||||
if d.postStartAutoHealOverride != nil {
|
||||
return d.postStartAutoHealOverride(ctx)
|
||||
}
|
||||
if d.orch == nil {
|
||||
return nil
|
||||
}
|
||||
return d.orch.RunPostStartAutoHeal(ctx)
|
||||
}
|
||||
|
||||
// triggerShutdown runs one orchestration or CLI step.
|
||||
// Signature: (d *Daemon) triggerShutdown(ctx context.Context, reason string) error.
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
|
||||
@ -165,50 +165,6 @@ func TestDaemonRunTriggersShutdownOnTelemetryTimeout(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestDaemonRunTriggersShutdownAfterOnBatteryGrace runs one orchestration or CLI step.
|
||||
// Signature: TestDaemonRunTriggersShutdownAfterOnBatteryGrace(t *testing.T).
|
||||
// Why: covers the sustained-on-battery trigger so short runtime estimates are not
|
||||
// the only path to a graceful shutdown during abrupt power loss.
|
||||
func TestDaemonRunTriggersShutdownAfterOnBatteryGrace(t *testing.T) {
|
||||
stateDir := t.TempDir()
|
||||
orch := newDaemonTestOrchestrator(t, stateDir)
|
||||
d := &Daemon{
|
||||
cfg: config.Config{
|
||||
UPS: config.UPS{
|
||||
Enabled: true,
|
||||
PollSeconds: 1,
|
||||
DebounceCount: 1,
|
||||
RuntimeSafetyFactor: 1.0,
|
||||
OnBatteryGraceSeconds: 1,
|
||||
},
|
||||
State: config.State{
|
||||
IntentPath: filepath.Join(stateDir, "intent.json"),
|
||||
},
|
||||
Shutdown: config.Shutdown{
|
||||
EmergencySkipDrain: true,
|
||||
EmergencySkipEtcd: true,
|
||||
},
|
||||
},
|
||||
orch: orch,
|
||||
targets: []Target{
|
||||
{
|
||||
Name: "Pyrphoros",
|
||||
Target: "pyrphoros@localhost",
|
||||
Provider: &daemonFakeProvider{
|
||||
samples: []ups.Sample{{OnBattery: true, LowBattery: false, RuntimeSeconds: 9999, RawStatus: "OB"}},
|
||||
},
|
||||
},
|
||||
},
|
||||
log: log.New(io.Discard, "", 0),
|
||||
exporter: metrics.New(),
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
if err := d.Run(ctx); err != nil {
|
||||
t.Fatalf("expected sustained-on-battery shutdown path to complete, got %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestForwardShutdownSucceedsWithSSHShim runs one orchestration or CLI step.
|
||||
// Signature: TestForwardShutdownSucceedsWithSSHShim(t *testing.T).
|
||||
// Why: covers forward-shutdown SSH execution path.
|
||||
|
||||
@ -1,51 +0,0 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"scm.bstein.dev/bstein/ananke/internal/config"
|
||||
)
|
||||
|
||||
// TestDaemonMaybeRunPostStartAutoHeal runs one orchestration or CLI step.
|
||||
// Signature: TestDaemonMaybeRunPostStartAutoHeal(t *testing.T).
|
||||
// Why: covers the daemon-side interval and on-battery guards for the new
|
||||
// post-start repair loop.
|
||||
func TestDaemonMaybeRunPostStartAutoHeal(t *testing.T) {
|
||||
calls := 0
|
||||
d := &Daemon{
|
||||
cfg: config.Config{
|
||||
Startup: config.Startup{
|
||||
PostStartAutoHealSeconds: 10,
|
||||
},
|
||||
},
|
||||
postStartAutoHealOverride: func(context.Context) error {
|
||||
calls++
|
||||
return nil
|
||||
},
|
||||
}
|
||||
|
||||
var last time.Time
|
||||
d.maybeRunPostStartAutoHeal(context.Background(), &last, false)
|
||||
if calls != 1 {
|
||||
t.Fatalf("expected first auto-heal invocation, got %d", calls)
|
||||
}
|
||||
|
||||
d.maybeRunPostStartAutoHeal(context.Background(), &last, false)
|
||||
if calls != 1 {
|
||||
t.Fatalf("expected interval guard to suppress second call, got %d", calls)
|
||||
}
|
||||
|
||||
last = time.Now().Add(-11 * time.Second)
|
||||
d.maybeRunPostStartAutoHeal(context.Background(), &last, true)
|
||||
if calls != 1 {
|
||||
t.Fatalf("expected on-battery guard to suppress call, got %d", calls)
|
||||
}
|
||||
|
||||
last = time.Now().Add(-11 * time.Second)
|
||||
d.maybeRunPostStartAutoHeal(context.Background(), &last, false)
|
||||
if calls != 2 {
|
||||
t.Fatalf("expected second allowed auto-heal call, got %d", calls)
|
||||
}
|
||||
}
|
||||
@ -22,23 +22,12 @@ type Intent struct {
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
}
|
||||
|
||||
var (
|
||||
readIntentImpl = readIntentDefault
|
||||
writeIntentImpl = writeIntentDefault
|
||||
)
|
||||
var writeIntentImpl = writeIntentDefault
|
||||
|
||||
// ReadIntent runs one orchestration or CLI step.
|
||||
// Signature: ReadIntent(path string) (Intent, error).
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
func ReadIntent(path string) (Intent, error) {
|
||||
return readIntentImpl(path)
|
||||
}
|
||||
|
||||
// readIntentDefault runs one orchestration or CLI step.
|
||||
// Signature: readIntentDefault(path string) (Intent, error).
|
||||
// Why: keeps production read behavior available while tests can override intent
|
||||
// reads deterministically without racing background file mutations.
|
||||
func readIntentDefault(path string) (Intent, error) {
|
||||
b, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
|
||||
@ -22,34 +22,6 @@ func TestHookWriteIntentDefault(path string, in Intent) error {
|
||||
return writeIntentDefault(path, in)
|
||||
}
|
||||
|
||||
// TestHookReadIntentDefault runs one orchestration or CLI step.
|
||||
// Signature: TestHookReadIntentDefault(path string) (Intent, error).
|
||||
// Why: lets top-level tests delegate to production ReadIntent behavior while
|
||||
// selectively forcing deterministic read sequences for lifecycle branches.
|
||||
func TestHookReadIntentDefault(path string) (Intent, error) {
|
||||
return readIntentDefault(path)
|
||||
}
|
||||
|
||||
// TestHookSetReadIntentOverride runs one orchestration or CLI step.
|
||||
// Signature: TestHookSetReadIntentOverride(fn func(path string) (Intent, error)) (restore func()).
|
||||
// Why: enables deterministic intent-read failure injection without sleeping
|
||||
// goroutines that race slower CI agents.
|
||||
func TestHookSetReadIntentOverride(fn func(path string) (Intent, error)) (restore func()) {
|
||||
testHookOverrideMu.Lock()
|
||||
prev := readIntentImpl
|
||||
if fn == nil {
|
||||
readIntentImpl = readIntentDefault
|
||||
} else {
|
||||
readIntentImpl = fn
|
||||
}
|
||||
testHookOverrideMu.Unlock()
|
||||
return func() {
|
||||
testHookOverrideMu.Lock()
|
||||
readIntentImpl = prev
|
||||
testHookOverrideMu.Unlock()
|
||||
}
|
||||
}
|
||||
|
||||
// TestHookSetWriteIntentOverride runs one orchestration or CLI step.
|
||||
// Signature: TestHookSetWriteIntentOverride(fn func(path string, in Intent) error) (restore func()).
|
||||
// Why: enables deterministic intent-write failure injection from the top-level
|
||||
|
||||
@ -1,116 +0,0 @@
|
||||
# Binary, config template, and systemd artifact helpers for the installer.
|
||||
|
||||
resolve_build_target() {
|
||||
if [[ -d "${REPO_DIR}/cmd/ananke" ]]; then
|
||||
echo "./cmd/ananke"
|
||||
return 0
|
||||
fi
|
||||
return 1
|
||||
}
|
||||
|
||||
install_config_template() {
|
||||
local template="$1"
|
||||
local dest="$2"
|
||||
local src legacy
|
||||
local -a modern_candidates=()
|
||||
local -a legacy_candidates=()
|
||||
|
||||
case "${template}" in
|
||||
coordinator)
|
||||
modern_candidates=("configs/ananke.coordinator.yaml" "configs/ananke.titan-db.yaml")
|
||||
legacy_candidates=("configs/hecate.titan-db.yaml")
|
||||
;;
|
||||
peer)
|
||||
modern_candidates=("configs/ananke.peer.yaml" "configs/ananke.tethys.yaml")
|
||||
legacy_candidates=("configs/hecate.tethys.yaml")
|
||||
;;
|
||||
example)
|
||||
modern_candidates=("configs/ananke.example.yaml")
|
||||
legacy_candidates=("configs/hecate.example.yaml")
|
||||
;;
|
||||
*)
|
||||
echo "[install] unknown config template key: ${template}" >&2
|
||||
return 1
|
||||
;;
|
||||
esac
|
||||
|
||||
for src in "${modern_candidates[@]}"; do
|
||||
if [[ -f "${src}" ]]; then
|
||||
install -m 0640 "${src}" "${dest}"
|
||||
return 0
|
||||
fi
|
||||
done
|
||||
|
||||
for legacy in "${legacy_candidates[@]}"; do
|
||||
if [[ -f "${legacy}" ]]; then
|
||||
src="$(mktemp)"
|
||||
legacy_path_rewrite "${legacy}" "${src}"
|
||||
install -m 0640 "${src}" "${dest}"
|
||||
rm -f "${src}"
|
||||
return 0
|
||||
fi
|
||||
done
|
||||
|
||||
echo "[install] missing config template sources for '${template}'. modern=[${modern_candidates[*]}] legacy=[${legacy_candidates[*]}]" >&2
|
||||
return 1
|
||||
}
|
||||
|
||||
install_systemd_units() {
|
||||
local tmp
|
||||
|
||||
while IFS='|' read -r target_name modern_name legacy_name; do
|
||||
local modern_src="deploy/systemd/${modern_name}"
|
||||
local legacy_src="deploy/systemd/${legacy_name}"
|
||||
local target="${SYSTEMD_DIR}/${target_name}"
|
||||
|
||||
if [[ -f "${modern_src}" ]]; then
|
||||
install -m 0644 "${modern_src}" "${target}"
|
||||
continue
|
||||
fi
|
||||
|
||||
if [[ -f "${legacy_src}" ]]; then
|
||||
tmp="$(mktemp)"
|
||||
legacy_path_rewrite "${legacy_src}" "${tmp}"
|
||||
install -m 0644 "${tmp}" "${target}"
|
||||
rm -f "${tmp}"
|
||||
continue
|
||||
fi
|
||||
|
||||
echo "[install] missing both modern and legacy systemd unit sources for ${target_name}" >&2
|
||||
return 1
|
||||
done <<'EOF_UNITS'
|
||||
ananke.service|ananke.service|hecate.service
|
||||
ananke-bootstrap.service|ananke-bootstrap.service|hecate-bootstrap.service
|
||||
ananke-update.service|ananke-update.service|hecate-update.service
|
||||
ananke-update.timer|ananke-update.timer|hecate-update.timer
|
||||
EOF_UNITS
|
||||
}
|
||||
|
||||
install_self_update_script() {
|
||||
local modern_src="scripts/ananke-self-update.sh"
|
||||
local legacy_src="scripts/hecate-self-update.sh"
|
||||
local target="${LIB_DIR}/ananke-self-update.sh"
|
||||
local tmp
|
||||
|
||||
if [[ -f "${modern_src}" ]]; then
|
||||
install -m 0755 "${modern_src}" "${target}"
|
||||
return 0
|
||||
fi
|
||||
|
||||
if [[ -f "${legacy_src}" ]]; then
|
||||
tmp="$(mktemp)"
|
||||
legacy_path_rewrite "${legacy_src}" "${tmp}"
|
||||
sed -Ei \
|
||||
-e 's/HECATE_/ANANKE_/g' \
|
||||
-e 's/hecate-self-update/ananke-self-update/g' \
|
||||
-e 's#/opt/hecate#/opt/ananke#g' \
|
||||
-e 's#bstein/hecate\.git#bstein/ananke.git#g' \
|
||||
"${tmp}"
|
||||
install -m 0755 "${tmp}" "${target}"
|
||||
rm -f "${tmp}"
|
||||
return 0
|
||||
fi
|
||||
|
||||
echo "[install] missing both modern and legacy self-update scripts." >&2
|
||||
return 1
|
||||
}
|
||||
@ -1,334 +0,0 @@
|
||||
# Config migration helpers for the Ananke host installer.
|
||||
|
||||
read_ananke_role() {
|
||||
if [[ ! -f "${CONF_DIR}/ananke.yaml" ]]; then
|
||||
echo "coordinator"
|
||||
return 0
|
||||
fi
|
||||
local role
|
||||
role="$(awk '/^[[:space:]]*role:[[:space:]]*/ {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true)"
|
||||
if [[ -z "${role}" ]]; then
|
||||
role="coordinator"
|
||||
fi
|
||||
echo "${role}"
|
||||
}
|
||||
|
||||
migration_yaml_lookup() {
|
||||
local key="$1"
|
||||
awk -F': *' -v k="${key}" '$1 == k {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true
|
||||
}
|
||||
|
||||
first_control_plane_name() {
|
||||
awk '
|
||||
/^control_planes:[[:space:]]*$/ {in_list=1; next}
|
||||
in_list && /^[[:space:]]*-[[:space:]]*/ {gsub(/^[[:space:]]*-[[:space:]]*/, "", $0); print $0; exit}
|
||||
in_list && /^[^[:space:]]/ {in_list=0}
|
||||
' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true
|
||||
}
|
||||
|
||||
lookup_node_host() {
|
||||
local node="$1"
|
||||
awk -F': *' -v n="${node}" '$1 == " " n {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true
|
||||
}
|
||||
|
||||
migrate_ananke_config() {
|
||||
if [[ ! -f "${CONF_DIR}/ananke.yaml" ]]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
local changed=0
|
||||
local role_hint
|
||||
role_hint="$(read_ananke_role)"
|
||||
if grep -Eq 'default_budget_seconds:[[:space:]]*300' "${CONF_DIR}/ananke.yaml"; then
|
||||
sed -Ei 's/(default_budget_seconds:[[:space:]]*)300/\11380/' "${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] migrated default_budget_seconds 300 -> 1380 in ${CONF_DIR}/ananke.yaml"
|
||||
changed=1
|
||||
fi
|
||||
if grep -Eq 'runtime_safety_factor:[[:space:]]*1\.10' "${CONF_DIR}/ananke.yaml"; then
|
||||
sed -Ei 's/(runtime_safety_factor:[[:space:]]*)1\.10/\11.25/' "${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] migrated runtime_safety_factor 1.10 -> 1.25 in ${CONF_DIR}/ananke.yaml"
|
||||
changed=1
|
||||
fi
|
||||
if grep -Eq '^ssh_node_users:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
|
||||
&& grep -Eq '^ titan-24:[[:space:]]*tethys[[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
|
||||
sed -Ei 's/^ titan-24:[[:space:]]*tethys[[:space:]]*$/ titan-24: atlas/' "${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] migrated ssh_node_users titan-24 override to atlas"
|
||||
changed=1
|
||||
fi
|
||||
if grep -Eq '^ command_timeout_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
|
||||
&& ! grep -Eq '^ startup_guard_max_age_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml"; then
|
||||
sed -Ei '/^ command_timeout_seconds:[[:space:]]*[0-9]+/a\ startup_guard_max_age_seconds: 900' "${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] added coordination.startup_guard_max_age_seconds=900"
|
||||
changed=1
|
||||
fi
|
||||
if grep -Eq '^[[:space:]]*poweroff_enabled:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
|
||||
sed -Ei \
|
||||
-e '/^[[:space:]]*poweroff_enabled:[[:space:]]*(true|false)/d' \
|
||||
-e '/^[[:space:]]*poweroff_delay_seconds:[[:space:]]*[0-9]+/d' \
|
||||
-e '/^[[:space:]]*poweroff_local_host:[[:space:]]*(true|false)/d' \
|
||||
-e '/^[[:space:]]*extra_poweroff_hosts:[[:space:]]*(\[\])?[[:space:]]*$/d' \
|
||||
"${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] removed deprecated host-poweroff shutdown config keys"
|
||||
changed=1
|
||||
fi
|
||||
if grep -Eq '^ minimum_battery_percent:[[:space:]]*[0-9.]+' "${CONF_DIR}/ananke.yaml" \
|
||||
&& ! grep -Eq '^ require_node_inventory_reachability:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
|
||||
sed -Ei '/^ minimum_battery_percent:[[:space:]]*[0-9.]+/a\ require_node_inventory_reachability: true\n node_inventory_reachability_wait_seconds: 300\n node_inventory_reachability_poll_seconds: 5' "${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] added startup node inventory reachability gate defaults"
|
||||
changed=1
|
||||
fi
|
||||
if grep -Eq '^state:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
|
||||
&& ! grep -Eq '^ reports_dir:[[:space:]]*/var/lib/ananke/reports' "${CONF_DIR}/ananke.yaml"; then
|
||||
sed -Ei '/^ dir:[[:space:]]*\/var\/lib\/ananke$/a\ reports_dir: /var/lib/ananke/reports' "${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] added state.reports_dir default"
|
||||
changed=1
|
||||
fi
|
||||
if ! grep -Eq '^ peer_hosts:' "${CONF_DIR}/ananke.yaml"; then
|
||||
if [[ "${role_hint}" == "peer" ]] && grep -Eq '^ forward_shutdown_host:[[:space:]]*[A-Za-z0-9._-]+' "${CONF_DIR}/ananke.yaml"; then
|
||||
local peer_host
|
||||
peer_host="$(awk -F': *' '/^ forward_shutdown_host:[[:space:]]*/ {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true)"
|
||||
if [[ -n "${peer_host}" ]]; then
|
||||
sed -Ei '/^ forward_shutdown_config:[[:space:]]*.*$/a\ peer_hosts:\n - '"${peer_host}"'' "${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] added coordination.peer_hosts from forward_shutdown_host (${peer_host})"
|
||||
changed=1
|
||||
fi
|
||||
elif [[ "${role_hint}" == "coordinator" ]] && grep -Eq '^ titan-24:[[:space:]]*[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' "${CONF_DIR}/ananke.yaml"; then
|
||||
sed -Ei '/^ forward_shutdown_config:[[:space:]]*.*$/a\ peer_hosts:\n - titan-24' "${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] added coordination.peer_hosts default (titan-24) for coordinator role"
|
||||
changed=1
|
||||
else
|
||||
sed -Ei '/^ forward_shutdown_config:[[:space:]]*.*$/a\ peer_hosts: []' "${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] added coordination.peer_hosts empty default"
|
||||
changed=1
|
||||
fi
|
||||
fi
|
||||
local default_restore_cp
|
||||
default_restore_cp="$(first_control_plane_name)"
|
||||
if [[ -z "${default_restore_cp}" ]]; then
|
||||
default_restore_cp="titan-0a"
|
||||
fi
|
||||
if grep -Eq '^ api_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
|
||||
&& ! grep -Eq '^ auto_etcd_restore_on_api_failure:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
|
||||
sed -Ei '/^ api_poll_seconds:[[:space:]]*[0-9]+/a\ require_time_sync: true\n time_sync_wait_seconds: 240\n time_sync_poll_seconds: 5\n reconcile_access_on_boot: true\n auto_etcd_restore_on_api_failure: true\n etcd_restore_control_plane: '"${default_restore_cp}"'' "${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] added startup.auto_etcd_restore_on_api_failure + startup.etcd_restore_control_plane defaults"
|
||||
changed=1
|
||||
fi
|
||||
if grep -Eq '^ api_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
|
||||
&& ! grep -Eq '^ require_time_sync:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
|
||||
sed -Ei '/^ api_poll_seconds:[[:space:]]*[0-9]+/a\ require_time_sync: true\n time_sync_wait_seconds: 240\n time_sync_poll_seconds: 5\n reconcile_access_on_boot: true' "${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] added startup time sync + access reconciliation defaults"
|
||||
changed=1
|
||||
fi
|
||||
if grep -Eq '^ time_sync_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
|
||||
&& ! grep -Eq '^ time_sync_mode:[[:space:]]*(strict|quorum)' "${CONF_DIR}/ananke.yaml"; then
|
||||
sed -Ei '/^ time_sync_poll_seconds:[[:space:]]*[0-9]+/a\ time_sync_mode: quorum\n time_sync_quorum: 2' "${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] added startup time sync quorum defaults"
|
||||
changed=1
|
||||
fi
|
||||
if grep -Eq '^ etcd_restore_control_plane:[[:space:]]*[A-Za-z0-9._-]+' "${CONF_DIR}/ananke.yaml" \
|
||||
&& ! grep -Eq '^ require_storage_ready:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
|
||||
sed -Ei '/^ etcd_restore_control_plane:[[:space:]]*[A-Za-z0-9._-]+/a\ require_storage_ready: true\n storage_ready_wait_seconds: 420\n storage_ready_poll_seconds: 5\n storage_min_ready_nodes: 2\n storage_critical_pvcs:\n - vault/data-vault-0\n - postgres/postgres-data-postgres-0\n - gitea/gitea-data\n - sso/keycloak-data' "${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] added startup storage readiness defaults"
|
||||
changed=1
|
||||
fi
|
||||
if grep -Eq '^ storage_critical_pvcs:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
|
||||
&& ! grep -Eq '^ require_post_start_probes:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
|
||||
sed -Ei '/^ - sso\/keycloak-data$/a\ require_post_start_probes: true\n post_start_probe_wait_seconds: 240\n post_start_probe_poll_seconds: 5\n post_start_probes:\n - https://scm.bstein.dev/api/healthz\n - https://metrics.bstein.dev/api/health\n require_service_checklist: true\n service_checklist_wait_seconds: 420\n service_checklist_poll_seconds: 5\n service_checklist_stability_seconds: 120\n service_checklist:\n - name: gitea-api\n url: https://scm.bstein.dev/api/healthz\n accepted_statuses: [200]\n body_contains: pass\n timeout_seconds: 12\n - name: grafana-api\n url: https://metrics.bstein.dev/api/health\n accepted_statuses: [200]\n body_contains: '\''\"database\":\"ok\"'\''\n timeout_seconds: 12\n vault_unseal_key_file: /var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] added startup post-start probe + vault key fallback defaults"
|
||||
changed=1
|
||||
fi
|
||||
if grep -Eq '^ - https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration$' "${CONF_DIR}/ananke.yaml"; then
|
||||
sed -Ei '/^ - https:\/\/sso\.bstein\.dev\/realms\/atlas\/\.well-known\/openid-configuration$/d' "${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] removed sso OIDC probe from startup.post_start_probes (returns 404 in current deployment)"
|
||||
changed=1
|
||||
fi
|
||||
if ! grep -Eq '^ vault_unseal_key_file:[[:space:]]*/var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"; then
|
||||
if grep -Eq '^startup:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" && grep -Eq '^ post_start_probes:[[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
|
||||
sed -Ei '/^ - https:\/\/metrics\.bstein\.dev\/api\/health$/a\ vault_unseal_key_file: /var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] added startup.vault_unseal_key_file default"
|
||||
changed=1
|
||||
fi
|
||||
fi
|
||||
if ! grep -Eq '^ vault_unseal_breakglass_timeout_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml"; then
|
||||
if grep -Eq '^ vault_unseal_key_file:[[:space:]]*/var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"; then
|
||||
sed -Ei '/^ vault_unseal_key_file:[[:space:]]*\/var\/lib\/ananke\/vault-unseal.key$/a\ vault_unseal_breakglass_command: ""\n vault_unseal_breakglass_timeout_seconds: 15' "${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] added startup break-glass fallback defaults"
|
||||
changed=1
|
||||
fi
|
||||
fi
|
||||
|
||||
install_cluster_inventory_defaults "${role_hint}" && changed=1
|
||||
|
||||
if [[ "${changed}" -eq 1 ]]; then
|
||||
chmod 0640 "${CONF_DIR}/ananke.yaml" || true
|
||||
fi
|
||||
}
|
||||
|
||||
install_cluster_inventory_defaults() {
|
||||
local role="$1"
|
||||
local changed=0
|
||||
local inventory_block=""
|
||||
local managed_block=""
|
||||
local workers_block
|
||||
workers_block='workers:
|
||||
- titan-04
|
||||
- titan-05
|
||||
- titan-06
|
||||
- titan-07
|
||||
- titan-08
|
||||
- titan-09
|
||||
- titan-10
|
||||
- titan-11
|
||||
- titan-12
|
||||
- titan-13
|
||||
- titan-14
|
||||
- titan-15
|
||||
- titan-17
|
||||
- titan-18
|
||||
- titan-19
|
||||
- titan-20
|
||||
- titan-21
|
||||
- titan-22
|
||||
- titan-24'
|
||||
|
||||
if [[ "${role}" == "coordinator" || "${role}" == "peer" ]]; then
|
||||
inventory_block='ssh_node_hosts:
|
||||
titan-db: 192.168.22.10
|
||||
titan-0a: 192.168.22.11
|
||||
titan-0b: 192.168.22.12
|
||||
titan-0c: 192.168.22.13
|
||||
titan-04: 192.168.22.30
|
||||
titan-05: 192.168.22.31
|
||||
titan-06: 192.168.22.32
|
||||
titan-07: 192.168.22.33
|
||||
titan-08: 192.168.22.34
|
||||
titan-09: 192.168.22.35
|
||||
titan-10: 192.168.22.36
|
||||
titan-11: 192.168.22.37
|
||||
titan-12: 192.168.22.40
|
||||
titan-13: 192.168.22.41
|
||||
titan-14: 192.168.22.42
|
||||
titan-15: 192.168.22.43
|
||||
titan-17: 192.168.22.45
|
||||
titan-18: 192.168.22.46
|
||||
titan-19: 192.168.22.47
|
||||
titan-20: 192.168.22.20
|
||||
titan-21: 192.168.22.21
|
||||
titan-22: 192.168.22.22
|
||||
titan-24: 192.168.22.26'
|
||||
managed_block='ssh_managed_nodes:
|
||||
- titan-db
|
||||
- titan-0a
|
||||
- titan-0b
|
||||
- titan-0c
|
||||
- titan-04
|
||||
- titan-05
|
||||
- titan-06
|
||||
- titan-07
|
||||
- titan-08
|
||||
- titan-09
|
||||
- titan-10
|
||||
- titan-11
|
||||
- titan-12
|
||||
- titan-13
|
||||
- titan-14
|
||||
- titan-15
|
||||
- titan-17
|
||||
- titan-18
|
||||
- titan-19
|
||||
- titan-20
|
||||
- titan-21
|
||||
- titan-22
|
||||
- titan-24'
|
||||
fi
|
||||
|
||||
if [[ -n "${inventory_block}" ]] && grep -Eq '^ssh_node_hosts:[[:space:]]*\{\}[[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
|
||||
perl -0pi -e 's#ssh_node_hosts:\s*\{\}\n#'"${inventory_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] hydrated ssh_node_hosts inventory for role=${role}"
|
||||
changed=1
|
||||
fi
|
||||
if grep -Eq '^workers:[[:space:]]*\[\][[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
|
||||
perl -0pi -e 's#workers:\s*\[\]\n#'"${workers_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] hydrated workers inventory for startup/shutdown orchestration"
|
||||
changed=1
|
||||
fi
|
||||
|
||||
if [[ -n "${managed_block}" ]]; then
|
||||
if grep -Eq '^ssh_managed_nodes:[[:space:]]*\[\][[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
|
||||
perl -0pi -e 's#ssh_managed_nodes:\s*\[\]\n#'"${managed_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] hydrated ssh_managed_nodes inventory for role=${role}"
|
||||
changed=1
|
||||
fi
|
||||
if ! grep -Eq '^ - titan-04$' "${CONF_DIR}/ananke.yaml" || ! grep -Eq '^ - titan-21$' "${CONF_DIR}/ananke.yaml"; then
|
||||
perl -0pi -e 's#ssh_managed_nodes:\n(?: - [^\n]*\n)*#'"${managed_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] refreshed ssh_managed_nodes coverage for role=${role}"
|
||||
changed=1
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ "${role}" == "peer" ]]; then
|
||||
install_peer_inventory_defaults && changed=1
|
||||
fi
|
||||
|
||||
[[ "${changed}" -eq 1 ]]
|
||||
}
|
||||
|
||||
install_peer_inventory_defaults() {
|
||||
local changed=0
|
||||
if grep -Eq '^ssh_managed_nodes:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
|
||||
&& grep -Eq '^ - titan-db$' "${CONF_DIR}/ananke.yaml" \
|
||||
&& grep -Eq '^ - titan-24$' "${CONF_DIR}/ananke.yaml" \
|
||||
&& ! grep -Eq '^ - titan-0a$' "${CONF_DIR}/ananke.yaml"; then
|
||||
perl -0pi -e 's#ssh_managed_nodes:\n - titan-db\n - titan-24\n#ssh_managed_nodes:\n - titan-db\n - titan-0a\n - titan-0b\n - titan-0c\n - titan-04\n - titan-05\n - titan-06\n - titan-07\n - titan-08\n - titan-09\n - titan-10\n - titan-11\n - titan-12\n - titan-13\n - titan-14\n - titan-15\n - titan-17\n - titan-18\n - titan-19\n - titan-20\n - titan-21\n - titan-22\n - titan-24\n#s' "${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] expanded peer ssh_managed_nodes for bootstrap fallback coverage"
|
||||
changed=1
|
||||
fi
|
||||
|
||||
if ! grep -Eq '^ - services/keycloak$' "${CONF_DIR}/ananke.yaml" || ! grep -Eq '^ - infrastructure/cert-manager$' "${CONF_DIR}/ananke.yaml" || ! grep -Eq '^ - services/oauth2-proxy$' "${CONF_DIR}/ananke.yaml"; then
|
||||
perl -0pi -e 's#local_bootstrap_paths:\n(?: - [^\n]*\n)*#local_bootstrap_paths:\n - infrastructure/core\n - clusters/atlas/flux-system\n - infrastructure/sources/helm\n - infrastructure/metallb\n - infrastructure/traefik\n - infrastructure/cert-manager\n - infrastructure/vault-csi\n - infrastructure/vault-injector\n - services/vault\n - infrastructure/postgres\n - services/gitea\n - services/keycloak\n - services/oauth2-proxy\n#s' "${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] refreshed peer local_bootstrap_paths for full fallback bootstrap parity"
|
||||
changed=1
|
||||
fi
|
||||
[[ "${changed}" -eq 1 ]]
|
||||
}
|
||||
|
||||
sanitize_migrated_ananke_config() {
|
||||
local cfg="${CONF_DIR}/ananke.yaml"
|
||||
[[ -f "${cfg}" ]] || return 0
|
||||
|
||||
local tmp changed=0
|
||||
tmp="$(mktemp)"
|
||||
|
||||
# If a legacy migration bug appended root-level node entries after
|
||||
# ssh_managed_nodes, drop those orphan entries until the next top-level key.
|
||||
awk '
|
||||
BEGIN {in_managed=0}
|
||||
/^ssh_managed_nodes:[[:space:]]*$/ {in_managed=1; print; next}
|
||||
{
|
||||
if (in_managed) {
|
||||
if ($0 ~ /^ - /) {print; next}
|
||||
if ($0 ~ /^- /) {next}
|
||||
if ($0 ~ /^[A-Za-z0-9_]+:[[:space:]]*/) {in_managed=0}
|
||||
}
|
||||
print
|
||||
}
|
||||
' "${cfg}" > "${tmp}"
|
||||
|
||||
if ! cmp -s "${cfg}" "${tmp}"; then
|
||||
mv "${tmp}" "${cfg}"
|
||||
changed=1
|
||||
echo "[install] sanitized malformed ssh_managed_nodes block in ${cfg}"
|
||||
else
|
||||
rm -f "${tmp}"
|
||||
fi
|
||||
|
||||
if grep -Eq '^[[:space:]]*forward_shutdown_config:[[:space:]]*/etc/ananke/hecate.yaml[[:space:]]*$' "${cfg}"; then
|
||||
sed -Ei 's#(^[[:space:]]*forward_shutdown_config:[[:space:]]*)/etc/ananke/hecate.yaml#\1/etc/ananke/ananke.yaml#' "${cfg}"
|
||||
changed=1
|
||||
echo "[install] migrated coordination.forward_shutdown_config to /etc/ananke/ananke.yaml"
|
||||
fi
|
||||
|
||||
if [[ "${changed}" -eq 1 ]]; then
|
||||
chmod 0640 "${cfg}" || true
|
||||
fi
|
||||
}
|
||||
@ -1,239 +0,0 @@
|
||||
# Host bootstrap helpers for the Ananke installer.
|
||||
|
||||
resolve_nut_ups_name() {
|
||||
if [[ -n "${NUT_UPS_NAME}" ]]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
if [[ -f "${CONF_DIR}/ananke.yaml" ]]; then
|
||||
local target=""
|
||||
target="$(grep -Eo 'target:[[:space:]]*[A-Za-z0-9._-]+@localhost' "${CONF_DIR}/ananke.yaml" | head -n 1 | awk '{print $2}')"
|
||||
if [[ -n "${target}" ]]; then
|
||||
NUT_UPS_NAME="${target%@localhost}"
|
||||
echo "[install] inferred NUT UPS name from config: ${NUT_UPS_NAME}"
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
|
||||
NUT_UPS_NAME="pyrphoros"
|
||||
echo "[install] defaulting NUT UPS name to ${NUT_UPS_NAME}"
|
||||
}
|
||||
|
||||
ensure_ananke_kubeconfig() {
|
||||
local kubeconfig_path
|
||||
kubeconfig_path="$(migration_yaml_lookup "kubeconfig")"
|
||||
if [[ -z "${kubeconfig_path}" ]]; then
|
||||
kubeconfig_path="/etc/ananke/kubeconfig"
|
||||
fi
|
||||
install -d -m 0750 "$(dirname "${kubeconfig_path}")"
|
||||
|
||||
if [[ -s "${kubeconfig_path}" ]] && KUBECONFIG="${kubeconfig_path}" kubectl version --request-timeout=5s >/dev/null 2>&1; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
if [[ -r /etc/rancher/k3s/k3s.yaml ]]; then
|
||||
install -m 0600 /etc/rancher/k3s/k3s.yaml "${kubeconfig_path}"
|
||||
echo "[install] refreshed kubeconfig from local /etc/rancher/k3s/k3s.yaml"
|
||||
if KUBECONFIG="${kubeconfig_path}" kubectl version --request-timeout=5s >/dev/null 2>&1; then
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
|
||||
local cp_name cp_host ssh_user ssh_port ssh_cfg ssh_key
|
||||
cp_name="$(first_control_plane_name)"
|
||||
if [[ -z "${cp_name}" ]]; then
|
||||
echo "[install] warning: cannot infer control plane name; kubeconfig bootstrap skipped"
|
||||
return 0
|
||||
fi
|
||||
cp_host="$(lookup_node_host "${cp_name}")"
|
||||
if [[ -z "${cp_host}" ]]; then
|
||||
cp_host="${cp_name}"
|
||||
fi
|
||||
ssh_user="$(migration_yaml_lookup "ssh_user")"
|
||||
ssh_port="$(migration_yaml_lookup "ssh_port")"
|
||||
ssh_cfg="$(migration_yaml_lookup "ssh_config_file")"
|
||||
ssh_key="$(migration_yaml_lookup "ssh_identity_file")"
|
||||
if [[ -z "${ssh_port}" ]]; then
|
||||
ssh_port="2277"
|
||||
fi
|
||||
|
||||
local target
|
||||
target="${cp_host}"
|
||||
if [[ -n "${ssh_user}" ]]; then
|
||||
target="${ssh_user}@${cp_host}"
|
||||
fi
|
||||
local ssh_args=(
|
||||
-o BatchMode=yes
|
||||
-o ConnectTimeout=8
|
||||
-o StrictHostKeyChecking=accept-new
|
||||
)
|
||||
if [[ -n "${ssh_cfg}" && -f "${ssh_cfg}" ]]; then
|
||||
ssh_args+=(-F "${ssh_cfg}")
|
||||
fi
|
||||
if [[ -n "${ssh_key}" && -f "${ssh_key}" ]]; then
|
||||
ssh_args+=(-i "${ssh_key}")
|
||||
fi
|
||||
if [[ -n "${ssh_port}" ]]; then
|
||||
ssh_args+=(-p "${ssh_port}")
|
||||
fi
|
||||
|
||||
local remote_cfg
|
||||
if remote_cfg="$(ssh "${ssh_args[@]}" "${target}" "sudo cat /etc/rancher/k3s/k3s.yaml" 2>/dev/null)"; then
|
||||
printf '%s\n' "${remote_cfg}" > "${kubeconfig_path}"
|
||||
sed -Ei "s#server:[[:space:]]*https://127\\.0\\.0\\.1:6443#server: https://${cp_host}:6443#g" "${kubeconfig_path}" || true
|
||||
chmod 0600 "${kubeconfig_path}"
|
||||
echo "[install] bootstrapped kubeconfig from control plane ${cp_name} (${cp_host})"
|
||||
if KUBECONFIG="${kubeconfig_path}" kubectl version --request-timeout=5s >/dev/null 2>&1; then
|
||||
return 0
|
||||
fi
|
||||
else
|
||||
echo "[install] warning: failed to fetch kubeconfig from ${cp_name} (${cp_host})"
|
||||
fi
|
||||
|
||||
echo "[install] warning: kubeconfig at ${kubeconfig_path} is still not validated; local startup fallback may fail"
|
||||
}
|
||||
|
||||
ensure_ananke_ssh_identity() {
|
||||
local key_path key_dir key_user key_comment
|
||||
key_path="$(migration_yaml_lookup "ssh_identity_file")"
|
||||
if [[ -z "${key_path}" ]]; then
|
||||
key_path="/home/atlas/.ssh/id_ed25519"
|
||||
fi
|
||||
key_dir="$(dirname "${key_path}")"
|
||||
key_comment="ananke-$(hostname)-forward"
|
||||
|
||||
key_user="root"
|
||||
if [[ "${key_path}" == /home/*/* ]]; then
|
||||
key_user="${key_path#/home/}"
|
||||
key_user="${key_user%%/*}"
|
||||
fi
|
||||
|
||||
if ! id "${key_user}" >/dev/null 2>&1; then
|
||||
echo "[install] warning: ssh identity owner ${key_user} does not exist; skipping key bootstrap for ${key_path}"
|
||||
return 0
|
||||
fi
|
||||
|
||||
install -d -m 0700 -o "${key_user}" -g "${key_user}" "${key_dir}"
|
||||
if [[ ! -s "${key_path}" ]]; then
|
||||
echo "[install] generating missing SSH identity at ${key_path}"
|
||||
if [[ "${key_user}" == "root" ]]; then
|
||||
ssh-keygen -q -t ed25519 -N '' -C "${key_comment}" -f "${key_path}"
|
||||
else
|
||||
runuser -u "${key_user}" -- ssh-keygen -q -t ed25519 -N '' -C "${key_comment}" -f "${key_path}"
|
||||
fi
|
||||
fi
|
||||
chown "${key_user}:${key_user}" "${key_path}" "${key_path}.pub" 2>/dev/null || true
|
||||
chmod 0600 "${key_path}" || true
|
||||
chmod 0644 "${key_path}.pub" || true
|
||||
}
|
||||
|
||||
ensure_apt_packages() {
|
||||
local missing=()
|
||||
for pkg in "$@"; do
|
||||
if ! dpkg -s "${pkg}" >/dev/null 2>&1; then
|
||||
missing+=("${pkg}")
|
||||
fi
|
||||
done
|
||||
if [[ ${#missing[@]} -eq 0 ]]; then
|
||||
return 0
|
||||
fi
|
||||
echo "[install] apt install: ${missing[*]}"
|
||||
export DEBIAN_FRONTEND=noninteractive
|
||||
apt-get update -y
|
||||
apt-get install -y "${missing[@]}"
|
||||
}
|
||||
|
||||
install_kubectl_if_missing() {
|
||||
if command -v kubectl >/dev/null 2>&1; then
|
||||
return 0
|
||||
fi
|
||||
ensure_apt_packages kubernetes-client || true
|
||||
if command -v kubectl >/dev/null 2>&1; then
|
||||
return 0
|
||||
fi
|
||||
echo "[install] installing kubectl via upstream binary"
|
||||
local arch
|
||||
arch="$(uname -m)"
|
||||
case "${arch}" in
|
||||
x86_64) arch="amd64" ;;
|
||||
aarch64|arm64) arch="arm64" ;;
|
||||
*) echo "Unsupported arch for kubectl install: ${arch}" >&2; return 1 ;;
|
||||
esac
|
||||
local version
|
||||
version="$(curl -fsSL https://dl.k8s.io/release/stable.txt)"
|
||||
curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/${version}/bin/linux/${arch}/kubectl"
|
||||
chmod 0755 /usr/local/bin/kubectl
|
||||
}
|
||||
|
||||
ensure_dependencies() {
|
||||
if [[ "${INSTALL_DEPS}" -eq 0 ]]; then
|
||||
echo "[install] skipping dependency installation"
|
||||
return 0
|
||||
fi
|
||||
if ! command -v apt-get >/dev/null 2>&1; then
|
||||
echo "This installer currently supports apt-based hosts only." >&2
|
||||
exit 1
|
||||
fi
|
||||
ensure_apt_packages ca-certificates curl git openssh-client jq nut-client nut-server nut-monitor golang-go
|
||||
install_kubectl_if_missing
|
||||
}
|
||||
|
||||
configure_nut() {
|
||||
if [[ "${MANAGE_NUT}" != "1" ]]; then
|
||||
echo "[install] skipping NUT configuration (ANANKE_MANAGE_NUT=${MANAGE_NUT})"
|
||||
return 0
|
||||
fi
|
||||
|
||||
echo "[install] configuring NUT + udev for UPS ${NUT_UPS_NAME} (${NUT_VENDOR_ID}:${NUT_PRODUCT_ID})"
|
||||
install -d -m 0755 /etc/nut /etc/udev/rules.d
|
||||
|
||||
cat > /etc/nut/nut.conf <<EOF
|
||||
MODE=standalone
|
||||
EOF
|
||||
|
||||
cat > /etc/nut/ups.conf <<EOF
|
||||
[${NUT_UPS_NAME}]
|
||||
driver = usbhid-ups
|
||||
port = auto
|
||||
vendorid = ${NUT_VENDOR_ID}
|
||||
productid = ${NUT_PRODUCT_ID}
|
||||
pollinterval = 5
|
||||
EOF
|
||||
|
||||
cat > /etc/nut/upsd.users <<EOF
|
||||
[${NUT_MONITOR_USER}]
|
||||
password = ${NUT_MONITOR_PASSWORD}
|
||||
upsmon primary
|
||||
EOF
|
||||
chmod 0640 /etc/nut/upsd.users
|
||||
if getent group nut >/dev/null 2>&1; then
|
||||
chown root:nut /etc/nut/upsd.users
|
||||
else
|
||||
chown root:root /etc/nut/upsd.users
|
||||
fi
|
||||
|
||||
cat > /etc/nut/upsmon.conf <<EOF
|
||||
RUN_AS_USER nut
|
||||
MONITOR ${NUT_UPS_NAME}@localhost 1 ${NUT_MONITOR_USER} ${NUT_MONITOR_PASSWORD} primary
|
||||
MINSUPPLIES 1
|
||||
SHUTDOWNCMD "/sbin/shutdown -h +0"
|
||||
POLLFREQ 5
|
||||
POLLFREQALERT 5
|
||||
HOSTSYNC 15
|
||||
DEADTIME 15
|
||||
POWERDOWNFLAG /etc/killpower
|
||||
EOF
|
||||
|
||||
cat > /etc/udev/rules.d/99-ananke-ups.rules <<EOF
|
||||
# Managed by ananke install.sh: ensure UPS USB HID devices are readable by NUT
|
||||
ACTION=="add|change", SUBSYSTEM=="usb", ATTR{idVendor}=="${NUT_VENDOR_ID}", ATTR{idProduct}=="${NUT_PRODUCT_ID}", MODE:="0660", GROUP:="nut"
|
||||
EOF
|
||||
|
||||
udevadm control --reload-rules || true
|
||||
udevadm trigger --subsystem-match=usb --attr-match=idVendor="${NUT_VENDOR_ID}" --attr-match=idProduct="${NUT_PRODUCT_ID}" || true
|
||||
|
||||
systemctl enable nut-driver-enumerator.service nut-server.service nut-monitor.service >/dev/null 2>&1 || true
|
||||
systemctl restart nut-driver-enumerator.service >/dev/null 2>&1 || true
|
||||
systemctl restart "nut-driver@${NUT_UPS_NAME}.service" >/dev/null 2>&1 || true
|
||||
systemctl restart nut-server.service nut-monitor.service >/dev/null 2>&1 || true
|
||||
}
|
||||
@ -1,98 +0,0 @@
|
||||
# Legacy Hecate migration helpers for the Ananke installer.
|
||||
|
||||
legacy_path_rewrite() {
|
||||
local src="$1"
|
||||
local dst="$2"
|
||||
sed \
|
||||
-e 's#/etc/hecate/hecate.yaml#/etc/ananke/ananke.yaml#g' \
|
||||
-e 's#/etc/hecate/kubeconfig#/etc/ananke/kubeconfig#g' \
|
||||
-e 's#/var/lib/hecate/vault-unseal.key#/var/lib/ananke/vault-unseal.key#g' \
|
||||
-e 's#/var/lib/hecate/hecate.lock#/var/lib/ananke/ananke.lock#g' \
|
||||
-e 's#/opt/hecate#/opt/ananke#g' \
|
||||
-e 's#/etc/hecate#/etc/ananke#g' \
|
||||
-e 's#/var/lib/hecate#/var/lib/ananke#g' \
|
||||
-e 's#/usr/local/bin/hecate#/usr/local/bin/ananke#g' \
|
||||
-e 's#/usr/local/lib/hecate#/usr/local/lib/ananke#g' \
|
||||
-e 's/hecate.yaml/ananke.yaml/g' \
|
||||
-e 's/hecate.lock/ananke.lock/g' \
|
||||
-e 's/hecate/ananke/g' \
|
||||
-e 's/Hecate/Ananke/g' \
|
||||
-e 's#hecate\.lock#ananke.lock#g' \
|
||||
"${src}" > "${dst}"
|
||||
}
|
||||
|
||||
migrate_legacy_hecate_install() {
|
||||
local legacy_conf_dir="/etc/hecate"
|
||||
local legacy_state_dir="/var/lib/hecate"
|
||||
local legacy_systemd_dir="/etc/systemd/system"
|
||||
|
||||
install -d -m 0750 "${CONF_DIR}"
|
||||
install -d -m 0750 "${STATE_DIR}"
|
||||
|
||||
if [[ ! -f "${CONF_DIR}/ananke.yaml" && -f "${legacy_conf_dir}/hecate.yaml" ]]; then
|
||||
echo "[install] migrating legacy config ${legacy_conf_dir}/hecate.yaml -> ${CONF_DIR}/ananke.yaml"
|
||||
legacy_path_rewrite "${legacy_conf_dir}/hecate.yaml" "${CONF_DIR}/ananke.yaml"
|
||||
chmod 0640 "${CONF_DIR}/ananke.yaml"
|
||||
fi
|
||||
|
||||
if [[ ! -f "${CONF_DIR}/kubeconfig" && -f "${legacy_conf_dir}/kubeconfig" ]]; then
|
||||
echo "[install] migrating legacy kubeconfig ${legacy_conf_dir}/kubeconfig -> ${CONF_DIR}/kubeconfig"
|
||||
install -m 0600 "${legacy_conf_dir}/kubeconfig" "${CONF_DIR}/kubeconfig"
|
||||
fi
|
||||
|
||||
if [[ ! -f "${STATE_DIR}/vault-unseal.key" && -f "${legacy_state_dir}/vault-unseal.key" ]]; then
|
||||
echo "[install] migrating legacy vault key ${legacy_state_dir}/vault-unseal.key -> ${STATE_DIR}/vault-unseal.key"
|
||||
install -m 0600 "${legacy_state_dir}/vault-unseal.key" "${STATE_DIR}/vault-unseal.key"
|
||||
fi
|
||||
|
||||
if [[ ! -f "${STATE_DIR}/runs.json" && -f "${legacy_state_dir}/runs.json" ]]; then
|
||||
echo "[install] migrating legacy run history ${legacy_state_dir}/runs.json -> ${STATE_DIR}/runs.json"
|
||||
install -m 0640 "${legacy_state_dir}/runs.json" "${STATE_DIR}/runs.json"
|
||||
fi
|
||||
|
||||
if [[ ! -f "${STATE_DIR}/intent.json" && -f "${legacy_state_dir}/intent.json" ]]; then
|
||||
echo "[install] migrating legacy intent state ${legacy_state_dir}/intent.json -> ${STATE_DIR}/intent.json"
|
||||
install -m 0640 "${legacy_state_dir}/intent.json" "${STATE_DIR}/intent.json"
|
||||
fi
|
||||
|
||||
if [[ ! -f "${STATE_DIR}/ananke.lock" && -f "${legacy_state_dir}/hecate.lock" ]]; then
|
||||
echo "[install] migrating legacy lock ${legacy_state_dir}/hecate.lock -> ${STATE_DIR}/ananke.lock"
|
||||
install -m 0640 "${legacy_state_dir}/hecate.lock" "${STATE_DIR}/ananke.lock"
|
||||
fi
|
||||
|
||||
if [[ -d "${legacy_systemd_dir}" ]]; then
|
||||
if ls "${legacy_systemd_dir}"/hecate*.service >/dev/null 2>&1 || ls "${legacy_systemd_dir}"/hecate*.timer >/dev/null 2>&1; then
|
||||
echo "[install] detected legacy hecate systemd unit files; will retire after ananke install"
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
retire_legacy_hecate_install() {
|
||||
local ts backup_dir
|
||||
ts="$(date +%Y%m%d%H%M%S)"
|
||||
backup_dir="/var/backups/ananke-legacy-hecate-${ts}"
|
||||
|
||||
systemctl disable --now hecate.service hecate-bootstrap.service hecate-update.timer >/dev/null 2>&1 || true
|
||||
systemctl stop hecate-update.service >/dev/null 2>&1 || true
|
||||
|
||||
if [[ -d /etc/hecate || -d /var/lib/hecate || -d /usr/local/lib/hecate || -d /opt/hecate ]]; then
|
||||
install -d -m 0750 "${backup_dir}"
|
||||
[[ -d /etc/hecate ]] && cp -a /etc/hecate "${backup_dir}/" || true
|
||||
[[ -d /var/lib/hecate ]] && cp -a /var/lib/hecate "${backup_dir}/" || true
|
||||
[[ -d /usr/local/lib/hecate ]] && cp -a /usr/local/lib/hecate "${backup_dir}/" || true
|
||||
[[ -d /opt/hecate ]] && cp -a /opt/hecate "${backup_dir}/" || true
|
||||
[[ -f /usr/local/bin/hecate ]] && install -m 0755 /usr/local/bin/hecate "${backup_dir}/hecate.bin" || true
|
||||
echo "[install] backed up legacy hecate assets to ${backup_dir}"
|
||||
fi
|
||||
|
||||
rm -f \
|
||||
/etc/systemd/system/hecate.service \
|
||||
/etc/systemd/system/hecate-bootstrap.service \
|
||||
/etc/systemd/system/hecate-update.service \
|
||||
/etc/systemd/system/hecate-update.timer
|
||||
rm -f /usr/local/bin/hecate
|
||||
rm -rf /usr/local/lib/hecate
|
||||
rm -rf /opt/hecate
|
||||
rm -rf /etc/hecate
|
||||
rm -rf /var/lib/hecate
|
||||
}
|
||||
@ -41,10 +41,829 @@ while [[ $# -gt 0 ]]; do
|
||||
esac
|
||||
done
|
||||
|
||||
source "${REPO_DIR}/scripts/install-config-migration.sh"
|
||||
source "${REPO_DIR}/scripts/install-host-bootstrap.sh"
|
||||
source "${REPO_DIR}/scripts/install-legacy-migration.sh"
|
||||
source "${REPO_DIR}/scripts/install-artifacts.sh"
|
||||
resolve_nut_ups_name() {
|
||||
if [[ -n "${NUT_UPS_NAME}" ]]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
if [[ -f "${CONF_DIR}/ananke.yaml" ]]; then
|
||||
local target=""
|
||||
target="$(grep -Eo 'target:[[:space:]]*[A-Za-z0-9._-]+@localhost' "${CONF_DIR}/ananke.yaml" | head -n 1 | awk '{print $2}')"
|
||||
if [[ -n "${target}" ]]; then
|
||||
NUT_UPS_NAME="${target%@localhost}"
|
||||
echo "[install] inferred NUT UPS name from config: ${NUT_UPS_NAME}"
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
|
||||
NUT_UPS_NAME="pyrphoros"
|
||||
echo "[install] defaulting NUT UPS name to ${NUT_UPS_NAME}"
|
||||
}
|
||||
|
||||
read_ananke_role() {
|
||||
if [[ ! -f "${CONF_DIR}/ananke.yaml" ]]; then
|
||||
echo "coordinator"
|
||||
return 0
|
||||
fi
|
||||
local role
|
||||
role="$(awk '/^[[:space:]]*role:[[:space:]]*/ {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true)"
|
||||
if [[ -z "${role}" ]]; then
|
||||
role="coordinator"
|
||||
fi
|
||||
echo "${role}"
|
||||
}
|
||||
|
||||
migration_yaml_lookup() {
|
||||
local key="$1"
|
||||
awk -F': *' -v k="${key}" '$1 == k {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true
|
||||
}
|
||||
|
||||
first_control_plane_name() {
|
||||
awk '
|
||||
/^control_planes:[[:space:]]*$/ {in_list=1; next}
|
||||
in_list && /^[[:space:]]*-[[:space:]]*/ {gsub(/^[[:space:]]*-[[:space:]]*/, "", $0); print $0; exit}
|
||||
in_list && /^[^[:space:]]/ {in_list=0}
|
||||
' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true
|
||||
}
|
||||
|
||||
lookup_node_host() {
|
||||
local node="$1"
|
||||
awk -F': *' -v n="${node}" '$1 == " " n {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true
|
||||
}
|
||||
|
||||
ensure_ananke_kubeconfig() {
|
||||
local kubeconfig_path
|
||||
kubeconfig_path="$(migration_yaml_lookup "kubeconfig")"
|
||||
if [[ -z "${kubeconfig_path}" ]]; then
|
||||
kubeconfig_path="/etc/ananke/kubeconfig"
|
||||
fi
|
||||
install -d -m 0750 "$(dirname "${kubeconfig_path}")"
|
||||
|
||||
if [[ -s "${kubeconfig_path}" ]] && KUBECONFIG="${kubeconfig_path}" kubectl version --request-timeout=5s >/dev/null 2>&1; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
if [[ -r /etc/rancher/k3s/k3s.yaml ]]; then
|
||||
install -m 0600 /etc/rancher/k3s/k3s.yaml "${kubeconfig_path}"
|
||||
echo "[install] refreshed kubeconfig from local /etc/rancher/k3s/k3s.yaml"
|
||||
if KUBECONFIG="${kubeconfig_path}" kubectl version --request-timeout=5s >/dev/null 2>&1; then
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
|
||||
local cp_name cp_host ssh_user ssh_port ssh_cfg ssh_key
|
||||
cp_name="$(first_control_plane_name)"
|
||||
if [[ -z "${cp_name}" ]]; then
|
||||
echo "[install] warning: cannot infer control plane name; kubeconfig bootstrap skipped"
|
||||
return 0
|
||||
fi
|
||||
cp_host="$(lookup_node_host "${cp_name}")"
|
||||
if [[ -z "${cp_host}" ]]; then
|
||||
cp_host="${cp_name}"
|
||||
fi
|
||||
ssh_user="$(migration_yaml_lookup "ssh_user")"
|
||||
ssh_port="$(migration_yaml_lookup "ssh_port")"
|
||||
ssh_cfg="$(migration_yaml_lookup "ssh_config_file")"
|
||||
ssh_key="$(migration_yaml_lookup "ssh_identity_file")"
|
||||
if [[ -z "${ssh_port}" ]]; then
|
||||
ssh_port="2277"
|
||||
fi
|
||||
|
||||
local target
|
||||
target="${cp_host}"
|
||||
if [[ -n "${ssh_user}" ]]; then
|
||||
target="${ssh_user}@${cp_host}"
|
||||
fi
|
||||
local ssh_args=(
|
||||
-o BatchMode=yes
|
||||
-o ConnectTimeout=8
|
||||
-o StrictHostKeyChecking=accept-new
|
||||
)
|
||||
if [[ -n "${ssh_cfg}" && -f "${ssh_cfg}" ]]; then
|
||||
ssh_args+=(-F "${ssh_cfg}")
|
||||
fi
|
||||
if [[ -n "${ssh_key}" && -f "${ssh_key}" ]]; then
|
||||
ssh_args+=(-i "${ssh_key}")
|
||||
fi
|
||||
if [[ -n "${ssh_port}" ]]; then
|
||||
ssh_args+=(-p "${ssh_port}")
|
||||
fi
|
||||
|
||||
local remote_cfg
|
||||
if remote_cfg="$(ssh "${ssh_args[@]}" "${target}" "sudo cat /etc/rancher/k3s/k3s.yaml" 2>/dev/null)"; then
|
||||
printf '%s\n' "${remote_cfg}" > "${kubeconfig_path}"
|
||||
sed -Ei "s#server:[[:space:]]*https://127\\.0\\.0\\.1:6443#server: https://${cp_host}:6443#g" "${kubeconfig_path}" || true
|
||||
chmod 0600 "${kubeconfig_path}"
|
||||
echo "[install] bootstrapped kubeconfig from control plane ${cp_name} (${cp_host})"
|
||||
if KUBECONFIG="${kubeconfig_path}" kubectl version --request-timeout=5s >/dev/null 2>&1; then
|
||||
return 0
|
||||
fi
|
||||
else
|
||||
echo "[install] warning: failed to fetch kubeconfig from ${cp_name} (${cp_host})"
|
||||
fi
|
||||
|
||||
echo "[install] warning: kubeconfig at ${kubeconfig_path} is still not validated; local startup fallback may fail"
|
||||
}
|
||||
|
||||
ensure_ananke_ssh_identity() {
|
||||
local key_path key_dir key_user key_comment
|
||||
key_path="$(migration_yaml_lookup "ssh_identity_file")"
|
||||
if [[ -z "${key_path}" ]]; then
|
||||
key_path="/home/atlas/.ssh/id_ed25519"
|
||||
fi
|
||||
key_dir="$(dirname "${key_path}")"
|
||||
key_comment="ananke-$(hostname)-forward"
|
||||
|
||||
key_user="root"
|
||||
if [[ "${key_path}" == /home/*/* ]]; then
|
||||
key_user="${key_path#/home/}"
|
||||
key_user="${key_user%%/*}"
|
||||
fi
|
||||
|
||||
if ! id "${key_user}" >/dev/null 2>&1; then
|
||||
echo "[install] warning: ssh identity owner ${key_user} does not exist; skipping key bootstrap for ${key_path}"
|
||||
return 0
|
||||
fi
|
||||
|
||||
install -d -m 0700 -o "${key_user}" -g "${key_user}" "${key_dir}"
|
||||
if [[ ! -s "${key_path}" ]]; then
|
||||
echo "[install] generating missing SSH identity at ${key_path}"
|
||||
if [[ "${key_user}" == "root" ]]; then
|
||||
ssh-keygen -q -t ed25519 -N '' -C "${key_comment}" -f "${key_path}"
|
||||
else
|
||||
runuser -u "${key_user}" -- ssh-keygen -q -t ed25519 -N '' -C "${key_comment}" -f "${key_path}"
|
||||
fi
|
||||
fi
|
||||
chown "${key_user}:${key_user}" "${key_path}" "${key_path}.pub" 2>/dev/null || true
|
||||
chmod 0600 "${key_path}" || true
|
||||
chmod 0644 "${key_path}.pub" || true
|
||||
}
|
||||
|
||||
migrate_ananke_config() {
|
||||
if [[ ! -f "${CONF_DIR}/ananke.yaml" ]]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
local changed=0
|
||||
local role_hint
|
||||
role_hint="$(read_ananke_role)"
|
||||
if grep -Eq 'default_budget_seconds:[[:space:]]*300' "${CONF_DIR}/ananke.yaml"; then
|
||||
sed -Ei 's/(default_budget_seconds:[[:space:]]*)300/\11380/' "${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] migrated default_budget_seconds 300 -> 1380 in ${CONF_DIR}/ananke.yaml"
|
||||
changed=1
|
||||
fi
|
||||
if grep -Eq 'runtime_safety_factor:[[:space:]]*1\.10' "${CONF_DIR}/ananke.yaml"; then
|
||||
sed -Ei 's/(runtime_safety_factor:[[:space:]]*)1\.10/\11.25/' "${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] migrated runtime_safety_factor 1.10 -> 1.25 in ${CONF_DIR}/ananke.yaml"
|
||||
changed=1
|
||||
fi
|
||||
if grep -Eq '^ssh_node_users:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
|
||||
&& grep -Eq '^ titan-24:[[:space:]]*tethys[[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
|
||||
sed -Ei 's/^ titan-24:[[:space:]]*tethys[[:space:]]*$/ titan-24: atlas/' "${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] migrated ssh_node_users titan-24 override to atlas"
|
||||
changed=1
|
||||
fi
|
||||
if grep -Eq '^ command_timeout_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
|
||||
&& ! grep -Eq '^ startup_guard_max_age_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml"; then
|
||||
sed -Ei '/^ command_timeout_seconds:[[:space:]]*[0-9]+/a\ startup_guard_max_age_seconds: 900' "${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] added coordination.startup_guard_max_age_seconds=900"
|
||||
changed=1
|
||||
fi
|
||||
if grep -Eq '^[[:space:]]*poweroff_enabled:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
|
||||
sed -Ei \
|
||||
-e '/^[[:space:]]*poweroff_enabled:[[:space:]]*(true|false)/d' \
|
||||
-e '/^[[:space:]]*poweroff_delay_seconds:[[:space:]]*[0-9]+/d' \
|
||||
-e '/^[[:space:]]*poweroff_local_host:[[:space:]]*(true|false)/d' \
|
||||
-e '/^[[:space:]]*extra_poweroff_hosts:[[:space:]]*(\[\])?[[:space:]]*$/d' \
|
||||
"${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] removed deprecated host-poweroff shutdown config keys"
|
||||
changed=1
|
||||
fi
|
||||
if grep -Eq '^ minimum_battery_percent:[[:space:]]*[0-9.]+' "${CONF_DIR}/ananke.yaml" \
|
||||
&& ! grep -Eq '^ require_node_inventory_reachability:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
|
||||
sed -Ei '/^ minimum_battery_percent:[[:space:]]*[0-9.]+/a\ require_node_inventory_reachability: true\n node_inventory_reachability_wait_seconds: 300\n node_inventory_reachability_poll_seconds: 5' "${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] added startup node inventory reachability gate defaults"
|
||||
changed=1
|
||||
fi
|
||||
if grep -Eq '^state:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
|
||||
&& ! grep -Eq '^ reports_dir:[[:space:]]*/var/lib/ananke/reports' "${CONF_DIR}/ananke.yaml"; then
|
||||
sed -Ei '/^ dir:[[:space:]]*\/var\/lib\/ananke$/a\ reports_dir: /var/lib/ananke/reports' "${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] added state.reports_dir default"
|
||||
changed=1
|
||||
fi
|
||||
if ! grep -Eq '^ peer_hosts:' "${CONF_DIR}/ananke.yaml"; then
|
||||
if [[ "${role_hint}" == "peer" ]] && grep -Eq '^ forward_shutdown_host:[[:space:]]*[A-Za-z0-9._-]+' "${CONF_DIR}/ananke.yaml"; then
|
||||
local peer_host
|
||||
peer_host="$(awk -F': *' '/^ forward_shutdown_host:[[:space:]]*/ {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true)"
|
||||
if [[ -n "${peer_host}" ]]; then
|
||||
sed -Ei '/^ forward_shutdown_config:[[:space:]]*.*$/a\ peer_hosts:\n - '"${peer_host}"'' "${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] added coordination.peer_hosts from forward_shutdown_host (${peer_host})"
|
||||
changed=1
|
||||
fi
|
||||
elif [[ "${role_hint}" == "coordinator" ]] && grep -Eq '^ titan-24:[[:space:]]*[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' "${CONF_DIR}/ananke.yaml"; then
|
||||
sed -Ei '/^ forward_shutdown_config:[[:space:]]*.*$/a\ peer_hosts:\n - titan-24' "${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] added coordination.peer_hosts default (titan-24) for coordinator role"
|
||||
changed=1
|
||||
else
|
||||
sed -Ei '/^ forward_shutdown_config:[[:space:]]*.*$/a\ peer_hosts: []' "${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] added coordination.peer_hosts empty default"
|
||||
changed=1
|
||||
fi
|
||||
fi
|
||||
local default_restore_cp
|
||||
default_restore_cp="$(first_control_plane_name)"
|
||||
if [[ -z "${default_restore_cp}" ]]; then
|
||||
default_restore_cp="titan-0a"
|
||||
fi
|
||||
if grep -Eq '^ api_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
|
||||
&& ! grep -Eq '^ auto_etcd_restore_on_api_failure:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
|
||||
sed -Ei '/^ api_poll_seconds:[[:space:]]*[0-9]+/a\ require_time_sync: true\n time_sync_wait_seconds: 240\n time_sync_poll_seconds: 5\n reconcile_access_on_boot: true\n auto_etcd_restore_on_api_failure: true\n etcd_restore_control_plane: '"${default_restore_cp}"'' "${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] added startup.auto_etcd_restore_on_api_failure + startup.etcd_restore_control_plane defaults"
|
||||
changed=1
|
||||
fi
|
||||
if grep -Eq '^ api_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
|
||||
&& ! grep -Eq '^ require_time_sync:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
|
||||
sed -Ei '/^ api_poll_seconds:[[:space:]]*[0-9]+/a\ require_time_sync: true\n time_sync_wait_seconds: 240\n time_sync_poll_seconds: 5\n reconcile_access_on_boot: true' "${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] added startup time sync + access reconciliation defaults"
|
||||
changed=1
|
||||
fi
|
||||
if grep -Eq '^ time_sync_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
|
||||
&& ! grep -Eq '^ time_sync_mode:[[:space:]]*(strict|quorum)' "${CONF_DIR}/ananke.yaml"; then
|
||||
sed -Ei '/^ time_sync_poll_seconds:[[:space:]]*[0-9]+/a\ time_sync_mode: quorum\n time_sync_quorum: 2' "${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] added startup time sync quorum defaults"
|
||||
changed=1
|
||||
fi
|
||||
if grep -Eq '^ etcd_restore_control_plane:[[:space:]]*[A-Za-z0-9._-]+' "${CONF_DIR}/ananke.yaml" \
|
||||
&& ! grep -Eq '^ require_storage_ready:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
|
||||
sed -Ei '/^ etcd_restore_control_plane:[[:space:]]*[A-Za-z0-9._-]+/a\ require_storage_ready: true\n storage_ready_wait_seconds: 420\n storage_ready_poll_seconds: 5\n storage_min_ready_nodes: 2\n storage_critical_pvcs:\n - vault/data-vault-0\n - postgres/postgres-data-postgres-0\n - gitea/gitea-data\n - sso/keycloak-data' "${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] added startup storage readiness defaults"
|
||||
changed=1
|
||||
fi
|
||||
if grep -Eq '^ storage_critical_pvcs:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
|
||||
&& ! grep -Eq '^ require_post_start_probes:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
|
||||
sed -Ei '/^ - sso\/keycloak-data$/a\ require_post_start_probes: true\n post_start_probe_wait_seconds: 240\n post_start_probe_poll_seconds: 5\n post_start_probes:\n - https://scm.bstein.dev/api/healthz\n - https://metrics.bstein.dev/api/health\n require_service_checklist: true\n service_checklist_wait_seconds: 420\n service_checklist_poll_seconds: 5\n service_checklist_stability_seconds: 120\n service_checklist:\n - name: gitea-api\n url: https://scm.bstein.dev/api/healthz\n accepted_statuses: [200]\n body_contains: pass\n timeout_seconds: 12\n - name: grafana-api\n url: https://metrics.bstein.dev/api/health\n accepted_statuses: [200]\n body_contains: '\''\"database\":\"ok\"'\''\n timeout_seconds: 12\n vault_unseal_key_file: /var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] added startup post-start probe + vault key fallback defaults"
|
||||
changed=1
|
||||
fi
|
||||
if grep -Eq '^ - https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration$' "${CONF_DIR}/ananke.yaml"; then
|
||||
sed -Ei '/^ - https:\/\/sso\.bstein\.dev\/realms\/atlas\/\.well-known\/openid-configuration$/d' "${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] removed sso OIDC probe from startup.post_start_probes (returns 404 in current deployment)"
|
||||
changed=1
|
||||
fi
|
||||
if ! grep -Eq '^ vault_unseal_key_file:[[:space:]]*/var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"; then
|
||||
if grep -Eq '^startup:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" && grep -Eq '^ post_start_probes:[[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
|
||||
sed -Ei '/^ - https:\/\/metrics\.bstein\.dev\/api\/health$/a\ vault_unseal_key_file: /var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] added startup.vault_unseal_key_file default"
|
||||
changed=1
|
||||
fi
|
||||
fi
|
||||
if ! grep -Eq '^ vault_unseal_breakglass_timeout_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml"; then
|
||||
if grep -Eq '^ vault_unseal_key_file:[[:space:]]*/var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"; then
|
||||
sed -Ei '/^ vault_unseal_key_file:[[:space:]]*\/var\/lib\/ananke\/vault-unseal.key$/a\ vault_unseal_breakglass_command: ""\n vault_unseal_breakglass_timeout_seconds: 15' "${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] added startup break-glass fallback defaults"
|
||||
changed=1
|
||||
fi
|
||||
fi
|
||||
|
||||
local role
|
||||
role="$(read_ananke_role)"
|
||||
local inventory_block
|
||||
local managed_block
|
||||
local workers_block
|
||||
workers_block='workers:
|
||||
- titan-04
|
||||
- titan-05
|
||||
- titan-06
|
||||
- titan-07
|
||||
- titan-08
|
||||
- titan-09
|
||||
- titan-10
|
||||
- titan-11
|
||||
- titan-12
|
||||
- titan-13
|
||||
- titan-14
|
||||
- titan-15
|
||||
- titan-17
|
||||
- titan-18
|
||||
- titan-19
|
||||
- titan-20
|
||||
- titan-21
|
||||
- titan-22
|
||||
- titan-24'
|
||||
if [[ "${role}" == "coordinator" ]]; then
|
||||
inventory_block='ssh_node_hosts:
|
||||
titan-db: 192.168.22.10
|
||||
titan-0a: 192.168.22.11
|
||||
titan-0b: 192.168.22.12
|
||||
titan-0c: 192.168.22.13
|
||||
titan-04: 192.168.22.30
|
||||
titan-05: 192.168.22.31
|
||||
titan-06: 192.168.22.32
|
||||
titan-07: 192.168.22.33
|
||||
titan-08: 192.168.22.34
|
||||
titan-09: 192.168.22.35
|
||||
titan-10: 192.168.22.36
|
||||
titan-11: 192.168.22.37
|
||||
titan-12: 192.168.22.40
|
||||
titan-13: 192.168.22.41
|
||||
titan-14: 192.168.22.42
|
||||
titan-15: 192.168.22.43
|
||||
titan-17: 192.168.22.45
|
||||
titan-18: 192.168.22.46
|
||||
titan-19: 192.168.22.47
|
||||
titan-20: 192.168.22.20
|
||||
titan-21: 192.168.22.21
|
||||
titan-22: 192.168.22.22
|
||||
titan-24: 192.168.22.26'
|
||||
managed_block='ssh_managed_nodes:
|
||||
- titan-db
|
||||
- titan-0a
|
||||
- titan-0b
|
||||
- titan-0c
|
||||
- titan-04
|
||||
- titan-05
|
||||
- titan-06
|
||||
- titan-07
|
||||
- titan-08
|
||||
- titan-09
|
||||
- titan-10
|
||||
- titan-11
|
||||
- titan-12
|
||||
- titan-13
|
||||
- titan-14
|
||||
- titan-15
|
||||
- titan-17
|
||||
- titan-18
|
||||
- titan-19
|
||||
- titan-20
|
||||
- titan-21
|
||||
- titan-22
|
||||
- titan-24'
|
||||
elif [[ "${role}" == "peer" ]]; then
|
||||
inventory_block='ssh_node_hosts:
|
||||
titan-db: 192.168.22.10
|
||||
titan-0a: 192.168.22.11
|
||||
titan-0b: 192.168.22.12
|
||||
titan-0c: 192.168.22.13
|
||||
titan-04: 192.168.22.30
|
||||
titan-05: 192.168.22.31
|
||||
titan-06: 192.168.22.32
|
||||
titan-07: 192.168.22.33
|
||||
titan-08: 192.168.22.34
|
||||
titan-09: 192.168.22.35
|
||||
titan-10: 192.168.22.36
|
||||
titan-11: 192.168.22.37
|
||||
titan-12: 192.168.22.40
|
||||
titan-13: 192.168.22.41
|
||||
titan-14: 192.168.22.42
|
||||
titan-15: 192.168.22.43
|
||||
titan-17: 192.168.22.45
|
||||
titan-18: 192.168.22.46
|
||||
titan-19: 192.168.22.47
|
||||
titan-20: 192.168.22.20
|
||||
titan-21: 192.168.22.21
|
||||
titan-22: 192.168.22.22
|
||||
titan-24: 192.168.22.26'
|
||||
managed_block='ssh_managed_nodes:
|
||||
- titan-db
|
||||
- titan-0a
|
||||
- titan-0b
|
||||
- titan-0c
|
||||
- titan-04
|
||||
- titan-05
|
||||
- titan-06
|
||||
- titan-07
|
||||
- titan-08
|
||||
- titan-09
|
||||
- titan-10
|
||||
- titan-11
|
||||
- titan-12
|
||||
- titan-13
|
||||
- titan-14
|
||||
- titan-15
|
||||
- titan-17
|
||||
- titan-18
|
||||
- titan-19
|
||||
- titan-20
|
||||
- titan-21
|
||||
- titan-22
|
||||
- titan-24'
|
||||
fi
|
||||
|
||||
if [[ -n "${inventory_block}" ]]; then
|
||||
if grep -Eq '^ssh_node_hosts:[[:space:]]*\{\}[[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
|
||||
perl -0pi -e 's#ssh_node_hosts:\s*\{\}\n#'"${inventory_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] hydrated ssh_node_hosts inventory for role=${role}"
|
||||
changed=1
|
||||
fi
|
||||
fi
|
||||
if grep -Eq '^workers:[[:space:]]*\[\][[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
|
||||
perl -0pi -e 's#workers:\s*\[\]\n#'"${workers_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] hydrated workers inventory for startup/shutdown orchestration"
|
||||
changed=1
|
||||
fi
|
||||
|
||||
if [[ -n "${managed_block}" ]]; then
|
||||
if grep -Eq '^ssh_managed_nodes:[[:space:]]*\[\][[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
|
||||
perl -0pi -e 's#ssh_managed_nodes:\s*\[\]\n#'"${managed_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] hydrated ssh_managed_nodes inventory for role=${role}"
|
||||
changed=1
|
||||
fi
|
||||
if ! grep -Eq '^ - titan-04$' "${CONF_DIR}/ananke.yaml" || ! grep -Eq '^ - titan-21$' "${CONF_DIR}/ananke.yaml"; then
|
||||
perl -0pi -e 's#ssh_managed_nodes:\n(?: - [^\n]*\n)*#'"${managed_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] refreshed ssh_managed_nodes coverage for role=${role}"
|
||||
changed=1
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ "${role}" == "peer" ]]; then
|
||||
if grep -Eq '^ssh_managed_nodes:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
|
||||
&& grep -Eq '^ - titan-db$' "${CONF_DIR}/ananke.yaml" \
|
||||
&& grep -Eq '^ - titan-24$' "${CONF_DIR}/ananke.yaml" \
|
||||
&& ! grep -Eq '^ - titan-0a$' "${CONF_DIR}/ananke.yaml"; then
|
||||
perl -0pi -e 's#ssh_managed_nodes:\n - titan-db\n - titan-24\n#ssh_managed_nodes:\n - titan-db\n - titan-0a\n - titan-0b\n - titan-0c\n - titan-04\n - titan-05\n - titan-06\n - titan-07\n - titan-08\n - titan-09\n - titan-10\n - titan-11\n - titan-12\n - titan-13\n - titan-14\n - titan-15\n - titan-17\n - titan-18\n - titan-19\n - titan-20\n - titan-21\n - titan-22\n - titan-24\n#s' "${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] expanded peer ssh_managed_nodes for bootstrap fallback coverage"
|
||||
changed=1
|
||||
fi
|
||||
|
||||
if ! grep -Eq '^ - services/keycloak$' "${CONF_DIR}/ananke.yaml" || ! grep -Eq '^ - infrastructure/cert-manager$' "${CONF_DIR}/ananke.yaml" || ! grep -Eq '^ - services/oauth2-proxy$' "${CONF_DIR}/ananke.yaml"; then
|
||||
perl -0pi -e 's#local_bootstrap_paths:\n(?: - [^\n]*\n)*#local_bootstrap_paths:\n - infrastructure/core\n - clusters/atlas/flux-system\n - infrastructure/sources/helm\n - infrastructure/metallb\n - infrastructure/traefik\n - infrastructure/cert-manager\n - infrastructure/vault-csi\n - infrastructure/vault-injector\n - services/vault\n - infrastructure/postgres\n - services/gitea\n - services/keycloak\n - services/oauth2-proxy\n#s' "${CONF_DIR}/ananke.yaml"
|
||||
echo "[install] refreshed peer local_bootstrap_paths for full fallback bootstrap parity"
|
||||
changed=1
|
||||
fi
|
||||
|
||||
fi
|
||||
|
||||
if [[ "${changed}" -eq 1 ]]; then
|
||||
chmod 0640 "${CONF_DIR}/ananke.yaml" || true
|
||||
fi
|
||||
}
|
||||
|
||||
sanitize_migrated_ananke_config() {
|
||||
local cfg="${CONF_DIR}/ananke.yaml"
|
||||
[[ -f "${cfg}" ]] || return 0
|
||||
|
||||
local tmp changed=0
|
||||
tmp="$(mktemp)"
|
||||
|
||||
# Legacy migration bug guard:
|
||||
# If root-level "- node" entries were accidentally appended after ssh_managed_nodes,
|
||||
# drop those orphan entries until the next top-level key.
|
||||
awk '
|
||||
BEGIN {in_managed=0}
|
||||
/^ssh_managed_nodes:[[:space:]]*$/ {in_managed=1; print; next}
|
||||
{
|
||||
if (in_managed) {
|
||||
if ($0 ~ /^ - /) {print; next}
|
||||
if ($0 ~ /^- /) {next}
|
||||
if ($0 ~ /^[A-Za-z0-9_]+:[[:space:]]*/) {in_managed=0}
|
||||
}
|
||||
print
|
||||
}
|
||||
' "${cfg}" > "${tmp}"
|
||||
|
||||
if ! cmp -s "${cfg}" "${tmp}"; then
|
||||
mv "${tmp}" "${cfg}"
|
||||
changed=1
|
||||
echo "[install] sanitized malformed ssh_managed_nodes block in ${cfg}"
|
||||
else
|
||||
rm -f "${tmp}"
|
||||
fi
|
||||
|
||||
if grep -Eq '^[[:space:]]*forward_shutdown_config:[[:space:]]*/etc/ananke/hecate.yaml[[:space:]]*$' "${cfg}"; then
|
||||
sed -Ei 's#(^[[:space:]]*forward_shutdown_config:[[:space:]]*)/etc/ananke/hecate.yaml#\1/etc/ananke/ananke.yaml#' "${cfg}"
|
||||
changed=1
|
||||
echo "[install] migrated coordination.forward_shutdown_config to /etc/ananke/ananke.yaml"
|
||||
fi
|
||||
|
||||
if [[ "${changed}" -eq 1 ]]; then
|
||||
chmod 0640 "${cfg}" || true
|
||||
fi
|
||||
}
|
||||
|
||||
ensure_apt_packages() {
|
||||
local missing=()
|
||||
for pkg in "$@"; do
|
||||
if ! dpkg -s "${pkg}" >/dev/null 2>&1; then
|
||||
missing+=("${pkg}")
|
||||
fi
|
||||
done
|
||||
if [[ ${#missing[@]} -eq 0 ]]; then
|
||||
return 0
|
||||
fi
|
||||
echo "[install] apt install: ${missing[*]}"
|
||||
export DEBIAN_FRONTEND=noninteractive
|
||||
apt-get update -y
|
||||
apt-get install -y "${missing[@]}"
|
||||
}
|
||||
|
||||
install_kubectl_if_missing() {
|
||||
if command -v kubectl >/dev/null 2>&1; then
|
||||
return 0
|
||||
fi
|
||||
ensure_apt_packages kubernetes-client || true
|
||||
if command -v kubectl >/dev/null 2>&1; then
|
||||
return 0
|
||||
fi
|
||||
echo "[install] installing kubectl via upstream binary"
|
||||
local arch
|
||||
arch="$(uname -m)"
|
||||
case "${arch}" in
|
||||
x86_64) arch="amd64" ;;
|
||||
aarch64|arm64) arch="arm64" ;;
|
||||
*) echo "Unsupported arch for kubectl install: ${arch}" >&2; return 1 ;;
|
||||
esac
|
||||
local version
|
||||
version="$(curl -fsSL https://dl.k8s.io/release/stable.txt)"
|
||||
curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/${version}/bin/linux/${arch}/kubectl"
|
||||
chmod 0755 /usr/local/bin/kubectl
|
||||
}
|
||||
|
||||
ensure_dependencies() {
|
||||
if [[ "${INSTALL_DEPS}" -eq 0 ]]; then
|
||||
echo "[install] skipping dependency installation"
|
||||
return 0
|
||||
fi
|
||||
if ! command -v apt-get >/dev/null 2>&1; then
|
||||
echo "This installer currently supports apt-based hosts only." >&2
|
||||
exit 1
|
||||
fi
|
||||
ensure_apt_packages ca-certificates curl git openssh-client jq nut-client nut-server nut-monitor golang-go
|
||||
install_kubectl_if_missing
|
||||
}
|
||||
|
||||
legacy_path_rewrite() {
|
||||
local src="$1"
|
||||
local dst="$2"
|
||||
sed \
|
||||
-e 's#/etc/hecate/hecate.yaml#/etc/ananke/ananke.yaml#g' \
|
||||
-e 's#/etc/hecate/kubeconfig#/etc/ananke/kubeconfig#g' \
|
||||
-e 's#/var/lib/hecate/vault-unseal.key#/var/lib/ananke/vault-unseal.key#g' \
|
||||
-e 's#/var/lib/hecate/hecate.lock#/var/lib/ananke/ananke.lock#g' \
|
||||
-e 's#/opt/hecate#/opt/ananke#g' \
|
||||
-e 's#/etc/hecate#/etc/ananke#g' \
|
||||
-e 's#/var/lib/hecate#/var/lib/ananke#g' \
|
||||
-e 's#/usr/local/bin/hecate#/usr/local/bin/ananke#g' \
|
||||
-e 's#/usr/local/lib/hecate#/usr/local/lib/ananke#g' \
|
||||
-e 's/hecate.yaml/ananke.yaml/g' \
|
||||
-e 's/hecate.lock/ananke.lock/g' \
|
||||
-e 's/hecate/ananke/g' \
|
||||
-e 's/Hecate/Ananke/g' \
|
||||
-e 's#hecate\.lock#ananke.lock#g' \
|
||||
"${src}" > "${dst}"
|
||||
}
|
||||
|
||||
migrate_legacy_hecate_install() {
|
||||
local legacy_conf_dir="/etc/hecate"
|
||||
local legacy_state_dir="/var/lib/hecate"
|
||||
local legacy_systemd_dir="/etc/systemd/system"
|
||||
|
||||
install -d -m 0750 "${CONF_DIR}"
|
||||
install -d -m 0750 "${STATE_DIR}"
|
||||
|
||||
if [[ ! -f "${CONF_DIR}/ananke.yaml" && -f "${legacy_conf_dir}/hecate.yaml" ]]; then
|
||||
echo "[install] migrating legacy config ${legacy_conf_dir}/hecate.yaml -> ${CONF_DIR}/ananke.yaml"
|
||||
legacy_path_rewrite "${legacy_conf_dir}/hecate.yaml" "${CONF_DIR}/ananke.yaml"
|
||||
chmod 0640 "${CONF_DIR}/ananke.yaml"
|
||||
fi
|
||||
|
||||
if [[ ! -f "${CONF_DIR}/kubeconfig" && -f "${legacy_conf_dir}/kubeconfig" ]]; then
|
||||
echo "[install] migrating legacy kubeconfig ${legacy_conf_dir}/kubeconfig -> ${CONF_DIR}/kubeconfig"
|
||||
install -m 0600 "${legacy_conf_dir}/kubeconfig" "${CONF_DIR}/kubeconfig"
|
||||
fi
|
||||
|
||||
if [[ ! -f "${STATE_DIR}/vault-unseal.key" && -f "${legacy_state_dir}/vault-unseal.key" ]]; then
|
||||
echo "[install] migrating legacy vault key ${legacy_state_dir}/vault-unseal.key -> ${STATE_DIR}/vault-unseal.key"
|
||||
install -m 0600 "${legacy_state_dir}/vault-unseal.key" "${STATE_DIR}/vault-unseal.key"
|
||||
fi
|
||||
|
||||
if [[ ! -f "${STATE_DIR}/runs.json" && -f "${legacy_state_dir}/runs.json" ]]; then
|
||||
echo "[install] migrating legacy run history ${legacy_state_dir}/runs.json -> ${STATE_DIR}/runs.json"
|
||||
install -m 0640 "${legacy_state_dir}/runs.json" "${STATE_DIR}/runs.json"
|
||||
fi
|
||||
|
||||
if [[ ! -f "${STATE_DIR}/intent.json" && -f "${legacy_state_dir}/intent.json" ]]; then
|
||||
echo "[install] migrating legacy intent state ${legacy_state_dir}/intent.json -> ${STATE_DIR}/intent.json"
|
||||
install -m 0640 "${legacy_state_dir}/intent.json" "${STATE_DIR}/intent.json"
|
||||
fi
|
||||
|
||||
if [[ ! -f "${STATE_DIR}/ananke.lock" && -f "${legacy_state_dir}/hecate.lock" ]]; then
|
||||
echo "[install] migrating legacy lock ${legacy_state_dir}/hecate.lock -> ${STATE_DIR}/ananke.lock"
|
||||
install -m 0640 "${legacy_state_dir}/hecate.lock" "${STATE_DIR}/ananke.lock"
|
||||
fi
|
||||
|
||||
if [[ -d "${legacy_systemd_dir}" ]]; then
|
||||
if ls "${legacy_systemd_dir}"/hecate*.service >/dev/null 2>&1 || ls "${legacy_systemd_dir}"/hecate*.timer >/dev/null 2>&1; then
|
||||
echo "[install] detected legacy hecate systemd unit files; will retire after ananke install"
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
retire_legacy_hecate_install() {
|
||||
local ts backup_dir
|
||||
ts="$(date +%Y%m%d%H%M%S)"
|
||||
backup_dir="/var/backups/ananke-legacy-hecate-${ts}"
|
||||
|
||||
systemctl disable --now hecate.service hecate-bootstrap.service hecate-update.timer >/dev/null 2>&1 || true
|
||||
systemctl stop hecate-update.service >/dev/null 2>&1 || true
|
||||
|
||||
if [[ -d /etc/hecate || -d /var/lib/hecate || -d /usr/local/lib/hecate || -d /opt/hecate ]]; then
|
||||
install -d -m 0750 "${backup_dir}"
|
||||
[[ -d /etc/hecate ]] && cp -a /etc/hecate "${backup_dir}/" || true
|
||||
[[ -d /var/lib/hecate ]] && cp -a /var/lib/hecate "${backup_dir}/" || true
|
||||
[[ -d /usr/local/lib/hecate ]] && cp -a /usr/local/lib/hecate "${backup_dir}/" || true
|
||||
[[ -d /opt/hecate ]] && cp -a /opt/hecate "${backup_dir}/" || true
|
||||
[[ -f /usr/local/bin/hecate ]] && install -m 0755 /usr/local/bin/hecate "${backup_dir}/hecate.bin" || true
|
||||
echo "[install] backed up legacy hecate assets to ${backup_dir}"
|
||||
fi
|
||||
|
||||
rm -f \
|
||||
/etc/systemd/system/hecate.service \
|
||||
/etc/systemd/system/hecate-bootstrap.service \
|
||||
/etc/systemd/system/hecate-update.service \
|
||||
/etc/systemd/system/hecate-update.timer
|
||||
rm -f /usr/local/bin/hecate
|
||||
rm -rf /usr/local/lib/hecate
|
||||
rm -rf /opt/hecate
|
||||
rm -rf /etc/hecate
|
||||
rm -rf /var/lib/hecate
|
||||
}
|
||||
|
||||
resolve_build_target() {
|
||||
if [[ -d "${REPO_DIR}/cmd/ananke" ]]; then
|
||||
echo "./cmd/ananke"
|
||||
return 0
|
||||
fi
|
||||
return 1
|
||||
}
|
||||
|
||||
install_config_template() {
|
||||
local template="$1"
|
||||
local dest="$2"
|
||||
local src legacy
|
||||
local -a modern_candidates=()
|
||||
local -a legacy_candidates=()
|
||||
|
||||
case "${template}" in
|
||||
coordinator)
|
||||
modern_candidates=("configs/ananke.coordinator.yaml" "configs/ananke.titan-db.yaml")
|
||||
legacy_candidates=("configs/hecate.titan-db.yaml")
|
||||
;;
|
||||
peer)
|
||||
modern_candidates=("configs/ananke.peer.yaml" "configs/ananke.tethys.yaml")
|
||||
legacy_candidates=("configs/hecate.tethys.yaml")
|
||||
;;
|
||||
example)
|
||||
modern_candidates=("configs/ananke.example.yaml")
|
||||
legacy_candidates=("configs/hecate.example.yaml")
|
||||
;;
|
||||
*)
|
||||
echo "[install] unknown config template key: ${template}" >&2
|
||||
return 1
|
||||
;;
|
||||
esac
|
||||
|
||||
for src in "${modern_candidates[@]}"; do
|
||||
if [[ -f "${src}" ]]; then
|
||||
install -m 0640 "${src}" "${dest}"
|
||||
return 0
|
||||
fi
|
||||
done
|
||||
|
||||
for legacy in "${legacy_candidates[@]}"; do
|
||||
if [[ -f "${legacy}" ]]; then
|
||||
src="$(mktemp)"
|
||||
legacy_path_rewrite "${legacy}" "${src}"
|
||||
install -m 0640 "${src}" "${dest}"
|
||||
rm -f "${src}"
|
||||
return 0
|
||||
fi
|
||||
done
|
||||
|
||||
echo "[install] missing config template sources for '${template}'. modern=[${modern_candidates[*]}] legacy=[${legacy_candidates[*]}]" >&2
|
||||
return 1
|
||||
}
|
||||
|
||||
install_systemd_units() {
|
||||
local source_map
|
||||
local tmp
|
||||
|
||||
while IFS='|' read -r target_name modern_name legacy_name; do
|
||||
local modern_src="deploy/systemd/${modern_name}"
|
||||
local legacy_src="deploy/systemd/${legacy_name}"
|
||||
local target="${SYSTEMD_DIR}/${target_name}"
|
||||
|
||||
if [[ -f "${modern_src}" ]]; then
|
||||
install -m 0644 "${modern_src}" "${target}"
|
||||
continue
|
||||
fi
|
||||
|
||||
if [[ -f "${legacy_src}" ]]; then
|
||||
tmp="$(mktemp)"
|
||||
legacy_path_rewrite "${legacy_src}" "${tmp}"
|
||||
install -m 0644 "${tmp}" "${target}"
|
||||
rm -f "${tmp}"
|
||||
continue
|
||||
fi
|
||||
|
||||
echo "[install] missing both modern and legacy systemd unit sources for ${target_name}" >&2
|
||||
return 1
|
||||
done <<'EOF_UNITS'
|
||||
ananke.service|ananke.service|hecate.service
|
||||
ananke-bootstrap.service|ananke-bootstrap.service|hecate-bootstrap.service
|
||||
ananke-update.service|ananke-update.service|hecate-update.service
|
||||
ananke-update.timer|ananke-update.timer|hecate-update.timer
|
||||
EOF_UNITS
|
||||
}
|
||||
|
||||
install_self_update_script() {
|
||||
local modern_src="scripts/ananke-self-update.sh"
|
||||
local legacy_src="scripts/hecate-self-update.sh"
|
||||
local target="${LIB_DIR}/ananke-self-update.sh"
|
||||
local tmp
|
||||
|
||||
if [[ -f "${modern_src}" ]]; then
|
||||
install -m 0755 "${modern_src}" "${target}"
|
||||
return 0
|
||||
fi
|
||||
|
||||
if [[ -f "${legacy_src}" ]]; then
|
||||
tmp="$(mktemp)"
|
||||
legacy_path_rewrite "${legacy_src}" "${tmp}"
|
||||
sed -Ei \
|
||||
-e 's/HECATE_/ANANKE_/g' \
|
||||
-e 's/hecate-self-update/ananke-self-update/g' \
|
||||
-e 's#/opt/hecate#/opt/ananke#g' \
|
||||
-e 's#bstein/hecate\.git#bstein/ananke.git#g' \
|
||||
"${tmp}"
|
||||
install -m 0755 "${tmp}" "${target}"
|
||||
rm -f "${tmp}"
|
||||
return 0
|
||||
fi
|
||||
|
||||
echo "[install] missing both modern and legacy self-update scripts." >&2
|
||||
return 1
|
||||
}
|
||||
|
||||
configure_nut() {
|
||||
if [[ "${MANAGE_NUT}" != "1" ]]; then
|
||||
echo "[install] skipping NUT configuration (ANANKE_MANAGE_NUT=${MANAGE_NUT})"
|
||||
return 0
|
||||
fi
|
||||
|
||||
echo "[install] configuring NUT + udev for UPS ${NUT_UPS_NAME} (${NUT_VENDOR_ID}:${NUT_PRODUCT_ID})"
|
||||
install -d -m 0755 /etc/nut /etc/udev/rules.d
|
||||
|
||||
cat > /etc/nut/nut.conf <<EOF
|
||||
MODE=standalone
|
||||
EOF
|
||||
|
||||
cat > /etc/nut/ups.conf <<EOF
|
||||
[${NUT_UPS_NAME}]
|
||||
driver = usbhid-ups
|
||||
port = auto
|
||||
vendorid = ${NUT_VENDOR_ID}
|
||||
productid = ${NUT_PRODUCT_ID}
|
||||
pollinterval = 5
|
||||
EOF
|
||||
|
||||
cat > /etc/nut/upsd.users <<EOF
|
||||
[${NUT_MONITOR_USER}]
|
||||
password = ${NUT_MONITOR_PASSWORD}
|
||||
upsmon primary
|
||||
EOF
|
||||
chmod 0640 /etc/nut/upsd.users
|
||||
if getent group nut >/dev/null 2>&1; then
|
||||
chown root:nut /etc/nut/upsd.users
|
||||
else
|
||||
chown root:root /etc/nut/upsd.users
|
||||
fi
|
||||
|
||||
cat > /etc/nut/upsmon.conf <<EOF
|
||||
RUN_AS_USER nut
|
||||
MONITOR ${NUT_UPS_NAME}@localhost 1 ${NUT_MONITOR_USER} ${NUT_MONITOR_PASSWORD} primary
|
||||
MINSUPPLIES 1
|
||||
SHUTDOWNCMD "/sbin/shutdown -h +0"
|
||||
POLLFREQ 5
|
||||
POLLFREQALERT 5
|
||||
HOSTSYNC 15
|
||||
DEADTIME 15
|
||||
POWERDOWNFLAG /etc/killpower
|
||||
EOF
|
||||
|
||||
cat > /etc/udev/rules.d/99-ananke-ups.rules <<EOF
|
||||
# Managed by ananke install.sh: ensure UPS USB HID devices are readable by NUT
|
||||
ACTION=="add|change", SUBSYSTEM=="usb", ATTR{idVendor}=="${NUT_VENDOR_ID}", ATTR{idProduct}=="${NUT_PRODUCT_ID}", MODE:="0660", GROUP:="nut"
|
||||
EOF
|
||||
|
||||
udevadm control --reload-rules || true
|
||||
udevadm trigger --subsystem-match=usb --attr-match=idVendor="${NUT_VENDOR_ID}" --attr-match=idProduct="${NUT_PRODUCT_ID}" || true
|
||||
|
||||
systemctl enable nut-driver-enumerator.service nut-server.service nut-monitor.service >/dev/null 2>&1 || true
|
||||
systemctl restart nut-driver-enumerator.service >/dev/null 2>&1 || true
|
||||
systemctl restart "nut-driver@${NUT_UPS_NAME}.service" >/dev/null 2>&1 || true
|
||||
systemctl restart nut-server.service nut-monitor.service >/dev/null 2>&1 || true
|
||||
}
|
||||
|
||||
ensure_dependencies
|
||||
migrate_legacy_hecate_install
|
||||
|
||||
@ -6,28 +6,9 @@ cd "${REPO_DIR}"
|
||||
export PATH="$(go env GOPATH)/bin:${PATH}"
|
||||
STATICCHECK_VERSION="${ANANKE_STATICCHECK_VERSION:-2025.1.1}"
|
||||
|
||||
run_with_retry() {
|
||||
local attempts="$1"
|
||||
shift
|
||||
local try=1
|
||||
local delay=3
|
||||
local rc=0
|
||||
while true; do
|
||||
"$@" && return 0
|
||||
rc=$?
|
||||
if [[ "${try}" -ge "${attempts}" ]]; then
|
||||
return "${rc}"
|
||||
fi
|
||||
echo "[lint] retry ${try}/${attempts} after rc=${rc}: $*" >&2
|
||||
sleep "${delay}"
|
||||
delay=$((delay * 2))
|
||||
try=$((try + 1))
|
||||
done
|
||||
}
|
||||
|
||||
if ! command -v staticcheck >/dev/null 2>&1 || ! staticcheck -version 2>/dev/null | grep -q "${STATICCHECK_VERSION}"; then
|
||||
echo "[lint] installing staticcheck ${STATICCHECK_VERSION}"
|
||||
run_with_retry 4 go install "honnef.co/go/tools/cmd/staticcheck@${STATICCHECK_VERSION}"
|
||||
go install "honnef.co/go/tools/cmd/staticcheck@${STATICCHECK_VERSION}"
|
||||
fi
|
||||
|
||||
echo "[lint] go vet"
|
||||
|
||||
@ -77,17 +77,6 @@ def _fetch_existing_counter(pushgateway_url: str, metric: str, labels: dict[str,
|
||||
return 0.0
|
||||
|
||||
|
||||
def _series_exists(pushgateway_url: str, metric: str, labels: dict[str, str], timeout_seconds: float) -> bool:
|
||||
"""Return whether Pushgateway already has a series for this build."""
|
||||
text = _read_http(f"{pushgateway_url.rstrip('/')}/metrics", timeout_seconds)
|
||||
for line in text.splitlines():
|
||||
if not line.startswith(metric + "{"):
|
||||
continue
|
||||
if all(f'{key}="{value}"' in line for key, value in labels.items()):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _build_payload(
|
||||
suite: str,
|
||||
trigger: str,
|
||||
@ -100,25 +89,9 @@ def _build_payload(
|
||||
tests_skipped: int,
|
||||
test_cases: list[tuple[str, str]],
|
||||
coverage_percent: float,
|
||||
source_files_total: int,
|
||||
source_lines_over_500: int,
|
||||
branch: str,
|
||||
build_number: str,
|
||||
jenkins_job: str,
|
||||
checks: dict[str, str],
|
||||
) -> str:
|
||||
build_labels = {
|
||||
"suite": suite,
|
||||
"branch": branch,
|
||||
"build_number": build_number or "unknown",
|
||||
"jenkins_job": jenkins_job,
|
||||
}
|
||||
test_case_base_labels = {
|
||||
"suite": suite,
|
||||
"branch": branch,
|
||||
"build_number": build_number or "unknown",
|
||||
"jenkins_job": jenkins_job,
|
||||
}
|
||||
lines = [
|
||||
"# TYPE platform_quality_gate_runs_total counter",
|
||||
f'platform_quality_gate_runs_total{{suite="{suite}",status="ok"}} {ok_count}',
|
||||
@ -132,30 +105,21 @@ def _build_payload(
|
||||
f'ananke_quality_gate_coverage_percent{{suite="{suite}"}} {coverage_percent:.3f}',
|
||||
"# TYPE platform_quality_gate_workspace_line_coverage_percent gauge",
|
||||
f'platform_quality_gate_workspace_line_coverage_percent{{suite="{suite}"}} {coverage_percent:.3f}',
|
||||
"# TYPE platform_quality_gate_source_files_total gauge",
|
||||
f'platform_quality_gate_source_files_total{{suite="{suite}"}} {source_files_total}',
|
||||
"# TYPE platform_quality_gate_source_lines_over_500_total gauge",
|
||||
f'platform_quality_gate_source_lines_over_500_total{{suite="{suite}"}} {source_lines_over_500}',
|
||||
"# TYPE platform_quality_gate_build_info gauge",
|
||||
f"platform_quality_gate_build_info{_label_str(build_labels)} 1",
|
||||
"# TYPE platform_quality_gate_test_case_result gauge",
|
||||
"# TYPE ananke_quality_gate_checks_total gauge",
|
||||
"# TYPE ananke_quality_gate_publish_info gauge",
|
||||
f'ananke_quality_gate_publish_info{_label_str({"suite": suite, "trigger": trigger})} 1',
|
||||
]
|
||||
lines.extend(
|
||||
f'platform_quality_gate_test_case_result{{suite="{suite}",test="{_escape_label(test_name)}",status="{_escape_label(test_status)}"}} 1'
|
||||
for test_name, test_status in test_cases
|
||||
)
|
||||
lines.extend(
|
||||
f'ananke_quality_gate_checks_total{{suite="{suite}",check="{check_name}",result="{check_status}"}} 1'
|
||||
for check_name, check_status in checks.items()
|
||||
)
|
||||
lines.append("# TYPE platform_quality_gate_test_case_result gauge")
|
||||
if test_cases:
|
||||
lines.extend(
|
||||
f"platform_quality_gate_test_case_result{_label_str({**test_case_base_labels, 'test': test_name, 'status': test_status})} 1"
|
||||
for test_name, test_status in test_cases
|
||||
)
|
||||
else:
|
||||
lines.append(
|
||||
f"platform_quality_gate_test_case_result{_label_str({**test_case_base_labels, 'test': '__no_test_cases__', 'status': 'skipped'})} 1"
|
||||
)
|
||||
return "\n".join(lines) + "\n"
|
||||
|
||||
|
||||
@ -172,7 +136,8 @@ def _read_coverage_percent(path: str) -> float:
|
||||
return 0.0
|
||||
|
||||
|
||||
def _iter_source_files(repo_root: Path):
|
||||
def _count_source_files_over_limit(repo_root: Path, max_lines: int = 500) -> int:
|
||||
count = 0
|
||||
for rel_root in SOURCE_SCAN_ROOTS:
|
||||
base = repo_root / rel_root
|
||||
if not base.exists():
|
||||
@ -182,37 +147,12 @@ def _iter_source_files(repo_root: Path):
|
||||
continue
|
||||
if path.suffix not in SOURCE_EXTENSIONS:
|
||||
continue
|
||||
if path.name.endswith("_test.go") or path.name.endswith(".test.py"):
|
||||
continue
|
||||
yield path
|
||||
|
||||
|
||||
def _count_source_files(repo_root: Path) -> int:
|
||||
return sum(1 for _ in _iter_source_files(repo_root))
|
||||
|
||||
|
||||
def _count_source_files_over_limit(repo_root: Path, max_lines: int = 500) -> int:
|
||||
count = 0
|
||||
for path in _iter_source_files(repo_root):
|
||||
lines = len(path.read_text(encoding="utf-8", errors="ignore").splitlines())
|
||||
if lines > max_lines:
|
||||
count += 1
|
||||
lines = len(path.read_text(encoding="utf-8", errors="ignore").splitlines())
|
||||
if lines > max_lines:
|
||||
count += 1
|
||||
return count
|
||||
|
||||
|
||||
def _unit_tests_failed(output_path: Path, coverage_percent: float) -> bool:
|
||||
if coverage_percent <= 0 or not output_path.exists():
|
||||
return True
|
||||
text = output_path.read_text(encoding="utf-8", errors="ignore")
|
||||
start_marker = "[quality] unit tests + workspace coverage profile"
|
||||
end_marker = "[quality] hygiene: doc contracts"
|
||||
if start_marker in text:
|
||||
text = text.split(start_marker, 1)[1]
|
||||
if end_marker in text:
|
||||
text = text.split(end_marker, 1)[0]
|
||||
return bool(re.search(r"^(--- FAIL:|FAIL\\b)", text, flags=re.M))
|
||||
|
||||
|
||||
def _parse_go_test_counts(output_path: Path) -> dict[str, int]:
|
||||
if not output_path.exists():
|
||||
return {"passed": 0, "failed": 0, "errors": 0, "skipped": 0}
|
||||
@ -226,37 +166,14 @@ def _parse_go_test_counts(output_path: Path) -> dict[str, int]:
|
||||
|
||||
|
||||
def _parse_go_test_cases(output_path: Path) -> list[tuple[str, str]]:
|
||||
"""Parse per-test status records from go test output text."""
|
||||
if not output_path.exists():
|
||||
return []
|
||||
text = output_path.read_text(encoding="utf-8", errors="ignore")
|
||||
cases: list[tuple[str, str]] = []
|
||||
patterns = {
|
||||
"passed": re.compile(r"^--- PASS: ([^\s(]+)", flags=re.M),
|
||||
"failed": re.compile(r"^--- FAIL: ([^\s(]+)", flags=re.M),
|
||||
"skipped": re.compile(r"^--- SKIP: ([^\s(]+)", flags=re.M),
|
||||
}
|
||||
for status, pattern in patterns.items():
|
||||
for test_name in pattern.findall(text):
|
||||
cleaned = str(test_name).strip()
|
||||
if cleaned:
|
||||
cases.append((cleaned, status))
|
||||
if cases:
|
||||
return cases
|
||||
|
||||
# Fallback for non-verbose `go test` output where individual test names are absent.
|
||||
package_cases: list[tuple[str, str]] = []
|
||||
for package_name in re.findall(r"^ok\s+([^\s]+)", text, flags=re.M):
|
||||
cleaned = str(package_name).strip()
|
||||
if cleaned:
|
||||
package_cases.append((f"package::{cleaned}", "passed"))
|
||||
for package_name in re.findall(r"^FAIL\s+([^\s]+)", text, flags=re.M):
|
||||
cleaned = str(package_name).strip()
|
||||
if cleaned:
|
||||
package_cases.append((f"package::{cleaned}", "failed"))
|
||||
if package_cases:
|
||||
deduped = list(dict.fromkeys(package_cases))
|
||||
return deduped
|
||||
for match in re.finditer(r"^---\s+(PASS|FAIL|SKIP):\s+(\S+)", text, flags=re.M):
|
||||
raw_status, test_name = match.groups()
|
||||
status = {"PASS": "passed", "FAIL": "failed", "SKIP": "skipped"}.get(raw_status, "error")
|
||||
cases.append((test_name.strip(), status))
|
||||
return cases
|
||||
|
||||
|
||||
@ -307,23 +224,17 @@ def _sonarqube_check_status(build_dir: Path) -> str:
|
||||
|
||||
|
||||
def _supply_chain_check_status(build_dir: Path) -> str:
|
||||
required = os.getenv("QUALITY_GATE_IRONBANK_REQUIRED", "0").strip().lower() in {"1", "true", "yes", "on"}
|
||||
report = _load_json(Path(os.getenv("QUALITY_GATE_IRONBANK_REPORT", str(build_dir / "ironbank-compliance.json"))))
|
||||
if not report:
|
||||
return "failed" if required else "not_applicable"
|
||||
return "not_applicable"
|
||||
compliant = report.get("compliant")
|
||||
if isinstance(compliant, bool):
|
||||
return "ok" if compliant else "failed"
|
||||
status_candidates = [report.get("status"), report.get("result"), report.get("compliance")]
|
||||
for value in status_candidates:
|
||||
if isinstance(value, str):
|
||||
normalized = value.strip().lower()
|
||||
if normalized in QUALITY_SUCCESS_STATES:
|
||||
return "ok"
|
||||
if normalized in {"n/a", "na", "not_applicable", "not-applicable", "skipped", "skip"}:
|
||||
return "failed" if required else "not_applicable"
|
||||
return "failed" if required else "not_applicable"
|
||||
return "failed" if required else "not_applicable"
|
||||
return "ok" if value.strip().lower() in QUALITY_SUCCESS_STATES else "failed"
|
||||
return "failed"
|
||||
|
||||
|
||||
def parse_args(argv: list[str]) -> argparse.Namespace:
|
||||
@ -367,19 +278,10 @@ def main(argv: list[str] | None = None) -> int:
|
||||
args = parse_args(argv or sys.argv[1:])
|
||||
repo_root = Path(__file__).resolve().parents[1]
|
||||
build_dir = repo_root / "build"
|
||||
gate_rc = _read_exit_code(Path(os.getenv("ANANKE_QUALITY_EXIT_CODE_PATH", str(build_dir / "quality-gate.rc"))))
|
||||
current_ok = 1 if gate_rc == 0 else 0
|
||||
current_failed = 0 if gate_rc == 0 else 1
|
||||
|
||||
branch = os.getenv("BRANCH_NAME") or os.getenv("GIT_BRANCH") or "unknown"
|
||||
if branch.startswith("origin/"):
|
||||
branch = branch[len("origin/") :]
|
||||
build_number = os.getenv("BUILD_NUMBER", "")
|
||||
jenkins_job = os.getenv("JOB_NAME", "ananke")
|
||||
remote_ok = 0
|
||||
remote_failed = 0
|
||||
remote_error = ""
|
||||
already_recorded = False
|
||||
try:
|
||||
remote_ok = int(
|
||||
_fetch_existing_counter(
|
||||
@ -397,39 +299,21 @@ def main(argv: list[str] | None = None) -> int:
|
||||
args.timeout_seconds,
|
||||
)
|
||||
)
|
||||
already_recorded = bool(build_number) and _series_exists(
|
||||
args.pushgateway_url,
|
||||
"platform_quality_gate_build_info",
|
||||
{
|
||||
"job": args.job_name,
|
||||
"suite": args.suite,
|
||||
"branch": branch or "unknown",
|
||||
"build_number": build_number or "unknown",
|
||||
"jenkins_job": jenkins_job,
|
||||
},
|
||||
args.timeout_seconds,
|
||||
)
|
||||
except Exception as exc:
|
||||
remote_error = str(exc)
|
||||
|
||||
resolved_ok = remote_ok
|
||||
resolved_failed = remote_failed
|
||||
if remote_error:
|
||||
resolved_ok = args.local_ok
|
||||
resolved_failed = args.local_failed
|
||||
elif not already_recorded:
|
||||
resolved_ok += current_ok
|
||||
resolved_failed += current_failed
|
||||
resolved_ok = max(args.local_ok, remote_ok)
|
||||
resolved_failed = max(args.local_failed, remote_failed)
|
||||
coverage_percent = _read_coverage_percent(args.coverage_percent_file)
|
||||
source_files_total = _count_source_files(repo_root)
|
||||
source_lines_over_500 = _count_source_files_over_limit(repo_root, max_lines=500)
|
||||
quality_output = Path(os.getenv("ANANKE_QUALITY_OUTPUT_FILE", str(build_dir / "quality-gate.out")))
|
||||
tests = _parse_go_test_counts(quality_output)
|
||||
test_cases = _parse_go_test_cases(quality_output)
|
||||
test_output = Path(os.getenv("ANANKE_QUALITY_OUTPUT_FILE", str(build_dir / "quality-gate.out")))
|
||||
tests = _parse_go_test_counts(test_output)
|
||||
test_cases = _parse_go_test_cases(test_output)
|
||||
gate_rc = _read_exit_code(Path(os.getenv("ANANKE_QUALITY_EXIT_CODE_PATH", str(build_dir / "quality-gate.rc"))))
|
||||
docs_status = _read_status(Path(os.getenv("ANANKE_QUALITY_DOCS_STATUS_PATH", str(build_dir / "docs-naming.status"))))
|
||||
unit_tests_failed = _unit_tests_failed(quality_output, coverage_percent)
|
||||
gate_failed = gate_rc != 0
|
||||
checks = {
|
||||
"tests": "failed" if unit_tests_failed or tests["failed"] > 0 or tests["errors"] > 0 else "ok",
|
||||
"tests": "failed" if gate_failed or tests["failed"] > 0 else "ok",
|
||||
"coverage": "ok" if coverage_percent >= 95.0 else "failed",
|
||||
"loc": "ok" if source_lines_over_500 == 0 else "failed",
|
||||
"docs_naming": docs_status,
|
||||
@ -448,11 +332,7 @@ def main(argv: list[str] | None = None) -> int:
|
||||
tests_skipped=tests["skipped"],
|
||||
test_cases=test_cases,
|
||||
coverage_percent=coverage_percent,
|
||||
source_files_total=source_files_total,
|
||||
source_lines_over_500=source_lines_over_500,
|
||||
branch=branch,
|
||||
build_number=build_number,
|
||||
jenkins_job=jenkins_job,
|
||||
checks=checks,
|
||||
)
|
||||
|
||||
@ -465,8 +345,7 @@ def main(argv: list[str] | None = None) -> int:
|
||||
|
||||
summary = (
|
||||
f"[quality] published Pushgateway metrics suite={args.suite} job={args.job_name} ok={resolved_ok} "
|
||||
f"failed={resolved_failed} coverage={coverage_percent:.3f} source_files_total={source_files_total} "
|
||||
f"source_lines_over_500={source_lines_over_500}"
|
||||
f"failed={resolved_failed} coverage={coverage_percent:.3f} source_lines_over_500={source_lines_over_500}"
|
||||
)
|
||||
if remote_error:
|
||||
summary += f" remote_read_error={remote_error}"
|
||||
|
||||
@ -3,11 +3,8 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import http.server
|
||||
from pathlib import Path
|
||||
import socketserver
|
||||
import tempfile
|
||||
import threading
|
||||
from unittest import mock
|
||||
import unittest
|
||||
|
||||
import publish_quality_metrics as publisher
|
||||
@ -61,19 +58,7 @@ class PublishQualityMetricsTest(unittest.TestCase):
|
||||
self.server.server_close()
|
||||
self.thread.join(timeout=5)
|
||||
|
||||
def _env_for_gate_status(self, status: int = 0) -> dict[str, str]:
|
||||
tmp_dir = tempfile.TemporaryDirectory()
|
||||
self.addCleanup(tmp_dir.cleanup)
|
||||
rc_path = Path(tmp_dir.name) / "quality-gate.rc"
|
||||
rc_path.write_text(f"{status}\n", encoding="utf-8")
|
||||
return {
|
||||
"ANANKE_QUALITY_EXIT_CODE_PATH": str(rc_path),
|
||||
"ANANKE_QUALITY_COVERAGE_PERCENT_FILE": str(Path(tmp_dir.name) / "coverage.txt"),
|
||||
"ANANKE_QUALITY_OUTPUT_FILE": str(Path(tmp_dir.name) / "quality-gate.out"),
|
||||
"ANANKE_QUALITY_DOCS_STATUS_PATH": str(Path(tmp_dir.name) / "docs-naming.status"),
|
||||
}
|
||||
|
||||
def test_publish_adds_current_run_to_remote_counters(self) -> None:
|
||||
def test_publish_uses_remote_high_water_mark(self) -> None:
|
||||
_GatewayHandler.metrics_text = "\n".join(
|
||||
[
|
||||
'# TYPE platform_quality_gate_runs_total counter',
|
||||
@ -82,93 +67,51 @@ class PublishQualityMetricsTest(unittest.TestCase):
|
||||
]
|
||||
)
|
||||
|
||||
with mock.patch.dict("os.environ", self._env_for_gate_status(0)):
|
||||
exit_code = publisher.main(
|
||||
[
|
||||
"--pushgateway-url",
|
||||
self.base_url,
|
||||
"--job-name",
|
||||
"platform-quality-ci",
|
||||
"--suite",
|
||||
"ananke",
|
||||
"--trigger",
|
||||
"host",
|
||||
"--local-ok",
|
||||
"5",
|
||||
"--local-failed",
|
||||
"2",
|
||||
]
|
||||
)
|
||||
exit_code = publisher.main(
|
||||
[
|
||||
"--pushgateway-url",
|
||||
self.base_url,
|
||||
"--job-name",
|
||||
"platform-quality-ci",
|
||||
"--suite",
|
||||
"ananke",
|
||||
"--trigger",
|
||||
"host",
|
||||
"--local-ok",
|
||||
"5",
|
||||
"--local-failed",
|
||||
"2",
|
||||
]
|
||||
)
|
||||
|
||||
self.assertEqual(exit_code, 0)
|
||||
self.assertEqual(len(_GatewayHandler.posts), 1)
|
||||
path, body = _GatewayHandler.posts[0]
|
||||
self.assertEqual(path, "/metrics/job/platform-quality-ci/suite/ananke")
|
||||
self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="ok"} 8', body)
|
||||
self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="failed"} 1', body)
|
||||
self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="ok"} 7', body)
|
||||
self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="failed"} 2', body)
|
||||
self.assertIn('ananke_quality_gate_publish_info{suite="ananke",trigger="host"} 1', body)
|
||||
self.assertIn('ananke_quality_gate_coverage_percent{suite="ananke"}', body)
|
||||
self.assertIn('platform_quality_gate_workspace_line_coverage_percent{suite="ananke"}', body)
|
||||
self.assertIn('platform_quality_gate_source_files_total{suite="ananke"}', body)
|
||||
self.assertIn('platform_quality_gate_source_lines_over_500_total{suite="ananke"}', body)
|
||||
|
||||
def test_publish_does_not_double_count_same_build(self) -> None:
|
||||
_GatewayHandler.metrics_text = "\n".join(
|
||||
[
|
||||
'platform_quality_gate_runs_total{job="platform-quality-ci",suite="ananke",status="ok"} 7',
|
||||
'platform_quality_gate_runs_total{job="platform-quality-ci",suite="ananke",status="failed"} 1',
|
||||
'platform_quality_gate_build_info{job="platform-quality-ci",suite="ananke",branch="main",build_number="78",jenkins_job="ananke"} 1',
|
||||
]
|
||||
)
|
||||
with mock.patch.dict(
|
||||
"os.environ",
|
||||
{
|
||||
**self._env_for_gate_status(0),
|
||||
"BRANCH_NAME": "main",
|
||||
"BUILD_NUMBER": "78",
|
||||
"JOB_NAME": "ananke",
|
||||
},
|
||||
):
|
||||
exit_code = publisher.main(
|
||||
[
|
||||
"--pushgateway-url",
|
||||
self.base_url,
|
||||
"--job-name",
|
||||
"platform-quality-ci",
|
||||
"--suite",
|
||||
"ananke",
|
||||
"--trigger",
|
||||
"host",
|
||||
"--local-ok",
|
||||
"1",
|
||||
"--local-failed",
|
||||
"0",
|
||||
]
|
||||
)
|
||||
|
||||
self.assertEqual(exit_code, 0)
|
||||
_, body = _GatewayHandler.posts[0]
|
||||
self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="ok"} 7', body)
|
||||
self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="failed"} 1', body)
|
||||
|
||||
def test_publish_falls_back_to_local_counters_when_metrics_read_fails(self) -> None:
|
||||
_GatewayHandler.fail_metrics_read = True
|
||||
|
||||
with mock.patch.dict("os.environ", self._env_for_gate_status(0)):
|
||||
exit_code = publisher.main(
|
||||
[
|
||||
"--pushgateway-url",
|
||||
self.base_url,
|
||||
"--job-name",
|
||||
"platform-quality-ci",
|
||||
"--suite",
|
||||
"ananke",
|
||||
"--local-ok",
|
||||
"11",
|
||||
"--local-failed",
|
||||
"3",
|
||||
]
|
||||
)
|
||||
exit_code = publisher.main(
|
||||
[
|
||||
"--pushgateway-url",
|
||||
self.base_url,
|
||||
"--job-name",
|
||||
"platform-quality-ci",
|
||||
"--suite",
|
||||
"ananke",
|
||||
"--local-ok",
|
||||
"11",
|
||||
"--local-failed",
|
||||
"3",
|
||||
]
|
||||
)
|
||||
|
||||
self.assertEqual(exit_code, 0)
|
||||
self.assertEqual(len(_GatewayHandler.posts), 1)
|
||||
@ -176,7 +119,6 @@ class PublishQualityMetricsTest(unittest.TestCase):
|
||||
self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="ok"} 11', body)
|
||||
self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="failed"} 3', body)
|
||||
self.assertIn('platform_quality_gate_workspace_line_coverage_percent{suite="ananke"}', body)
|
||||
self.assertIn('platform_quality_gate_source_files_total{suite="ananke"}', body)
|
||||
self.assertIn('platform_quality_gate_source_lines_over_500_total{suite="ananke"}', body)
|
||||
|
||||
|
||||
|
||||
@ -158,9 +158,15 @@ mkdir -p "${BUILD_DIR}"
|
||||
rm -f "${COVERAGE_PROFILE}" "${COVERAGE_PERCENT_FILE}"
|
||||
printf 'failed\n' > "${BUILD_DIR}/docs-naming.status"
|
||||
|
||||
echo "[quality] dependency download"
|
||||
echo "[quality] unit tests + workspace coverage profile"
|
||||
export GOPROXY="${GOPROXY:-https://proxy.golang.org,direct}"
|
||||
run_with_retry 4 go mod download
|
||||
run_with_retry 3 go test -coverprofile="${COVERAGE_PROFILE}" ./...
|
||||
coverage_percent="$(go tool cover -func="${COVERAGE_PROFILE}" | awk '/^total:/ {gsub("%","",$3); print $3}')"
|
||||
if [[ -z "${coverage_percent}" ]]; then
|
||||
coverage_percent="0"
|
||||
fi
|
||||
printf '%s\n' "${coverage_percent}" > "${COVERAGE_PERCENT_FILE}"
|
||||
|
||||
echo "[quality] hygiene: doc contracts"
|
||||
cd testing
|
||||
@ -183,14 +189,6 @@ echo "[quality] lint"
|
||||
echo "[quality] installer template contracts"
|
||||
./scripts/verify_install_templates.sh
|
||||
|
||||
echo "[quality] unit tests + workspace coverage profile"
|
||||
run_with_retry 3 go test -coverprofile="${COVERAGE_PROFILE}" ./...
|
||||
coverage_percent="$(go tool cover -func="${COVERAGE_PROFILE}" | awk '/^total:/ {gsub("%","",$3); print $3}')"
|
||||
if [[ -z "${coverage_percent}" ]]; then
|
||||
coverage_percent="0"
|
||||
fi
|
||||
printf '%s\n' "${coverage_percent}" > "${COVERAGE_PERCENT_FILE}"
|
||||
|
||||
echo "[quality] per-file coverage gate (95%)"
|
||||
cd testing
|
||||
ANANKE_ENFORCE_COVERAGE=1 ANANKE_PER_FILE_COVERAGE_TARGET=95 go test ./coverage -run TestPerFileCoverageReport -count=1 -v
|
||||
|
||||
@ -17,12 +17,6 @@ import (
|
||||
const maxGoFileLOC = 500
|
||||
|
||||
var goFileNamePattern = regexp.MustCompile(`^[a-z0-9]+(_[a-z0-9]+)*(_test)?\.go$`)
|
||||
var genericFileNameTokens = map[string]struct{}{
|
||||
"chunk": {},
|
||||
"part": {},
|
||||
"piece": {},
|
||||
"split": {},
|
||||
}
|
||||
|
||||
func repoRoot(tb testing.TB) string {
|
||||
tb.Helper()
|
||||
@ -67,16 +61,13 @@ func collectGoFiles(tb testing.TB, roots ...string) []string {
|
||||
func TestHygieneContracts(t *testing.T) {
|
||||
root := repoRoot(t)
|
||||
files := collectGoFiles(t, filepath.Join(root, "cmd"), filepath.Join(root, "internal"))
|
||||
namingFiles := append([]string{}, files...)
|
||||
namingFiles = append(namingFiles, collectGoFiles(t, filepath.Join(root, "testing"))...)
|
||||
sort.Strings(files)
|
||||
sort.Strings(namingFiles)
|
||||
|
||||
t.Run("doc_contract", func(t *testing.T) {
|
||||
checkDocContracts(t, files)
|
||||
})
|
||||
t.Run("naming_contract", func(t *testing.T) {
|
||||
checkNamingContracts(t, namingFiles)
|
||||
checkNamingContracts(t, files)
|
||||
})
|
||||
t.Run("loc_limit", func(t *testing.T) {
|
||||
checkFileLOCLimits(t, files)
|
||||
@ -130,19 +121,9 @@ func checkNamingContracts(t *testing.T, files []string) {
|
||||
if !goFileNamePattern.MatchString(base) {
|
||||
t.Errorf("%s: filename %q violates naming contract %s", file, base, goFileNamePattern.String())
|
||||
}
|
||||
for _, token := range filenameTokens(base) {
|
||||
if _, ok := genericFileNameTokens[token]; ok {
|
||||
t.Errorf("%s: filename %q uses generic split-file token %q", file, base, token)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func filenameTokens(name string) []string {
|
||||
trimmed := strings.TrimSuffix(strings.TrimSuffix(name, ".go"), "_test")
|
||||
return strings.Split(trimmed, "_")
|
||||
}
|
||||
|
||||
// checkFileLOCLimits runs one orchestration or CLI step.
|
||||
// Signature: checkFileLOCLimits(t *testing.T, files []string).
|
||||
// Why: A strict LOC cap forces focused files and keeps refactors manageable.
|
||||
|
||||
@ -13,8 +13,6 @@ cmd/ananke/power_safety_test.go
|
||||
cmd/ananke/test_helpers_test.go
|
||||
internal/cluster/orchestrator_inventory_test.go
|
||||
internal/cluster/orchestrator_report_test.go
|
||||
internal/cluster/orchestrator_autorepair_test.go
|
||||
internal/cluster/orchestrator_autorepair_cleanup_test.go
|
||||
internal/cluster/orchestrator_test.go
|
||||
internal/cluster/orchestrator_unit_additional_test.go
|
||||
internal/cluster/orchestrator_vault_test.go
|
||||
@ -23,7 +21,6 @@ internal/config/load_additional_test.go
|
||||
internal/config/validate_matrix_test.go
|
||||
internal/service/daemon_additional_test.go
|
||||
internal/service/daemon_coverage_closeout_test.go
|
||||
internal/service/daemon_poststart_autorepair_test.go
|
||||
internal/service/daemon_quality_branches_test.go
|
||||
internal/service/daemon_test.go
|
||||
internal/sshutil/repair_test.go
|
||||
|
||||
@ -363,3 +363,4 @@ func TestHookBootstrapCacheAndRepoSyncFailureBranches(t *testing.T) {
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
@ -79,29 +79,6 @@ func TestHookFluxAndWorkloadMatrix(t *testing.T) {
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("flux-health-required-kustomizations-ignore-optional-failures", func(t *testing.T) {
|
||||
cfg := lifecycleConfig(t)
|
||||
cfg.Startup.FluxHealthRequiredKustomizations = []string{"flux-system/monitoring"}
|
||||
|
||||
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||||
command := name + " " + strings.Join(args, " ")
|
||||
switch {
|
||||
case name == "kubectl" && strings.Contains(command, "get kustomizations.kustomize.toolkit.fluxcd.io -A -o json"):
|
||||
return `{"items":[
|
||||
{"metadata":{"namespace":"flux-system","name":"monitoring"},"spec":{"suspend":false},"status":{"conditions":[{"type":"Ready","status":"True","message":"ok"}]}},
|
||||
{"metadata":{"namespace":"flux-system","name":"jellyfin"},"spec":{"suspend":false},"status":{"conditions":[{"type":"Ready","status":"False","message":"optional still down"}]}}
|
||||
]}`, nil
|
||||
default:
|
||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||
}
|
||||
}
|
||||
orch, _ := newHookOrchestrator(t, cfg, run, run)
|
||||
ready, detail, err := orch.TestHookFluxHealthReady(context.Background())
|
||||
if err != nil || !ready || !strings.Contains(detail, "required kustomizations ready=") {
|
||||
t.Fatalf("expected required flux kustomization scope to pass, ready=%v detail=%q err=%v", ready, detail, err)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("workload-convergence-and-recycle-branches", func(t *testing.T) {
|
||||
cfg := lifecycleConfig(t)
|
||||
cfg.Startup.WorkloadConvergenceWaitSeconds = 1
|
||||
@ -168,42 +145,6 @@ func TestHookFluxAndWorkloadMatrix(t *testing.T) {
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("required-workload-namespaces-ignore-optional-failure-pods", func(t *testing.T) {
|
||||
cfg := lifecycleConfig(t)
|
||||
cfg.Startup.WorkloadConvergenceRequiredNamespaces = []string{"monitoring"}
|
||||
cfg.Startup.StuckPodGraceSeconds = 1
|
||||
|
||||
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||||
command := name + " " + strings.Join(args, " ")
|
||||
switch {
|
||||
case name == "kubectl" && strings.Contains(command, "get deploy,statefulset,daemonset -A -o json"):
|
||||
return `{"items":[
|
||||
{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"grafana"},"spec":{"replicas":1},"status":{"readyReplicas":1}},
|
||||
{"kind":"Deployment","metadata":{"namespace":"jellyfin","name":"pegasus"},"spec":{"replicas":1},"status":{"readyReplicas":0}}
|
||||
]}`, nil
|
||||
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
|
||||
return `{"items":[
|
||||
{"metadata":{"namespace":"monitoring","name":"grafana-0","creationTimestamp":"2020-01-01T00:00:00Z","ownerReferences":[{"kind":"ReplicaSet","name":"grafana"}]},"spec":{"nodeName":"titan-23","containers":[{"name":"grafana"}]},"status":{"containerStatuses":[{"state":{"running":{}}}]}},
|
||||
{"metadata":{"namespace":"jellyfin","name":"pegasus-0","creationTimestamp":"2020-01-01T00:00:00Z","ownerReferences":[{"kind":"ReplicaSet","name":"pegasus"}]},"spec":{"nodeName":"titan-23","containers":[{"name":"pegasus"}]},"status":{"containerStatuses":[{"state":{"waiting":{"reason":"CrashLoopBackOff"}}}]}}
|
||||
]}`, nil
|
||||
default:
|
||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||
}
|
||||
}
|
||||
orch, _ := newHookOrchestrator(t, cfg, run, run)
|
||||
ready, detail, err := orch.TestHookWorkloadConvergenceReady(context.Background())
|
||||
if err != nil || !ready || !strings.Contains(detail, "controllers ready=") {
|
||||
t.Fatalf("expected required workload namespace scope to pass, ready=%v detail=%q err=%v", ready, detail, err)
|
||||
}
|
||||
failures, err := orch.TestHookStartupFailurePods(context.Background())
|
||||
if err != nil {
|
||||
t.Fatalf("startup failure pod query: %v", err)
|
||||
}
|
||||
if len(failures) != 0 {
|
||||
t.Fatalf("expected optional namespace failures to be ignored, got %v", failures)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("critical-workload-replica-heal-branches", func(t *testing.T) {
|
||||
cfg := lifecycleConfig(t)
|
||||
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||||
|
||||
@ -19,11 +19,11 @@ import (
|
||||
"scm.bstein.dev/bstein/ananke/internal/state"
|
||||
)
|
||||
|
||||
// newHookOrchestratorWithRunnerMode runs one orchestration or CLI step.
|
||||
// Signature: newHookOrchestratorWithRunnerMode(t *testing.T, cfg config.Config, dryRun bool, run commandOverride, runSensitive commandOverride) (*cluster.Orchestrator, *commandRecorder).
|
||||
// Why: these scenarios needs dry-run and non-dry-run variants while keeping
|
||||
// newHookOrchestratorAdvanced runs one orchestration or CLI step.
|
||||
// Signature: newHookOrchestratorAdvanced(t *testing.T, cfg config.Config, dryRun bool, run commandOverride, runSensitive commandOverride) (*cluster.Orchestrator, *commandRecorder).
|
||||
// Why: this part10 matrix needs dry-run and non-dry-run variants while keeping
|
||||
// command dispatch deterministic from the top-level testing module.
|
||||
func newHookOrchestratorWithRunnerMode(
|
||||
func newHookOrchestratorAdvanced(
|
||||
t *testing.T,
|
||||
cfg config.Config,
|
||||
dryRun bool,
|
||||
@ -49,11 +49,11 @@ func newHookOrchestratorWithRunnerMode(
|
||||
return orch, recorder
|
||||
}
|
||||
|
||||
// TestHookVaultLifecycleBranchMatrix runs one orchestration or CLI step.
|
||||
// Signature: TestHookVaultLifecycleBranchMatrix(t *testing.T).
|
||||
// TestHookGapMatrixPart10LowFileClosure runs one orchestration or CLI step.
|
||||
// Signature: TestHookGapMatrixPart10LowFileClosure(t *testing.T).
|
||||
// Why: closes remaining branch gaps on low-coverage orchestrator files using
|
||||
// targeted hook-level scenarios instead of brittle full-drill reruns.
|
||||
func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
||||
func TestHookGapMatrixPart10LowFileClosure(t *testing.T) {
|
||||
t.Run("critical-vault-low-branches", func(t *testing.T) {
|
||||
t.Run("vault-sealed-parse-error", func(t *testing.T) {
|
||||
cfg := lifecycleConfig(t)
|
||||
@ -64,7 +64,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
||||
}
|
||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||
}
|
||||
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
|
||||
orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
|
||||
if _, err := orch.TestHookVaultSealed(context.Background()); err == nil || !strings.Contains(err.Error(), "parse vault status") {
|
||||
t.Fatalf("expected vault status parse error branch, got %v", err)
|
||||
}
|
||||
@ -81,7 +81,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
||||
}
|
||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||
}
|
||||
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
|
||||
orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
|
||||
if _, err := orch.TestHookVaultUnsealKey(context.Background()); err == nil || !strings.Contains(err.Error(), "vault-init unseal key is empty") {
|
||||
t.Fatalf("expected empty decoded unseal key branch, got %v", err)
|
||||
}
|
||||
@ -90,7 +90,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
||||
t.Run("write-unseal-key-file-write-error", func(t *testing.T) {
|
||||
cfg := lifecycleConfig(t)
|
||||
cfg.Startup.VaultUnsealKeyFile = t.TempDir()
|
||||
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, nil, nil)
|
||||
orch, _ := newHookOrchestratorAdvanced(t, cfg, false, nil, nil)
|
||||
if err := orch.TestHookWriteVaultUnsealKeyFile("vault-key"); err == nil || !strings.Contains(err.Error(), "write vault unseal key file") {
|
||||
t.Fatalf("expected write failure branch when key path is a directory, got %v", err)
|
||||
}
|
||||
@ -105,7 +105,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
||||
}
|
||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||
}
|
||||
orchNoValue, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, runNoValue, runNoValue)
|
||||
orchNoValue, _ := newHookOrchestratorAdvanced(t, cfg, false, runNoValue, runNoValue)
|
||||
ready, err := orchNoValue.TestHookWorkloadReady(context.Background(), "vault", "statefulset", "vault")
|
||||
if err != nil || ready {
|
||||
t.Fatalf("expected no-value readiness branch, ready=%v err=%v", ready, err)
|
||||
@ -124,7 +124,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||
}
|
||||
}
|
||||
orchEnsureErr, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, runEnsureErr, runEnsureErr)
|
||||
orchEnsureErr, _ := newHookOrchestratorAdvanced(t, cfg, false, runEnsureErr, runEnsureErr)
|
||||
if err := orchEnsureErr.TestHookEnsureCriticalStartupWorkloads(context.Background()); err == nil || !strings.Contains(err.Error(), "rollout failed") {
|
||||
t.Fatalf("expected ensureCriticalStartupWorkloads wait error branch, got %v", err)
|
||||
}
|
||||
@ -139,7 +139,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
||||
}
|
||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||
}
|
||||
orchPhase, _ := newHookOrchestratorWithRunnerMode(t, cfgPhase, false, runPhase, runPhase)
|
||||
orchPhase, _ := newHookOrchestratorAdvanced(t, cfgPhase, false, runPhase, runPhase)
|
||||
if err := orchPhase.TestHookEnsureVaultUnsealed(context.Background()); err == nil || !strings.Contains(err.Error(), "pod phase") {
|
||||
t.Fatalf("expected pod phase guard branch, got %v", err)
|
||||
}
|
||||
@ -170,7 +170,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
||||
}
|
||||
return runFollowup(ctx, timeout, name, args...)
|
||||
}
|
||||
orchFollowup, _ := newHookOrchestratorWithRunnerMode(t, cfgFollowup, false, runFollowup, runSensitive)
|
||||
orchFollowup, _ := newHookOrchestratorAdvanced(t, cfgFollowup, false, runFollowup, runSensitive)
|
||||
if err := orchFollowup.TestHookEnsureVaultUnsealed(context.Background()); err == nil || !strings.Contains(err.Error(), "vault status check failed") {
|
||||
t.Fatalf("expected follow-up sealed status error branch, got %v", err)
|
||||
}
|
||||
@ -204,7 +204,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||
}
|
||||
}
|
||||
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
|
||||
orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
|
||||
err := orch.TestHookDrainWorkers(context.Background(), workers)
|
||||
if err == nil || !strings.Contains(err.Error(), "drain workers had 5 errors") {
|
||||
t.Fatalf("expected drain aggregation branch, got %v", err)
|
||||
@ -217,7 +217,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
||||
cfg.SSHManagedNodes = []string{"titan-db"}
|
||||
rec := &commandRecorder{}
|
||||
base := lifecycleDispatcher(rec)
|
||||
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, base, base)
|
||||
orch, _ := newHookOrchestratorAdvanced(t, cfg, false, base, base)
|
||||
orch.TestHookRunSSHAcrossNodes(context.Background(), []string{"titan-db", "not-managed"}, "noop", "echo ok")
|
||||
if !rec.contains("atlas@titan-db echo ok") {
|
||||
t.Fatalf("expected managed ssh execution branch")
|
||||
@ -233,7 +233,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
||||
}
|
||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||
}
|
||||
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
|
||||
orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
|
||||
if _, err := orch.TestHookLatestEtcdSnapshotPath(context.Background(), "titan-db"); err == nil || !strings.Contains(err.Error(), "no etcd snapshots found") {
|
||||
t.Fatalf("expected empty snapshot-list branch, got %v", err)
|
||||
}
|
||||
@ -250,7 +250,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
||||
}
|
||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||
}
|
||||
orchWorkers, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, runWorkers, runWorkers)
|
||||
orchWorkers, _ := newHookOrchestratorAdvanced(t, cfg, false, runWorkers, runWorkers)
|
||||
workers, err := orchWorkers.TestHookEffectiveWorkers(context.Background())
|
||||
if err != nil || len(workers) == 0 {
|
||||
t.Fatalf("expected inventory worker fallback branch, workers=%v err=%v", workers, err)
|
||||
@ -273,7 +273,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||
}
|
||||
}
|
||||
orchWrite, _ := newHookOrchestratorWithRunnerMode(t, cfgWrite, false, runWrite, runWrite)
|
||||
orchWrite, _ := newHookOrchestratorAdvanced(t, cfgWrite, false, runWrite, runWrite)
|
||||
if err := orchWrite.TestHookScaleDownApps(context.Background()); err == nil || !strings.Contains(err.Error(), "write scaled workload snapshot") {
|
||||
t.Fatalf("expected scaled snapshot write-failure branch, got %v", err)
|
||||
}
|
||||
@ -294,7 +294,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||
}
|
||||
}
|
||||
orchReady, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, runReady, runReady)
|
||||
orchReady, _ := newHookOrchestratorAdvanced(t, cfg, false, runReady, runReady)
|
||||
ready, detail, err := orchReady.TestHookFluxHealthReady(context.Background())
|
||||
if err != nil || ready || !strings.Contains(detail, "ready=false") {
|
||||
t.Fatalf("expected flux ready-reason fallback branch, ready=%v detail=%q err=%v", ready, detail, err)
|
||||
@ -319,7 +319,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||
}
|
||||
}
|
||||
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
|
||||
orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
cancel()
|
||||
if err := orch.TestHookWaitForFluxHealth(ctx); !errors.Is(err, context.Canceled) {
|
||||
@ -336,7 +336,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
||||
}
|
||||
rec := &commandRecorder{}
|
||||
base := lifecycleDispatcher(rec)
|
||||
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, base, base)
|
||||
orch, _ := newHookOrchestratorAdvanced(t, cfg, false, base, base)
|
||||
if err := orch.TestHookEnsureRequiredNodeLabels(context.Background()); err != nil {
|
||||
t.Fatalf("expected ensureRequiredNodeLabels skip/apply branches, got %v", err)
|
||||
}
|
||||
@ -347,7 +347,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
||||
|
||||
t.Run("wait-for-startup-convergence-dryrun-and-critical-endpoint-fail", func(t *testing.T) {
|
||||
cfgDry := lifecycleConfig(t)
|
||||
orchDry, _ := newHookOrchestratorWithRunnerMode(t, cfgDry, true, nil, nil)
|
||||
orchDry, _ := newHookOrchestratorAdvanced(t, cfgDry, true, nil, nil)
|
||||
if err := orchDry.TestHookWaitForStartupConvergence(context.Background()); err != nil {
|
||||
t.Fatalf("expected startup convergence dry-run fast path, got %v", err)
|
||||
}
|
||||
@ -365,7 +365,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
||||
}
|
||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||
}
|
||||
orchFail, _ := newHookOrchestratorWithRunnerMode(t, cfgFail, false, run, run)
|
||||
orchFail, _ := newHookOrchestratorAdvanced(t, cfgFail, false, run, run)
|
||||
if err := orchFail.TestHookWaitForStartupConvergence(context.Background()); err == nil || !strings.Contains(err.Error(), "query endpoints") {
|
||||
t.Fatalf("expected critical-endpoint convergence failure branch, got %v", err)
|
||||
}
|
||||
@ -373,7 +373,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
||||
|
||||
t.Run("ingress-namespace-discovery-empty-and-query-error", func(t *testing.T) {
|
||||
cfg := lifecycleConfig(t)
|
||||
orchEmpty, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, nil, nil)
|
||||
orchEmpty, _ := newHookOrchestratorAdvanced(t, cfg, false, nil, nil)
|
||||
namespaces, err := orchEmpty.TestHookDiscoverIngressNamespacesForHost(context.Background(), " ")
|
||||
if err != nil || len(namespaces) != 0 {
|
||||
t.Fatalf("expected empty-host fast path, namespaces=%v err=%v", namespaces, err)
|
||||
@ -386,7 +386,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
||||
}
|
||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||
}
|
||||
orchErr, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, runErr, runErr)
|
||||
orchErr, _ := newHookOrchestratorAdvanced(t, cfg, false, runErr, runErr)
|
||||
if _, err := orchErr.TestHookDiscoverIngressNamespacesForHost(context.Background(), "metrics.bstein.dev"); err == nil || !strings.Contains(err.Error(), "query ingresses") {
|
||||
t.Fatalf("expected ingress query error branch, got %v", err)
|
||||
}
|
||||
@ -412,7 +412,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
||||
URL: "http://" + listener.Addr().String() + "/health",
|
||||
AcceptedStatuses: []int{200},
|
||||
}}
|
||||
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, nil, nil)
|
||||
orch, _ := newHookOrchestratorAdvanced(t, cfg, false, nil, nil)
|
||||
ready, detail := orch.TestHookServiceChecklistReady(context.Background())
|
||||
if ready || !strings.Contains(detail, "http://") {
|
||||
t.Fatalf("expected service checklist URL-name fallback failure, ready=%v detail=%q", ready, detail)
|
||||
@ -435,7 +435,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
||||
}
|
||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||
}
|
||||
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
|
||||
orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
cancel()
|
||||
if err := orch.TestHookWaitForNodeInventoryReachability(ctx); !errors.Is(err, context.Canceled) {
|
||||
@ -456,7 +456,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
||||
}
|
||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||
}
|
||||
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
|
||||
orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
cancel()
|
||||
if err := orch.TestHookWaitForPostStartProbes(ctx); !errors.Is(err, context.Canceled) {
|
||||
@ -478,7 +478,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
||||
}
|
||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||
}
|
||||
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
|
||||
orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
|
||||
if err := orch.TestHookResumeFluxAndReconcile(context.Background()); err != nil {
|
||||
t.Fatalf("expected resume flux warning-only branch, got %v", err)
|
||||
}
|
||||
@ -505,7 +505,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||
}
|
||||
}
|
||||
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
|
||||
orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
cancel()
|
||||
if err := orch.TestHookWaitForTimeSync(ctx, []string{"", "titan-db"}); !errors.Is(err, context.Canceled) {
|
||||
@ -532,14 +532,14 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||
}
|
||||
}
|
||||
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
|
||||
orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
|
||||
if err := orch.TestHookWaitForWorkloadConvergence(context.Background()); err != nil {
|
||||
t.Fatalf("expected workload convergence default-branch success, got %v", err)
|
||||
}
|
||||
|
||||
cfgIgnore := lifecycleConfig(t)
|
||||
cfgIgnore.Startup.AutoRecycleStuckPods = false
|
||||
orchIgnoreDry, _ := newHookOrchestratorWithRunnerMode(t, cfgIgnore, true, run, run)
|
||||
orchIgnoreDry, _ := newHookOrchestratorAdvanced(t, cfgIgnore, true, run, run)
|
||||
now := time.Now().UTC().Add(-time.Hour)
|
||||
orchIgnoreDry.TestHookMaybeAutoRecycleStuckPods(context.Background(), &now)
|
||||
orchIgnoreDry.TestHookMaybeAutoHealCriticalWorkloadReplicas(context.Background(), &now)
|
||||
@ -551,7 +551,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
||||
}
|
||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||
}
|
||||
orchHealErr, _ := newHookOrchestratorWithRunnerMode(t, lifecycleConfig(t), false, runHealErr, runHealErr)
|
||||
orchHealErr, _ := newHookOrchestratorAdvanced(t, lifecycleConfig(t), false, runHealErr, runHealErr)
|
||||
if _, err := orchHealErr.TestHookHealCriticalWorkloadReplicas(context.Background()); err == nil || !strings.Contains(err.Error(), "query workloads") {
|
||||
t.Fatalf("expected critical workload heal query-error branch, got %v", err)
|
||||
}
|
||||
@ -20,7 +20,7 @@ import (
|
||||
|
||||
// newLifecycleMatrixOrchestrator runs one orchestration or CLI step.
|
||||
// Signature: newLifecycleMatrixOrchestrator(t *testing.T, cfg config.Config, dryRun bool, run commandOverride, runSensitive commandOverride, kubeconfig string) *cluster.Orchestrator.
|
||||
// Why: lifecycle cleanup scenarios need direct control over runner dry-run and kubeconfig branches.
|
||||
// Why: part11 needs direct control over runner dry-run and kubeconfig branches.
|
||||
func newLifecycleMatrixOrchestrator(
|
||||
t *testing.T,
|
||||
cfg config.Config,
|
||||
@ -49,11 +49,11 @@ func newLifecycleMatrixOrchestrator(
|
||||
return orch
|
||||
}
|
||||
|
||||
// TestHookLifecycleCleanupRemainingClosure runs one orchestration or CLI step.
|
||||
// Signature: TestHookLifecycleCleanupRemainingClosure(t *testing.T).
|
||||
// TestHookGapMatrixPart11RemainingClosure runs one orchestration or CLI step.
|
||||
// Signature: TestHookGapMatrixPart11RemainingClosure(t *testing.T).
|
||||
// Why: closes final branch gaps for lifecycle + remaining near-threshold
|
||||
// orchestrator files so per-file coverage reaches the enforced 95% target.
|
||||
func TestHookLifecycleCleanupRemainingClosure(t *testing.T) {
|
||||
func TestHookGapMatrixPart11RemainingClosure(t *testing.T) {
|
||||
t.Run("critical-vault-final-closures", func(t *testing.T) {
|
||||
t.Run("ensure-critical-cleanup-error-and-cleanup-branches", func(t *testing.T) {
|
||||
cfg := lifecycleConfig(t)
|
||||
@ -633,7 +633,7 @@ func TestHookLifecycleCleanupRemainingClosure(t *testing.T) {
|
||||
switch {
|
||||
case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
|
||||
apiVersionCalls++
|
||||
if apiVersionCalls <= 2 {
|
||||
if apiVersionCalls == 1 {
|
||||
return "", errors.New("api down")
|
||||
}
|
||||
return "v1.31.0", nil
|
||||
@ -17,11 +17,11 @@ import (
|
||||
"scm.bstein.dev/bstein/ananke/internal/state"
|
||||
)
|
||||
|
||||
// TestHookTimesyncAndStabilityMatrix runs one orchestration or CLI step.
|
||||
// Signature: TestHookTimesyncAndStabilityMatrix(t *testing.T).
|
||||
// TestHookGapMatrixPart2TimesyncAndStability runs one orchestration or CLI step.
|
||||
// Signature: TestHookGapMatrixPart2TimesyncAndStability(t *testing.T).
|
||||
// Why: drives low-coverage time-sync, datastore parsing, and startup stability
|
||||
// branches from the top-level testing module.
|
||||
func TestHookTimesyncAndStabilityMatrix(t *testing.T) {
|
||||
func TestHookGapMatrixPart2TimesyncAndStability(t *testing.T) {
|
||||
t.Run("parse-datastore-endpoint-matrix", func(t *testing.T) {
|
||||
cases := []struct {
|
||||
line string
|
||||
@ -162,11 +162,11 @@ func TestHookTimesyncAndStabilityMatrix(t *testing.T) {
|
||||
})
|
||||
}
|
||||
|
||||
// TestHookFluxScalingReportMatrix runs one orchestration or CLI step.
|
||||
// Signature: TestHookFluxScalingReportMatrix(t *testing.T).
|
||||
// TestHookGapMatrixPart2FluxScalingReport runs one orchestration or CLI step.
|
||||
// Signature: TestHookGapMatrixPart2FluxScalingReport(t *testing.T).
|
||||
// Why: targets low branch density in flux-health, scaling snapshot handling,
|
||||
// and report sanitization helpers.
|
||||
func TestHookFluxScalingReportMatrix(t *testing.T) {
|
||||
func TestHookGapMatrixPart2FluxScalingReport(t *testing.T) {
|
||||
t.Run("flux-helper-matrix", func(t *testing.T) {
|
||||
if !cluster.TestHookLooksLikeImmutableJobError("Job update failed: FIELD IS IMMUTABLE") {
|
||||
t.Fatalf("expected immutable matcher true for uppercase+job variant")
|
||||
@ -241,11 +241,11 @@ func TestHookFluxScalingReportMatrix(t *testing.T) {
|
||||
})
|
||||
}
|
||||
|
||||
// TestHookVaultAndCoordinationMatrix runs one orchestration or CLI step.
|
||||
// Signature: TestHookVaultAndCoordinationMatrix(t *testing.T).
|
||||
// TestHookGapMatrixPart2VaultAndCoordination runs one orchestration or CLI step.
|
||||
// Signature: TestHookGapMatrixPart2VaultAndCoordination(t *testing.T).
|
||||
// Why: raises branch coverage on vault/key and coordination helpers without
|
||||
// requiring package-local tests.
|
||||
func TestHookVaultAndCoordinationMatrix(t *testing.T) {
|
||||
func TestHookGapMatrixPart2VaultAndCoordination(t *testing.T) {
|
||||
t.Run("vault-unseal-and-file-branches", func(t *testing.T) {
|
||||
cfg := lifecycleConfig(t)
|
||||
cfg.Startup.VaultUnsealKeyFile = ""
|
||||
@ -296,11 +296,11 @@ func TestHookVaultAndCoordinationMatrix(t *testing.T) {
|
||||
})
|
||||
}
|
||||
|
||||
// TestHookWorkloadIgnoreMatrix runs one orchestration or CLI step.
|
||||
// Signature: TestHookWorkloadIgnoreMatrix(t *testing.T).
|
||||
// TestHookGapMatrixPart2WorkloadIgnore runs one orchestration or CLI step.
|
||||
// Signature: TestHookGapMatrixPart2WorkloadIgnore(t *testing.T).
|
||||
// Why: expands low branch coverage in workload ignore helpers and startup-failure
|
||||
// pod classification.
|
||||
func TestHookWorkloadIgnoreMatrix(t *testing.T) {
|
||||
func TestHookGapMatrixPart2WorkloadIgnore(t *testing.T) {
|
||||
t.Run("ignored-node-helper-matrix", func(t *testing.T) {
|
||||
if !cluster.TestHookWorkloadTargetsIgnoredNodes("titan-22", nil, []string{"titan-22"}) {
|
||||
t.Fatalf("expected selector-host ignored match")
|
||||
@ -11,11 +11,11 @@ import (
|
||||
"scm.bstein.dev/bstein/ananke/internal/config"
|
||||
)
|
||||
|
||||
// TestHookConvergenceAndStabilityMatrix runs one orchestration or CLI step.
|
||||
// Signature: TestHookConvergenceAndStabilityMatrix(t *testing.T).
|
||||
// TestHookGapMatrixPart3ConvergenceAndStability runs one orchestration or CLI step.
|
||||
// Signature: TestHookGapMatrixPart3ConvergenceAndStability(t *testing.T).
|
||||
// Why: raises coverage for startup convergence orchestration and stability gates
|
||||
// that determine whether startup is considered truly complete.
|
||||
func TestHookConvergenceAndStabilityMatrix(t *testing.T) {
|
||||
func TestHookGapMatrixPart3ConvergenceAndStability(t *testing.T) {
|
||||
t.Run("wait-for-startup-convergence-gate-matrix", func(t *testing.T) {
|
||||
cfgIngress := lifecycleConfig(t)
|
||||
cfgIngress.Startup.RequireIngressChecklist = true
|
||||
@ -108,11 +108,11 @@ func TestHookConvergenceAndStabilityMatrix(t *testing.T) {
|
||||
})
|
||||
}
|
||||
|
||||
// TestHookLifecycleRestoreShutdownMatrix runs one orchestration or CLI step.
|
||||
// Signature: TestHookLifecycleRestoreShutdownMatrix(t *testing.T).
|
||||
// TestHookGapMatrixPart3LifecycleRestoreShutdown runs one orchestration or CLI step.
|
||||
// Signature: TestHookGapMatrixPart3LifecycleRestoreShutdown(t *testing.T).
|
||||
// Why: fills lifecycle restore/shutdown success paths that are easy to miss in
|
||||
// failure-focused drill tests.
|
||||
func TestHookLifecycleRestoreShutdownMatrix(t *testing.T) {
|
||||
func TestHookGapMatrixPart3LifecycleRestoreShutdown(t *testing.T) {
|
||||
t.Run("etcd-restore-dry-run-and-success", func(t *testing.T) {
|
||||
cfgDry := lifecycleConfig(t)
|
||||
dry := newDryRunHookOrchestrator(t, cfgDry, nil)
|
||||
@ -19,11 +19,11 @@ import (
|
||||
"scm.bstein.dev/bstein/ananke/internal/state"
|
||||
)
|
||||
|
||||
// TestHookCoordinationAndReachabilityMatrix runs one orchestration or CLI step.
|
||||
// Signature: TestHookCoordinationAndReachabilityMatrix(t *testing.T).
|
||||
// TestHookGapMatrixPart4CoordinationAndReachability runs one orchestration or CLI step.
|
||||
// Signature: TestHookGapMatrixPart4CoordinationAndReachability(t *testing.T).
|
||||
// Why: closes remaining coordination/reachability low branches with deterministic
|
||||
// command responses and short timeouts.
|
||||
func TestHookCoordinationAndReachabilityMatrix(t *testing.T) {
|
||||
func TestHookGapMatrixPart4CoordinationAndReachability(t *testing.T) {
|
||||
t.Run("peer-shutdown-complete-cooldown-blocks-startup", func(t *testing.T) {
|
||||
cfg := lifecycleConfig(t)
|
||||
cfg.Coordination.PeerHosts = []string{"titan-24"}
|
||||
@ -136,11 +136,11 @@ func TestHookCoordinationAndReachabilityMatrix(t *testing.T) {
|
||||
})
|
||||
}
|
||||
|
||||
// TestHookIngressServiceAndPostStartMatrix runs one orchestration or CLI step.
|
||||
// Signature: TestHookIngressServiceAndPostStartMatrix(t *testing.T).
|
||||
// TestHookGapMatrixPart4IngressServiceAndPostStart runs one orchestration or CLI step.
|
||||
// Signature: TestHookGapMatrixPart4IngressServiceAndPostStart(t *testing.T).
|
||||
// Why: drives ingress/service checklist and post-start branches that were still
|
||||
// under-covered after drill-focused matrix tests.
|
||||
func TestHookIngressServiceAndPostStartMatrix(t *testing.T) {
|
||||
func TestHookGapMatrixPart4IngressServiceAndPostStart(t *testing.T) {
|
||||
t.Run("ingress-backend-autoscale-cooldown-and-host-parser", func(t *testing.T) {
|
||||
cfg := lifecycleConfig(t)
|
||||
cfg.Startup.ServiceChecklist = []config.ServiceChecklistCheck{
|
||||
@ -194,11 +194,11 @@ func TestHookIngressServiceAndPostStartMatrix(t *testing.T) {
|
||||
cfg := lifecycleConfig(t)
|
||||
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
|
||||
ok, detail := orch.TestHookServiceCheckReady(context.Background(), config.ServiceChecklistCheck{
|
||||
Name: "forbidden-marker",
|
||||
URL: srv.URL,
|
||||
Name: "forbidden-marker",
|
||||
URL: srv.URL,
|
||||
AcceptedStatuses: []int{200},
|
||||
BodyNotContains: "marker",
|
||||
TimeoutSeconds: 2,
|
||||
BodyNotContains: "marker",
|
||||
TimeoutSeconds: 2,
|
||||
})
|
||||
if ok || !strings.Contains(detail, "forbidden marker") {
|
||||
t.Fatalf("expected forbidden-marker branch, ok=%v detail=%q", ok, detail)
|
||||
@ -233,11 +233,11 @@ func TestHookIngressServiceAndPostStartMatrix(t *testing.T) {
|
||||
})
|
||||
}
|
||||
|
||||
// TestHookReportScalingStorageDrainMatrix runs one orchestration or CLI step.
|
||||
// Signature: TestHookReportScalingStorageDrainMatrix(t *testing.T).
|
||||
// TestHookGapMatrixPart4ReportScalingStorageDrain runs one orchestration or CLI step.
|
||||
// Signature: TestHookGapMatrixPart4ReportScalingStorageDrain(t *testing.T).
|
||||
// Why: covers artifact, scaling snapshot, storage, and drain error branches that
|
||||
// are difficult to hit from happy-path lifecycle drills.
|
||||
func TestHookReportScalingStorageDrainMatrix(t *testing.T) {
|
||||
func TestHookGapMatrixPart4ReportScalingStorageDrain(t *testing.T) {
|
||||
t.Run("report-artifact-and-progress-error-paths", func(t *testing.T) {
|
||||
cfg := lifecycleConfig(t)
|
||||
reportsFile := filepath.Join(t.TempDir(), "reports-as-file")
|
||||
@ -339,11 +339,11 @@ func TestHookReportScalingStorageDrainMatrix(t *testing.T) {
|
||||
})
|
||||
}
|
||||
|
||||
// TestHookTimesyncLifecycleAndAccessMatrix runs one orchestration or CLI step.
|
||||
// Signature: TestHookTimesyncLifecycleAndAccessMatrix(t *testing.T).
|
||||
// TestHookGapMatrixPart4TimesyncLifecycleAndAccess runs one orchestration or CLI step.
|
||||
// Signature: TestHookGapMatrixPart4TimesyncLifecycleAndAccess(t *testing.T).
|
||||
// Why: closes remaining timing/access/lifecycle branches that still sat below
|
||||
// target after the earlier matrices.
|
||||
func TestHookTimesyncLifecycleAndAccessMatrix(t *testing.T) {
|
||||
func TestHookGapMatrixPart4TimesyncLifecycleAndAccess(t *testing.T) {
|
||||
t.Run("timesync-quorum-and-strict-failure-branches", func(t *testing.T) {
|
||||
cfg := lifecycleConfig(t)
|
||||
cfg.Startup.TimeSyncMode = "quorum"
|
||||
@ -20,11 +20,11 @@ import (
|
||||
"scm.bstein.dev/bstein/ananke/internal/state"
|
||||
)
|
||||
|
||||
// TestHookEndpointHealingCoverageClosure runs one orchestration or CLI step.
|
||||
// Signature: TestHookEndpointHealingCoverageClosure(t *testing.T).
|
||||
// TestHookGapMatrixPart5CoverageClosure runs one orchestration or CLI step.
|
||||
// Signature: TestHookGapMatrixPart5CoverageClosure(t *testing.T).
|
||||
// Why: closes branch gaps that still remained after drill-style tests by driving
|
||||
// low-coverage orchestrator internals through the exported top-level hook surface.
|
||||
func TestHookEndpointHealingCoverageClosure(t *testing.T) {
|
||||
func TestHookGapMatrixPart5CoverageClosure(t *testing.T) {
|
||||
t.Run("critical-endpoint-backend-heal-matrix", func(t *testing.T) {
|
||||
t.Run("empty-namespace-service-noop", func(t *testing.T) {
|
||||
orch, _ := newHookOrchestrator(t, lifecycleConfig(t), nil, nil)
|
||||
@ -491,10 +491,10 @@ func httpStatusHandler(code int, body string) func(http.ResponseWriter, *http.Re
|
||||
}
|
||||
}
|
||||
|
||||
// TestHookIngressHostMappingRegression runs one orchestration or CLI step.
|
||||
// Signature: TestHookIngressHostMappingRegression(t *testing.T).
|
||||
// TestHookGapMatrixPart5IngressHostMappingRegression runs one orchestration or CLI step.
|
||||
// Signature: TestHookGapMatrixPart5IngressHostMappingRegression(t *testing.T).
|
||||
// Why: ensures host parsing fallback paths stay stable for ingress/service checklist failures.
|
||||
func TestHookIngressHostMappingRegression(t *testing.T) {
|
||||
func TestHookGapMatrixPart5IngressHostMappingRegression(t *testing.T) {
|
||||
cfg := lifecycleConfig(t)
|
||||
cfg.Startup.ServiceChecklist = []config.ServiceChecklistCheck{
|
||||
{Name: "metrics", URL: "https://metrics.bstein.dev/api/health"},
|
||||
@ -16,11 +16,11 @@ import (
|
||||
"scm.bstein.dev/bstein/ananke/internal/state"
|
||||
)
|
||||
|
||||
// TestHookVaultPostStartBranchMatrix runs one orchestration or CLI step.
|
||||
// Signature: TestHookVaultPostStartBranchMatrix(t *testing.T).
|
||||
// Why: targets the remaining low branch paths after endpoint-healing coverage so per-file coverage
|
||||
// TestHookGapMatrixPart6CoverageClosure runs one orchestration or CLI step.
|
||||
// Signature: TestHookGapMatrixPart6CoverageClosure(t *testing.T).
|
||||
// Why: targets the remaining low branch paths after part5 so per-file coverage
|
||||
// can move toward the strict 95% quality gate.
|
||||
func TestHookVaultPostStartBranchMatrix(t *testing.T) {
|
||||
func TestHookGapMatrixPart6CoverageClosure(t *testing.T) {
|
||||
t.Run("critical-vault-and-poststart-branches", func(t *testing.T) {
|
||||
t.Run("wait-vault-ready-dryrun-and-cancel", func(t *testing.T) {
|
||||
cfg := lifecycleConfig(t)
|
||||
@ -14,11 +14,11 @@ import (
|
||||
"scm.bstein.dev/bstein/ananke/internal/state"
|
||||
)
|
||||
|
||||
// TestHookWorkloadStorageAccessMatrix runs one orchestration or CLI step.
|
||||
// Signature: TestHookWorkloadStorageAccessMatrix(t *testing.T).
|
||||
// TestHookGapMatrixPart7CoverageClosure runs one orchestration or CLI step.
|
||||
// Signature: TestHookGapMatrixPart7CoverageClosure(t *testing.T).
|
||||
// Why: closes additional low-coverage branches in convergence, storage, access,
|
||||
// flux, lifecycle, and sensitive command wrappers.
|
||||
func TestHookWorkloadStorageAccessMatrix(t *testing.T) {
|
||||
func TestHookGapMatrixPart7CoverageClosure(t *testing.T) {
|
||||
t.Run("workload-convergence-branch-matrix", func(t *testing.T) {
|
||||
cfg := lifecycleConfig(t)
|
||||
cfg.Startup.WorkloadConvergenceWaitSeconds = 1
|
||||
@ -165,32 +165,6 @@ func TestHookWorkloadStorageAccessMatrix(t *testing.T) {
|
||||
t.Fatalf("expected inventory reachability timeout on unexpected output, got %v", err)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("wait-for-node-gates-only-require-core-nodes", func(t *testing.T) {
|
||||
cfg := lifecycleConfig(t)
|
||||
cfg.Startup.RequireNodeSSHAuth = true
|
||||
cfg.Startup.NodeSSHAuthWaitSeconds = 1
|
||||
cfg.Startup.NodeSSHAuthPollSeconds = 1
|
||||
cfg.Startup.NodeInventoryReachWaitSeconds = 1
|
||||
cfg.Startup.NodeInventoryReachPollSeconds = 1
|
||||
cfg.Startup.NodeInventoryReachRequiredNodes = []string{"titan-db"}
|
||||
cfg.Startup.NodeSSHAuthRequiredNodes = []string{"titan-db"}
|
||||
|
||||
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||||
command := name + " " + strings.Join(args, " ")
|
||||
if name == "ssh" && strings.Contains(command, "titan-23") && strings.Contains(command, "__ANANKE_") {
|
||||
return "", errors.New("no route to host")
|
||||
}
|
||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||
}
|
||||
orch, _ := newHookOrchestrator(t, cfg, run, run)
|
||||
if err := orch.TestHookWaitForNodeSSHAuth(context.Background(), []string{"titan-db", "titan-23"}); err != nil {
|
||||
t.Fatalf("expected ssh-auth gate to ignore non-core worker failure, got %v", err)
|
||||
}
|
||||
if err := orch.TestHookWaitForNodeInventoryReachability(context.Background()); err != nil {
|
||||
t.Fatalf("expected inventory reachability gate to ignore non-core worker failure, got %v", err)
|
||||
}
|
||||
})
|
||||
})
|
||||
|
||||
t.Run("flux-lifecycle-and-sensitive-run-branches", func(t *testing.T) {
|
||||
@ -19,11 +19,11 @@ import (
|
||||
"scm.bstein.dev/bstein/ananke/internal/state"
|
||||
)
|
||||
|
||||
// TestHookAccessVaultLifecycleMatrix runs one orchestration or CLI step.
|
||||
// Signature: TestHookAccessVaultLifecycleMatrix(t *testing.T).
|
||||
// TestHookGapMatrixPart8CoverageClosure runs one orchestration or CLI step.
|
||||
// Signature: TestHookGapMatrixPart8CoverageClosure(t *testing.T).
|
||||
// Why: closes additional low-coverage branches in access, vault, lifecycle,
|
||||
// ingress/service stability, and timesync/inventory orchestration paths.
|
||||
func TestHookAccessVaultLifecycleMatrix(t *testing.T) {
|
||||
func TestHookGapMatrixPart8CoverageClosure(t *testing.T) {
|
||||
t.Run("access-ssh-and-branch-guard-branches", func(t *testing.T) {
|
||||
cfg := lifecycleConfig(t)
|
||||
cfg.Startup.RequireNodeSSHAuth = true
|
||||
@ -331,11 +331,11 @@ func TestHookAccessVaultLifecycleMatrix(t *testing.T) {
|
||||
})
|
||||
}
|
||||
|
||||
// TestHookLifecycleStartupAutoRestoreBranch runs one orchestration or CLI step.
|
||||
// Signature: TestHookLifecycleStartupAutoRestoreBranch(t *testing.T).
|
||||
// TestHookGapMatrixPart8LifecycleStartupAutoRestoreBranch runs one orchestration or CLI step.
|
||||
// Signature: TestHookGapMatrixPart8LifecycleStartupAutoRestoreBranch(t *testing.T).
|
||||
// Why: covers Startup's API-failure->auto-restore retry path that is otherwise
|
||||
// hard to exercise in deterministic top-level tests.
|
||||
func TestHookLifecycleStartupAutoRestoreBranch(t *testing.T) {
|
||||
func TestHookGapMatrixPart8LifecycleStartupAutoRestoreBranch(t *testing.T) {
|
||||
cfg := lifecycleConfig(t)
|
||||
cfg.Startup.AutoEtcdRestoreOnAPIFailure = true
|
||||
cfg.Startup.EtcdRestoreControlPlane = "titan-db"
|
||||
@ -384,7 +384,7 @@ func TestHookLifecycleStartupAutoRestoreBranch(t *testing.T) {
|
||||
}
|
||||
}
|
||||
orch, _ := newHookOrchestrator(t, cfg, run, run)
|
||||
err = orch.Startup(context.Background(), cluster.StartupOptions{Reason: "lifecycle-auto-restore"})
|
||||
err = orch.Startup(context.Background(), cluster.StartupOptions{Reason: "part8-auto-restore"})
|
||||
if err != nil {
|
||||
t.Fatalf("expected startup auto-restore path success, got %v", err)
|
||||
}
|
||||
@ -394,7 +394,7 @@ func TestHookLifecycleStartupAutoRestoreBranch(t *testing.T) {
|
||||
|
||||
cfgBadMode := lifecycleConfig(t)
|
||||
orchBadMode, _ := newHookOrchestrator(t, cfgBadMode, nil, nil)
|
||||
err = orchBadMode.Shutdown(context.Background(), cluster.ShutdownOptions{Reason: "lifecycle", Mode: "unknown-mode"})
|
||||
err = orchBadMode.Shutdown(context.Background(), cluster.ShutdownOptions{Reason: "part8", Mode: "unknown-mode"})
|
||||
if err == nil || !strings.Contains(err.Error(), "unsupported shutdown mode") {
|
||||
t.Fatalf("expected shutdown unsupported-mode branch, got %v", err)
|
||||
}
|
||||
@ -16,11 +16,11 @@ import (
|
||||
"scm.bstein.dev/bstein/ananke/internal/state"
|
||||
)
|
||||
|
||||
// TestHookAccessCoordinationEndpointsMatrix runs one orchestration or CLI step.
|
||||
// Signature: TestHookAccessCoordinationEndpointsMatrix(t *testing.T).
|
||||
// TestHookGapMatrixPart9AccessCoordinationEndpoints runs one orchestration or CLI step.
|
||||
// Signature: TestHookGapMatrixPart9AccessCoordinationEndpoints(t *testing.T).
|
||||
// Why: closes uncovered statement ranges in access/fluxsource, coordination,
|
||||
// and critical-endpoint orchestration helpers.
|
||||
func TestHookAccessCoordinationEndpointsMatrix(t *testing.T) {
|
||||
func TestHookGapMatrixPart9AccessCoordinationEndpoints(t *testing.T) {
|
||||
t.Run("access-fluxsource-uncovered-ranges", func(t *testing.T) {
|
||||
cfg := lifecycleConfig(t)
|
||||
cfg.Shutdown.SSHParallelism = 0
|
||||
@ -53,48 +53,6 @@ func TestHookIngressServiceMatrix(t *testing.T) {
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("required-node-labels-skip-ignored-unavailable-nodes", func(t *testing.T) {
|
||||
cfg := lifecycleConfig(t)
|
||||
cfg.Startup.RequiredNodeLabels = map[string]map[string]string{
|
||||
"titan-09": {
|
||||
"ananke.bstein.dev/harbor-bootstrap": "true",
|
||||
},
|
||||
}
|
||||
cfg.Startup.IgnoreUnavailableNodes = []string{"titan-09"}
|
||||
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||||
command := name + " " + strings.Join(args, " ")
|
||||
if name == "kubectl" && strings.Contains(command, "label node titan-09 --overwrite") {
|
||||
t.Fatalf("expected ignored unavailable node labels to be skipped, got %q", command)
|
||||
}
|
||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||
}
|
||||
orch, _ := newHookOrchestrator(t, cfg, run, run)
|
||||
if err := orch.TestHookEnsureRequiredNodeLabels(context.Background()); err != nil {
|
||||
t.Fatalf("expected ignored unavailable node label enforcement to be skipped, got %v", err)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("required-node-labels-skip-absent-non-core-nodes", func(t *testing.T) {
|
||||
cfg := lifecycleConfig(t)
|
||||
cfg.Startup.RequiredNodeLabels = map[string]map[string]string{
|
||||
"titan-09": {
|
||||
"ananke.bstein.dev/harbor-bootstrap": "true",
|
||||
},
|
||||
}
|
||||
cfg.Startup.NodeInventoryReachRequiredNodes = []string{"titan-db"}
|
||||
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||||
command := name + " " + strings.Join(args, " ")
|
||||
if name == "kubectl" && strings.Contains(command, "label node titan-09 --overwrite") {
|
||||
return "", errors.New("Error from server (NotFound): nodes \"titan-09\" not found")
|
||||
}
|
||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||
}
|
||||
orch, _ := newHookOrchestrator(t, cfg, run, run)
|
||||
if err := orch.TestHookEnsureRequiredNodeLabels(context.Background()); err != nil {
|
||||
t.Fatalf("expected absent non-core node label enforcement to be skipped, got %v", err)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("ingress-discovery-checklist-and-heal", func(t *testing.T) {
|
||||
tlsServer := httptest.NewTLSServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
|
||||
w.WriteHeader(http.StatusOK)
|
||||
|
||||
@ -4,6 +4,7 @@ import (
|
||||
"context"
|
||||
"errors"
|
||||
"net"
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
@ -124,25 +125,20 @@ func TestLifecycleDeepFailureMatrix(t *testing.T) {
|
||||
|
||||
t.Run("startup-cooldown-reread-intent-error", func(t *testing.T) {
|
||||
cfg := lifecycleFastConfig(t)
|
||||
cfg.Startup.RequireNodeInventoryReach = false
|
||||
cfg.Startup.ShutdownCooldownSeconds = 5
|
||||
reads := 0
|
||||
restoreRead := state.TestHookSetReadIntentOverride(func(path string) (state.Intent, error) {
|
||||
if path != cfg.State.IntentPath {
|
||||
return state.TestHookReadIntentDefault(path)
|
||||
}
|
||||
reads++
|
||||
if reads == 1 {
|
||||
return state.Intent{
|
||||
State: state.IntentShutdownComplete,
|
||||
Reason: "recent",
|
||||
Source: "test",
|
||||
UpdatedAt: time.Now().UTC().Add(-4 * time.Second),
|
||||
}, nil
|
||||
}
|
||||
return state.Intent{}, errors.New("forced reread failure")
|
||||
})
|
||||
t.Cleanup(restoreRead)
|
||||
cfg.Startup.ShutdownCooldownSeconds = 1
|
||||
if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{
|
||||
State: state.IntentShutdownComplete,
|
||||
Reason: "recent",
|
||||
Source: "test",
|
||||
UpdatedAt: time.Now().UTC(),
|
||||
}); err != nil {
|
||||
t.Fatalf("seed cooldown intent: %v", err)
|
||||
}
|
||||
go func(intentPath string) {
|
||||
time.Sleep(150 * time.Millisecond)
|
||||
_ = os.Remove(intentPath)
|
||||
_ = os.Mkdir(intentPath, 0o755)
|
||||
}(cfg.State.IntentPath)
|
||||
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
|
||||
err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "cooldown-reread"})
|
||||
if err == nil || !strings.Contains(err.Error(), "re-read startup intent after cooldown wait") {
|
||||
@ -152,30 +148,24 @@ func TestLifecycleDeepFailureMatrix(t *testing.T) {
|
||||
|
||||
t.Run("startup-cooldown-shutdown-became-active", func(t *testing.T) {
|
||||
cfg := lifecycleFastConfig(t)
|
||||
cfg.Startup.RequireNodeInventoryReach = false
|
||||
cfg.Startup.ShutdownCooldownSeconds = 5
|
||||
reads := 0
|
||||
restoreRead := state.TestHookSetReadIntentOverride(func(path string) (state.Intent, error) {
|
||||
if path != cfg.State.IntentPath {
|
||||
return state.TestHookReadIntentDefault(path)
|
||||
}
|
||||
reads++
|
||||
if reads == 1 {
|
||||
return state.Intent{
|
||||
State: state.IntentShutdownComplete,
|
||||
Reason: "recent",
|
||||
Source: "test",
|
||||
UpdatedAt: time.Now().UTC().Add(-4 * time.Second),
|
||||
}, nil
|
||||
}
|
||||
return state.Intent{
|
||||
cfg.Startup.ShutdownCooldownSeconds = 1
|
||||
if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{
|
||||
State: state.IntentShutdownComplete,
|
||||
Reason: "recent",
|
||||
Source: "test",
|
||||
UpdatedAt: time.Now().UTC(),
|
||||
}); err != nil {
|
||||
t.Fatalf("seed cooldown intent: %v", err)
|
||||
}
|
||||
go func(intentPath string) {
|
||||
time.Sleep(150 * time.Millisecond)
|
||||
_ = state.WriteIntent(intentPath, state.Intent{
|
||||
State: state.IntentShuttingDown,
|
||||
Reason: "peer-shutdown",
|
||||
Source: "test",
|
||||
UpdatedAt: time.Now().UTC(),
|
||||
}, nil
|
||||
})
|
||||
t.Cleanup(restoreRead)
|
||||
})
|
||||
}(cfg.State.IntentPath)
|
||||
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
|
||||
err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "cooldown-peer-active"})
|
||||
if err == nil || !strings.Contains(err.Error(), "shutdown intent became active during cooldown wait") {
|
||||
|
||||
@ -1,432 +0,0 @@
|
||||
package orchestrator
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"scm.bstein.dev/bstein/ananke/internal/cluster"
|
||||
)
|
||||
|
||||
// TestHookSchedulingStormHelpers runs one orchestration or CLI step.
|
||||
// Signature: TestHookSchedulingStormHelpers(t *testing.T).
|
||||
// Why: keeps scheduling-storm helper coverage in the split top-level testing module
|
||||
// required by the repo hygiene contract.
|
||||
func TestHookSchedulingStormHelpers(t *testing.T) {
|
||||
if got, ok := cluster.TestHookSchedulingStormOwnerWorkload("ai", "ReplicaSet", "ollama-rs", "Deployment", "ollama"); !ok || got != "ai/deployment/ollama" {
|
||||
t.Fatalf("unexpected deployment owner resolution: got=%q ok=%v", got, ok)
|
||||
}
|
||||
if got, ok := cluster.TestHookSchedulingStormOwnerWorkload("storage", "StatefulSet", "nextcloud", "", ""); !ok || got != "storage/statefulset/nextcloud" {
|
||||
t.Fatalf("unexpected statefulset owner resolution: got=%q ok=%v", got, ok)
|
||||
}
|
||||
if got, ok := cluster.TestHookSchedulingStormOwnerWorkload("ai", "ReplicaSet", "missing", "", ""); ok || got != "" {
|
||||
t.Fatalf("expected missing replicaset owner lookup to fail, got=%q ok=%v", got, ok)
|
||||
}
|
||||
|
||||
if got := cluster.TestHookEventObservationCount(3, 9); got != 9 {
|
||||
t.Fatalf("expected series count to win, got %d", got)
|
||||
}
|
||||
if got := cluster.TestHookEventObservationCount(0, 0); got != 1 {
|
||||
t.Fatalf("expected zero-count normalization to 1, got %d", got)
|
||||
}
|
||||
|
||||
now := time.Now().UTC().Round(time.Second)
|
||||
if got := cluster.TestHookEventLastObservedAt(now, now.Add(-time.Minute), now.Add(-2*time.Minute), now.Add(-3*time.Minute)); !got.Equal(now) {
|
||||
t.Fatalf("expected series timestamp priority, got %s", got)
|
||||
}
|
||||
if got := cluster.TestHookEventLastObservedAt(time.Time{}, now, now.Add(-time.Minute), now.Add(-2*time.Minute)); !got.Equal(now) {
|
||||
t.Fatalf("expected lastTimestamp fallback, got %s", got)
|
||||
}
|
||||
if got := cluster.TestHookEventLastObservedAt(time.Time{}, time.Time{}, now, now.Add(-time.Minute)); !got.Equal(now) {
|
||||
t.Fatalf("expected eventTime fallback, got %s", got)
|
||||
}
|
||||
if got := cluster.TestHookEventLastObservedAt(time.Time{}, time.Time{}, time.Time{}, now); !got.Equal(now) {
|
||||
t.Fatalf("expected creationTimestamp fallback, got %s", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestHookSchedulingStormQuarantine runs one orchestration or CLI step.
|
||||
// Signature: TestHookSchedulingStormQuarantine(t *testing.T).
|
||||
// Why: verifies that only non-core workloads generating real scheduling storms
|
||||
// are auto-quarantined, which prevents event/Kine churn from spiking control-plane CPU.
|
||||
func TestHookSchedulingStormQuarantine(t *testing.T) {
|
||||
now := time.Now().UTC().Format(time.RFC3339)
|
||||
cfg := lifecycleConfig(t)
|
||||
cfg.Startup.AutoQuarantineSchedulingStorms = true
|
||||
cfg.Startup.SchedulingStormEventThreshold = 30
|
||||
cfg.Startup.SchedulingStormWindowSeconds = 180
|
||||
cfg.Startup.WorkloadConvergenceRequiredNamespaces = []string{"vault"}
|
||||
cfg.Startup.IgnoreWorkloadNamespaces = []string{"ignored-ns"}
|
||||
cfg.Startup.IgnoreWorkloads = []string{"monitoring/deployment/ignore-me"}
|
||||
cfg.Startup.IgnoreUnavailableNodes = []string{"titan-22"}
|
||||
scaledOllama := false
|
||||
|
||||
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||||
command := name + " " + strings.Join(args, " ")
|
||||
switch {
|
||||
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
|
||||
return `{"items":[
|
||||
{"metadata":{"namespace":"ai","name":"ollama-pod","ownerReferences":[{"kind":"ReplicaSet","name":"ollama-rs"}]},"spec":{},"status":{"phase":"Pending"}},
|
||||
{"metadata":{"namespace":"vault","name":"vault-0","ownerReferences":[{"kind":"StatefulSet","name":"vault"}]},"spec":{},"status":{"phase":"Pending"}},
|
||||
{"metadata":{"namespace":"ignored-ns","name":"skip-pod","ownerReferences":[{"kind":"ReplicaSet","name":"skip-rs"}]},"spec":{},"status":{"phase":"Pending"}},
|
||||
{"metadata":{"namespace":"monitoring","name":"ignore-me-pod","ownerReferences":[{"kind":"ReplicaSet","name":"ignore-me-rs"}]},"spec":{},"status":{"phase":"Pending"}},
|
||||
{"metadata":{"namespace":"monitoring","name":"ignored-node-pod","ownerReferences":[{"kind":"ReplicaSet","name":"ignored-node-rs"}]},"spec":{"nodeName":"titan-22"},"status":{"phase":"Pending"}},
|
||||
{"metadata":{"namespace":"monitoring","name":"running-pod","ownerReferences":[{"kind":"ReplicaSet","name":"running-rs"}]},"spec":{},"status":{"phase":"Running"}}
|
||||
]}`, nil
|
||||
case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
|
||||
return `{"items":[
|
||||
{"metadata":{"namespace":"ai","name":"ollama-rs","ownerReferences":[{"kind":"Deployment","name":"ollama"}]}},
|
||||
{"metadata":{"namespace":"ignored-ns","name":"skip-rs","ownerReferences":[{"kind":"Deployment","name":"skip"}]}},
|
||||
{"metadata":{"namespace":"monitoring","name":"ignore-me-rs","ownerReferences":[{"kind":"Deployment","name":"ignore-me"}]}},
|
||||
{"metadata":{"namespace":"monitoring","name":"ignored-node-rs","ownerReferences":[{"kind":"Deployment","name":"ignored-node"}]}},
|
||||
{"metadata":{"namespace":"monitoring","name":"running-rs","ownerReferences":[{"kind":"Deployment","name":"running"}]}}
|
||||
]}`, nil
|
||||
case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
|
||||
return `{"items":[
|
||||
{"metadata":{"creationTimestamp":"` + now + `"},"involvedObject":{"kind":"Pod","namespace":"ai","name":"ollama-pod"},"type":"Warning","reason":"FailedScheduling","count":45},
|
||||
{"metadata":{"creationTimestamp":"` + now + `"},"involvedObject":{"kind":"Pod","namespace":"vault","name":"vault-0"},"type":"Warning","reason":"FailedScheduling","count":45},
|
||||
{"metadata":{"creationTimestamp":"` + now + `"},"involvedObject":{"kind":"Pod","namespace":"ignored-ns","name":"skip-pod"},"type":"Warning","reason":"FailedScheduling","count":45},
|
||||
{"metadata":{"creationTimestamp":"` + now + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"ignore-me-pod"},"type":"Warning","reason":"FailedScheduling","count":45},
|
||||
{"metadata":{"creationTimestamp":"` + now + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"ignored-node-pod"},"type":"Warning","reason":"FailedScheduling","count":45},
|
||||
{"metadata":{"creationTimestamp":"` + now + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"running-pod"},"type":"Warning","reason":"FailedScheduling","count":45},
|
||||
{"metadata":{"creationTimestamp":"2000-01-01T00:00:00Z"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"stale-pod"},"type":"Warning","reason":"FailedScheduling","count":99}
|
||||
]}`, nil
|
||||
case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"):
|
||||
return `{"items":[
|
||||
{"kind":"Deployment","metadata":{"namespace":"ai","name":"ollama"},"spec":{"replicas":1}},
|
||||
{"kind":"StatefulSet","metadata":{"namespace":"vault","name":"vault"},"spec":{"replicas":1}},
|
||||
{"kind":"Deployment","metadata":{"namespace":"ignored-ns","name":"skip"},"spec":{"replicas":1}},
|
||||
{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"ignore-me"},"spec":{"replicas":1}},
|
||||
{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"ignored-node"},"spec":{"replicas":1}},
|
||||
{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"running"},"spec":{"replicas":1}}
|
||||
]}`, nil
|
||||
case name == "kubectl" && strings.Contains(command, "-n ai scale deployment ollama --replicas=0"):
|
||||
scaledOllama = true
|
||||
return "", nil
|
||||
default:
|
||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||
}
|
||||
}
|
||||
|
||||
orch, _ := newHookOrchestrator(t, cfg, run, run)
|
||||
orch.TestHookBeginStartupReport("scheduling-storm")
|
||||
defer orch.TestHookFinalizeStartupReport(nil)
|
||||
|
||||
if err := orch.TestHookQuarantineSchedulingStormWorkloads(context.Background()); err != nil {
|
||||
t.Fatalf("quarantine scheduling storm workloads: %v", err)
|
||||
}
|
||||
if !scaledOllama {
|
||||
t.Fatalf("expected ollama deployment to be scaled to zero")
|
||||
}
|
||||
progress := readStartupProgress(t, orch)
|
||||
if !strings.Contains(progress, "ollama") {
|
||||
t.Fatalf("expected startup progress to mention ollama quarantine, payload=%s", progress)
|
||||
}
|
||||
if strings.Contains(progress, "vault") || strings.Contains(progress, "ignore-me") || strings.Contains(progress, "ignored-node") {
|
||||
t.Fatalf("expected only the non-core eligible workload to be quarantined, payload=%s", progress)
|
||||
}
|
||||
}
|
||||
|
||||
// TestHookSchedulingStormTriggerGuards runs one orchestration or CLI step.
|
||||
// Signature: TestHookSchedulingStormTriggerGuards(t *testing.T).
|
||||
// Why: covers dry-run/disabled/rate-limit guards so the scheduler-storm auto-heal
|
||||
// only activates when the cluster is actually suffering this exact failure mode.
|
||||
func TestHookSchedulingStormTriggerGuards(t *testing.T) {
|
||||
cfgDisabled := lifecycleConfig(t)
|
||||
orchDisabled, _ := newHookOrchestrator(t, cfgDisabled, nil, nil)
|
||||
lastAttempt := time.Time{}
|
||||
orchDisabled.TestHookMaybeAutoQuarantineSchedulingStorms(context.Background(), &lastAttempt)
|
||||
if !lastAttempt.IsZero() {
|
||||
t.Fatalf("expected disabled scheduling-storm trigger to be skipped")
|
||||
}
|
||||
|
||||
cfgDry := lifecycleConfig(t)
|
||||
cfgDry.Startup.AutoQuarantineSchedulingStorms = true
|
||||
orchDry := newDryRunHookOrchestrator(t, cfgDry, nil)
|
||||
orchDry.TestHookMaybeAutoQuarantineSchedulingStorms(context.Background(), &lastAttempt)
|
||||
if !lastAttempt.IsZero() {
|
||||
t.Fatalf("expected dry-run scheduling-storm trigger to be skipped")
|
||||
}
|
||||
|
||||
cfgRate := lifecycleConfig(t)
|
||||
cfgRate.Startup.AutoQuarantineSchedulingStorms = true
|
||||
cfgRate.Startup.SchedulingStormEventThreshold = 5
|
||||
cfgRate.Startup.SchedulingStormWindowSeconds = 60
|
||||
recorder := &commandRecorder{}
|
||||
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||||
recorder.record(name, args)
|
||||
command := name + " " + strings.Join(args, " ")
|
||||
switch {
|
||||
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
|
||||
return `{"items":[]}`, nil
|
||||
case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
|
||||
return `{"items":[]}`, nil
|
||||
case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
|
||||
return `{"items":[]}`, nil
|
||||
case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"):
|
||||
return `{"items":[]}`, nil
|
||||
default:
|
||||
return lifecycleDispatcher(recorder)(ctx, timeout, name, args...)
|
||||
}
|
||||
}
|
||||
orchRate, _ := newHookOrchestrator(t, cfgRate, run, run)
|
||||
lastAttempt = time.Now()
|
||||
orchRate.TestHookMaybeAutoQuarantineSchedulingStorms(context.Background(), &lastAttempt)
|
||||
if recorder.contains("get pods -A -o json") {
|
||||
t.Fatalf("expected rate-limited scheduling-storm trigger to skip kubectl scans")
|
||||
}
|
||||
}
|
||||
|
||||
// TestHookSchedulingStormTriggerAndNoOpBranches runs one orchestration or CLI step.
|
||||
// Signature: TestHookSchedulingStormTriggerAndNoOpBranches(t *testing.T).
|
||||
// Why: raises scheduling-storm branch coverage on the success/no-op paths so the
|
||||
// auto-heal only acts on genuine event storms and stays quiet otherwise.
|
||||
func TestHookSchedulingStormTriggerAndNoOpBranches(t *testing.T) {
|
||||
cfg := lifecycleConfig(t)
|
||||
cfg.Startup.AutoQuarantineSchedulingStorms = true
|
||||
cfg.Startup.SchedulingStormEventThreshold = 0
|
||||
cfg.Startup.SchedulingStormWindowSeconds = 0
|
||||
|
||||
scanRan := false
|
||||
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||||
command := name + " " + strings.Join(args, " ")
|
||||
switch {
|
||||
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
|
||||
scanRan = true
|
||||
return `{"items":[
|
||||
{"metadata":{"namespace":"","name":"missing"}},
|
||||
{"metadata":{"namespace":"monitoring","name":"no-owner"},"spec":{},"status":{"phase":"Pending"}},
|
||||
{"metadata":{"namespace":"monitoring","name":"done","ownerReferences":[{"kind":"ReplicaSet","name":"done-rs"}]},"spec":{},"status":{"phase":"Running"}},
|
||||
{"metadata":{"namespace":"monitoring","name":"zero-replicas","ownerReferences":[{"kind":"ReplicaSet","name":"zero-rs"}]},"spec":{},"status":{"phase":"Pending"}}
|
||||
]}`, nil
|
||||
case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
|
||||
return `{"items":[
|
||||
{"metadata":{"namespace":"","name":"bad-rs"}},
|
||||
{"metadata":{"namespace":"monitoring","name":"done-rs","ownerReferences":[{"kind":"","name":"ignored"}]}},
|
||||
{"metadata":{"namespace":"monitoring","name":"zero-rs","ownerReferences":[{"kind":"Deployment","name":"zero"}]}}
|
||||
]}`, nil
|
||||
case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
|
||||
return `{"items":[
|
||||
{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"normal"},"type":"Normal","reason":"FailedScheduling","count":99},
|
||||
{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"wrong-reason"},"type":"Warning","reason":"SomeOtherReason","count":99},
|
||||
{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Service","namespace":"monitoring","name":"wrong-kind"},"type":"Warning","reason":"FailedScheduling","count":99},
|
||||
{"metadata":{"creationTimestamp":"2000-01-01T00:00:00Z"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"old"},"type":"Warning","reason":"FailedScheduling","count":99},
|
||||
{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"low-count"},"type":"Warning","reason":"FailedScheduling","count":1},
|
||||
{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"missing-pod"},"type":"Warning","reason":"FailedScheduling","count":99},
|
||||
{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"done"},"type":"Warning","reason":"FailedScheduling","count":99},
|
||||
{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"no-owner"},"type":"Warning","reason":"FailedScheduling","count":99},
|
||||
{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"zero-replicas"},"type":"Warning","reason":"FailedScheduling","count":99}
|
||||
]}`, nil
|
||||
case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"):
|
||||
return `{"items":[
|
||||
{"kind":"","metadata":{"namespace":"monitoring","name":"blank-kind"}},
|
||||
{"kind":"Job","metadata":{"namespace":"monitoring","name":"unsupported"}},
|
||||
{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"zero"},"spec":{"replicas":0}}
|
||||
]}`, nil
|
||||
default:
|
||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||
}
|
||||
}
|
||||
|
||||
orch, _ := newHookOrchestrator(t, cfg, run, run)
|
||||
orch.TestHookBeginStartupReport("scheduling-storm-noop")
|
||||
defer orch.TestHookFinalizeStartupReport(nil)
|
||||
|
||||
lastAttempt := time.Time{}
|
||||
orch.TestHookMaybeAutoQuarantineSchedulingStorms(context.Background(), &lastAttempt)
|
||||
if lastAttempt.IsZero() {
|
||||
t.Fatalf("expected successful scheduling-storm trigger to update lastAttempt")
|
||||
}
|
||||
if !scanRan {
|
||||
t.Fatalf("expected scheduling-storm scan to execute")
|
||||
}
|
||||
progress := readStartupProgress(t, orch)
|
||||
if strings.Contains(progress, "quarantined scheduling storm workload") {
|
||||
t.Fatalf("expected no-op scheduling-storm scan to avoid auto-heal output, payload=%s", progress)
|
||||
}
|
||||
}
|
||||
|
||||
// TestHookSchedulingStormErrorMatrix runs one orchestration or CLI step.
|
||||
// Signature: TestHookSchedulingStormErrorMatrix(t *testing.T).
|
||||
// Why: covers malformed/error response branches in the scheduling-storm scan so
|
||||
// Ananke can surface precise diagnostics when the API itself is part of the problem.
|
||||
func TestHookSchedulingStormErrorMatrix(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
run func(context.Context, time.Duration, string, ...string) (string, error)
|
||||
wantErr string
|
||||
}{
|
||||
{
|
||||
name: "pods-query-error",
|
||||
run: func(_ context.Context, _ time.Duration, name string, _ ...string) (string, error) {
|
||||
if name == "kubectl" {
|
||||
return "", errors.New("pods boom")
|
||||
}
|
||||
return "", nil
|
||||
},
|
||||
wantErr: "query pods for scheduling storm scan",
|
||||
},
|
||||
{
|
||||
name: "pods-decode-error",
|
||||
run: func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
|
||||
if name == "kubectl" && strings.Contains(strings.Join(args, " "), "get pods -A -o json") {
|
||||
return "{", nil
|
||||
}
|
||||
return `{"items":[]}`, nil
|
||||
},
|
||||
wantErr: "decode pods for scheduling storm scan",
|
||||
},
|
||||
{
|
||||
name: "replicasets-query-error",
|
||||
run: func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
|
||||
command := name + " " + strings.Join(args, " ")
|
||||
switch {
|
||||
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
|
||||
return `{"items":[]}`, nil
|
||||
case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
|
||||
return "", errors.New("replicasets boom")
|
||||
default:
|
||||
return "", nil
|
||||
}
|
||||
},
|
||||
wantErr: "query replicasets for scheduling storm scan",
|
||||
},
|
||||
{
|
||||
name: "replicasets-decode-error",
|
||||
run: func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
|
||||
command := name + " " + strings.Join(args, " ")
|
||||
switch {
|
||||
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
|
||||
return `{"items":[]}`, nil
|
||||
case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
|
||||
return "{", nil
|
||||
default:
|
||||
return `{"items":[]}`, nil
|
||||
}
|
||||
},
|
||||
wantErr: "decode replicasets for scheduling storm scan",
|
||||
},
|
||||
{
|
||||
name: "events-query-error",
|
||||
run: func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
|
||||
command := name + " " + strings.Join(args, " ")
|
||||
switch {
|
||||
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
|
||||
return `{"items":[]}`, nil
|
||||
case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
|
||||
return `{"items":[]}`, nil
|
||||
case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
|
||||
return "", errors.New("events boom")
|
||||
default:
|
||||
return "", nil
|
||||
}
|
||||
},
|
||||
wantErr: "query events for scheduling storm scan",
|
||||
},
|
||||
{
|
||||
name: "events-decode-error",
|
||||
run: func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
|
||||
command := name + " " + strings.Join(args, " ")
|
||||
switch {
|
||||
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
|
||||
return `{"items":[]}`, nil
|
||||
case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
|
||||
return `{"items":[]}`, nil
|
||||
case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
|
||||
return "{", nil
|
||||
default:
|
||||
return `{"items":[]}`, nil
|
||||
}
|
||||
},
|
||||
wantErr: "decode events for scheduling storm scan",
|
||||
},
|
||||
{
|
||||
name: "workloads-query-error",
|
||||
run: func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
|
||||
command := name + " " + strings.Join(args, " ")
|
||||
switch {
|
||||
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
|
||||
return `{"items":[]}`, nil
|
||||
case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
|
||||
return `{"items":[]}`, nil
|
||||
case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
|
||||
return `{"items":[]}`, nil
|
||||
case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"):
|
||||
return "", errors.New("workloads boom")
|
||||
default:
|
||||
return "", nil
|
||||
}
|
||||
},
|
||||
wantErr: "query workloads for scheduling storm scan",
|
||||
},
|
||||
{
|
||||
name: "workloads-decode-error",
|
||||
run: func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
|
||||
command := name + " " + strings.Join(args, " ")
|
||||
switch {
|
||||
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
|
||||
return `{"items":[]}`, nil
|
||||
case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
|
||||
return `{"items":[]}`, nil
|
||||
case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
|
||||
return `{"items":[]}`, nil
|
||||
case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"):
|
||||
return "{", nil
|
||||
default:
|
||||
return "", nil
|
||||
}
|
||||
},
|
||||
wantErr: "decode workloads for scheduling storm scan",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
cfg := lifecycleConfig(t)
|
||||
cfg.Startup.AutoQuarantineSchedulingStorms = true
|
||||
orch, _ := newHookOrchestrator(t, cfg, tc.run, tc.run)
|
||||
err := orch.TestHookQuarantineSchedulingStormWorkloads(context.Background())
|
||||
if err == nil || !strings.Contains(err.Error(), tc.wantErr) {
|
||||
t.Fatalf("expected error containing %q, got %v", tc.wantErr, err)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestHookSchedulingStormScaleError runs one orchestration or CLI step.
|
||||
// Signature: TestHookSchedulingStormScaleError(t *testing.T).
|
||||
// Why: covers the final error path where Ananke detects a real storm but cannot
|
||||
// scale the offending workload down.
|
||||
func TestHookSchedulingStormScaleError(t *testing.T) {
|
||||
now := time.Now().UTC().Format(time.RFC3339)
|
||||
cfg := lifecycleConfig(t)
|
||||
cfg.Startup.AutoQuarantineSchedulingStorms = true
|
||||
cfg.Startup.SchedulingStormEventThreshold = 5
|
||||
cfg.Startup.SchedulingStormWindowSeconds = 60
|
||||
|
||||
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||||
command := name + " " + strings.Join(args, " ")
|
||||
switch {
|
||||
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
|
||||
return `{"items":[{"metadata":{"namespace":"ai","name":"ollama-pod","ownerReferences":[{"kind":"ReplicaSet","name":"ollama-rs"}]},"spec":{},"status":{"phase":"Pending"}}]}`, nil
|
||||
case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
|
||||
return `{"items":[{"metadata":{"namespace":"ai","name":"ollama-rs","ownerReferences":[{"kind":"Deployment","name":"ollama"}]}}]}`, nil
|
||||
case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
|
||||
return `{"items":[{"metadata":{"creationTimestamp":"` + now + `"},"involvedObject":{"kind":"Pod","namespace":"ai","name":"ollama-pod"},"type":"Warning","reason":"FailedScheduling","count":45}]}`, nil
|
||||
case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"):
|
||||
return `{"items":[{"kind":"Deployment","metadata":{"namespace":"ai","name":"ollama"},"spec":{"replicas":1}}]}`, nil
|
||||
case name == "kubectl" && strings.Contains(command, "-n ai scale deployment ollama --replicas=0"):
|
||||
return "", errors.New("scale denied")
|
||||
default:
|
||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||
}
|
||||
}
|
||||
|
||||
orch, _ := newHookOrchestrator(t, cfg, run, run)
|
||||
err := orch.TestHookQuarantineSchedulingStormWorkloads(context.Background())
|
||||
if err == nil || !strings.Contains(err.Error(), "scale scheduling storm workload ai/deployment/ollama to 0") {
|
||||
t.Fatalf("expected scale error, got %v", err)
|
||||
}
|
||||
}
|
||||
@ -1,222 +0,0 @@
|
||||
package orchestrator
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"scm.bstein.dev/bstein/ananke/internal/cluster"
|
||||
)
|
||||
|
||||
// readStartupProgress runs one orchestration or CLI step.
|
||||
// Signature: readStartupProgress(t *testing.T, orch *cluster.Orchestrator) string.
|
||||
// Why: startup helper tests need to inspect progress artifacts without reaching
|
||||
// into internal package state from the top-level testing module.
|
||||
func readStartupProgress(t *testing.T, orch *cluster.Orchestrator) string {
|
||||
t.Helper()
|
||||
payload, err := os.ReadFile(orch.TestHookStartupProgressPath())
|
||||
if err != nil {
|
||||
t.Fatalf("read startup progress: %v", err)
|
||||
}
|
||||
return string(payload)
|
||||
}
|
||||
|
||||
// TestHookStartupScopeAndVaultHelpers runs one orchestration or CLI step.
|
||||
// Signature: TestHookStartupScopeAndVaultHelpers(t *testing.T).
|
||||
// Why: keeps startup-scope and startup-Vault helper branches covered from the
|
||||
// split top-level testing module required by the repo hygiene contract.
|
||||
func TestHookStartupScopeAndVaultHelpers(t *testing.T) {
|
||||
t.Run("startup-scope-helpers", func(t *testing.T) {
|
||||
nodes := []string{"titan-db", " titan-23 ", "", "titan-24"}
|
||||
if got := cluster.TestHookStartupRequiredNodes(nodes, nil); len(got) != len(nodes) {
|
||||
t.Fatalf("expected passthrough node list, got %v", got)
|
||||
}
|
||||
got := cluster.TestHookStartupRequiredNodes(nodes, []string{"titan-db", "titan-24"})
|
||||
if len(got) != 2 || got[0] != "titan-db" || got[1] != "titan-24" {
|
||||
t.Fatalf("unexpected filtered node list: %v", got)
|
||||
}
|
||||
|
||||
if !cluster.TestHookContainsNode([]string{"titan-db", " titan-23 "}, " titan-23 ") {
|
||||
t.Fatalf("expected trimmed node membership match")
|
||||
}
|
||||
if cluster.TestHookContainsNode([]string{"titan-db"}, " ") {
|
||||
t.Fatalf("expected blank node probe to be ignored")
|
||||
}
|
||||
|
||||
cfg := lifecycleConfig(t)
|
||||
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
|
||||
if !orch.TestHookStartupNodeStrictlyRequired("titan-23") {
|
||||
t.Fatalf("expected all nodes to be strict when no recovery scopes are configured")
|
||||
}
|
||||
|
||||
cfgScoped := lifecycleConfig(t)
|
||||
cfgScoped.Startup.NodeInventoryReachRequiredNodes = []string{"titan-23"}
|
||||
cfgScoped.Startup.NodeSSHAuthRequiredNodes = []string{"titan-24"}
|
||||
cfgScoped.Startup.FluxHealthRequiredKustomizations = []string{"flux-system/core", " flux-system/gitea "}
|
||||
cfgScoped.Startup.WorkloadConvergenceRequiredNamespaces = []string{"vault", " monitoring "}
|
||||
orchScoped, _ := newHookOrchestrator(t, cfgScoped, nil, nil)
|
||||
if !orchScoped.TestHookStartupNodeStrictlyRequired("titan-db") {
|
||||
t.Fatalf("expected control plane to remain strict")
|
||||
}
|
||||
if !orchScoped.TestHookStartupNodeStrictlyRequired("titan-23") {
|
||||
t.Fatalf("expected inventory-scoped node to remain strict")
|
||||
}
|
||||
if !orchScoped.TestHookStartupNodeStrictlyRequired("titan-24") {
|
||||
t.Fatalf("expected ssh-scoped node to remain strict")
|
||||
}
|
||||
if orchScoped.TestHookStartupNodeStrictlyRequired("titan-25") {
|
||||
t.Fatalf("expected non-core worker to stop being strict")
|
||||
}
|
||||
|
||||
flux := orchScoped.TestHookStartupRequiredFluxKustomizations()
|
||||
if _, ok := flux["flux-system/core"]; !ok {
|
||||
t.Fatalf("expected core flux kustomization in required set: %v", flux)
|
||||
}
|
||||
if _, ok := flux["flux-system/gitea"]; !ok {
|
||||
t.Fatalf("expected trimmed gitea kustomization in required set: %v", flux)
|
||||
}
|
||||
|
||||
namespaces := orchScoped.TestHookStartupRequiredWorkloadNamespaces()
|
||||
if _, ok := namespaces["vault"]; !ok {
|
||||
t.Fatalf("expected vault namespace in required set: %v", namespaces)
|
||||
}
|
||||
if _, ok := namespaces["monitoring"]; !ok {
|
||||
t.Fatalf("expected trimmed monitoring namespace in required set: %v", namespaces)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("startup-vault-helpers", func(t *testing.T) {
|
||||
t.Run("early-vault-unseal-paths", func(t *testing.T) {
|
||||
cfgAPI := lifecycleConfig(t)
|
||||
runAPI := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||||
command := name + " " + strings.Join(args, " ")
|
||||
if name == "kubectl" && strings.Contains(command, "version --request-timeout=5s") {
|
||||
return "", errors.New("api down")
|
||||
}
|
||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||
}
|
||||
orchAPI, _ := newHookOrchestrator(t, cfgAPI, runAPI, runAPI)
|
||||
orchAPI.TestHookBeginStartupReport("startup-vault")
|
||||
orchAPI.TestHookMaybeRunEarlyVaultUnseal(context.Background())
|
||||
if payload := readStartupProgress(t, orchAPI); strings.Contains(payload, "vault-unseal-early") {
|
||||
t.Fatalf("expected no early vault check when api is unavailable, payload=%s", payload)
|
||||
}
|
||||
|
||||
cfgErr := lifecycleConfig(t)
|
||||
runErr := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||||
command := name + " " + strings.Join(args, " ")
|
||||
switch {
|
||||
case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
|
||||
return "v1.31.0", nil
|
||||
case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
|
||||
return "", errors.New("phase probe failed")
|
||||
default:
|
||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||
}
|
||||
}
|
||||
orchErr, _ := newHookOrchestrator(t, cfgErr, runErr, runErr)
|
||||
orchErr.TestHookBeginStartupReport("startup-vault")
|
||||
orchErr.TestHookMaybeRunEarlyVaultUnseal(context.Background())
|
||||
if payload := readStartupProgress(t, orchErr); !strings.Contains(payload, "deferred early vault unseal") {
|
||||
t.Fatalf("expected early vault auto-heal detail, payload=%s", payload)
|
||||
}
|
||||
|
||||
cfgDeferred := lifecycleConfig(t)
|
||||
runDeferred := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||||
command := name + " " + strings.Join(args, " ")
|
||||
switch {
|
||||
case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
|
||||
return "v1.31.0", nil
|
||||
case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
|
||||
return "Pending", nil
|
||||
default:
|
||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||
}
|
||||
}
|
||||
orchDeferred, _ := newHookOrchestrator(t, cfgDeferred, runDeferred, runDeferred)
|
||||
orchDeferred.TestHookBeginStartupReport("startup-vault")
|
||||
orchDeferred.TestHookMaybeRunEarlyVaultUnseal(context.Background())
|
||||
if payload := readStartupProgress(t, orchDeferred); !strings.Contains(payload, `vault-0 pod phase is \"Pending\"`) {
|
||||
t.Fatalf("expected deferred early vault detail, payload=%s", payload)
|
||||
}
|
||||
|
||||
cfgSuccess := lifecycleConfig(t)
|
||||
runSuccess := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||||
command := name + " " + strings.Join(args, " ")
|
||||
switch {
|
||||
case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
|
||||
return "v1.31.0", nil
|
||||
case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
|
||||
return "Running", nil
|
||||
case name == "kubectl" && strings.Contains(command, "vault status -format=json"):
|
||||
return `{"sealed":false,"initialized":true}`, nil
|
||||
default:
|
||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||
}
|
||||
}
|
||||
orchSuccess, _ := newHookOrchestrator(t, cfgSuccess, runSuccess, runSuccess)
|
||||
orchSuccess.TestHookBeginStartupReport("startup-vault")
|
||||
orchSuccess.TestHookMaybeRunEarlyVaultUnseal(context.Background())
|
||||
if payload := readStartupProgress(t, orchSuccess); !strings.Contains(payload, `"vault-unseal-early"`) || !strings.Contains(payload, `"passed"`) {
|
||||
t.Fatalf("expected successful early vault check, payload=%s", payload)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("startup-vault-gate-paths", func(t *testing.T) {
|
||||
cfgErr := lifecycleConfig(t)
|
||||
runErr := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||||
command := name + " " + strings.Join(args, " ")
|
||||
if name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}") {
|
||||
return "", errors.New("phase probe failed")
|
||||
}
|
||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||
}
|
||||
orchErr, _ := newHookOrchestrator(t, cfgErr, runErr, runErr)
|
||||
orchErr.TestHookBeginStartupReport("startup-vault")
|
||||
if err := orchErr.TestHookRunStartupVaultUnsealGate(context.Background()); err == nil || !strings.Contains(err.Error(), "phase probe failed") {
|
||||
t.Fatalf("expected startup vault gate error, got %v", err)
|
||||
}
|
||||
|
||||
cfgDeferred := lifecycleConfig(t)
|
||||
runDeferred := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||||
command := name + " " + strings.Join(args, " ")
|
||||
if name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}") {
|
||||
return "Pending", nil
|
||||
}
|
||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||
}
|
||||
orchDeferred, _ := newHookOrchestrator(t, cfgDeferred, runDeferred, runDeferred)
|
||||
orchDeferred.TestHookBeginStartupReport("startup-vault")
|
||||
if err := orchDeferred.TestHookRunStartupVaultUnsealGate(context.Background()); err != nil {
|
||||
t.Fatalf("expected deferred startup vault gate to succeed, got %v", err)
|
||||
}
|
||||
if payload := readStartupProgress(t, orchDeferred); !strings.Contains(payload, `vault-0 pod phase is \"Pending\"`) {
|
||||
t.Fatalf("expected deferred startup vault detail, payload=%s", payload)
|
||||
}
|
||||
|
||||
cfgSuccess := lifecycleConfig(t)
|
||||
runSuccess := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||||
command := name + " " + strings.Join(args, " ")
|
||||
switch {
|
||||
case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
|
||||
return "Running", nil
|
||||
case name == "kubectl" && strings.Contains(command, "vault status -format=json"):
|
||||
return `{"sealed":false,"initialized":true}`, nil
|
||||
default:
|
||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||
}
|
||||
}
|
||||
orchSuccess, _ := newHookOrchestrator(t, cfgSuccess, runSuccess, runSuccess)
|
||||
orchSuccess.TestHookBeginStartupReport("startup-vault")
|
||||
if err := orchSuccess.TestHookRunStartupVaultUnsealGate(context.Background()); err != nil {
|
||||
t.Fatalf("expected successful startup vault gate, got %v", err)
|
||||
}
|
||||
if payload := readStartupProgress(t, orchSuccess); !strings.Contains(payload, `"vault is unsealed"`) {
|
||||
t.Fatalf("expected successful startup vault detail, payload=%s", payload)
|
||||
}
|
||||
})
|
||||
})
|
||||
}
|
||||
@ -24,36 +24,12 @@ func TestStateTestHookOverrideSetters(t *testing.T) {
|
||||
}
|
||||
restoreWriteNil()
|
||||
|
||||
restoreReadNil := state.TestHookSetReadIntentOverride(nil)
|
||||
readAfterNil, err := state.ReadIntent(intentPath)
|
||||
if err != nil || readAfterNil.State != state.IntentNormal {
|
||||
t.Fatalf("expected default read intent path after nil override, got %v / %v", readAfterNil, err)
|
||||
}
|
||||
restoreReadNil()
|
||||
|
||||
readOverrideCalled := false
|
||||
restoreRead := state.TestHookSetReadIntentOverride(func(path string) (state.Intent, error) {
|
||||
readOverrideCalled = true
|
||||
return state.Intent{}, errors.New("forced read override")
|
||||
})
|
||||
_, err = state.ReadIntent(intentPath)
|
||||
if err == nil || !strings.Contains(err.Error(), "forced read override") {
|
||||
t.Fatalf("expected forced read override error, got %v", err)
|
||||
}
|
||||
if !readOverrideCalled {
|
||||
t.Fatalf("expected read override to be invoked")
|
||||
}
|
||||
restoreRead()
|
||||
if _, err := state.TestHookReadIntentDefault(intentPath); err != nil {
|
||||
t.Fatalf("expected explicit default read helper to succeed, got %v", err)
|
||||
}
|
||||
|
||||
writeOverrideCalled := false
|
||||
restoreWrite := state.TestHookSetWriteIntentOverride(func(path string, in state.Intent) error {
|
||||
writeOverrideCalled = true
|
||||
return errors.New("forced write override")
|
||||
})
|
||||
err = state.WriteIntent(intentPath, state.Intent{State: state.IntentNormal})
|
||||
err := state.WriteIntent(intentPath, state.Intent{State: state.IntentNormal})
|
||||
if err == nil || !strings.Contains(err.Error(), "forced write override") {
|
||||
t.Fatalf("expected forced write override error, got %v", err)
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user