Compare commits
No commits in common. "main" and "codex/ananke-gate-platform-metrics" have entirely different histories.
main
...
codex/anan
2
.gitignore
vendored
2
.gitignore
vendored
@ -1,6 +1,4 @@
|
|||||||
/bin/
|
/bin/
|
||||||
/build/
|
|
||||||
/dist/
|
/dist/
|
||||||
internal/state/.corrupt-*
|
|
||||||
*.log
|
*.log
|
||||||
*.tmp
|
*.tmp
|
||||||
|
|||||||
201
Jenkinsfile
vendored
201
Jenkinsfile
vendored
@ -1,59 +1,25 @@
|
|||||||
pipeline {
|
pipeline {
|
||||||
agent {
|
agent {
|
||||||
kubernetes {
|
kubernetes {
|
||||||
|
label 'ananke-quality'
|
||||||
defaultContainer 'go-tester'
|
defaultContainer 'go-tester'
|
||||||
yaml """
|
yaml """
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: Pod
|
kind: Pod
|
||||||
spec:
|
spec:
|
||||||
nodeSelector:
|
nodeSelector:
|
||||||
hardware: rpi5
|
|
||||||
kubernetes.io/arch: arm64
|
kubernetes.io/arch: arm64
|
||||||
node-role.kubernetes.io/worker: "true"
|
node-role.kubernetes.io/worker: "true"
|
||||||
affinity:
|
|
||||||
nodeAffinity:
|
|
||||||
requiredDuringSchedulingIgnoredDuringExecution:
|
|
||||||
nodeSelectorTerms:
|
|
||||||
- matchExpressions:
|
|
||||||
- key: kubernetes.io/hostname
|
|
||||||
operator: NotIn
|
|
||||||
values:
|
|
||||||
- titan-06
|
|
||||||
preferredDuringSchedulingIgnoredDuringExecution:
|
|
||||||
- weight: 100
|
|
||||||
preference:
|
|
||||||
matchExpressions:
|
|
||||||
- key: kubernetes.io/hostname
|
|
||||||
operator: NotIn
|
|
||||||
values:
|
|
||||||
- titan-13
|
|
||||||
- titan-15
|
|
||||||
- titan-17
|
|
||||||
- titan-19
|
|
||||||
topologySpreadConstraints:
|
|
||||||
- maxSkew: 1
|
|
||||||
topologyKey: kubernetes.io/hostname
|
|
||||||
whenUnsatisfiable: ScheduleAnyway
|
|
||||||
labelSelector:
|
|
||||||
matchLabels:
|
|
||||||
jenkins/jenkins-jenkins-agent: "true"
|
|
||||||
containers:
|
containers:
|
||||||
- name: go-tester
|
- name: go-tester
|
||||||
image: registry.bstein.dev/bstein/golang:1.25-bookworm
|
image: golang:1.25-bookworm
|
||||||
command: ["cat"]
|
command: ["cat"]
|
||||||
tty: true
|
tty: true
|
||||||
volumeMounts:
|
volumeMounts:
|
||||||
- name: workspace-volume
|
- name: workspace-volume
|
||||||
mountPath: /home/jenkins/agent
|
mountPath: /home/jenkins/agent
|
||||||
- name: publisher
|
- name: publisher
|
||||||
image: registry.bstein.dev/bstein/python:3.12-slim
|
image: python:3.12-slim
|
||||||
command: ["cat"]
|
|
||||||
tty: true
|
|
||||||
volumeMounts:
|
|
||||||
- name: workspace-volume
|
|
||||||
mountPath: /home/jenkins/agent
|
|
||||||
- name: quality-tools
|
|
||||||
image: registry.bstein.dev/bstein/quality-tools:sonar8.0.1-trivy0.70.0-db20260422-arm64
|
|
||||||
command: ["cat"]
|
command: ["cat"]
|
||||||
tty: true
|
tty: true
|
||||||
volumeMounts:
|
volumeMounts:
|
||||||
@ -69,13 +35,7 @@ spec:
|
|||||||
environment {
|
environment {
|
||||||
SUITE_NAME = 'ananke'
|
SUITE_NAME = 'ananke'
|
||||||
PUSHGATEWAY_URL = 'http://platform-quality-gateway.monitoring.svc.cluster.local:9091'
|
PUSHGATEWAY_URL = 'http://platform-quality-gateway.monitoring.svc.cluster.local:9091'
|
||||||
SONARQUBE_HOST_URL = 'http://sonarqube.quality.svc.cluster.local:9000'
|
|
||||||
SONARQUBE_PROJECT_KEY = 'ananke'
|
|
||||||
SONARQUBE_TOKEN = credentials('sonarqube-token')
|
|
||||||
QUALITY_GATE_SONARQUBE_ENFORCE = '1'
|
|
||||||
QUALITY_GATE_SONARQUBE_REPORT = 'build/sonarqube-quality-gate.json'
|
QUALITY_GATE_SONARQUBE_REPORT = 'build/sonarqube-quality-gate.json'
|
||||||
QUALITY_GATE_IRONBANK_ENFORCE = '1'
|
|
||||||
QUALITY_GATE_IRONBANK_REQUIRED = '0'
|
|
||||||
QUALITY_GATE_IRONBANK_REPORT = 'build/ironbank-compliance.json'
|
QUALITY_GATE_IRONBANK_REPORT = 'build/ironbank-compliance.json'
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -97,27 +57,6 @@ spec:
|
|||||||
|
|
||||||
stage('Collect SonarQube evidence') {
|
stage('Collect SonarQube evidence') {
|
||||||
steps {
|
steps {
|
||||||
container('quality-tools') {
|
|
||||||
sh '''#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
mkdir -p build
|
|
||||||
args=(
|
|
||||||
"-Dsonar.host.url=${SONARQUBE_HOST_URL}"
|
|
||||||
"-Dsonar.login=${SONARQUBE_TOKEN}"
|
|
||||||
"-Dsonar.projectKey=${SONARQUBE_PROJECT_KEY}"
|
|
||||||
"-Dsonar.projectName=${SONARQUBE_PROJECT_KEY}"
|
|
||||||
"-Dsonar.sources=."
|
|
||||||
"-Dsonar.exclusions=**/.git/**,**/build/**,**/dist/**,**/node_modules/**,**/.venv/**,**/__pycache__/**,**/coverage/**,**/test-results/**,**/playwright-report/**"
|
|
||||||
"-Dsonar.test.inclusions=**/tests/**,**/testing/**,**/*_test.go,**/*.test.ts,**/*.test.tsx,**/*.spec.ts,**/*.spec.tsx"
|
|
||||||
)
|
|
||||||
[ -f build/coverage.out ] && args+=("-Dsonar.go.coverage.reportPaths=build/coverage.out")
|
|
||||||
set +e
|
|
||||||
sonar-scanner "${args[@]}" | tee build/sonar-scanner.log
|
|
||||||
rc=${PIPESTATUS[0]}
|
|
||||||
set -e
|
|
||||||
printf '%s\n' "${rc}" > build/sonarqube-analysis.rc
|
|
||||||
'''
|
|
||||||
}
|
|
||||||
container('publisher') {
|
container('publisher') {
|
||||||
sh '''
|
sh '''
|
||||||
set -eu
|
set -eu
|
||||||
@ -156,34 +95,6 @@ PY
|
|||||||
|
|
||||||
stage('Collect Supply Chain evidence') {
|
stage('Collect Supply Chain evidence') {
|
||||||
steps {
|
steps {
|
||||||
container('quality-tools') {
|
|
||||||
sh '''#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
mkdir -p build
|
|
||||||
set +e
|
|
||||||
trivy fs --cache-dir "${TRIVY_CACHE_DIR}" --skip-db-update --timeout 5m --no-progress --format json --output build/trivy-fs.json --scanners vuln,secret,misconfig --severity HIGH,CRITICAL .
|
|
||||||
trivy_rc=$?
|
|
||||||
set -e
|
|
||||||
if [ ! -s build/trivy-fs.json ]; then
|
|
||||||
cat > build/ironbank-compliance.json <<EOF
|
|
||||||
{"status":"failed","compliant":false,"scanner":"trivy","scan_type":"filesystem","error":"trivy did not produce JSON output","trivy_rc":${trivy_rc}}
|
|
||||||
EOF
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
critical="$(jq '[.Results[]? | .Vulnerabilities[]? | select(.Severity=="CRITICAL")] | length' build/trivy-fs.json)"
|
|
||||||
high="$(jq '[.Results[]? | .Vulnerabilities[]? | select(.Severity=="HIGH")] | length' build/trivy-fs.json)"
|
|
||||||
secrets="$(jq '[.Results[]? | .Secrets[]?] | length' build/trivy-fs.json)"
|
|
||||||
misconfigs="$(jq '[.Results[]? | .Misconfigurations[]? | select(.Status=="FAIL" and (.Severity=="CRITICAL" or .Severity=="HIGH"))] | length' build/trivy-fs.json)"
|
|
||||||
status=ok
|
|
||||||
compliant=true
|
|
||||||
if [ "${critical}" -gt 0 ] || [ "${secrets}" -gt 0 ] || [ "${misconfigs}" -gt 0 ]; then
|
|
||||||
status=failed
|
|
||||||
compliant=false
|
|
||||||
fi
|
|
||||||
jq -n --arg status "${status}" --argjson compliant "${compliant}" --argjson critical "${critical}" --argjson high "${high}" --argjson secrets "${secrets}" --argjson misconfigs "${misconfigs}" --argjson trivy_rc "${trivy_rc}" \
|
|
||||||
'{status:$status, compliant:$compliant, category:"artifact_security", scan_type:"filesystem", scanner:"trivy", critical_vulnerabilities:$critical, high_vulnerabilities:$high, secrets:$secrets, high_or_critical_misconfigurations:$misconfigs, trivy_rc:$trivy_rc, high_vulnerability_policy:"observe"}' > build/ironbank-compliance.json
|
|
||||||
'''
|
|
||||||
}
|
|
||||||
container('publisher') {
|
container('publisher') {
|
||||||
sh '''
|
sh '''
|
||||||
set -eu
|
set -eu
|
||||||
@ -241,25 +152,13 @@ PY
|
|||||||
failed_runs="$(awk -F= '$1=="failed"{print $2}' build/quality-gate.state 2>/dev/null | tail -n1)"
|
failed_runs="$(awk -F= '$1=="failed"{print $2}' build/quality-gate.state 2>/dev/null | tail -n1)"
|
||||||
[ -n "${ok_runs}" ] || ok_runs=0
|
[ -n "${ok_runs}" ] || ok_runs=0
|
||||||
[ -n "${failed_runs}" ] || failed_runs=0
|
[ -n "${failed_runs}" ] || failed_runs=0
|
||||||
coverage_percent="$(python3 - <<'PY'
|
|
||||||
import re
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
log_path = Path("build/quality-gate.out")
|
|
||||||
text = log_path.read_text(encoding="utf-8", errors="ignore") if log_path.exists() else ""
|
|
||||||
values = [float(match.group(1)) for match in re.finditer(r"([0-9]+(?:\\.[0-9]+)?)%", text)]
|
|
||||||
print(values[-1] if values else 0.0)
|
|
||||||
PY
|
|
||||||
)"
|
|
||||||
printf '%s\n' "${coverage_percent}" > build/coverage-percent.txt
|
|
||||||
python3 scripts/publish_quality_metrics.py \
|
python3 scripts/publish_quality_metrics.py \
|
||||||
--pushgateway-url "${PUSHGATEWAY_URL}" \
|
--pushgateway-url "${PUSHGATEWAY_URL}" \
|
||||||
--job-name platform-quality-ci \
|
--job-name platform-quality-ci \
|
||||||
--suite "${SUITE_NAME}" \
|
--suite "${SUITE_NAME}" \
|
||||||
--trigger jenkins \
|
--trigger jenkins \
|
||||||
--local-ok "${ok_runs}" \
|
--local-ok "${ok_runs}" \
|
||||||
--local-failed "${failed_runs}" \
|
--local-failed "${failed_runs}"
|
||||||
--coverage-percent-file build/coverage-percent.txt
|
|
||||||
'''
|
'''
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -270,95 +169,7 @@ PY
|
|||||||
container('publisher') {
|
container('publisher') {
|
||||||
sh '''
|
sh '''
|
||||||
set -eu
|
set -eu
|
||||||
gate_rc="$(cat build/quality-gate.rc 2>/dev/null || echo 1)"
|
test "$(cat build/quality-gate.rc 2>/dev/null || echo 1)" -eq 0
|
||||||
fail=0
|
|
||||||
if [ "${gate_rc}" -ne 0 ]; then
|
|
||||||
echo "quality gate failed with rc=${gate_rc}" >&2
|
|
||||||
fail=1
|
|
||||||
fi
|
|
||||||
|
|
||||||
enabled() {
|
|
||||||
case "$(printf '%s' "${1:-}" | tr '[:upper:]' '[:lower:]')" in
|
|
||||||
1|true|yes|on) return 0 ;;
|
|
||||||
*) return 1 ;;
|
|
||||||
esac
|
|
||||||
}
|
|
||||||
|
|
||||||
if enabled "${QUALITY_GATE_SONARQUBE_ENFORCE:-1}"; then
|
|
||||||
sonar_status="$(python3 - <<'PY'
|
|
||||||
import json
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
path = Path("build/sonarqube-quality-gate.json")
|
|
||||||
if not path.exists():
|
|
||||||
print("missing")
|
|
||||||
raise SystemExit(0)
|
|
||||||
try:
|
|
||||||
payload = json.loads(path.read_text(encoding="utf-8"))
|
|
||||||
except Exception: # noqa: BLE001
|
|
||||||
print("error")
|
|
||||||
raise SystemExit(0)
|
|
||||||
status = (payload.get("status") or payload.get("projectStatus", {}).get("status") or payload.get("qualityGate", {}).get("status") or "").strip().lower()
|
|
||||||
print(status or "missing")
|
|
||||||
PY
|
|
||||||
)"
|
|
||||||
case "${sonar_status}" in
|
|
||||||
ok|pass|passed|success) ;;
|
|
||||||
*)
|
|
||||||
echo "sonarqube gate failed: ${sonar_status}" >&2
|
|
||||||
fail=1
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
fi
|
|
||||||
|
|
||||||
ironbank_required="${QUALITY_GATE_IRONBANK_REQUIRED:-0}"
|
|
||||||
if [ "${PUBLISH_IMAGES:-false}" = "true" ]; then
|
|
||||||
ironbank_required=1
|
|
||||||
fi
|
|
||||||
if enabled "${QUALITY_GATE_IRONBANK_ENFORCE:-1}"; then
|
|
||||||
supply_status="$(python3 - <<'PY'
|
|
||||||
import json
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
path = Path("build/ironbank-compliance.json")
|
|
||||||
if not path.exists():
|
|
||||||
print("missing")
|
|
||||||
raise SystemExit(0)
|
|
||||||
try:
|
|
||||||
payload = json.loads(path.read_text(encoding="utf-8"))
|
|
||||||
except Exception: # noqa: BLE001
|
|
||||||
print("error")
|
|
||||||
raise SystemExit(0)
|
|
||||||
compliant = payload.get("compliant")
|
|
||||||
if compliant is True:
|
|
||||||
print("ok")
|
|
||||||
elif compliant is False:
|
|
||||||
print("failed")
|
|
||||||
else:
|
|
||||||
status = str(payload.get("status") or payload.get("result") or payload.get("compliance") or "").strip().lower()
|
|
||||||
print(status or "missing")
|
|
||||||
PY
|
|
||||||
)"
|
|
||||||
case "${supply_status}" in
|
|
||||||
ok|pass|passed|success|compliant) ;;
|
|
||||||
not_applicable|na|n/a)
|
|
||||||
if enabled "${ironbank_required}"; then
|
|
||||||
echo "supply chain gate required but status=${supply_status}" >&2
|
|
||||||
fail=1
|
|
||||||
fi
|
|
||||||
;;
|
|
||||||
*)
|
|
||||||
if enabled "${ironbank_required}"; then
|
|
||||||
echo "supply chain gate failed: ${supply_status}" >&2
|
|
||||||
fail=1
|
|
||||||
else
|
|
||||||
echo "supply chain gate not passing (${supply_status}) but not required for this run" >&2
|
|
||||||
fi
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
fi
|
|
||||||
|
|
||||||
exit "${fail}"
|
|
||||||
'''
|
'''
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -367,7 +178,7 @@ PY
|
|||||||
|
|
||||||
post {
|
post {
|
||||||
always {
|
always {
|
||||||
archiveArtifacts artifacts: 'build/*.json,build/*.out,build/*.rc,build/*.txt,build/*.xml', allowEmptyArchive: true, fingerprint: true
|
archiveArtifacts artifacts: 'build/quality-gate.out,build/quality-gate.rc', allowEmptyArchive: true, fingerprint: true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
14
README.md
14
README.md
@ -97,15 +97,10 @@ Primary config path:
|
|||||||
Keep these fields accurate:
|
Keep these fields accurate:
|
||||||
- `expected_flux_source_url`
|
- `expected_flux_source_url`
|
||||||
- `expected_flux_branch`
|
- `expected_flux_branch`
|
||||||
- `startup.service_checklist_explicit_only`
|
|
||||||
- `startup.service_checklist`
|
- `startup.service_checklist`
|
||||||
- `startup.critical_service_endpoints`
|
- `startup.critical_service_endpoints`
|
||||||
- `startup.require_ingress_checklist`
|
- `startup.require_ingress_checklist`
|
||||||
- `startup.require_node_inventory_reachability`
|
- `startup.require_node_inventory_reachability`
|
||||||
- `startup.node_inventory_reachability_required_nodes`
|
|
||||||
- `startup.node_ssh_auth_required_nodes`
|
|
||||||
- `startup.flux_health_required_kustomizations`
|
|
||||||
- `startup.workload_convergence_required_namespaces`
|
|
||||||
- `startup.ignore_unavailable_nodes`
|
- `startup.ignore_unavailable_nodes`
|
||||||
- `coordination.role`
|
- `coordination.role`
|
||||||
- `coordination.peer_hosts`
|
- `coordination.peer_hosts`
|
||||||
@ -139,10 +134,9 @@ Installer behavior:
|
|||||||
|
|
||||||
When adding nodes or services:
|
When adding nodes or services:
|
||||||
1. Update inventory and node mapping in config.
|
1. Update inventory and node mapping in config.
|
||||||
2. Keep the explicit service checklist focused on the core services that must come back during an outage.
|
2. Add/adjust service checklist entries for anything user-facing or critical.
|
||||||
3. Keep `*_required_*` startup scopes aligned with the same core set so optional stacks do not block bootstrap.
|
3. Add/adjust ingress expectations for exposed services.
|
||||||
4. Add/adjust ingress expectations for exposed services.
|
4. Use temporary ignores only when truly intentional, then remove them.
|
||||||
5. Use temporary ignores only when truly intentional, then remove them.
|
5. Run `scripts/quality_gate.sh` before host deployment.
|
||||||
6. Run `scripts/quality_gate.sh` before host deployment.
|
|
||||||
|
|
||||||
Recovery quality should improve over time: every drill should reduce manual work in the next drill.
|
Recovery quality should improve over time: every drill should reduce manual work in the next drill.
|
||||||
|
|||||||
@ -51,7 +51,6 @@ startup:
|
|||||||
require_node_inventory_reachability: true
|
require_node_inventory_reachability: true
|
||||||
node_inventory_reachability_wait_seconds: 300
|
node_inventory_reachability_wait_seconds: 300
|
||||||
node_inventory_reachability_poll_seconds: 5
|
node_inventory_reachability_poll_seconds: 5
|
||||||
node_inventory_reachability_required_nodes: []
|
|
||||||
required_node_labels:
|
required_node_labels:
|
||||||
titan-09:
|
titan-09:
|
||||||
ananke.bstein.dev/harbor-bootstrap: "true"
|
ananke.bstein.dev/harbor-bootstrap: "true"
|
||||||
@ -91,7 +90,6 @@ startup:
|
|||||||
admin_secret_name: keycloak-admin
|
admin_secret_name: keycloak-admin
|
||||||
admin_secret_username_key: username
|
admin_secret_username_key: username
|
||||||
admin_secret_password_key: password
|
admin_secret_password_key: password
|
||||||
service_checklist_explicit_only: false
|
|
||||||
service_checklist:
|
service_checklist:
|
||||||
- name: gitea-api
|
- name: gitea-api
|
||||||
url: https://scm.bstein.dev/api/healthz
|
url: https://scm.bstein.dev/api/healthz
|
||||||
@ -136,26 +134,18 @@ startup:
|
|||||||
require_node_ssh_auth: true
|
require_node_ssh_auth: true
|
||||||
node_ssh_auth_wait_seconds: 240
|
node_ssh_auth_wait_seconds: 240
|
||||||
node_ssh_auth_poll_seconds: 5
|
node_ssh_auth_poll_seconds: 5
|
||||||
node_ssh_auth_required_nodes: []
|
|
||||||
require_flux_health: true
|
require_flux_health: true
|
||||||
flux_health_wait_seconds: 900
|
flux_health_wait_seconds: 900
|
||||||
flux_health_poll_seconds: 5
|
flux_health_poll_seconds: 5
|
||||||
flux_health_required_kustomizations: []
|
|
||||||
ignore_flux_kustomizations: []
|
ignore_flux_kustomizations: []
|
||||||
require_workload_convergence: true
|
require_workload_convergence: true
|
||||||
workload_convergence_wait_seconds: 900
|
workload_convergence_wait_seconds: 900
|
||||||
workload_convergence_poll_seconds: 5
|
workload_convergence_poll_seconds: 5
|
||||||
workload_convergence_required_namespaces: []
|
|
||||||
ignore_workload_namespaces: []
|
ignore_workload_namespaces: []
|
||||||
ignore_workloads: []
|
ignore_workloads: []
|
||||||
ignore_unavailable_nodes: []
|
ignore_unavailable_nodes: []
|
||||||
auto_recycle_stuck_pods: true
|
auto_recycle_stuck_pods: true
|
||||||
auto_quarantine_scheduling_storms: false
|
|
||||||
scheduling_storm_event_threshold: 30
|
|
||||||
scheduling_storm_window_seconds: 180
|
|
||||||
stuck_pod_grace_seconds: 180
|
stuck_pod_grace_seconds: 180
|
||||||
post_start_auto_heal_seconds: 60
|
|
||||||
dead_node_cleanup_grace_seconds: 300
|
|
||||||
vault_unseal_key_file: /var/lib/ananke/vault-unseal.key
|
vault_unseal_key_file: /var/lib/ananke/vault-unseal.key
|
||||||
vault_unseal_breakglass_command: ""
|
vault_unseal_breakglass_command: ""
|
||||||
vault_unseal_breakglass_timeout_seconds: 15
|
vault_unseal_breakglass_timeout_seconds: 15
|
||||||
@ -180,7 +170,6 @@ ups:
|
|||||||
target: pyrphoros@localhost
|
target: pyrphoros@localhost
|
||||||
poll_seconds: 5
|
poll_seconds: 5
|
||||||
runtime_safety_factor: 1.25
|
runtime_safety_factor: 1.25
|
||||||
on_battery_grace_seconds: 90
|
|
||||||
debounce_count: 3
|
debounce_count: 3
|
||||||
telemetry_timeout_seconds: 90
|
telemetry_timeout_seconds: 90
|
||||||
coordination:
|
coordination:
|
||||||
|
|||||||
@ -117,52 +117,8 @@ startup:
|
|||||||
require_node_inventory_reachability: true
|
require_node_inventory_reachability: true
|
||||||
node_inventory_reachability_wait_seconds: 300
|
node_inventory_reachability_wait_seconds: 300
|
||||||
node_inventory_reachability_poll_seconds: 5
|
node_inventory_reachability_poll_seconds: 5
|
||||||
node_inventory_reachability_required_nodes:
|
|
||||||
- titan-0a
|
|
||||||
- titan-0b
|
|
||||||
- titan-0c
|
|
||||||
required_node_labels:
|
required_node_labels:
|
||||||
titan-04:
|
|
||||||
node-role.kubernetes.io/worker: "true"
|
|
||||||
longhorn-host: "true"
|
|
||||||
titan-05:
|
|
||||||
node-role.kubernetes.io/worker: "true"
|
|
||||||
longhorn-host: "true"
|
|
||||||
titan-06:
|
|
||||||
node-role.kubernetes.io/worker: "true"
|
|
||||||
longhorn-host: "true"
|
|
||||||
titan-07:
|
|
||||||
node-role.kubernetes.io/worker: "true"
|
|
||||||
longhorn-host: "true"
|
|
||||||
titan-08:
|
|
||||||
node-role.kubernetes.io/worker: "true"
|
|
||||||
longhorn-host: "true"
|
|
||||||
titan-11:
|
|
||||||
node-role.kubernetes.io/worker: "true"
|
|
||||||
longhorn-host: "true"
|
|
||||||
titan-12:
|
|
||||||
node-role.kubernetes.io/worker: "true"
|
|
||||||
longhorn-host: "true"
|
|
||||||
titan-13:
|
|
||||||
node-role.kubernetes.io/worker: "true"
|
|
||||||
longhorn-host: "true"
|
|
||||||
titan-14:
|
|
||||||
node-role.kubernetes.io/worker: "true"
|
|
||||||
longhorn-host: "true"
|
|
||||||
titan-15:
|
|
||||||
node-role.kubernetes.io/worker: "true"
|
|
||||||
longhorn-host: "true"
|
|
||||||
titan-17:
|
|
||||||
node-role.kubernetes.io/worker: "true"
|
|
||||||
longhorn-host: "true"
|
|
||||||
titan-18:
|
|
||||||
node-role.kubernetes.io/worker: "true"
|
|
||||||
longhorn-host: "true"
|
|
||||||
titan-19:
|
|
||||||
node-role.kubernetes.io/worker: "true"
|
|
||||||
longhorn-host: "true"
|
|
||||||
titan-09:
|
titan-09:
|
||||||
node-role.kubernetes.io/worker: "true"
|
|
||||||
ananke.bstein.dev/harbor-bootstrap: "true"
|
ananke.bstein.dev/harbor-bootstrap: "true"
|
||||||
require_time_sync: true
|
require_time_sync: true
|
||||||
time_sync_wait_seconds: 240
|
time_sync_wait_seconds: 240
|
||||||
@ -200,7 +156,6 @@ startup:
|
|||||||
admin_secret_name: keycloak-admin
|
admin_secret_name: keycloak-admin
|
||||||
admin_secret_username_key: username
|
admin_secret_username_key: username
|
||||||
admin_secret_password_key: password
|
admin_secret_password_key: password
|
||||||
service_checklist_explicit_only: true
|
|
||||||
service_checklist:
|
service_checklist:
|
||||||
- name: gitea-api
|
- name: gitea-api
|
||||||
url: https://scm.bstein.dev/api/healthz
|
url: https://scm.bstein.dev/api/healthz
|
||||||
@ -245,49 +200,18 @@ startup:
|
|||||||
require_node_ssh_auth: true
|
require_node_ssh_auth: true
|
||||||
node_ssh_auth_wait_seconds: 240
|
node_ssh_auth_wait_seconds: 240
|
||||||
node_ssh_auth_poll_seconds: 5
|
node_ssh_auth_poll_seconds: 5
|
||||||
node_ssh_auth_required_nodes:
|
|
||||||
- titan-0a
|
|
||||||
- titan-0b
|
|
||||||
- titan-0c
|
|
||||||
require_flux_health: true
|
require_flux_health: true
|
||||||
flux_health_wait_seconds: 900
|
flux_health_wait_seconds: 900
|
||||||
flux_health_poll_seconds: 5
|
flux_health_poll_seconds: 5
|
||||||
flux_health_required_kustomizations:
|
|
||||||
- flux-system/core
|
|
||||||
- flux-system/helm
|
|
||||||
- flux-system/traefik
|
|
||||||
- flux-system/cert-manager
|
|
||||||
- flux-system/longhorn
|
|
||||||
- flux-system/vault-csi
|
|
||||||
- flux-system/vault-injector
|
|
||||||
- flux-system/postgres
|
|
||||||
- flux-system/vault
|
|
||||||
- flux-system/keycloak
|
|
||||||
- flux-system/oauth2-proxy
|
|
||||||
- flux-system/gitea
|
|
||||||
- flux-system/monitoring
|
|
||||||
- flux-system/harbor
|
|
||||||
ignore_flux_kustomizations: []
|
ignore_flux_kustomizations: []
|
||||||
require_workload_convergence: true
|
require_workload_convergence: true
|
||||||
workload_convergence_wait_seconds: 900
|
workload_convergence_wait_seconds: 900
|
||||||
workload_convergence_poll_seconds: 5
|
workload_convergence_poll_seconds: 5
|
||||||
workload_convergence_required_namespaces:
|
|
||||||
- vault
|
|
||||||
- postgres
|
|
||||||
- sso
|
|
||||||
- gitea
|
|
||||||
- monitoring
|
|
||||||
- harbor
|
|
||||||
ignore_workload_namespaces: []
|
ignore_workload_namespaces: []
|
||||||
ignore_workloads: []
|
ignore_workloads: []
|
||||||
ignore_unavailable_nodes: []
|
ignore_unavailable_nodes: []
|
||||||
auto_recycle_stuck_pods: true
|
auto_recycle_stuck_pods: true
|
||||||
auto_quarantine_scheduling_storms: true
|
|
||||||
scheduling_storm_event_threshold: 30
|
|
||||||
scheduling_storm_window_seconds: 180
|
|
||||||
stuck_pod_grace_seconds: 180
|
stuck_pod_grace_seconds: 180
|
||||||
post_start_auto_heal_seconds: 60
|
|
||||||
dead_node_cleanup_grace_seconds: 300
|
|
||||||
vault_unseal_key_file: /var/lib/ananke/vault-unseal.key
|
vault_unseal_key_file: /var/lib/ananke/vault-unseal.key
|
||||||
vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/tethys/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.ananke-breakglass/vault-unseal.key'"
|
vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/tethys/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.ananke-breakglass/vault-unseal.key'"
|
||||||
vault_unseal_breakglass_timeout_seconds: 15
|
vault_unseal_breakglass_timeout_seconds: 15
|
||||||
@ -311,7 +235,6 @@ ups:
|
|||||||
target: statera@localhost
|
target: statera@localhost
|
||||||
poll_seconds: 5
|
poll_seconds: 5
|
||||||
runtime_safety_factor: 1.25
|
runtime_safety_factor: 1.25
|
||||||
on_battery_grace_seconds: 90
|
|
||||||
debounce_count: 3
|
debounce_count: 3
|
||||||
telemetry_timeout_seconds: 90
|
telemetry_timeout_seconds: 90
|
||||||
coordination:
|
coordination:
|
||||||
|
|||||||
@ -117,52 +117,8 @@ startup:
|
|||||||
require_node_inventory_reachability: true
|
require_node_inventory_reachability: true
|
||||||
node_inventory_reachability_wait_seconds: 300
|
node_inventory_reachability_wait_seconds: 300
|
||||||
node_inventory_reachability_poll_seconds: 5
|
node_inventory_reachability_poll_seconds: 5
|
||||||
node_inventory_reachability_required_nodes:
|
|
||||||
- titan-0a
|
|
||||||
- titan-0b
|
|
||||||
- titan-0c
|
|
||||||
required_node_labels:
|
required_node_labels:
|
||||||
titan-04:
|
|
||||||
node-role.kubernetes.io/worker: "true"
|
|
||||||
longhorn-host: "true"
|
|
||||||
titan-05:
|
|
||||||
node-role.kubernetes.io/worker: "true"
|
|
||||||
longhorn-host: "true"
|
|
||||||
titan-06:
|
|
||||||
node-role.kubernetes.io/worker: "true"
|
|
||||||
longhorn-host: "true"
|
|
||||||
titan-07:
|
|
||||||
node-role.kubernetes.io/worker: "true"
|
|
||||||
longhorn-host: "true"
|
|
||||||
titan-08:
|
|
||||||
node-role.kubernetes.io/worker: "true"
|
|
||||||
longhorn-host: "true"
|
|
||||||
titan-11:
|
|
||||||
node-role.kubernetes.io/worker: "true"
|
|
||||||
longhorn-host: "true"
|
|
||||||
titan-12:
|
|
||||||
node-role.kubernetes.io/worker: "true"
|
|
||||||
longhorn-host: "true"
|
|
||||||
titan-13:
|
|
||||||
node-role.kubernetes.io/worker: "true"
|
|
||||||
longhorn-host: "true"
|
|
||||||
titan-14:
|
|
||||||
node-role.kubernetes.io/worker: "true"
|
|
||||||
longhorn-host: "true"
|
|
||||||
titan-15:
|
|
||||||
node-role.kubernetes.io/worker: "true"
|
|
||||||
longhorn-host: "true"
|
|
||||||
titan-17:
|
|
||||||
node-role.kubernetes.io/worker: "true"
|
|
||||||
longhorn-host: "true"
|
|
||||||
titan-18:
|
|
||||||
node-role.kubernetes.io/worker: "true"
|
|
||||||
longhorn-host: "true"
|
|
||||||
titan-19:
|
|
||||||
node-role.kubernetes.io/worker: "true"
|
|
||||||
longhorn-host: "true"
|
|
||||||
titan-09:
|
titan-09:
|
||||||
node-role.kubernetes.io/worker: "true"
|
|
||||||
ananke.bstein.dev/harbor-bootstrap: "true"
|
ananke.bstein.dev/harbor-bootstrap: "true"
|
||||||
require_time_sync: true
|
require_time_sync: true
|
||||||
time_sync_wait_seconds: 240
|
time_sync_wait_seconds: 240
|
||||||
@ -200,7 +156,6 @@ startup:
|
|||||||
admin_secret_name: keycloak-admin
|
admin_secret_name: keycloak-admin
|
||||||
admin_secret_username_key: username
|
admin_secret_username_key: username
|
||||||
admin_secret_password_key: password
|
admin_secret_password_key: password
|
||||||
service_checklist_explicit_only: true
|
|
||||||
service_checklist:
|
service_checklist:
|
||||||
- name: gitea-api
|
- name: gitea-api
|
||||||
url: https://scm.bstein.dev/api/healthz
|
url: https://scm.bstein.dev/api/healthz
|
||||||
@ -245,49 +200,18 @@ startup:
|
|||||||
require_node_ssh_auth: true
|
require_node_ssh_auth: true
|
||||||
node_ssh_auth_wait_seconds: 240
|
node_ssh_auth_wait_seconds: 240
|
||||||
node_ssh_auth_poll_seconds: 5
|
node_ssh_auth_poll_seconds: 5
|
||||||
node_ssh_auth_required_nodes:
|
|
||||||
- titan-0a
|
|
||||||
- titan-0b
|
|
||||||
- titan-0c
|
|
||||||
require_flux_health: true
|
require_flux_health: true
|
||||||
flux_health_wait_seconds: 900
|
flux_health_wait_seconds: 900
|
||||||
flux_health_poll_seconds: 5
|
flux_health_poll_seconds: 5
|
||||||
flux_health_required_kustomizations:
|
|
||||||
- flux-system/core
|
|
||||||
- flux-system/helm
|
|
||||||
- flux-system/traefik
|
|
||||||
- flux-system/cert-manager
|
|
||||||
- flux-system/longhorn
|
|
||||||
- flux-system/vault-csi
|
|
||||||
- flux-system/vault-injector
|
|
||||||
- flux-system/postgres
|
|
||||||
- flux-system/vault
|
|
||||||
- flux-system/keycloak
|
|
||||||
- flux-system/oauth2-proxy
|
|
||||||
- flux-system/gitea
|
|
||||||
- flux-system/monitoring
|
|
||||||
- flux-system/harbor
|
|
||||||
ignore_flux_kustomizations: []
|
ignore_flux_kustomizations: []
|
||||||
require_workload_convergence: true
|
require_workload_convergence: true
|
||||||
workload_convergence_wait_seconds: 900
|
workload_convergence_wait_seconds: 900
|
||||||
workload_convergence_poll_seconds: 5
|
workload_convergence_poll_seconds: 5
|
||||||
workload_convergence_required_namespaces:
|
|
||||||
- vault
|
|
||||||
- postgres
|
|
||||||
- sso
|
|
||||||
- gitea
|
|
||||||
- monitoring
|
|
||||||
- harbor
|
|
||||||
ignore_workload_namespaces: []
|
ignore_workload_namespaces: []
|
||||||
ignore_workloads: []
|
ignore_workloads: []
|
||||||
ignore_unavailable_nodes: []
|
ignore_unavailable_nodes: []
|
||||||
auto_recycle_stuck_pods: true
|
auto_recycle_stuck_pods: true
|
||||||
auto_quarantine_scheduling_storms: true
|
|
||||||
scheduling_storm_event_threshold: 30
|
|
||||||
scheduling_storm_window_seconds: 180
|
|
||||||
stuck_pod_grace_seconds: 180
|
stuck_pod_grace_seconds: 180
|
||||||
post_start_auto_heal_seconds: 60
|
|
||||||
dead_node_cleanup_grace_seconds: 300
|
|
||||||
vault_unseal_key_file: /var/lib/ananke/vault-unseal.key
|
vault_unseal_key_file: /var/lib/ananke/vault-unseal.key
|
||||||
vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/atlas/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.ananke-breakglass/vault-unseal.key'"
|
vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/atlas/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.ananke-breakglass/vault-unseal.key'"
|
||||||
vault_unseal_breakglass_timeout_seconds: 15
|
vault_unseal_breakglass_timeout_seconds: 15
|
||||||
@ -311,7 +235,6 @@ ups:
|
|||||||
target: pyrphoros@localhost
|
target: pyrphoros@localhost
|
||||||
poll_seconds: 5
|
poll_seconds: 5
|
||||||
runtime_safety_factor: 1.25
|
runtime_safety_factor: 1.25
|
||||||
on_battery_grace_seconds: 90
|
|
||||||
debounce_count: 3
|
debounce_count: 3
|
||||||
telemetry_timeout_seconds: 90
|
telemetry_timeout_seconds: 90
|
||||||
coordination:
|
coordination:
|
||||||
|
|||||||
@ -77,7 +77,7 @@ func (o *Orchestrator) waitForNodeSSHAuth(ctx context.Context, nodes []string) e
|
|||||||
ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
|
ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
|
||||||
seen := map[string]struct{}{}
|
seen := map[string]struct{}{}
|
||||||
targets := make([]string, 0, len(nodes))
|
targets := make([]string, 0, len(nodes))
|
||||||
for _, node := range startupRequiredNodes(nodes, o.cfg.Startup.NodeSSHAuthRequiredNodes) {
|
for _, node := range nodes {
|
||||||
node = strings.TrimSpace(node)
|
node = strings.TrimSpace(node)
|
||||||
if node == "" {
|
if node == "" {
|
||||||
continue
|
continue
|
||||||
|
|||||||
@ -1,288 +0,0 @@
|
|||||||
package cluster
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"encoding/json"
|
|
||||||
"errors"
|
|
||||||
"fmt"
|
|
||||||
"strings"
|
|
||||||
"time"
|
|
||||||
)
|
|
||||||
|
|
||||||
type nodeReadyList struct {
|
|
||||||
Items []struct {
|
|
||||||
Metadata struct {
|
|
||||||
Name string `json:"name"`
|
|
||||||
} `json:"metadata"`
|
|
||||||
Status struct {
|
|
||||||
Conditions []struct {
|
|
||||||
Type string `json:"type"`
|
|
||||||
Status string `json:"status"`
|
|
||||||
} `json:"conditions"`
|
|
||||||
} `json:"status"`
|
|
||||||
} `json:"items"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type podDeleteList struct {
|
|
||||||
Items []struct {
|
|
||||||
Metadata struct {
|
|
||||||
Namespace string `json:"namespace"`
|
|
||||||
Name string `json:"name"`
|
|
||||||
DeletionTimestamp *time.Time `json:"deletionTimestamp"`
|
|
||||||
} `json:"metadata"`
|
|
||||||
Spec struct {
|
|
||||||
NodeName string `json:"nodeName"`
|
|
||||||
} `json:"spec"`
|
|
||||||
} `json:"items"`
|
|
||||||
}
|
|
||||||
|
|
||||||
// RunPostStartAutoHeal runs one orchestration or CLI step.
|
|
||||||
// Signature: (o *Orchestrator) RunPostStartAutoHeal(ctx context.Context) error.
|
|
||||||
// Why: gives the long-running daemon a narrow, testable repair entrypoint for
|
|
||||||
// post-start drift without rerunning the full startup flow.
|
|
||||||
func (o *Orchestrator) RunPostStartAutoHeal(ctx context.Context) error {
|
|
||||||
return o.postStartAutoHeal(ctx)
|
|
||||||
}
|
|
||||||
|
|
||||||
// postStartAutoHeal runs one orchestration or CLI step.
|
|
||||||
// Signature: (o *Orchestrator) postStartAutoHeal(ctx context.Context) error.
|
|
||||||
// Why: centralizes bounded post-start repair actions so recurring outage
|
|
||||||
// patterns only trigger the specific remediation they need.
|
|
||||||
func (o *Orchestrator) postStartAutoHeal(ctx context.Context) error {
|
|
||||||
if o.runner.DryRun {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
errs := []string{}
|
|
||||||
requestReconcile := false
|
|
||||||
|
|
||||||
if err := o.ensureRequiredNodeLabels(ctx); err != nil {
|
|
||||||
errs = append(errs, fmt.Sprintf("required node labels: %v", err))
|
|
||||||
}
|
|
||||||
|
|
||||||
vaultRecovered, err := o.autoRecoverSealedVault(ctx)
|
|
||||||
if err != nil {
|
|
||||||
errs = append(errs, fmt.Sprintf("vault auto-recovery: %v", err))
|
|
||||||
} else if vaultRecovered {
|
|
||||||
requestReconcile = true
|
|
||||||
if err := o.rerunVaultK8sAuthConfigJob(ctx); err != nil {
|
|
||||||
errs = append(errs, fmt.Sprintf("vault k8s auth config rerun: %v", err))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
cleaned, err := o.cleanupTerminatingPodsOnUnavailableNodes(ctx)
|
|
||||||
if err != nil {
|
|
||||||
errs = append(errs, fmt.Sprintf("dead-node terminating pod cleanup: %v", err))
|
|
||||||
} else if cleaned > 0 {
|
|
||||||
requestReconcile = true
|
|
||||||
}
|
|
||||||
|
|
||||||
if requestReconcile {
|
|
||||||
o.bestEffort("request flux reconcile after post-start auto-heal", func() error {
|
|
||||||
return o.requestFluxReconcile(ctx)
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(errs) > 0 {
|
|
||||||
return errors.New(strings.Join(errs, "; "))
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// autoRecoverSealedVault runs one orchestration or CLI step.
|
|
||||||
// Signature: (o *Orchestrator) autoRecoverSealedVault(ctx context.Context) (bool, error).
|
|
||||||
// Why: lets the daemon repair a later Vault reseal without waiting for a new
|
|
||||||
// bootstrap run.
|
|
||||||
func (o *Orchestrator) autoRecoverSealedVault(ctx context.Context) (bool, error) {
|
|
||||||
if o.runner.DryRun {
|
|
||||||
return false, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
phase, err := o.kubectl(ctx, 15*time.Second, "-n", "vault", "get", "pod", "vault-0", "-o", "jsonpath={.status.phase}")
|
|
||||||
if err != nil {
|
|
||||||
if isNotFoundErr(err) {
|
|
||||||
return false, nil
|
|
||||||
}
|
|
||||||
return false, fmt.Errorf("vault pod phase check failed: %w", err)
|
|
||||||
}
|
|
||||||
if strings.TrimSpace(phase) != "Running" {
|
|
||||||
return false, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
sealed, err := o.vaultSealed(ctx)
|
|
||||||
if err != nil {
|
|
||||||
return false, err
|
|
||||||
}
|
|
||||||
if !sealed {
|
|
||||||
return false, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
o.log.Printf("warning: detected sealed Vault after startup; attempting post-start auto-recovery")
|
|
||||||
if err := o.ensureVaultUnsealed(ctx); err != nil {
|
|
||||||
return false, err
|
|
||||||
}
|
|
||||||
return true, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// rerunVaultK8sAuthConfigJob runs one orchestration or CLI step.
|
|
||||||
// Signature: (o *Orchestrator) rerunVaultK8sAuthConfigJob(ctx context.Context) error.
|
|
||||||
// Why: post-unseal Vault recovery needs the auth-config job retriggered so
|
|
||||||
// downstream secret consumers stop carrying stale failures from the sealed window.
|
|
||||||
func (o *Orchestrator) rerunVaultK8sAuthConfigJob(ctx context.Context) error {
|
|
||||||
if o.runner.DryRun {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
jobName := fmt.Sprintf("vault-k8s-auth-config-autoheal-%d", time.Now().Unix())
|
|
||||||
if _, err := o.kubectl(
|
|
||||||
ctx,
|
|
||||||
25*time.Second,
|
|
||||||
"-n", "vault",
|
|
||||||
"create", "job",
|
|
||||||
"--from=cronjob/vault-k8s-auth-config",
|
|
||||||
jobName,
|
|
||||||
); err != nil {
|
|
||||||
return fmt.Errorf("create job %s: %w", jobName, err)
|
|
||||||
}
|
|
||||||
o.log.Printf("triggered vault k8s auth config job %s after vault recovery", jobName)
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// cleanupTerminatingPodsOnUnavailableNodes runs one orchestration or CLI step.
|
|
||||||
// Signature: (o *Orchestrator) cleanupTerminatingPodsOnUnavailableNodes(ctx context.Context) (int, error).
|
|
||||||
// Why: dead nodes can strand terminating pods indefinitely, so the daemon should
|
|
||||||
// clear only that narrow failure class instead of leaving garbage behind forever.
|
|
||||||
func (o *Orchestrator) cleanupTerminatingPodsOnUnavailableNodes(ctx context.Context) (int, error) {
|
|
||||||
if o.runner.DryRun {
|
|
||||||
return 0, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
unavailable, err := o.unavailableNodeSet(ctx)
|
|
||||||
if err != nil {
|
|
||||||
return 0, err
|
|
||||||
}
|
|
||||||
if len(unavailable) == 0 {
|
|
||||||
return 0, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
out, err := o.kubectl(ctx, 30*time.Second, "get", "pods", "-A", "-o", "json")
|
|
||||||
if err != nil {
|
|
||||||
return 0, fmt.Errorf("query pods: %w", err)
|
|
||||||
}
|
|
||||||
var pods podDeleteList
|
|
||||||
if err := json.Unmarshal([]byte(out), &pods); err != nil {
|
|
||||||
return 0, fmt.Errorf("decode pods: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
grace := time.Duration(o.cfg.Startup.DeadNodeCleanupGraceSeconds) * time.Second
|
|
||||||
now := time.Now()
|
|
||||||
count := 0
|
|
||||||
for _, item := range pods.Items {
|
|
||||||
if item.Metadata.DeletionTimestamp == nil || item.Spec.NodeName == "" {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if _, badNode := unavailable[item.Spec.NodeName]; !badNode {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if now.Sub(*item.Metadata.DeletionTimestamp) < grace {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
o.log.Printf("warning: force deleting terminating pod %s/%s on unavailable node %s", item.Metadata.Namespace, item.Metadata.Name, item.Spec.NodeName)
|
|
||||||
if _, err := o.kubectl(
|
|
||||||
ctx,
|
|
||||||
20*time.Second,
|
|
||||||
"-n", item.Metadata.Namespace,
|
|
||||||
"delete", "pod", item.Metadata.Name,
|
|
||||||
"--grace-period=0",
|
|
||||||
"--force",
|
|
||||||
"--wait=false",
|
|
||||||
); err != nil && !isNotFoundErr(err) {
|
|
||||||
return count, fmt.Errorf("delete pod %s/%s: %w", item.Metadata.Namespace, item.Metadata.Name, err)
|
|
||||||
}
|
|
||||||
count++
|
|
||||||
}
|
|
||||||
if count > 0 {
|
|
||||||
o.log.Printf("post-start auto-heal cleaned %d terminating pod(s) from unavailable nodes", count)
|
|
||||||
}
|
|
||||||
return count, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// unavailableNodeSet runs one orchestration or CLI step.
|
|
||||||
// Signature: (o *Orchestrator) unavailableNodeSet(ctx context.Context) (map[string]struct{}, error).
|
|
||||||
// Why: isolates Ready-condition parsing so dead-node cleanup stays targeted.
|
|
||||||
func (o *Orchestrator) unavailableNodeSet(ctx context.Context) (map[string]struct{}, error) {
|
|
||||||
out, err := o.kubectl(ctx, 20*time.Second, "get", "nodes", "-o", "json")
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("query nodes: %w", err)
|
|
||||||
}
|
|
||||||
var nodes nodeReadyList
|
|
||||||
if err := json.Unmarshal([]byte(out), &nodes); err != nil {
|
|
||||||
return nil, fmt.Errorf("decode nodes: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
unavailable := map[string]struct{}{}
|
|
||||||
for _, item := range nodes.Items {
|
|
||||||
ready := ""
|
|
||||||
for _, cond := range item.Status.Conditions {
|
|
||||||
if strings.EqualFold(strings.TrimSpace(cond.Type), "Ready") {
|
|
||||||
ready = strings.TrimSpace(cond.Status)
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if ready != "True" {
|
|
||||||
unavailable[item.Metadata.Name] = struct{}{}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return unavailable, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// requestFluxReconcile runs one orchestration or CLI step.
|
|
||||||
// Signature: (o *Orchestrator) requestFluxReconcile(ctx context.Context) error.
|
|
||||||
// Why: post-start repairs need a lightweight way to refresh GitOps health
|
|
||||||
// without reusing the broader startup flux-resume flow.
|
|
||||||
func (o *Orchestrator) requestFluxReconcile(ctx context.Context) error {
|
|
||||||
if o.runner.DryRun {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
now := time.Now().UTC().Format(time.RFC3339)
|
|
||||||
if _, err := o.kubectl(
|
|
||||||
ctx,
|
|
||||||
25*time.Second,
|
|
||||||
"-n", "flux-system",
|
|
||||||
"annotate", "gitrepository", "flux-system",
|
|
||||||
"reconcile.fluxcd.io/requestedAt="+now,
|
|
||||||
"--overwrite",
|
|
||||||
); err != nil {
|
|
||||||
return fmt.Errorf("annotate flux source reconcile: %w", err)
|
|
||||||
}
|
|
||||||
if _, err := o.kubectl(
|
|
||||||
ctx,
|
|
||||||
25*time.Second,
|
|
||||||
"-n", "flux-system",
|
|
||||||
"annotate",
|
|
||||||
"kustomizations.kustomize.toolkit.fluxcd.io",
|
|
||||||
"--all",
|
|
||||||
"reconcile.fluxcd.io/requestedAt="+now,
|
|
||||||
"--overwrite",
|
|
||||||
); err != nil {
|
|
||||||
return fmt.Errorf("annotate flux kustomizations reconcile: %w", err)
|
|
||||||
}
|
|
||||||
if _, err := o.kubectl(
|
|
||||||
ctx,
|
|
||||||
25*time.Second,
|
|
||||||
"annotate",
|
|
||||||
"--all-namespaces",
|
|
||||||
"helmreleases.helm.toolkit.fluxcd.io",
|
|
||||||
"--all",
|
|
||||||
"reconcile.fluxcd.io/requestedAt="+now,
|
|
||||||
"--overwrite",
|
|
||||||
); err != nil {
|
|
||||||
o.log.Printf("warning: annotate helmreleases for post-start reconcile failed: %v", err)
|
|
||||||
}
|
|
||||||
if o.runOverride == nil && o.runner.CommandExists("flux") {
|
|
||||||
if _, err := o.run(ctx, 75*time.Second, "flux", "reconcile", "source", "git", "flux-system", "-n", "flux-system", "--timeout=60s"); err != nil {
|
|
||||||
o.log.Printf("warning: flux source reconcile command failed during post-start auto-heal: %v", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
@ -1,296 +0,0 @@
|
|||||||
package cluster
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"errors"
|
|
||||||
"io"
|
|
||||||
"log"
|
|
||||||
"os"
|
|
||||||
"path/filepath"
|
|
||||||
"strings"
|
|
||||||
"testing"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"scm.bstein.dev/bstein/ananke/internal/config"
|
|
||||||
"scm.bstein.dev/bstein/ananke/internal/execx"
|
|
||||||
"scm.bstein.dev/bstein/ananke/internal/state"
|
|
||||||
)
|
|
||||||
|
|
||||||
// TestCleanupTerminatingPodsOnUnavailableNodesBranches runs one orchestration or CLI step.
|
|
||||||
// Signature: TestCleanupTerminatingPodsOnUnavailableNodesBranches(t *testing.T).
|
|
||||||
// Why: cleanup on dead nodes must be selective so Ananke only force-deletes the
|
|
||||||
// truly stranded pods and tolerates already-gone objects.
|
|
||||||
func TestCleanupTerminatingPodsOnUnavailableNodesBranches(t *testing.T) {
|
|
||||||
t.Run("dry run skips", func(t *testing.T) {
|
|
||||||
orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
|
|
||||||
orch.runner.DryRun = true
|
|
||||||
count, err := orch.cleanupTerminatingPodsOnUnavailableNodes(context.Background())
|
|
||||||
if err != nil || count != 0 {
|
|
||||||
t.Fatalf("expected dry-run skip, got count=%d err=%v", count, err)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
t.Run("selective cleanup tolerates not found", func(t *testing.T) {
|
|
||||||
oldDelete := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
|
|
||||||
recentDelete := time.Now().Add(-2 * time.Minute).UTC().Format(time.RFC3339)
|
|
||||||
orch := buildOrchestratorWithStubs(t, config.Config{
|
|
||||||
Startup: config.Startup{DeadNodeCleanupGraceSeconds: 300},
|
|
||||||
}, []commandStub{
|
|
||||||
{
|
|
||||||
match: matchContains("kubectl", "get nodes -o json"),
|
|
||||||
out: `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"Unknown"}]}},{"metadata":{"name":"titan-07"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
match: matchContains("kubectl", "get pods -A -o json"),
|
|
||||||
out: `{"items":[` +
|
|
||||||
`{"metadata":{"namespace":"maintenance","name":"old-stale","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-22"}},` +
|
|
||||||
`{"metadata":{"namespace":"maintenance","name":"fresh-stale","deletionTimestamp":"` + recentDelete + `"},"spec":{"nodeName":"titan-22"}},` +
|
|
||||||
`{"metadata":{"namespace":"logging","name":"healthy-node","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-18"}},` +
|
|
||||||
`{"metadata":{"namespace":"logging","name":"no-delete"},"spec":{"nodeName":"titan-22"}}]}`,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
match: matchContains("kubectl", "-n maintenance delete pod old-stale --grace-period=0 --force --wait=false"),
|
|
||||||
err: errors.New("pod old-stale not found"),
|
|
||||||
},
|
|
||||||
})
|
|
||||||
|
|
||||||
count, err := orch.cleanupTerminatingPodsOnUnavailableNodes(context.Background())
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("cleanupTerminatingPodsOnUnavailableNodes failed: %v", err)
|
|
||||||
}
|
|
||||||
if count != 1 {
|
|
||||||
t.Fatalf("expected one cleaned pod, got %d", count)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
t.Run("query and decode errors surface", func(t *testing.T) {
|
|
||||||
queryErrOrch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
|
||||||
{
|
|
||||||
match: matchContains("kubectl", "get nodes -o json"),
|
|
||||||
err: errors.New("nodes failed"),
|
|
||||||
},
|
|
||||||
})
|
|
||||||
if _, err := queryErrOrch.cleanupTerminatingPodsOnUnavailableNodes(context.Background()); err == nil || !strings.Contains(err.Error(), "query nodes") {
|
|
||||||
t.Fatalf("expected node query error, got %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
decodeErrOrch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
|
||||||
{
|
|
||||||
match: matchContains("kubectl", "get nodes -o json"),
|
|
||||||
out: `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"Unknown"}]}}]}`,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
match: matchContains("kubectl", "get pods -A -o json"),
|
|
||||||
out: `{bad json`,
|
|
||||||
},
|
|
||||||
})
|
|
||||||
if _, err := decodeErrOrch.cleanupTerminatingPodsOnUnavailableNodes(context.Background()); err == nil || !strings.Contains(err.Error(), "decode pods") {
|
|
||||||
t.Fatalf("expected pod decode error, got %v", err)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
t.Run("delete hard error surfaces", func(t *testing.T) {
|
|
||||||
oldDelete := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
|
|
||||||
orch := buildOrchestratorWithStubs(t, config.Config{
|
|
||||||
Startup: config.Startup{DeadNodeCleanupGraceSeconds: 300},
|
|
||||||
}, []commandStub{
|
|
||||||
{
|
|
||||||
match: matchContains("kubectl", "get nodes -o json"),
|
|
||||||
out: `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"False"}]}}]}`,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
match: matchContains("kubectl", "get pods -A -o json"),
|
|
||||||
out: `{"items":[{"metadata":{"namespace":"maintenance","name":"old-stale","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-22"}}]}`,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
match: matchContains("kubectl", "-n maintenance delete pod old-stale --grace-period=0 --force --wait=false"),
|
|
||||||
err: errors.New("delete failed"),
|
|
||||||
},
|
|
||||||
})
|
|
||||||
|
|
||||||
count, err := orch.cleanupTerminatingPodsOnUnavailableNodes(context.Background())
|
|
||||||
if count != 0 || err == nil || !strings.Contains(err.Error(), "delete pod maintenance/old-stale") {
|
|
||||||
t.Fatalf("expected delete failure, got count=%d err=%v", count, err)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
// TestUnavailableNodeSetBranches runs one orchestration or CLI step.
|
|
||||||
// Signature: TestUnavailableNodeSetBranches(t *testing.T).
|
|
||||||
// Why: node Ready parsing drives dead-node cleanup, so malformed and missing
|
|
||||||
// Ready condition payloads need direct coverage too.
|
|
||||||
func TestUnavailableNodeSetBranches(t *testing.T) {
|
|
||||||
t.Run("decode error surfaces", func(t *testing.T) {
|
|
||||||
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
|
||||||
{match: matchContains("kubectl", "get nodes -o json"), out: `{bad json`},
|
|
||||||
})
|
|
||||||
if _, err := orch.unavailableNodeSet(context.Background()); err == nil || !strings.Contains(err.Error(), "decode nodes") {
|
|
||||||
t.Fatalf("expected decode error, got %v", err)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
t.Run("missing ready condition counts as unavailable", func(t *testing.T) {
|
|
||||||
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
|
||||||
{
|
|
||||||
match: matchContains("kubectl", "get nodes -o json"),
|
|
||||||
out: `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"MemoryPressure","status":"False"}]}},{"metadata":{"name":"titan-07"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`,
|
|
||||||
},
|
|
||||||
})
|
|
||||||
nodes, err := orch.unavailableNodeSet(context.Background())
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("unavailableNodeSet failed: %v", err)
|
|
||||||
}
|
|
||||||
if _, ok := nodes["titan-22"]; !ok {
|
|
||||||
t.Fatalf("expected titan-22 to be treated as unavailable")
|
|
||||||
}
|
|
||||||
if _, ok := nodes["titan-07"]; ok {
|
|
||||||
t.Fatalf("did not expect titan-07 to be treated as unavailable")
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
// TestRequestFluxReconcileBranches runs one orchestration or CLI step.
|
|
||||||
// Signature: TestRequestFluxReconcileBranches(t *testing.T).
|
|
||||||
// Why: the post-start repair loop needs predictable Flux refresh behavior even
|
|
||||||
// when one annotation call is flaky.
|
|
||||||
func TestRequestFluxReconcileBranches(t *testing.T) {
|
|
||||||
t.Run("dry run skips", func(t *testing.T) {
|
|
||||||
orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
|
|
||||||
orch.runner.DryRun = true
|
|
||||||
if err := orch.requestFluxReconcile(context.Background()); err != nil {
|
|
||||||
t.Fatalf("dry-run requestFluxReconcile failed: %v", err)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
t.Run("git source annotate error surfaces", func(t *testing.T) {
|
|
||||||
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
|
||||||
{
|
|
||||||
match: matchContains("kubectl", "-n flux-system annotate gitrepository flux-system reconcile.fluxcd.io/requestedAt="),
|
|
||||||
err: errors.New("annotate failed"),
|
|
||||||
},
|
|
||||||
})
|
|
||||||
if err := orch.requestFluxReconcile(context.Background()); err == nil || !strings.Contains(err.Error(), "annotate flux source reconcile") {
|
|
||||||
t.Fatalf("expected gitrepository annotate error, got %v", err)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
t.Run("kustomization annotate error surfaces", func(t *testing.T) {
|
|
||||||
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
|
||||||
{
|
|
||||||
match: matchContains("kubectl", "-n flux-system annotate gitrepository flux-system reconcile.fluxcd.io/requestedAt="),
|
|
||||||
out: "",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
match: matchContains("kubectl", "-n flux-system annotate kustomizations.kustomize.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt="),
|
|
||||||
err: errors.New("annotate failed"),
|
|
||||||
},
|
|
||||||
})
|
|
||||||
if err := orch.requestFluxReconcile(context.Background()); err == nil || !strings.Contains(err.Error(), "annotate flux kustomizations reconcile") {
|
|
||||||
t.Fatalf("expected kustomization annotate error, got %v", err)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
t.Run("helm annotate warning and flux command path", func(t *testing.T) {
|
|
||||||
tmpDir := t.TempDir()
|
|
||||||
callLog := filepath.Join(tmpDir, "calls.log")
|
|
||||||
kubectlPath := filepath.Join(tmpDir, "kubectl")
|
|
||||||
fluxPath := filepath.Join(tmpDir, "flux")
|
|
||||||
|
|
||||||
kubectlScript := "#!/bin/sh\n" +
|
|
||||||
"printf '%s\\n' \"$*\" >> \"" + callLog + "\"\n" +
|
|
||||||
"case \"$*\" in\n" +
|
|
||||||
" *helmreleases.helm.toolkit.fluxcd.io*) echo helm annotate failed >&2; exit 1 ;;\n" +
|
|
||||||
"esac\n" +
|
|
||||||
"exit 0\n"
|
|
||||||
fluxScript := "#!/bin/sh\n" +
|
|
||||||
"printf 'flux %s\\n' \"$*\" >> \"" + callLog + "\"\n" +
|
|
||||||
"exit 0\n"
|
|
||||||
|
|
||||||
if err := os.WriteFile(kubectlPath, []byte(kubectlScript), 0o755); err != nil {
|
|
||||||
t.Fatalf("write fake kubectl: %v", err)
|
|
||||||
}
|
|
||||||
if err := os.WriteFile(fluxPath, []byte(fluxScript), 0o755); err != nil {
|
|
||||||
t.Fatalf("write fake flux: %v", err)
|
|
||||||
}
|
|
||||||
t.Setenv("PATH", tmpDir+":"+os.Getenv("PATH"))
|
|
||||||
|
|
||||||
cfg := config.Config{
|
|
||||||
State: config.State{
|
|
||||||
Dir: t.TempDir(),
|
|
||||||
ReportsDir: filepath.Join(t.TempDir(), "reports"),
|
|
||||||
RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"),
|
|
||||||
},
|
|
||||||
}
|
|
||||||
orch := &Orchestrator{
|
|
||||||
cfg: cfg,
|
|
||||||
runner: &execx.Runner{},
|
|
||||||
store: state.New(cfg.State.RunHistoryPath),
|
|
||||||
log: log.New(io.Discard, "", 0),
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := orch.requestFluxReconcile(context.Background()); err != nil {
|
|
||||||
t.Fatalf("requestFluxReconcile with fake binaries failed: %v", err)
|
|
||||||
}
|
|
||||||
calls, err := os.ReadFile(callLog)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("read fake command log: %v", err)
|
|
||||||
}
|
|
||||||
logText := string(calls)
|
|
||||||
if !strings.Contains(logText, "annotate gitrepository flux-system") {
|
|
||||||
t.Fatalf("expected gitrepository annotate call, got %q", logText)
|
|
||||||
}
|
|
||||||
if !strings.Contains(logText, "annotate kustomizations.kustomize.toolkit.fluxcd.io --all") {
|
|
||||||
t.Fatalf("expected kustomization annotate call, got %q", logText)
|
|
||||||
}
|
|
||||||
if !strings.Contains(logText, "flux reconcile source git flux-system -n flux-system --timeout=60s") {
|
|
||||||
t.Fatalf("expected flux reconcile command, got %q", logText)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
t.Run("flux command failure is tolerated", func(t *testing.T) {
|
|
||||||
tmpDir := t.TempDir()
|
|
||||||
callLog := filepath.Join(tmpDir, "calls.log")
|
|
||||||
kubectlPath := filepath.Join(tmpDir, "kubectl")
|
|
||||||
fluxPath := filepath.Join(tmpDir, "flux")
|
|
||||||
|
|
||||||
kubectlScript := "#!/bin/sh\n" +
|
|
||||||
"printf '%s\\n' \"$*\" >> \"" + callLog + "\"\n" +
|
|
||||||
"exit 0\n"
|
|
||||||
fluxScript := "#!/bin/sh\n" +
|
|
||||||
"printf 'flux-fail %s\\n' \"$*\" >> \"" + callLog + "\"\n" +
|
|
||||||
"exit 1\n"
|
|
||||||
|
|
||||||
if err := os.WriteFile(kubectlPath, []byte(kubectlScript), 0o755); err != nil {
|
|
||||||
t.Fatalf("write fake kubectl: %v", err)
|
|
||||||
}
|
|
||||||
if err := os.WriteFile(fluxPath, []byte(fluxScript), 0o755); err != nil {
|
|
||||||
t.Fatalf("write fake flux: %v", err)
|
|
||||||
}
|
|
||||||
t.Setenv("PATH", tmpDir+":"+os.Getenv("PATH"))
|
|
||||||
|
|
||||||
cfg := config.Config{
|
|
||||||
State: config.State{
|
|
||||||
Dir: t.TempDir(),
|
|
||||||
ReportsDir: filepath.Join(t.TempDir(), "reports"),
|
|
||||||
RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"),
|
|
||||||
},
|
|
||||||
}
|
|
||||||
orch := &Orchestrator{
|
|
||||||
cfg: cfg,
|
|
||||||
runner: &execx.Runner{},
|
|
||||||
store: state.New(cfg.State.RunHistoryPath),
|
|
||||||
log: log.New(io.Discard, "", 0),
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := orch.requestFluxReconcile(context.Background()); err != nil {
|
|
||||||
t.Fatalf("requestFluxReconcile should tolerate flux failure, got %v", err)
|
|
||||||
}
|
|
||||||
calls, err := os.ReadFile(callLog)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("read fake command log: %v", err)
|
|
||||||
}
|
|
||||||
if !strings.Contains(string(calls), "flux-fail reconcile source git flux-system -n flux-system --timeout=60s") {
|
|
||||||
t.Fatalf("expected failing flux command to be attempted, got %q", string(calls))
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
@ -1,382 +0,0 @@
|
|||||||
package cluster
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"encoding/base64"
|
|
||||||
"errors"
|
|
||||||
"io"
|
|
||||||
"log"
|
|
||||||
"path/filepath"
|
|
||||||
"strings"
|
|
||||||
"testing"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"scm.bstein.dev/bstein/ananke/internal/config"
|
|
||||||
"scm.bstein.dev/bstein/ananke/internal/execx"
|
|
||||||
"scm.bstein.dev/bstein/ananke/internal/state"
|
|
||||||
)
|
|
||||||
|
|
||||||
// TestPostStartAutoHealRepairsVaultAndUnavailableNodes runs one orchestration or CLI step.
|
|
||||||
// Signature: TestPostStartAutoHealRepairsVaultAndUnavailableNodes(t *testing.T).
|
|
||||||
// Why: covers the new daemon-triggered repair path for late Vault reseals and
|
|
||||||
// stale terminating pods anchored to unavailable nodes.
|
|
||||||
func TestPostStartAutoHealRepairsVaultAndUnavailableNodes(t *testing.T) {
|
|
||||||
cfg := config.Config{
|
|
||||||
Startup: config.Startup{
|
|
||||||
DeadNodeCleanupGraceSeconds: 300,
|
|
||||||
RequiredNodeLabels: map[string]map[string]string{
|
|
||||||
"titan-07": {"node-role.kubernetes.io/worker": "true"},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
State: config.State{
|
|
||||||
Dir: t.TempDir(),
|
|
||||||
ReportsDir: filepath.Join(t.TempDir(), "reports"),
|
|
||||||
RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"),
|
|
||||||
},
|
|
||||||
}
|
|
||||||
orch := &Orchestrator{
|
|
||||||
cfg: cfg,
|
|
||||||
runner: &execx.Runner{},
|
|
||||||
store: state.New(filepath.Join(t.TempDir(), "runs.json")),
|
|
||||||
log: log.New(io.Discard, "", 0),
|
|
||||||
}
|
|
||||||
|
|
||||||
oldDelete := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
|
|
||||||
unsealCalls := 0
|
|
||||||
jobCreated := false
|
|
||||||
reconciled := false
|
|
||||||
deleted := map[string]bool{}
|
|
||||||
dispatch := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
|
|
||||||
if name != "kubectl" {
|
|
||||||
return "", nil
|
|
||||||
}
|
|
||||||
joined := strings.Join(args, " ")
|
|
||||||
switch {
|
|
||||||
case strings.Contains(joined, "label node titan-07 --overwrite node-role.kubernetes.io/worker=true"):
|
|
||||||
return "", nil
|
|
||||||
case strings.Contains(joined, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
|
|
||||||
return "Running", nil
|
|
||||||
case strings.Contains(joined, "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"):
|
|
||||||
if unsealCalls == 0 {
|
|
||||||
return `{"initialized":true,"sealed":true}`, nil
|
|
||||||
}
|
|
||||||
return `{"initialized":true,"sealed":false}`, nil
|
|
||||||
case strings.Contains(joined, "-n vault get secret vault-init -o jsonpath={.data.unseal_key_b64}"):
|
|
||||||
return base64.StdEncoding.EncodeToString([]byte("vault-unseal-key")), nil
|
|
||||||
case strings.Contains(joined, "vault operator unseal"):
|
|
||||||
unsealCalls++
|
|
||||||
return "", nil
|
|
||||||
case strings.Contains(joined, "-n vault create job --from=cronjob/vault-k8s-auth-config"):
|
|
||||||
jobCreated = true
|
|
||||||
return "", nil
|
|
||||||
case strings.Contains(joined, "get nodes -o json"):
|
|
||||||
return `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"Unknown"}]}},{"metadata":{"name":"titan-07"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`, nil
|
|
||||||
case strings.Contains(joined, "get pods -A -o json"):
|
|
||||||
return `{"items":[{"metadata":{"namespace":"maintenance","name":"stale-pod","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-22"}},{"metadata":{"namespace":"logging","name":"healthy-node-pod","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-18"}}]}`, nil
|
|
||||||
case strings.Contains(joined, "-n maintenance delete pod stale-pod --grace-period=0 --force --wait=false"):
|
|
||||||
deleted["maintenance/stale-pod"] = true
|
|
||||||
return "", nil
|
|
||||||
case strings.Contains(joined, "-n flux-system annotate gitrepository flux-system reconcile.fluxcd.io/requestedAt="):
|
|
||||||
reconciled = true
|
|
||||||
return "", nil
|
|
||||||
case strings.Contains(joined, "-n flux-system annotate kustomizations.kustomize.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt="):
|
|
||||||
return "", nil
|
|
||||||
case strings.Contains(joined, "annotate --all-namespaces helmreleases.helm.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt="):
|
|
||||||
return "", nil
|
|
||||||
default:
|
|
||||||
return "", nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
orch.SetCommandOverrides(dispatch, dispatch)
|
|
||||||
|
|
||||||
if err := orch.postStartAutoHeal(context.Background()); err != nil {
|
|
||||||
t.Fatalf("postStartAutoHeal failed: %v", err)
|
|
||||||
}
|
|
||||||
if unsealCalls != 1 {
|
|
||||||
t.Fatalf("expected one Vault unseal attempt, got %d", unsealCalls)
|
|
||||||
}
|
|
||||||
if !jobCreated {
|
|
||||||
t.Fatalf("expected vault k8s auth config job to be created")
|
|
||||||
}
|
|
||||||
if !deleted["maintenance/stale-pod"] {
|
|
||||||
t.Fatalf("expected stale unavailable-node pod to be deleted")
|
|
||||||
}
|
|
||||||
if !reconciled {
|
|
||||||
t.Fatalf("expected flux reconcile request after repairs")
|
|
||||||
}
|
|
||||||
if deleted["logging/healthy-node-pod"] {
|
|
||||||
t.Fatalf("did not expect terminating pod on healthy node to be deleted")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// TestPostStartAutoHealSkipsWhenClusterIsHealthy runs one orchestration or CLI step.
|
|
||||||
// Signature: TestPostStartAutoHealSkipsWhenClusterIsHealthy(t *testing.T).
|
|
||||||
// Why: proves the new post-start repair loop stays quiet when the specific
|
|
||||||
// failure patterns are absent.
|
|
||||||
func TestPostStartAutoHealSkipsWhenClusterIsHealthy(t *testing.T) {
|
|
||||||
cfg := config.Config{
|
|
||||||
Startup: config.Startup{
|
|
||||||
DeadNodeCleanupGraceSeconds: 300,
|
|
||||||
},
|
|
||||||
State: config.State{
|
|
||||||
Dir: t.TempDir(),
|
|
||||||
ReportsDir: filepath.Join(t.TempDir(), "reports"),
|
|
||||||
RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"),
|
|
||||||
},
|
|
||||||
}
|
|
||||||
orch := &Orchestrator{
|
|
||||||
cfg: cfg,
|
|
||||||
runner: &execx.Runner{},
|
|
||||||
store: state.New(filepath.Join(t.TempDir(), "runs.json")),
|
|
||||||
log: log.New(io.Discard, "", 0),
|
|
||||||
}
|
|
||||||
|
|
||||||
unsealCalls := 0
|
|
||||||
jobCreated := false
|
|
||||||
reconciled := false
|
|
||||||
dispatch := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
|
|
||||||
if name != "kubectl" {
|
|
||||||
return "", nil
|
|
||||||
}
|
|
||||||
joined := strings.Join(args, " ")
|
|
||||||
switch {
|
|
||||||
case strings.Contains(joined, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
|
|
||||||
return "Running", nil
|
|
||||||
case strings.Contains(joined, "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"):
|
|
||||||
return `{"initialized":true,"sealed":false}`, nil
|
|
||||||
case strings.Contains(joined, "-n vault create job --from=cronjob/vault-k8s-auth-config"):
|
|
||||||
jobCreated = true
|
|
||||||
return "", nil
|
|
||||||
case strings.Contains(joined, "vault operator unseal"):
|
|
||||||
unsealCalls++
|
|
||||||
return "", nil
|
|
||||||
case strings.Contains(joined, "get nodes -o json"):
|
|
||||||
return `{"items":[{"metadata":{"name":"titan-07"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`, nil
|
|
||||||
case strings.Contains(joined, "get pods -A -o json"):
|
|
||||||
return `{"items":[]}`, nil
|
|
||||||
case strings.Contains(joined, "reconcile.fluxcd.io/requestedAt="):
|
|
||||||
reconciled = true
|
|
||||||
return "", nil
|
|
||||||
default:
|
|
||||||
return "", nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
orch.SetCommandOverrides(dispatch, dispatch)
|
|
||||||
|
|
||||||
if err := orch.postStartAutoHeal(context.Background()); err != nil {
|
|
||||||
t.Fatalf("postStartAutoHeal failed: %v", err)
|
|
||||||
}
|
|
||||||
if unsealCalls != 0 {
|
|
||||||
t.Fatalf("did not expect Vault unseal calls, got %d", unsealCalls)
|
|
||||||
}
|
|
||||||
if jobCreated {
|
|
||||||
t.Fatalf("did not expect vault auth config job creation")
|
|
||||||
}
|
|
||||||
if reconciled {
|
|
||||||
t.Fatalf("did not expect flux reconcile request for healthy cluster")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// TestRunPostStartAutoHealDryRun runs one orchestration or CLI step.
|
|
||||||
// Signature: TestRunPostStartAutoHealDryRun(t *testing.T).
|
|
||||||
// Why: covers the exported wrapper and the top-level dry-run guard so daemon
|
|
||||||
// auto-heal never mutates cluster state during rehearsal runs.
|
|
||||||
func TestRunPostStartAutoHealDryRun(t *testing.T) {
|
|
||||||
orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
|
|
||||||
orch.runner.DryRun = true
|
|
||||||
|
|
||||||
if err := orch.RunPostStartAutoHeal(context.Background()); err != nil {
|
|
||||||
t.Fatalf("RunPostStartAutoHeal dry-run failed: %v", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// TestPostStartAutoHealAggregatesErrors runs one orchestration or CLI step.
|
|
||||||
// Signature: TestPostStartAutoHealAggregatesErrors(t *testing.T).
|
|
||||||
// Why: proves the daemon reports each failed sub-repair together instead of
|
|
||||||
// hiding later failures behind the first problem.
|
|
||||||
func TestPostStartAutoHealAggregatesErrors(t *testing.T) {
|
|
||||||
cfg := config.Config{
|
|
||||||
Startup: config.Startup{
|
|
||||||
DeadNodeCleanupGraceSeconds: 300,
|
|
||||||
RequiredNodeLabels: map[string]map[string]string{
|
|
||||||
"titan-07": {"node-role.kubernetes.io/worker": "true"},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
}
|
|
||||||
orch := buildOrchestratorWithStubs(t, cfg, []commandStub{
|
|
||||||
{
|
|
||||||
match: matchContains("kubectl", "label node titan-07 --overwrite node-role.kubernetes.io/worker=true"),
|
|
||||||
err: errors.New("label failed"),
|
|
||||||
},
|
|
||||||
{
|
|
||||||
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
|
|
||||||
err: errors.New("vault phase failed"),
|
|
||||||
},
|
|
||||||
{
|
|
||||||
match: matchContains("kubectl", "get nodes -o json"),
|
|
||||||
err: errors.New("node query failed"),
|
|
||||||
},
|
|
||||||
})
|
|
||||||
|
|
||||||
err := orch.postStartAutoHeal(context.Background())
|
|
||||||
if err == nil {
|
|
||||||
t.Fatalf("expected aggregated error")
|
|
||||||
}
|
|
||||||
msg := err.Error()
|
|
||||||
for _, want := range []string{
|
|
||||||
"required node labels:",
|
|
||||||
"vault auto-recovery:",
|
|
||||||
"dead-node terminating pod cleanup:",
|
|
||||||
} {
|
|
||||||
if !strings.Contains(msg, want) {
|
|
||||||
t.Fatalf("expected %q in %q", want, msg)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// TestAutoRecoverSealedVaultBranches runs one orchestration or CLI step.
|
|
||||||
// Signature: TestAutoRecoverSealedVaultBranches(t *testing.T).
|
|
||||||
// Why: late Vault reseals are a high-risk failure path, so the daemon needs
|
|
||||||
// coverage across the quiet-skip, parse-failure, and unseal-failure branches.
|
|
||||||
func TestAutoRecoverSealedVaultBranches(t *testing.T) {
|
|
||||||
t.Run("dry run skips", func(t *testing.T) {
|
|
||||||
orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
|
|
||||||
orch.runner.DryRun = true
|
|
||||||
|
|
||||||
recovered, err := orch.autoRecoverSealedVault(context.Background())
|
|
||||||
if err != nil || recovered {
|
|
||||||
t.Fatalf("expected dry-run skip, got recovered=%v err=%v", recovered, err)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
t.Run("pod missing is quiet", func(t *testing.T) {
|
|
||||||
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
|
||||||
{
|
|
||||||
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
|
|
||||||
err: errors.New("vault-0 not found"),
|
|
||||||
},
|
|
||||||
})
|
|
||||||
|
|
||||||
recovered, err := orch.autoRecoverSealedVault(context.Background())
|
|
||||||
if err != nil || recovered {
|
|
||||||
t.Fatalf("expected quiet skip, got recovered=%v err=%v", recovered, err)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
t.Run("phase check error surfaces", func(t *testing.T) {
|
|
||||||
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
|
||||||
{
|
|
||||||
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
|
|
||||||
err: errors.New("phase check failed"),
|
|
||||||
},
|
|
||||||
})
|
|
||||||
|
|
||||||
recovered, err := orch.autoRecoverSealedVault(context.Background())
|
|
||||||
if recovered || err == nil || !strings.Contains(err.Error(), "vault pod phase check failed") {
|
|
||||||
t.Fatalf("expected phase check error, got recovered=%v err=%v", recovered, err)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
t.Run("non-running pod defers", func(t *testing.T) {
|
|
||||||
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
|
||||||
{
|
|
||||||
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
|
|
||||||
out: "Pending",
|
|
||||||
},
|
|
||||||
})
|
|
||||||
|
|
||||||
recovered, err := orch.autoRecoverSealedVault(context.Background())
|
|
||||||
if err != nil || recovered {
|
|
||||||
t.Fatalf("expected pending pod skip, got recovered=%v err=%v", recovered, err)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
t.Run("status parse failure surfaces", func(t *testing.T) {
|
|
||||||
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
|
||||||
{
|
|
||||||
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
|
|
||||||
out: "Running",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
match: matchContains("kubectl", "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"),
|
|
||||||
out: "garbage",
|
|
||||||
},
|
|
||||||
})
|
|
||||||
|
|
||||||
recovered, err := orch.autoRecoverSealedVault(context.Background())
|
|
||||||
if recovered || err == nil || !strings.Contains(err.Error(), "parse vault status") {
|
|
||||||
t.Fatalf("expected parse error, got recovered=%v err=%v", recovered, err)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
t.Run("already unsealed stays quiet", func(t *testing.T) {
|
|
||||||
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
|
||||||
{
|
|
||||||
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
|
|
||||||
out: "Running",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
match: matchContains("kubectl", "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"),
|
|
||||||
out: `{"sealed":false}`,
|
|
||||||
},
|
|
||||||
})
|
|
||||||
|
|
||||||
recovered, err := orch.autoRecoverSealedVault(context.Background())
|
|
||||||
if err != nil || recovered {
|
|
||||||
t.Fatalf("expected already-unsealed skip, got recovered=%v err=%v", recovered, err)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
t.Run("unseal failure surfaces", func(t *testing.T) {
|
|
||||||
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
|
||||||
{
|
|
||||||
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
|
|
||||||
out: "Running",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
match: matchContains("kubectl", "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"),
|
|
||||||
out: `{"sealed":true}`,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
match: matchContains("kubectl", "-n vault get secret vault-init -o jsonpath={.data.unseal_key_b64}"),
|
|
||||||
out: base64.StdEncoding.EncodeToString([]byte("vault-unseal-key")),
|
|
||||||
},
|
|
||||||
{
|
|
||||||
match: matchContains("kubectl", "vault operator unseal"),
|
|
||||||
err: errors.New("exec boom"),
|
|
||||||
},
|
|
||||||
})
|
|
||||||
|
|
||||||
recovered, err := orch.autoRecoverSealedVault(context.Background())
|
|
||||||
if recovered || err == nil || !strings.Contains(err.Error(), "vault unseal attempt 1 failed") {
|
|
||||||
t.Fatalf("expected unseal failure, got recovered=%v err=%v", recovered, err)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
// TestRerunVaultK8sAuthConfigJobBranches runs one orchestration or CLI step.
|
|
||||||
// Signature: TestRerunVaultK8sAuthConfigJobBranches(t *testing.T).
|
|
||||||
// Why: the post-unseal auth job is part of the production recovery chain, so
|
|
||||||
// dry-run and create-error behavior both need explicit coverage.
|
|
||||||
func TestRerunVaultK8sAuthConfigJobBranches(t *testing.T) {
|
|
||||||
t.Run("dry run skips", func(t *testing.T) {
|
|
||||||
orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
|
|
||||||
orch.runner.DryRun = true
|
|
||||||
if err := orch.rerunVaultK8sAuthConfigJob(context.Background()); err != nil {
|
|
||||||
t.Fatalf("dry-run rerunVaultK8sAuthConfigJob failed: %v", err)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
t.Run("create error surfaces", func(t *testing.T) {
|
|
||||||
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
|
|
||||||
{
|
|
||||||
match: matchContains("kubectl", "-n vault create job --from=cronjob/vault-k8s-auth-config"),
|
|
||||||
err: errors.New("create failed"),
|
|
||||||
},
|
|
||||||
})
|
|
||||||
err := orch.rerunVaultK8sAuthConfigJob(context.Background())
|
|
||||||
if err == nil || !strings.Contains(err.Error(), "create job vault-k8s-auth-config-autoheal-") {
|
|
||||||
t.Fatalf("expected create-job error, got %v", err)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
@ -227,31 +227,6 @@ func (o *Orchestrator) waitVaultReady(ctx context.Context, w startupWorkload) er
|
|||||||
return fmt.Errorf("wait ready %s/%s/%s: timeout", w.Namespace, w.Kind, w.Name)
|
return fmt.Errorf("wait ready %s/%s/%s: timeout", w.Namespace, w.Kind, w.Name)
|
||||||
}
|
}
|
||||||
|
|
||||||
// ensureVaultUnsealedWhenRunnable runs one orchestration or CLI step.
|
|
||||||
// Signature: (o *Orchestrator) ensureVaultUnsealedWhenRunnable(ctx context.Context) (bool, string, error).
|
|
||||||
// Why: lets startup defer vault unseal until the pod is actually runnable, while
|
|
||||||
// keeping the direct unseal helper strict for explicit recovery paths and tests.
|
|
||||||
func (o *Orchestrator) ensureVaultUnsealedWhenRunnable(ctx context.Context) (bool, string, error) {
|
|
||||||
if o.runner.DryRun {
|
|
||||||
return false, "", nil
|
|
||||||
}
|
|
||||||
|
|
||||||
phase, err := o.kubectl(ctx, 15*time.Second, "-n", "vault", "get", "pod", "vault-0", "-o", "jsonpath={.status.phase}")
|
|
||||||
if err != nil {
|
|
||||||
if isNotFoundErr(err) {
|
|
||||||
return true, "vault-0 pod is not present yet; deferring unseal until critical workload recovery", nil
|
|
||||||
}
|
|
||||||
return false, "", fmt.Errorf("vault pod phase check failed: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
trimmedPhase := strings.TrimSpace(phase)
|
|
||||||
if trimmedPhase != "Running" {
|
|
||||||
return true, fmt.Sprintf("vault-0 pod phase is %q; deferring unseal until critical workload recovery", trimmedPhase), nil
|
|
||||||
}
|
|
||||||
|
|
||||||
return false, "", o.ensureVaultUnsealed(ctx)
|
|
||||||
}
|
|
||||||
|
|
||||||
// ensureVaultUnsealed runs one orchestration or CLI step.
|
// ensureVaultUnsealed runs one orchestration or CLI step.
|
||||||
// Signature: (o *Orchestrator) ensureVaultUnsealed(ctx context.Context) error.
|
// Signature: (o *Orchestrator) ensureVaultUnsealed(ctx context.Context) error.
|
||||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||||
|
|||||||
@ -143,8 +143,6 @@ func (o *Orchestrator) fluxHealthReady(ctx context.Context) (bool, string, error
|
|||||||
return false, "", fmt.Errorf("decode flux kustomizations: %w", err)
|
return false, "", fmt.Errorf("decode flux kustomizations: %w", err)
|
||||||
}
|
}
|
||||||
ignored := makeStringSet(o.cfg.Startup.IgnoreFluxKustomizations)
|
ignored := makeStringSet(o.cfg.Startup.IgnoreFluxKustomizations)
|
||||||
required := o.startupRequiredFluxKustomizations()
|
|
||||||
requiredSeen := map[string]struct{}{}
|
|
||||||
notReady := []string{}
|
notReady := []string{}
|
||||||
for _, ks := range list.Items {
|
for _, ks := range list.Items {
|
||||||
ns := strings.TrimSpace(ks.Metadata.Namespace)
|
ns := strings.TrimSpace(ks.Metadata.Namespace)
|
||||||
@ -156,12 +154,6 @@ func (o *Orchestrator) fluxHealthReady(ctx context.Context) (bool, string, error
|
|||||||
if ks.Spec.Suspend {
|
if ks.Spec.Suspend {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if len(required) > 0 {
|
|
||||||
if _, ok := required[full]; !ok {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
requiredSeen[full] = struct{}{}
|
|
||||||
}
|
|
||||||
if _, ok := ignored[full]; ok {
|
if _, ok := ignored[full]; ok {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
@ -181,25 +173,10 @@ func (o *Orchestrator) fluxHealthReady(ctx context.Context) (bool, string, error
|
|||||||
}
|
}
|
||||||
notReady = append(notReady, fmt.Sprintf("%s(%s)", full, reason))
|
notReady = append(notReady, fmt.Sprintf("%s(%s)", full, reason))
|
||||||
}
|
}
|
||||||
if len(required) > 0 {
|
|
||||||
missing := []string{}
|
|
||||||
for full := range required {
|
|
||||||
if _, ok := requiredSeen[full]; !ok {
|
|
||||||
missing = append(missing, full+"(missing)")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if len(missing) > 0 {
|
|
||||||
sort.Strings(missing)
|
|
||||||
notReady = append(notReady, missing...)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if len(notReady) > 0 {
|
if len(notReady) > 0 {
|
||||||
sort.Strings(notReady)
|
sort.Strings(notReady)
|
||||||
return false, "not ready: " + joinLimited(notReady, 6), nil
|
return false, "not ready: " + joinLimited(notReady, 6), nil
|
||||||
}
|
}
|
||||||
if len(required) > 0 {
|
|
||||||
return true, fmt.Sprintf("required kustomizations ready=%d", len(requiredSeen)), nil
|
|
||||||
}
|
|
||||||
return true, fmt.Sprintf("all kustomizations ready=%d", len(list.Items)), nil
|
return true, fmt.Sprintf("all kustomizations ready=%d", len(list.Items)), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -19,7 +19,6 @@ func (o *Orchestrator) ensureRequiredNodeLabels(ctx context.Context) error {
|
|||||||
if o.runner.DryRun || len(o.cfg.Startup.RequiredNodeLabels) == 0 {
|
if o.runner.DryRun || len(o.cfg.Startup.RequiredNodeLabels) == 0 {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
|
|
||||||
nodes := make([]string, 0, len(o.cfg.Startup.RequiredNodeLabels))
|
nodes := make([]string, 0, len(o.cfg.Startup.RequiredNodeLabels))
|
||||||
for node := range o.cfg.Startup.RequiredNodeLabels {
|
for node := range o.cfg.Startup.RequiredNodeLabels {
|
||||||
node = strings.TrimSpace(node)
|
node = strings.TrimSpace(node)
|
||||||
@ -29,10 +28,6 @@ func (o *Orchestrator) ensureRequiredNodeLabels(ctx context.Context) error {
|
|||||||
}
|
}
|
||||||
sort.Strings(nodes)
|
sort.Strings(nodes)
|
||||||
for _, node := range nodes {
|
for _, node := range nodes {
|
||||||
if _, skip := ignored[node]; skip {
|
|
||||||
o.log.Printf("skipping required node labels for ignored unavailable node %s", node)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
labels := o.cfg.Startup.RequiredNodeLabels[node]
|
labels := o.cfg.Startup.RequiredNodeLabels[node]
|
||||||
if len(labels) == 0 {
|
if len(labels) == 0 {
|
||||||
continue
|
continue
|
||||||
@ -60,11 +55,6 @@ func (o *Orchestrator) ensureRequiredNodeLabels(ctx context.Context) error {
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if _, err := o.kubectl(ctx, 25*time.Second, args...); err != nil {
|
if _, err := o.kubectl(ctx, 25*time.Second, args...); err != nil {
|
||||||
if isNotFoundErr(err) && !o.startupNodeStrictlyRequired(node) {
|
|
||||||
o.log.Printf("warning: skipping required labels for absent non-core node %s: %v", node, err)
|
|
||||||
o.noteStartupAutoHeal(fmt.Sprintf("skipped required node labels for absent non-core node %s", node))
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
return fmt.Errorf("ensure required node labels on %s (%s): %w", node, strings.Join(pairs, ", "), err)
|
return fmt.Errorf("ensure required node labels on %s (%s): %w", node, strings.Join(pairs, ", "), err)
|
||||||
}
|
}
|
||||||
o.log.Printf("ensured required labels on node %s: %s", node, strings.Join(pairs, ", "))
|
o.log.Printf("ensured required labels on node %s: %s", node, strings.Join(pairs, ", "))
|
||||||
|
|||||||
@ -37,7 +37,6 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
|
|||||||
return invErr
|
return invErr
|
||||||
}
|
}
|
||||||
o.noteStartupCheck("node-inventory", true, "inventory/user/port validation passed")
|
o.noteStartupCheck("node-inventory", true, "inventory/user/port validation passed")
|
||||||
o.maybeRunEarlyVaultUnseal(ctx)
|
|
||||||
o.setStartupPhase("preflight-node-reachability", "waiting for ssh reachability across configured inventory")
|
o.setStartupPhase("preflight-node-reachability", "waiting for ssh reachability across configured inventory")
|
||||||
if reachErr := o.waitForNodeInventoryReachability(ctx); reachErr != nil {
|
if reachErr := o.waitForNodeInventoryReachability(ctx); reachErr != nil {
|
||||||
o.noteStartupCheck("node-inventory-reachability", false, reachErr.Error())
|
o.noteStartupCheck("node-inventory-reachability", false, reachErr.Error())
|
||||||
@ -180,9 +179,6 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
o.noteStartupCheck("kubernetes-api", true, "kubernetes api reachable")
|
o.noteStartupCheck("kubernetes-api", true, "kubernetes api reachable")
|
||||||
if err := o.runStartupVaultUnsealGate(ctx); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
if err := o.ensureRequiredNodeLabels(ctx); err != nil {
|
if err := o.ensureRequiredNodeLabels(ctx); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
@ -480,3 +476,18 @@ func (o *Orchestrator) Shutdown(ctx context.Context, opts ShutdownOptions) (err
|
|||||||
o.log.Printf("shutdown flow complete")
|
o.log.Printf("shutdown flow complete")
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// normalizeShutdownMode runs one orchestration or CLI step.
|
||||||
|
// Signature: normalizeShutdownMode(raw string) (string, error).
|
||||||
|
// Why: keeps shutdown behavior explicit and safe by allowing only cluster-only
|
||||||
|
// semantics while preserving compatibility with legacy "config" callers.
|
||||||
|
func normalizeShutdownMode(raw string) (string, error) {
|
||||||
|
switch strings.TrimSpace(raw) {
|
||||||
|
case "", "config", "cluster-only":
|
||||||
|
return "cluster-only", nil
|
||||||
|
case "poweroff":
|
||||||
|
return "", fmt.Errorf("shutdown mode %q has been removed; ananke no longer powers off hosts", raw)
|
||||||
|
default:
|
||||||
|
return "", fmt.Errorf("unsupported shutdown mode %q (expected config|cluster-only)", raw)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@ -30,7 +30,7 @@ func (o *Orchestrator) waitForNodeInventoryReachability(ctx context.Context) err
|
|||||||
ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
|
ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
|
||||||
targets := make([]string, 0, len(o.inventoryNodesForValidation()))
|
targets := make([]string, 0, len(o.inventoryNodesForValidation()))
|
||||||
seen := map[string]struct{}{}
|
seen := map[string]struct{}{}
|
||||||
for _, node := range startupRequiredNodes(o.inventoryNodesForValidation(), o.cfg.Startup.NodeInventoryReachRequiredNodes) {
|
for _, node := range o.inventoryNodesForValidation() {
|
||||||
node = strings.TrimSpace(node)
|
node = strings.TrimSpace(node)
|
||||||
if node == "" {
|
if node == "" {
|
||||||
continue
|
continue
|
||||||
|
|||||||
@ -1,261 +0,0 @@
|
|||||||
package cluster
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"encoding/json"
|
|
||||||
"fmt"
|
|
||||||
"sort"
|
|
||||||
"strings"
|
|
||||||
"time"
|
|
||||||
)
|
|
||||||
|
|
||||||
// maybeAutoQuarantineSchedulingStorms runs one orchestration or CLI step.
|
|
||||||
// Signature: (o *Orchestrator) maybeAutoQuarantineSchedulingStorms(ctx context.Context, lastAttempt *time.Time).
|
|
||||||
// Why: a non-core workload that cannot schedule can emit enough warning events to
|
|
||||||
// thrash the control plane datastore; quarantine keeps startup moving while
|
|
||||||
// preserving core services.
|
|
||||||
func (o *Orchestrator) maybeAutoQuarantineSchedulingStorms(ctx context.Context, lastAttempt *time.Time) {
|
|
||||||
if o.runner.DryRun || !o.cfg.Startup.AutoQuarantineSchedulingStorms {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
now := time.Now()
|
|
||||||
if lastAttempt != nil && !lastAttempt.IsZero() && now.Sub(*lastAttempt) < 30*time.Second {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
if lastAttempt != nil {
|
|
||||||
*lastAttempt = now
|
|
||||||
}
|
|
||||||
o.bestEffort("quarantine non-core scheduling storm workloads", func() error {
|
|
||||||
return o.quarantineSchedulingStormWorkloads(ctx)
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
// quarantineSchedulingStormWorkloads runs one orchestration or CLI step.
|
|
||||||
// Signature: (o *Orchestrator) quarantineSchedulingStormWorkloads(ctx context.Context) error.
|
|
||||||
// Why: limits startup-only mitigation to workloads proven to be generating a
|
|
||||||
// scheduling event storm, instead of scaling optional apps down blindly.
|
|
||||||
func (o *Orchestrator) quarantineSchedulingStormWorkloads(ctx context.Context) error {
|
|
||||||
podsOut, err := o.kubectl(ctx, 30*time.Second, "get", "pods", "-A", "-o", "json")
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("query pods for scheduling storm scan: %w", err)
|
|
||||||
}
|
|
||||||
var pods podList
|
|
||||||
if err := json.Unmarshal([]byte(podsOut), &pods); err != nil {
|
|
||||||
return fmt.Errorf("decode pods for scheduling storm scan: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
rsOut, err := o.kubectl(ctx, 30*time.Second, "get", "replicasets", "-A", "-o", "json")
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("query replicasets for scheduling storm scan: %w", err)
|
|
||||||
}
|
|
||||||
var rsList replicaSetList
|
|
||||||
if err := json.Unmarshal([]byte(rsOut), &rsList); err != nil {
|
|
||||||
return fmt.Errorf("decode replicasets for scheduling storm scan: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
eventsOut, err := o.kubectl(ctx, 30*time.Second, "get", "events", "-A", "-o", "json")
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("query events for scheduling storm scan: %w", err)
|
|
||||||
}
|
|
||||||
var events eventList
|
|
||||||
if err := json.Unmarshal([]byte(eventsOut), &events); err != nil {
|
|
||||||
return fmt.Errorf("decode events for scheduling storm scan: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
workloadsOut, err := o.kubectl(ctx, 30*time.Second, "get", "deploy,statefulset", "-A", "-o", "json")
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("query workloads for scheduling storm scan: %w", err)
|
|
||||||
}
|
|
||||||
var workloads workloadList
|
|
||||||
if err := json.Unmarshal([]byte(workloadsOut), &workloads); err != nil {
|
|
||||||
return fmt.Errorf("decode workloads for scheduling storm scan: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
requiredNamespaces := o.startupRequiredWorkloadNamespaces()
|
|
||||||
ignoredNamespaces := makeStringSet(o.cfg.Startup.IgnoreWorkloadNamespaces)
|
|
||||||
ignoredNodes := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
|
|
||||||
ignoreRules := parseWorkloadIgnoreRules(o.cfg.Startup.IgnoreWorkloads)
|
|
||||||
eventThreshold := o.cfg.Startup.SchedulingStormEventThreshold
|
|
||||||
if eventThreshold <= 0 {
|
|
||||||
eventThreshold = 30
|
|
||||||
}
|
|
||||||
window := time.Duration(o.cfg.Startup.SchedulingStormWindowSeconds) * time.Second
|
|
||||||
if window <= 0 {
|
|
||||||
window = 3 * time.Minute
|
|
||||||
}
|
|
||||||
|
|
||||||
podsByKey := map[string]podResource{}
|
|
||||||
for _, pod := range pods.Items {
|
|
||||||
ns := strings.TrimSpace(pod.Metadata.Namespace)
|
|
||||||
name := strings.TrimSpace(pod.Metadata.Name)
|
|
||||||
if ns == "" || name == "" {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
podsByKey[ns+"/"+name] = pod
|
|
||||||
}
|
|
||||||
|
|
||||||
rsOwners := map[string]ownerReference{}
|
|
||||||
for _, rs := range rsList.Items {
|
|
||||||
ns := strings.TrimSpace(rs.Metadata.Namespace)
|
|
||||||
name := strings.TrimSpace(rs.Metadata.Name)
|
|
||||||
if ns == "" || name == "" {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
for _, owner := range rs.Metadata.OwnerReferences {
|
|
||||||
kind := strings.TrimSpace(owner.Kind)
|
|
||||||
ownerName := strings.TrimSpace(owner.Name)
|
|
||||||
if kind == "" || ownerName == "" {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
rsOwners[ns+"/"+name] = owner
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
workloadDesired := map[string]int32{}
|
|
||||||
for _, item := range workloads.Items {
|
|
||||||
kind := strings.ToLower(strings.TrimSpace(item.Kind))
|
|
||||||
ns := strings.TrimSpace(item.Metadata.Namespace)
|
|
||||||
name := strings.TrimSpace(item.Metadata.Name)
|
|
||||||
if kind == "" || ns == "" || name == "" {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
desired, _, ok := desiredReady(item)
|
|
||||||
if !ok {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
workloadDesired[ns+"/"+kind+"/"+name] = desired
|
|
||||||
}
|
|
||||||
|
|
||||||
quarantined := []string{}
|
|
||||||
seen := map[string]struct{}{}
|
|
||||||
now := time.Now()
|
|
||||||
for _, event := range events.Items {
|
|
||||||
if !strings.EqualFold(strings.TrimSpace(event.Type), "Warning") {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if strings.TrimSpace(event.Reason) != "FailedScheduling" {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if !strings.EqualFold(strings.TrimSpace(event.InvolvedObject.Kind), "Pod") {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
lastSeen := eventLastObservedAt(event)
|
|
||||||
if !lastSeen.IsZero() && now.Sub(lastSeen) > window {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
count := eventObservationCount(event)
|
|
||||||
if count < eventThreshold {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
podKey := strings.TrimSpace(event.InvolvedObject.Namespace) + "/" + strings.TrimSpace(event.InvolvedObject.Name)
|
|
||||||
pod, ok := podsByKey[podKey]
|
|
||||||
if !ok {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if !strings.EqualFold(strings.TrimSpace(pod.Status.Phase), "Pending") {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
ns := strings.TrimSpace(pod.Metadata.Namespace)
|
|
||||||
if _, ok := requiredNamespaces[ns]; ok {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if _, ok := ignoredNamespaces[ns]; ok {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if workloadIgnored(ignoreRules, ns, "", strings.TrimSpace(pod.Metadata.Name)) {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if podTargetsIgnoredNode(pod, ignoredNodes) {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
workload, ok := schedulingStormOwnerWorkload(pod, rsOwners)
|
|
||||||
if !ok {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if workloadIgnored(ignoreRules, workload.Namespace, workload.Kind, workload.Name) {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
workloadKey := workload.Namespace + "/" + workload.Kind + "/" + workload.Name
|
|
||||||
if _, done := seen[workloadKey]; done {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
desired := workloadDesired[workloadKey]
|
|
||||||
if desired <= 0 {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if err := o.ensureWorkloadReplicas(ctx, workload, 0); err != nil {
|
|
||||||
return fmt.Errorf("scale scheduling storm workload %s to 0: %w", workloadKey, err)
|
|
||||||
}
|
|
||||||
seen[workloadKey] = struct{}{}
|
|
||||||
quarantined = append(quarantined, fmt.Sprintf("%s events=%d window=%s", workloadKey, count, window))
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(quarantined) == 0 {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
sort.Strings(quarantined)
|
|
||||||
detail := fmt.Sprintf("quarantined scheduling storm workload(s): %s", joinLimited(quarantined, 8))
|
|
||||||
o.log.Printf("%s", detail)
|
|
||||||
o.noteStartupAutoHeal(detail)
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// schedulingStormOwnerWorkload runs one orchestration or CLI step.
|
|
||||||
// Signature: schedulingStormOwnerWorkload(pod podResource, rsOwners map[string]ownerReference) (startupWorkload, bool).
|
|
||||||
// Why: scheduling storms happen at the pod layer, but safe mitigation needs to
|
|
||||||
// operate on the owning deployment or statefulset.
|
|
||||||
func schedulingStormOwnerWorkload(pod podResource, rsOwners map[string]ownerReference) (startupWorkload, bool) {
|
|
||||||
ns := strings.TrimSpace(pod.Metadata.Namespace)
|
|
||||||
for _, owner := range pod.Metadata.OwnerReferences {
|
|
||||||
switch strings.TrimSpace(owner.Kind) {
|
|
||||||
case "StatefulSet":
|
|
||||||
if name := strings.TrimSpace(owner.Name); name != "" {
|
|
||||||
return startupWorkload{Namespace: ns, Kind: "statefulset", Name: name}, true
|
|
||||||
}
|
|
||||||
case "ReplicaSet":
|
|
||||||
rsName := strings.TrimSpace(owner.Name)
|
|
||||||
if rsName == "" {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
rsOwner, ok := rsOwners[ns+"/"+rsName]
|
|
||||||
if !ok || strings.TrimSpace(rsOwner.Kind) != "Deployment" || strings.TrimSpace(rsOwner.Name) == "" {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
return startupWorkload{Namespace: ns, Kind: "deployment", Name: strings.TrimSpace(rsOwner.Name)}, true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return startupWorkload{}, false
|
|
||||||
}
|
|
||||||
|
|
||||||
// eventObservationCount runs one orchestration or CLI step.
|
|
||||||
// Signature: eventObservationCount(event eventResource) int.
|
|
||||||
// Why: event count can live either on the root event or in the series payload;
|
|
||||||
// using the max keeps detection stable across Kubernetes versions.
|
|
||||||
func eventObservationCount(event eventResource) int {
|
|
||||||
count := event.Count
|
|
||||||
if event.Series.Count > count {
|
|
||||||
count = event.Series.Count
|
|
||||||
}
|
|
||||||
if count < 1 {
|
|
||||||
return 1
|
|
||||||
}
|
|
||||||
return count
|
|
||||||
}
|
|
||||||
|
|
||||||
// eventLastObservedAt runs one orchestration or CLI step.
|
|
||||||
// Signature: eventLastObservedAt(event eventResource) time.Time.
|
|
||||||
// Why: event recency fields vary by cluster version; prefer the newest explicit
|
|
||||||
// observation time and fall back to creation time when needed.
|
|
||||||
func eventLastObservedAt(event eventResource) time.Time {
|
|
||||||
switch {
|
|
||||||
case !event.Series.LastObservedTime.IsZero():
|
|
||||||
return event.Series.LastObservedTime
|
|
||||||
case !event.LastTimestamp.IsZero():
|
|
||||||
return event.LastTimestamp
|
|
||||||
case !event.EventTime.IsZero():
|
|
||||||
return event.EventTime
|
|
||||||
default:
|
|
||||||
return event.Metadata.CreationTimestamp
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@ -1,21 +0,0 @@
|
|||||||
package cluster
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
"strings"
|
|
||||||
)
|
|
||||||
|
|
||||||
// normalizeShutdownMode runs one orchestration or CLI step.
|
|
||||||
// Signature: normalizeShutdownMode(raw string) (string, error).
|
|
||||||
// Why: keeps shutdown behavior explicit and safe by allowing only cluster-only
|
|
||||||
// semantics while preserving compatibility with legacy "config" callers.
|
|
||||||
func normalizeShutdownMode(raw string) (string, error) {
|
|
||||||
switch strings.TrimSpace(raw) {
|
|
||||||
case "", "config", "cluster-only":
|
|
||||||
return "cluster-only", nil
|
|
||||||
case "poweroff":
|
|
||||||
return "", fmt.Errorf("shutdown mode %q has been removed; ananke no longer powers off hosts", raw)
|
|
||||||
default:
|
|
||||||
return "", fmt.Errorf("unsupported shutdown mode %q (expected config|cluster-only)", raw)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@ -1,81 +0,0 @@
|
|||||||
package cluster
|
|
||||||
|
|
||||||
import "strings"
|
|
||||||
|
|
||||||
// startupRequiredNodes runs one orchestration or CLI step.
|
|
||||||
// Signature: startupRequiredNodes(nodes []string, required []string) []string.
|
|
||||||
// Why: lets startup enforce a smaller core node set during outage recovery
|
|
||||||
// without losing the stricter all-nodes behavior when no override is configured.
|
|
||||||
func startupRequiredNodes(nodes []string, required []string) []string {
|
|
||||||
requiredSet := makeStringSet(required)
|
|
||||||
if len(requiredSet) == 0 {
|
|
||||||
return nodes
|
|
||||||
}
|
|
||||||
filtered := make([]string, 0, len(nodes))
|
|
||||||
for _, node := range nodes {
|
|
||||||
node = strings.TrimSpace(node)
|
|
||||||
if node == "" {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if _, ok := requiredSet[node]; ok {
|
|
||||||
filtered = append(filtered, node)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return filtered
|
|
||||||
}
|
|
||||||
|
|
||||||
// startupNodeStrictlyRequired runs one orchestration or CLI step.
|
|
||||||
// Signature: (o *Orchestrator) startupNodeStrictlyRequired(node string) bool.
|
|
||||||
// Why: absent or broken non-core nodes should not block recovery-only actions
|
|
||||||
// like label reconciliation once the operator has narrowed startup to core nodes.
|
|
||||||
func (o *Orchestrator) startupNodeStrictlyRequired(node string) bool {
|
|
||||||
node = strings.TrimSpace(node)
|
|
||||||
if node == "" {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
if len(o.cfg.Startup.NodeInventoryReachRequiredNodes) == 0 && len(o.cfg.Startup.NodeSSHAuthRequiredNodes) == 0 {
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
for _, controlPlane := range o.cfg.ControlPlanes {
|
|
||||||
if strings.TrimSpace(controlPlane) == node {
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if containsNode(o.cfg.Startup.NodeInventoryReachRequiredNodes, node) {
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
return containsNode(o.cfg.Startup.NodeSSHAuthRequiredNodes, node)
|
|
||||||
}
|
|
||||||
|
|
||||||
// startupRequiredFluxKustomizations runs one orchestration or CLI step.
|
|
||||||
// Signature: (o *Orchestrator) startupRequiredFluxKustomizations() map[string]struct{}.
|
|
||||||
// Why: lets outage recovery wait on a declared core GitOps slice while leaving
|
|
||||||
// optional stacks free to converge after bootstrap succeeds.
|
|
||||||
func (o *Orchestrator) startupRequiredFluxKustomizations() map[string]struct{} {
|
|
||||||
return makeStringSet(o.cfg.Startup.FluxHealthRequiredKustomizations)
|
|
||||||
}
|
|
||||||
|
|
||||||
// startupRequiredWorkloadNamespaces runs one orchestration or CLI step.
|
|
||||||
// Signature: (o *Orchestrator) startupRequiredWorkloadNamespaces() map[string]struct{}.
|
|
||||||
// Why: keeps workload readiness scoped to core namespaces during recovery while
|
|
||||||
// preserving broad convergence checks when no explicit core list is configured.
|
|
||||||
func (o *Orchestrator) startupRequiredWorkloadNamespaces() map[string]struct{} {
|
|
||||||
return makeStringSet(o.cfg.Startup.WorkloadConvergenceRequiredNamespaces)
|
|
||||||
}
|
|
||||||
|
|
||||||
// containsNode runs one orchestration or CLI step.
|
|
||||||
// Signature: containsNode(entries []string, needle string) bool.
|
|
||||||
// Why: keeps node-scope checks small and explicit anywhere startup narrows its
|
|
||||||
// recovery gates to a declared core set.
|
|
||||||
func containsNode(entries []string, needle string) bool {
|
|
||||||
needle = strings.TrimSpace(needle)
|
|
||||||
if needle == "" {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
for _, entry := range entries {
|
|
||||||
if strings.TrimSpace(entry) == needle {
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
@ -1,52 +0,0 @@
|
|||||||
package cluster
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"fmt"
|
|
||||||
"time"
|
|
||||||
)
|
|
||||||
|
|
||||||
// maybeRunEarlyVaultUnseal runs one orchestration or CLI step.
|
|
||||||
// Signature: (o *Orchestrator) maybeRunEarlyVaultUnseal(ctx context.Context).
|
|
||||||
// Why: gives startup a best-effort Vault recovery path when the API is already
|
|
||||||
// live, without consuming the hard startup failure path before workloads recover.
|
|
||||||
func (o *Orchestrator) maybeRunEarlyVaultUnseal(ctx context.Context) {
|
|
||||||
if err := o.waitForAPI(ctx, 1, time.Second); err != nil {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
o.noteStartupCheckState("vault-unseal-early", "running", "best-effort early vault unseal while kubernetes api is already available")
|
|
||||||
deferred, detail, err := o.ensureVaultUnsealedWhenRunnable(ctx)
|
|
||||||
if err != nil {
|
|
||||||
o.log.Printf("warning: early vault unseal deferred: %v", err)
|
|
||||||
o.noteStartupAutoHeal(fmt.Sprintf("deferred early vault unseal: %v", err))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
if deferred {
|
|
||||||
o.log.Printf("vault early unseal deferred: %s", detail)
|
|
||||||
o.noteStartupAutoHeal(detail)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
o.noteStartupCheck("vault-unseal-early", true, "vault is already unsealed")
|
|
||||||
}
|
|
||||||
|
|
||||||
// runStartupVaultUnsealGate runs one orchestration or CLI step.
|
|
||||||
// Signature: (o *Orchestrator) runStartupVaultUnsealGate(ctx context.Context) error.
|
|
||||||
// Why: keeps the top-level startup flow readable while allowing Vault unseal to
|
|
||||||
// defer cleanly until critical workload recovery when the pod is not runnable yet.
|
|
||||||
func (o *Orchestrator) runStartupVaultUnsealGate(ctx context.Context) error {
|
|
||||||
o.noteStartupCheckState("vault-unseal", "running", "ensuring vault is unsealed before startup gates")
|
|
||||||
deferred, detail, err := o.ensureVaultUnsealedWhenRunnable(ctx)
|
|
||||||
if err != nil {
|
|
||||||
o.noteStartupCheck("vault-unseal", false, err.Error())
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
if deferred {
|
|
||||||
o.log.Printf("vault unseal deferred until workload recovery: %s", detail)
|
|
||||||
o.noteStartupAutoHeal(detail)
|
|
||||||
o.noteStartupCheck("vault-unseal", true, detail)
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
o.noteStartupCheck("vault-unseal", true, "vault is unsealed")
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
@ -177,46 +177,6 @@ type jobConditionRef struct {
|
|||||||
Status string `json:"status"`
|
Status string `json:"status"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type eventList struct {
|
|
||||||
Items []eventResource `json:"items"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type eventResource struct {
|
|
||||||
Metadata struct {
|
|
||||||
Namespace string `json:"namespace"`
|
|
||||||
CreationTimestamp time.Time `json:"creationTimestamp"`
|
|
||||||
} `json:"metadata"`
|
|
||||||
InvolvedObject struct {
|
|
||||||
Kind string `json:"kind"`
|
|
||||||
Namespace string `json:"namespace"`
|
|
||||||
Name string `json:"name"`
|
|
||||||
} `json:"involvedObject"`
|
|
||||||
Type string `json:"type"`
|
|
||||||
Reason string `json:"reason"`
|
|
||||||
Message string `json:"message"`
|
|
||||||
Count int `json:"count"`
|
|
||||||
EventTime time.Time `json:"eventTime"`
|
|
||||||
LastTimestamp time.Time `json:"lastTimestamp"`
|
|
||||||
Series eventSeries `json:"series"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type eventSeries struct {
|
|
||||||
Count int `json:"count"`
|
|
||||||
LastObservedTime time.Time `json:"lastObservedTime"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type replicaSetList struct {
|
|
||||||
Items []replicaSetResource `json:"items"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type replicaSetResource struct {
|
|
||||||
Metadata struct {
|
|
||||||
Namespace string `json:"namespace"`
|
|
||||||
Name string `json:"name"`
|
|
||||||
OwnerReferences []ownerReference `json:"ownerReferences"`
|
|
||||||
} `json:"metadata"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type workloadResource struct {
|
type workloadResource struct {
|
||||||
Kind string `json:"kind"`
|
Kind string `json:"kind"`
|
||||||
Metadata struct {
|
Metadata struct {
|
||||||
@ -261,7 +221,6 @@ type podResource struct {
|
|||||||
|
|
||||||
type ownerReference struct {
|
type ownerReference struct {
|
||||||
Kind string `json:"kind"`
|
Kind string `json:"kind"`
|
||||||
Name string `json:"name"`
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type podContainerStatus struct {
|
type podContainerStatus struct {
|
||||||
|
|||||||
@ -26,12 +26,10 @@ func (o *Orchestrator) waitForWorkloadConvergence(ctx context.Context) error {
|
|||||||
lastLogged := time.Time{}
|
lastLogged := time.Time{}
|
||||||
lastRecycleAttempt := time.Time{}
|
lastRecycleAttempt := time.Time{}
|
||||||
lastReplicaHeal := time.Time{}
|
lastReplicaHeal := time.Time{}
|
||||||
lastSchedulingStormHeal := time.Time{}
|
|
||||||
for {
|
for {
|
||||||
prevFailure := lastFailure
|
prevFailure := lastFailure
|
||||||
o.maybeAutoRecycleStuckPods(ctx, &lastRecycleAttempt)
|
o.maybeAutoRecycleStuckPods(ctx, &lastRecycleAttempt)
|
||||||
o.maybeAutoHealCriticalWorkloadReplicas(ctx, &lastReplicaHeal)
|
o.maybeAutoHealCriticalWorkloadReplicas(ctx, &lastReplicaHeal)
|
||||||
o.maybeAutoQuarantineSchedulingStorms(ctx, &lastSchedulingStormHeal)
|
|
||||||
ready, detail, err := o.workloadConvergenceReady(ctx)
|
ready, detail, err := o.workloadConvergenceReady(ctx)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
lastFailure = err.Error()
|
lastFailure = err.Error()
|
||||||
@ -73,7 +71,6 @@ func (o *Orchestrator) workloadConvergenceReady(ctx context.Context) (bool, stri
|
|||||||
if err := json.Unmarshal([]byte(out), &list); err != nil {
|
if err := json.Unmarshal([]byte(out), &list); err != nil {
|
||||||
return false, "", fmt.Errorf("decode controllers: %w", err)
|
return false, "", fmt.Errorf("decode controllers: %w", err)
|
||||||
}
|
}
|
||||||
requiredNamespaces := o.startupRequiredWorkloadNamespaces()
|
|
||||||
ignoredNamespaces := makeStringSet(o.cfg.Startup.IgnoreWorkloadNamespaces)
|
ignoredNamespaces := makeStringSet(o.cfg.Startup.IgnoreWorkloadNamespaces)
|
||||||
ignoredNodes := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
|
ignoredNodes := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
|
||||||
ignoreRules := parseWorkloadIgnoreRules(o.cfg.Startup.IgnoreWorkloads)
|
ignoreRules := parseWorkloadIgnoreRules(o.cfg.Startup.IgnoreWorkloads)
|
||||||
@ -87,11 +84,6 @@ func (o *Orchestrator) workloadConvergenceReady(ctx context.Context) (bool, stri
|
|||||||
if kind == "" || ns == "" || name == "" {
|
if kind == "" || ns == "" || name == "" {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if len(requiredNamespaces) > 0 {
|
|
||||||
if _, ok := requiredNamespaces[ns]; !ok {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if _, ok := ignoredNamespaces[ns]; ok {
|
if _, ok := ignoredNamespaces[ns]; ok {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|||||||
@ -116,7 +116,6 @@ func (o *Orchestrator) startupFailurePods(ctx context.Context) ([]string, error)
|
|||||||
return nil, fmt.Errorf("decode pods: %w", err)
|
return nil, fmt.Errorf("decode pods: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
requiredNamespaces := o.startupRequiredWorkloadNamespaces()
|
|
||||||
ignoredNamespaces := makeStringSet(o.cfg.Startup.IgnoreWorkloadNamespaces)
|
ignoredNamespaces := makeStringSet(o.cfg.Startup.IgnoreWorkloadNamespaces)
|
||||||
ignoredNodes := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
|
ignoredNodes := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
|
||||||
stuckReasons := map[string]struct{}{
|
stuckReasons := map[string]struct{}{
|
||||||
@ -139,11 +138,6 @@ func (o *Orchestrator) startupFailurePods(ctx context.Context) ([]string, error)
|
|||||||
if ns == "" || name == "" {
|
if ns == "" || name == "" {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if len(requiredNamespaces) > 0 {
|
|
||||||
if _, ok := requiredNamespaces[ns]; !ok {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if _, ok := ignoredNamespaces[ns]; ok {
|
if _, ok := ignoredNamespaces[ns]; ok {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,88 +0,0 @@
|
|||||||
package cluster
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"fmt"
|
|
||||||
"strings"
|
|
||||||
"time"
|
|
||||||
)
|
|
||||||
|
|
||||||
// TestHookMaybeAutoQuarantineSchedulingStorms runs one orchestration or CLI step.
|
|
||||||
// Signature: (o *Orchestrator) TestHookMaybeAutoQuarantineSchedulingStorms(ctx context.Context, lastAttempt *time.Time).
|
|
||||||
// Why: exposes the scheduling-storm trigger guard to the split top-level test module.
|
|
||||||
func (o *Orchestrator) TestHookMaybeAutoQuarantineSchedulingStorms(ctx context.Context, lastAttempt *time.Time) {
|
|
||||||
o.maybeAutoQuarantineSchedulingStorms(ctx, lastAttempt)
|
|
||||||
}
|
|
||||||
|
|
||||||
// TestHookQuarantineSchedulingStormWorkloads runs one orchestration or CLI step.
|
|
||||||
// Signature: (o *Orchestrator) TestHookQuarantineSchedulingStormWorkloads(ctx context.Context) error.
|
|
||||||
// Why: exposes the scheduling-storm auto-heal body to the split top-level test module.
|
|
||||||
func (o *Orchestrator) TestHookQuarantineSchedulingStormWorkloads(ctx context.Context) error {
|
|
||||||
return o.quarantineSchedulingStormWorkloads(ctx)
|
|
||||||
}
|
|
||||||
|
|
||||||
// TestHookSchedulingStormOwnerWorkload runs one orchestration or CLI step.
|
|
||||||
// Signature: TestHookSchedulingStormOwnerWorkload(namespace string, ownerKind string, ownerName string, rsOwnerKind string, rsOwnerName string) (string, bool).
|
|
||||||
// Why: exposes owner-resolution behavior without leaking internal workload types.
|
|
||||||
func TestHookSchedulingStormOwnerWorkload(
|
|
||||||
namespace string,
|
|
||||||
ownerKind string,
|
|
||||||
ownerName string,
|
|
||||||
rsOwnerKind string,
|
|
||||||
rsOwnerName string,
|
|
||||||
) (string, bool) {
|
|
||||||
var pod podResource
|
|
||||||
pod.Metadata.Namespace = strings.TrimSpace(namespace)
|
|
||||||
pod.Metadata.OwnerReferences = []ownerReference{{
|
|
||||||
Kind: strings.TrimSpace(ownerKind),
|
|
||||||
Name: strings.TrimSpace(ownerName),
|
|
||||||
}}
|
|
||||||
rsOwners := map[string]ownerReference{}
|
|
||||||
if rsName := strings.TrimSpace(ownerName); rsName != "" && strings.TrimSpace(ownerKind) == "ReplicaSet" {
|
|
||||||
rsOwners[pod.Metadata.Namespace+"/"+rsName] = ownerReference{
|
|
||||||
Kind: strings.TrimSpace(rsOwnerKind),
|
|
||||||
Name: strings.TrimSpace(rsOwnerName),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
workload, ok := schedulingStormOwnerWorkload(pod, rsOwners)
|
|
||||||
if !ok {
|
|
||||||
return "", false
|
|
||||||
}
|
|
||||||
return fmt.Sprintf("%s/%s/%s", workload.Namespace, workload.Kind, workload.Name), true
|
|
||||||
}
|
|
||||||
|
|
||||||
// TestHookEventObservationCount runs one orchestration or CLI step.
|
|
||||||
// Signature: TestHookEventObservationCount(count int, seriesCount int) int.
|
|
||||||
// Why: exposes event-count normalization used by scheduling-storm detection.
|
|
||||||
func TestHookEventObservationCount(count int, seriesCount int) int {
|
|
||||||
return eventObservationCount(eventResource{
|
|
||||||
Count: count,
|
|
||||||
Series: eventSeries{
|
|
||||||
Count: seriesCount,
|
|
||||||
},
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
// TestHookEventLastObservedAt runs one orchestration or CLI step.
|
|
||||||
// Signature: TestHookEventLastObservedAt(seriesLastObserved time.Time, lastTimestamp time.Time, eventTime time.Time, creationTimestamp time.Time) time.Time.
|
|
||||||
// Why: exposes event-time fallback behavior used by scheduling-storm detection.
|
|
||||||
func TestHookEventLastObservedAt(
|
|
||||||
seriesLastObserved time.Time,
|
|
||||||
lastTimestamp time.Time,
|
|
||||||
eventTime time.Time,
|
|
||||||
creationTimestamp time.Time,
|
|
||||||
) time.Time {
|
|
||||||
return eventLastObservedAt(eventResource{
|
|
||||||
LastTimestamp: lastTimestamp,
|
|
||||||
EventTime: eventTime,
|
|
||||||
Series: eventSeries{
|
|
||||||
LastObservedTime: seriesLastObserved,
|
|
||||||
},
|
|
||||||
Metadata: struct {
|
|
||||||
Namespace string `json:"namespace"`
|
|
||||||
CreationTimestamp time.Time `json:"creationTimestamp"`
|
|
||||||
}{
|
|
||||||
CreationTimestamp: creationTimestamp,
|
|
||||||
},
|
|
||||||
})
|
|
||||||
}
|
|
||||||
@ -1,55 +0,0 @@
|
|||||||
package cluster
|
|
||||||
|
|
||||||
import "context"
|
|
||||||
|
|
||||||
// TestHookStartupRequiredNodes runs one orchestration or CLI step.
|
|
||||||
// Signature: TestHookStartupRequiredNodes(nodes []string, required []string) []string.
|
|
||||||
// Why: exposes recovery-scope node filtering so top-level tests can cover core-only startup narrowing.
|
|
||||||
func TestHookStartupRequiredNodes(nodes []string, required []string) []string {
|
|
||||||
return startupRequiredNodes(nodes, required)
|
|
||||||
}
|
|
||||||
|
|
||||||
// TestHookContainsNode runs one orchestration or CLI step.
|
|
||||||
// Signature: TestHookContainsNode(entries []string, needle string) bool.
|
|
||||||
// Why: exposes the small startup-scope membership helper to top-level tests.
|
|
||||||
func TestHookContainsNode(entries []string, needle string) bool {
|
|
||||||
return containsNode(entries, needle)
|
|
||||||
}
|
|
||||||
|
|
||||||
// TestHookStartupNodeStrictlyRequired runs one orchestration or CLI step.
|
|
||||||
// Signature: (o *Orchestrator) TestHookStartupNodeStrictlyRequired(node string) bool.
|
|
||||||
// Why: exposes strict-node startup scoping so outage-recovery tests can confirm
|
|
||||||
// non-core nodes stop blocking bootstrap.
|
|
||||||
func (o *Orchestrator) TestHookStartupNodeStrictlyRequired(node string) bool {
|
|
||||||
return o.startupNodeStrictlyRequired(node)
|
|
||||||
}
|
|
||||||
|
|
||||||
// TestHookStartupRequiredFluxKustomizations runs one orchestration or CLI step.
|
|
||||||
// Signature: (o *Orchestrator) TestHookStartupRequiredFluxKustomizations() map[string]struct{}.
|
|
||||||
// Why: exposes flux startup scoping so top-level tests can confirm only core
|
|
||||||
// kustomizations block emergency bootstrap.
|
|
||||||
func (o *Orchestrator) TestHookStartupRequiredFluxKustomizations() map[string]struct{} {
|
|
||||||
return o.startupRequiredFluxKustomizations()
|
|
||||||
}
|
|
||||||
|
|
||||||
// TestHookStartupRequiredWorkloadNamespaces runs one orchestration or CLI step.
|
|
||||||
// Signature: (o *Orchestrator) TestHookStartupRequiredWorkloadNamespaces() map[string]struct{}.
|
|
||||||
// Why: exposes workload namespace startup scoping so top-level tests can
|
|
||||||
// confirm only core workloads block emergency bootstrap.
|
|
||||||
func (o *Orchestrator) TestHookStartupRequiredWorkloadNamespaces() map[string]struct{} {
|
|
||||||
return o.startupRequiredWorkloadNamespaces()
|
|
||||||
}
|
|
||||||
|
|
||||||
// TestHookMaybeRunEarlyVaultUnseal runs one orchestration or CLI step.
|
|
||||||
// Signature: (o *Orchestrator) TestHookMaybeRunEarlyVaultUnseal(ctx context.Context).
|
|
||||||
// Why: exposes the early startup Vault deferral helper to top-level tests.
|
|
||||||
func (o *Orchestrator) TestHookMaybeRunEarlyVaultUnseal(ctx context.Context) {
|
|
||||||
o.maybeRunEarlyVaultUnseal(ctx)
|
|
||||||
}
|
|
||||||
|
|
||||||
// TestHookRunStartupVaultUnsealGate runs one orchestration or CLI step.
|
|
||||||
// Signature: (o *Orchestrator) TestHookRunStartupVaultUnsealGate(ctx context.Context) error.
|
|
||||||
// Why: exposes the startup Vault gate helper to top-level tests.
|
|
||||||
func (o *Orchestrator) TestHookRunStartupVaultUnsealGate(ctx context.Context) error {
|
|
||||||
return o.runStartupVaultUnsealGate(ctx)
|
|
||||||
}
|
|
||||||
@ -33,9 +33,6 @@ func (c *Config) applyDefaults() {
|
|||||||
if c.Startup.NodeInventoryReachPollSeconds <= 0 {
|
if c.Startup.NodeInventoryReachPollSeconds <= 0 {
|
||||||
c.Startup.NodeInventoryReachPollSeconds = 5
|
c.Startup.NodeInventoryReachPollSeconds = 5
|
||||||
}
|
}
|
||||||
if c.Startup.NodeInventoryReachRequiredNodes == nil {
|
|
||||||
c.Startup.NodeInventoryReachRequiredNodes = []string{}
|
|
||||||
}
|
|
||||||
if c.Startup.RequiredNodeLabels == nil {
|
if c.Startup.RequiredNodeLabels == nil {
|
||||||
c.Startup.RequiredNodeLabels = map[string]map[string]string{
|
c.Startup.RequiredNodeLabels = map[string]map[string]string{
|
||||||
"titan-09": {
|
"titan-09": {
|
||||||
@ -124,11 +121,7 @@ func (c *Config) applyDefaults() {
|
|||||||
if strings.TrimSpace(c.Startup.ServiceChecklistAuth.AdminSecretPasswordKey) == "" {
|
if strings.TrimSpace(c.Startup.ServiceChecklistAuth.AdminSecretPasswordKey) == "" {
|
||||||
c.Startup.ServiceChecklistAuth.AdminSecretPasswordKey = "password"
|
c.Startup.ServiceChecklistAuth.AdminSecretPasswordKey = "password"
|
||||||
}
|
}
|
||||||
if c.Startup.ServiceChecklistExplicitOnly {
|
|
||||||
c.Startup.ServiceChecklist = mergeServiceChecklistDefaults(c.Startup.ServiceChecklist, []ServiceChecklistCheck{})
|
|
||||||
} else {
|
|
||||||
c.Startup.ServiceChecklist = mergeServiceChecklistDefaults(c.Startup.ServiceChecklist, defaultServiceChecklist())
|
c.Startup.ServiceChecklist = mergeServiceChecklistDefaults(c.Startup.ServiceChecklist, defaultServiceChecklist())
|
||||||
}
|
|
||||||
for i := range c.Startup.ServiceChecklist {
|
for i := range c.Startup.ServiceChecklist {
|
||||||
if c.Startup.ServiceChecklist[i].TimeoutSeconds <= 0 {
|
if c.Startup.ServiceChecklist[i].TimeoutSeconds <= 0 {
|
||||||
c.Startup.ServiceChecklist[i].TimeoutSeconds = 12
|
c.Startup.ServiceChecklist[i].TimeoutSeconds = 12
|
||||||
@ -159,18 +152,12 @@ func (c *Config) applyDefaults() {
|
|||||||
if c.Startup.NodeSSHAuthPollSeconds <= 0 {
|
if c.Startup.NodeSSHAuthPollSeconds <= 0 {
|
||||||
c.Startup.NodeSSHAuthPollSeconds = 5
|
c.Startup.NodeSSHAuthPollSeconds = 5
|
||||||
}
|
}
|
||||||
if c.Startup.NodeSSHAuthRequiredNodes == nil {
|
|
||||||
c.Startup.NodeSSHAuthRequiredNodes = []string{}
|
|
||||||
}
|
|
||||||
if c.Startup.FluxHealthWaitSeconds <= 0 {
|
if c.Startup.FluxHealthWaitSeconds <= 0 {
|
||||||
c.Startup.FluxHealthWaitSeconds = 900
|
c.Startup.FluxHealthWaitSeconds = 900
|
||||||
}
|
}
|
||||||
if c.Startup.FluxHealthPollSeconds <= 0 {
|
if c.Startup.FluxHealthPollSeconds <= 0 {
|
||||||
c.Startup.FluxHealthPollSeconds = 5
|
c.Startup.FluxHealthPollSeconds = 5
|
||||||
}
|
}
|
||||||
if c.Startup.FluxHealthRequiredKustomizations == nil {
|
|
||||||
c.Startup.FluxHealthRequiredKustomizations = []string{}
|
|
||||||
}
|
|
||||||
if c.Startup.IgnoreFluxKustomizations == nil {
|
if c.Startup.IgnoreFluxKustomizations == nil {
|
||||||
c.Startup.IgnoreFluxKustomizations = []string{}
|
c.Startup.IgnoreFluxKustomizations = []string{}
|
||||||
}
|
}
|
||||||
@ -180,9 +167,6 @@ func (c *Config) applyDefaults() {
|
|||||||
if c.Startup.WorkloadConvergencePollSeconds <= 0 {
|
if c.Startup.WorkloadConvergencePollSeconds <= 0 {
|
||||||
c.Startup.WorkloadConvergencePollSeconds = 5
|
c.Startup.WorkloadConvergencePollSeconds = 5
|
||||||
}
|
}
|
||||||
if c.Startup.WorkloadConvergenceRequiredNamespaces == nil {
|
|
||||||
c.Startup.WorkloadConvergenceRequiredNamespaces = []string{}
|
|
||||||
}
|
|
||||||
if c.Startup.IgnoreWorkloadNamespaces == nil {
|
if c.Startup.IgnoreWorkloadNamespaces == nil {
|
||||||
c.Startup.IgnoreWorkloadNamespaces = []string{}
|
c.Startup.IgnoreWorkloadNamespaces = []string{}
|
||||||
}
|
}
|
||||||
@ -195,12 +179,6 @@ func (c *Config) applyDefaults() {
|
|||||||
if c.Startup.StuckPodGraceSeconds <= 0 {
|
if c.Startup.StuckPodGraceSeconds <= 0 {
|
||||||
c.Startup.StuckPodGraceSeconds = 180
|
c.Startup.StuckPodGraceSeconds = 180
|
||||||
}
|
}
|
||||||
if c.Startup.PostStartAutoHealSeconds <= 0 {
|
|
||||||
c.Startup.PostStartAutoHealSeconds = 60
|
|
||||||
}
|
|
||||||
if c.Startup.DeadNodeCleanupGraceSeconds <= 0 {
|
|
||||||
c.Startup.DeadNodeCleanupGraceSeconds = 300
|
|
||||||
}
|
|
||||||
if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" {
|
if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" {
|
||||||
c.Startup.VaultUnsealKeyFile = "/var/lib/ananke/vault-unseal.key"
|
c.Startup.VaultUnsealKeyFile = "/var/lib/ananke/vault-unseal.key"
|
||||||
}
|
}
|
||||||
@ -243,12 +221,6 @@ func (c *Config) applyDefaults() {
|
|||||||
if c.UPS.TelemetryTimeoutSeconds <= 0 {
|
if c.UPS.TelemetryTimeoutSeconds <= 0 {
|
||||||
c.UPS.TelemetryTimeoutSeconds = 90
|
c.UPS.TelemetryTimeoutSeconds = 90
|
||||||
}
|
}
|
||||||
if c.Startup.SchedulingStormEventThreshold <= 0 {
|
|
||||||
c.Startup.SchedulingStormEventThreshold = 30
|
|
||||||
}
|
|
||||||
if c.Startup.SchedulingStormWindowSeconds <= 0 {
|
|
||||||
c.Startup.SchedulingStormWindowSeconds = 180
|
|
||||||
}
|
|
||||||
if c.Coordination.ForwardShutdownConfig == "" {
|
if c.Coordination.ForwardShutdownConfig == "" {
|
||||||
c.Coordination.ForwardShutdownConfig = "/etc/ananke/ananke.yaml"
|
c.Coordination.ForwardShutdownConfig = "/etc/ananke/ananke.yaml"
|
||||||
}
|
}
|
||||||
|
|||||||
@ -45,7 +45,6 @@ func defaults() Config {
|
|||||||
RequireNodeInventoryReach: true,
|
RequireNodeInventoryReach: true,
|
||||||
NodeInventoryReachWaitSeconds: 300,
|
NodeInventoryReachWaitSeconds: 300,
|
||||||
NodeInventoryReachPollSeconds: 5,
|
NodeInventoryReachPollSeconds: 5,
|
||||||
NodeInventoryReachRequiredNodes: []string{},
|
|
||||||
RequireTimeSync: true,
|
RequireTimeSync: true,
|
||||||
TimeSyncWaitSeconds: 240,
|
TimeSyncWaitSeconds: 240,
|
||||||
TimeSyncPollSeconds: 5,
|
TimeSyncPollSeconds: 5,
|
||||||
@ -105,16 +104,13 @@ func defaults() Config {
|
|||||||
RequireNodeSSHAuth: true,
|
RequireNodeSSHAuth: true,
|
||||||
NodeSSHAuthWaitSeconds: 240,
|
NodeSSHAuthWaitSeconds: 240,
|
||||||
NodeSSHAuthPollSeconds: 5,
|
NodeSSHAuthPollSeconds: 5,
|
||||||
NodeSSHAuthRequiredNodes: []string{},
|
|
||||||
RequireFluxHealth: true,
|
RequireFluxHealth: true,
|
||||||
FluxHealthWaitSeconds: 900,
|
FluxHealthWaitSeconds: 900,
|
||||||
FluxHealthPollSeconds: 5,
|
FluxHealthPollSeconds: 5,
|
||||||
FluxHealthRequiredKustomizations: []string{},
|
|
||||||
IgnoreFluxKustomizations: []string{},
|
IgnoreFluxKustomizations: []string{},
|
||||||
RequireWorkloadConvergence: true,
|
RequireWorkloadConvergence: true,
|
||||||
WorkloadConvergenceWaitSeconds: 900,
|
WorkloadConvergenceWaitSeconds: 900,
|
||||||
WorkloadConvergencePollSeconds: 5,
|
WorkloadConvergencePollSeconds: 5,
|
||||||
WorkloadConvergenceRequiredNamespaces: []string{},
|
|
||||||
IgnoreWorkloadNamespaces: []string{},
|
IgnoreWorkloadNamespaces: []string{},
|
||||||
IgnoreWorkloads: []string{},
|
IgnoreWorkloads: []string{},
|
||||||
IgnoreUnavailableNodes: []string{},
|
IgnoreUnavailableNodes: []string{},
|
||||||
|
|||||||
@ -51,41 +51,3 @@ startup:
|
|||||||
t.Fatalf("expected validation failure")
|
t.Fatalf("expected validation failure")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TestLoadKeepsExplicitServiceChecklist runs one orchestration or CLI step.
|
|
||||||
// Signature: TestLoadKeepsExplicitServiceChecklist(t *testing.T).
|
|
||||||
// Why: host recovery configs must be able to keep a narrow, explicit checklist
|
|
||||||
// without silently inheriting the full default service catalog.
|
|
||||||
func TestLoadKeepsExplicitServiceChecklist(t *testing.T) {
|
|
||||||
cfgPath := filepath.Join(t.TempDir(), "ananke.yaml")
|
|
||||||
raw := `
|
|
||||||
control_planes: [titan-0a]
|
|
||||||
expected_flux_branch: main
|
|
||||||
expected_flux_source_url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git
|
|
||||||
iac_repo_path: /opt/titan-iac
|
|
||||||
startup:
|
|
||||||
service_checklist_explicit_only: true
|
|
||||||
service_checklist:
|
|
||||||
- name: gitea-api
|
|
||||||
url: https://scm.bstein.dev/api/healthz
|
|
||||||
accepted_statuses: [200]
|
|
||||||
body_contains: pass
|
|
||||||
timeout_seconds: 12
|
|
||||||
ups:
|
|
||||||
enabled: false
|
|
||||||
`
|
|
||||||
if err := os.WriteFile(cfgPath, []byte(raw), 0o644); err != nil {
|
|
||||||
t.Fatalf("write config: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
cfg, err := Load(cfgPath)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("load config: %v", err)
|
|
||||||
}
|
|
||||||
if len(cfg.Startup.ServiceChecklist) != 1 {
|
|
||||||
t.Fatalf("expected 1 explicit service check, got %d", len(cfg.Startup.ServiceChecklist))
|
|
||||||
}
|
|
||||||
if cfg.Startup.ServiceChecklist[0].Name != "gitea-api" {
|
|
||||||
t.Fatalf("expected explicit gitea-api check, got %q", cfg.Startup.ServiceChecklist[0].Name)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@ -34,7 +34,6 @@ type Startup struct {
|
|||||||
RequireNodeInventoryReach bool `yaml:"require_node_inventory_reachability"`
|
RequireNodeInventoryReach bool `yaml:"require_node_inventory_reachability"`
|
||||||
NodeInventoryReachWaitSeconds int `yaml:"node_inventory_reachability_wait_seconds"`
|
NodeInventoryReachWaitSeconds int `yaml:"node_inventory_reachability_wait_seconds"`
|
||||||
NodeInventoryReachPollSeconds int `yaml:"node_inventory_reachability_poll_seconds"`
|
NodeInventoryReachPollSeconds int `yaml:"node_inventory_reachability_poll_seconds"`
|
||||||
NodeInventoryReachRequiredNodes []string `yaml:"node_inventory_reachability_required_nodes"`
|
|
||||||
RequiredNodeLabels map[string]map[string]string `yaml:"required_node_labels"`
|
RequiredNodeLabels map[string]map[string]string `yaml:"required_node_labels"`
|
||||||
RequireTimeSync bool `yaml:"require_time_sync"`
|
RequireTimeSync bool `yaml:"require_time_sync"`
|
||||||
TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"`
|
TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"`
|
||||||
@ -58,7 +57,6 @@ type Startup struct {
|
|||||||
ServiceChecklistPollSeconds int `yaml:"service_checklist_poll_seconds"`
|
ServiceChecklistPollSeconds int `yaml:"service_checklist_poll_seconds"`
|
||||||
ServiceChecklistStabilitySec int `yaml:"service_checklist_stability_seconds"`
|
ServiceChecklistStabilitySec int `yaml:"service_checklist_stability_seconds"`
|
||||||
ServiceChecklistAuth ServiceChecklistAuthSettings `yaml:"service_checklist_auth"`
|
ServiceChecklistAuth ServiceChecklistAuthSettings `yaml:"service_checklist_auth"`
|
||||||
ServiceChecklistExplicitOnly bool `yaml:"service_checklist_explicit_only"`
|
|
||||||
ServiceChecklist []ServiceChecklistCheck `yaml:"service_checklist"`
|
ServiceChecklist []ServiceChecklistCheck `yaml:"service_checklist"`
|
||||||
RequireCriticalServiceEndpoints bool `yaml:"require_critical_service_endpoints"`
|
RequireCriticalServiceEndpoints bool `yaml:"require_critical_service_endpoints"`
|
||||||
CriticalServiceEndpointWaitSec int `yaml:"critical_service_endpoint_wait_seconds"`
|
CriticalServiceEndpointWaitSec int `yaml:"critical_service_endpoint_wait_seconds"`
|
||||||
@ -73,26 +71,18 @@ type Startup struct {
|
|||||||
RequireNodeSSHAuth bool `yaml:"require_node_ssh_auth"`
|
RequireNodeSSHAuth bool `yaml:"require_node_ssh_auth"`
|
||||||
NodeSSHAuthWaitSeconds int `yaml:"node_ssh_auth_wait_seconds"`
|
NodeSSHAuthWaitSeconds int `yaml:"node_ssh_auth_wait_seconds"`
|
||||||
NodeSSHAuthPollSeconds int `yaml:"node_ssh_auth_poll_seconds"`
|
NodeSSHAuthPollSeconds int `yaml:"node_ssh_auth_poll_seconds"`
|
||||||
NodeSSHAuthRequiredNodes []string `yaml:"node_ssh_auth_required_nodes"`
|
|
||||||
RequireFluxHealth bool `yaml:"require_flux_health"`
|
RequireFluxHealth bool `yaml:"require_flux_health"`
|
||||||
FluxHealthWaitSeconds int `yaml:"flux_health_wait_seconds"`
|
FluxHealthWaitSeconds int `yaml:"flux_health_wait_seconds"`
|
||||||
FluxHealthPollSeconds int `yaml:"flux_health_poll_seconds"`
|
FluxHealthPollSeconds int `yaml:"flux_health_poll_seconds"`
|
||||||
FluxHealthRequiredKustomizations []string `yaml:"flux_health_required_kustomizations"`
|
|
||||||
IgnoreFluxKustomizations []string `yaml:"ignore_flux_kustomizations"`
|
IgnoreFluxKustomizations []string `yaml:"ignore_flux_kustomizations"`
|
||||||
RequireWorkloadConvergence bool `yaml:"require_workload_convergence"`
|
RequireWorkloadConvergence bool `yaml:"require_workload_convergence"`
|
||||||
WorkloadConvergenceWaitSeconds int `yaml:"workload_convergence_wait_seconds"`
|
WorkloadConvergenceWaitSeconds int `yaml:"workload_convergence_wait_seconds"`
|
||||||
WorkloadConvergencePollSeconds int `yaml:"workload_convergence_poll_seconds"`
|
WorkloadConvergencePollSeconds int `yaml:"workload_convergence_poll_seconds"`
|
||||||
WorkloadConvergenceRequiredNamespaces []string `yaml:"workload_convergence_required_namespaces"`
|
|
||||||
IgnoreWorkloadNamespaces []string `yaml:"ignore_workload_namespaces"`
|
IgnoreWorkloadNamespaces []string `yaml:"ignore_workload_namespaces"`
|
||||||
IgnoreWorkloads []string `yaml:"ignore_workloads"`
|
IgnoreWorkloads []string `yaml:"ignore_workloads"`
|
||||||
IgnoreUnavailableNodes []string `yaml:"ignore_unavailable_nodes"`
|
IgnoreUnavailableNodes []string `yaml:"ignore_unavailable_nodes"`
|
||||||
AutoRecycleStuckPods bool `yaml:"auto_recycle_stuck_pods"`
|
AutoRecycleStuckPods bool `yaml:"auto_recycle_stuck_pods"`
|
||||||
AutoQuarantineSchedulingStorms bool `yaml:"auto_quarantine_scheduling_storms"`
|
|
||||||
SchedulingStormEventThreshold int `yaml:"scheduling_storm_event_threshold"`
|
|
||||||
SchedulingStormWindowSeconds int `yaml:"scheduling_storm_window_seconds"`
|
|
||||||
StuckPodGraceSeconds int `yaml:"stuck_pod_grace_seconds"`
|
StuckPodGraceSeconds int `yaml:"stuck_pod_grace_seconds"`
|
||||||
PostStartAutoHealSeconds int `yaml:"post_start_auto_heal_seconds"`
|
|
||||||
DeadNodeCleanupGraceSeconds int `yaml:"dead_node_cleanup_grace_seconds"`
|
|
||||||
VaultUnsealKeyFile string `yaml:"vault_unseal_key_file"`
|
VaultUnsealKeyFile string `yaml:"vault_unseal_key_file"`
|
||||||
VaultUnsealBreakglassCommand string `yaml:"vault_unseal_breakglass_command"`
|
VaultUnsealBreakglassCommand string `yaml:"vault_unseal_breakglass_command"`
|
||||||
VaultUnsealBreakglassTimeout int `yaml:"vault_unseal_breakglass_timeout_seconds"`
|
VaultUnsealBreakglassTimeout int `yaml:"vault_unseal_breakglass_timeout_seconds"`
|
||||||
@ -146,7 +136,6 @@ type UPS struct {
|
|||||||
Targets []UPSTarget `yaml:"targets"`
|
Targets []UPSTarget `yaml:"targets"`
|
||||||
PollSeconds int `yaml:"poll_seconds"`
|
PollSeconds int `yaml:"poll_seconds"`
|
||||||
RuntimeSafetyFactor float64 `yaml:"runtime_safety_factor"`
|
RuntimeSafetyFactor float64 `yaml:"runtime_safety_factor"`
|
||||||
OnBatteryGraceSeconds int `yaml:"on_battery_grace_seconds"`
|
|
||||||
DebounceCount int `yaml:"debounce_count"`
|
DebounceCount int `yaml:"debounce_count"`
|
||||||
TelemetryTimeoutSeconds int `yaml:"telemetry_timeout_seconds"`
|
TelemetryTimeoutSeconds int `yaml:"telemetry_timeout_seconds"`
|
||||||
}
|
}
|
||||||
|
|||||||
@ -61,11 +61,6 @@ func (c Config) Validate() error {
|
|||||||
if c.Startup.NodeInventoryReachPollSeconds <= 0 {
|
if c.Startup.NodeInventoryReachPollSeconds <= 0 {
|
||||||
return fmt.Errorf("config.startup.node_inventory_reachability_poll_seconds must be > 0")
|
return fmt.Errorf("config.startup.node_inventory_reachability_poll_seconds must be > 0")
|
||||||
}
|
}
|
||||||
for _, node := range c.Startup.NodeInventoryReachRequiredNodes {
|
|
||||||
if strings.TrimSpace(node) == "" {
|
|
||||||
return fmt.Errorf("config.startup.node_inventory_reachability_required_nodes entries must not be empty")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for node, labels := range c.Startup.RequiredNodeLabels {
|
for node, labels := range c.Startup.RequiredNodeLabels {
|
||||||
if strings.TrimSpace(node) == "" {
|
if strings.TrimSpace(node) == "" {
|
||||||
return fmt.Errorf("config.startup.required_node_labels keys must not be empty")
|
return fmt.Errorf("config.startup.required_node_labels keys must not be empty")
|
||||||
@ -238,46 +233,21 @@ func (c Config) Validate() error {
|
|||||||
if c.Startup.NodeSSHAuthPollSeconds <= 0 {
|
if c.Startup.NodeSSHAuthPollSeconds <= 0 {
|
||||||
return fmt.Errorf("config.startup.node_ssh_auth_poll_seconds must be > 0")
|
return fmt.Errorf("config.startup.node_ssh_auth_poll_seconds must be > 0")
|
||||||
}
|
}
|
||||||
for _, node := range c.Startup.NodeSSHAuthRequiredNodes {
|
|
||||||
if strings.TrimSpace(node) == "" {
|
|
||||||
return fmt.Errorf("config.startup.node_ssh_auth_required_nodes entries must not be empty")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if c.Startup.FluxHealthWaitSeconds <= 0 {
|
if c.Startup.FluxHealthWaitSeconds <= 0 {
|
||||||
return fmt.Errorf("config.startup.flux_health_wait_seconds must be > 0")
|
return fmt.Errorf("config.startup.flux_health_wait_seconds must be > 0")
|
||||||
}
|
}
|
||||||
if c.Startup.FluxHealthPollSeconds <= 0 {
|
if c.Startup.FluxHealthPollSeconds <= 0 {
|
||||||
return fmt.Errorf("config.startup.flux_health_poll_seconds must be > 0")
|
return fmt.Errorf("config.startup.flux_health_poll_seconds must be > 0")
|
||||||
}
|
}
|
||||||
for _, item := range c.Startup.FluxHealthRequiredKustomizations {
|
|
||||||
item = strings.TrimSpace(item)
|
|
||||||
if item == "" {
|
|
||||||
return fmt.Errorf("config.startup.flux_health_required_kustomizations entries must not be empty")
|
|
||||||
}
|
|
||||||
if strings.Count(item, "/") != 1 {
|
|
||||||
return fmt.Errorf("config.startup.flux_health_required_kustomizations entries must be namespace/name, got %q", item)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if c.Startup.WorkloadConvergenceWaitSeconds <= 0 {
|
if c.Startup.WorkloadConvergenceWaitSeconds <= 0 {
|
||||||
return fmt.Errorf("config.startup.workload_convergence_wait_seconds must be > 0")
|
return fmt.Errorf("config.startup.workload_convergence_wait_seconds must be > 0")
|
||||||
}
|
}
|
||||||
if c.Startup.WorkloadConvergencePollSeconds <= 0 {
|
if c.Startup.WorkloadConvergencePollSeconds <= 0 {
|
||||||
return fmt.Errorf("config.startup.workload_convergence_poll_seconds must be > 0")
|
return fmt.Errorf("config.startup.workload_convergence_poll_seconds must be > 0")
|
||||||
}
|
}
|
||||||
for _, ns := range c.Startup.WorkloadConvergenceRequiredNamespaces {
|
|
||||||
if strings.TrimSpace(ns) == "" {
|
|
||||||
return fmt.Errorf("config.startup.workload_convergence_required_namespaces entries must not be empty")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if c.Startup.StuckPodGraceSeconds <= 0 {
|
if c.Startup.StuckPodGraceSeconds <= 0 {
|
||||||
return fmt.Errorf("config.startup.stuck_pod_grace_seconds must be > 0")
|
return fmt.Errorf("config.startup.stuck_pod_grace_seconds must be > 0")
|
||||||
}
|
}
|
||||||
if c.Startup.PostStartAutoHealSeconds <= 0 {
|
|
||||||
return fmt.Errorf("config.startup.post_start_auto_heal_seconds must be > 0")
|
|
||||||
}
|
|
||||||
if c.Startup.DeadNodeCleanupGraceSeconds <= 0 {
|
|
||||||
return fmt.Errorf("config.startup.dead_node_cleanup_grace_seconds must be > 0")
|
|
||||||
}
|
|
||||||
for _, probe := range c.Startup.PostStartProbes {
|
for _, probe := range c.Startup.PostStartProbes {
|
||||||
if strings.TrimSpace(probe) == "" {
|
if strings.TrimSpace(probe) == "" {
|
||||||
return fmt.Errorf("config.startup.post_start_probes entries must not be empty")
|
return fmt.Errorf("config.startup.post_start_probes entries must not be empty")
|
||||||
@ -307,16 +277,6 @@ func (c Config) Validate() error {
|
|||||||
return fmt.Errorf("config.startup.ignore_workload_namespaces entries must not be empty")
|
return fmt.Errorf("config.startup.ignore_workload_namespaces entries must not be empty")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for _, item := range c.Startup.FluxHealthRequiredKustomizations {
|
|
||||||
if containsTrimmed(c.Startup.IgnoreFluxKustomizations, item) {
|
|
||||||
return fmt.Errorf("config.startup.flux_health_required_kustomizations must not overlap ignore_flux_kustomizations (%q)", strings.TrimSpace(item))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for _, ns := range c.Startup.WorkloadConvergenceRequiredNamespaces {
|
|
||||||
if containsTrimmed(c.Startup.IgnoreWorkloadNamespaces, ns) {
|
|
||||||
return fmt.Errorf("config.startup.workload_convergence_required_namespaces must not overlap ignore_workload_namespaces (%q)", strings.TrimSpace(ns))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for _, node := range c.Startup.IgnoreUnavailableNodes {
|
for _, node := range c.Startup.IgnoreUnavailableNodes {
|
||||||
if strings.TrimSpace(node) == "" {
|
if strings.TrimSpace(node) == "" {
|
||||||
return fmt.Errorf("config.startup.ignore_unavailable_nodes entries must not be empty")
|
return fmt.Errorf("config.startup.ignore_unavailable_nodes entries must not be empty")
|
||||||
@ -332,9 +292,6 @@ func (c Config) Validate() error {
|
|||||||
if c.UPS.Provider == "" {
|
if c.UPS.Provider == "" {
|
||||||
return fmt.Errorf("config.ups.provider must not be empty when ups is enabled")
|
return fmt.Errorf("config.ups.provider must not be empty when ups is enabled")
|
||||||
}
|
}
|
||||||
if c.UPS.OnBatteryGraceSeconds < 0 {
|
|
||||||
return fmt.Errorf("config.ups.on_battery_grace_seconds must be >= 0")
|
|
||||||
}
|
|
||||||
if c.UPS.Target == "" && len(c.UPS.Targets) == 0 {
|
if c.UPS.Target == "" && len(c.UPS.Targets) == 0 {
|
||||||
return fmt.Errorf("config.ups.target or config.ups.targets must be set when ups is enabled")
|
return fmt.Errorf("config.ups.target or config.ups.targets must be set when ups is enabled")
|
||||||
}
|
}
|
||||||
@ -349,14 +306,6 @@ func (c Config) Validate() error {
|
|||||||
return fmt.Errorf("config.coordination.forward_shutdown_config must not be empty when forward_shutdown_host is set")
|
return fmt.Errorf("config.coordination.forward_shutdown_config must not be empty when forward_shutdown_host is set")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if c.Startup.AutoQuarantineSchedulingStorms {
|
|
||||||
if c.Startup.SchedulingStormEventThreshold <= 0 {
|
|
||||||
return fmt.Errorf("config.startup.scheduling_storm_event_threshold must be > 0 when auto_quarantine_scheduling_storms is enabled")
|
|
||||||
}
|
|
||||||
if c.Startup.SchedulingStormWindowSeconds <= 0 {
|
|
||||||
return fmt.Errorf("config.startup.scheduling_storm_window_seconds must be > 0 when auto_quarantine_scheduling_storms is enabled")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for _, peer := range c.Coordination.PeerHosts {
|
for _, peer := range c.Coordination.PeerHosts {
|
||||||
if strings.TrimSpace(peer) == "" {
|
if strings.TrimSpace(peer) == "" {
|
||||||
return fmt.Errorf("config.coordination.peer_hosts entries must not be empty")
|
return fmt.Errorf("config.coordination.peer_hosts entries must not be empty")
|
||||||
@ -379,20 +328,3 @@ func (c Config) Validate() error {
|
|||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// containsTrimmed runs one orchestration or CLI step.
|
|
||||||
// Signature: containsTrimmed(entries []string, needle string) bool.
|
|
||||||
// Why: startup config now supports both required and ignored recovery scopes, so
|
|
||||||
// validation needs a single normalized overlap check for those lists.
|
|
||||||
func containsTrimmed(entries []string, needle string) bool {
|
|
||||||
needle = strings.TrimSpace(needle)
|
|
||||||
if needle == "" {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
for _, entry := range entries {
|
|
||||||
if strings.TrimSpace(entry) == needle {
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
|
|||||||
@ -30,7 +30,6 @@ func TestValidateRejectsInvalidFieldsMatrix(t *testing.T) {
|
|||||||
{"bad_api_poll", func(c *Config) { c.Startup.APIPollSeconds = 0 }},
|
{"bad_api_poll", func(c *Config) { c.Startup.APIPollSeconds = 0 }},
|
||||||
{"bad_min_battery_percent", func(c *Config) { c.Startup.MinimumBatteryPercent = 101 }},
|
{"bad_min_battery_percent", func(c *Config) { c.Startup.MinimumBatteryPercent = 101 }},
|
||||||
{"bad_node_inventory_poll", func(c *Config) { c.Startup.NodeInventoryReachPollSeconds = 0 }},
|
{"bad_node_inventory_poll", func(c *Config) { c.Startup.NodeInventoryReachPollSeconds = 0 }},
|
||||||
{"bad_empty_node_inventory_required_node", func(c *Config) { c.Startup.NodeInventoryReachRequiredNodes = []string{"titan-0a", ""} }},
|
|
||||||
{"bad_time_sync_wait", func(c *Config) { c.Startup.TimeSyncWaitSeconds = 0 }},
|
{"bad_time_sync_wait", func(c *Config) { c.Startup.TimeSyncWaitSeconds = 0 }},
|
||||||
{"bad_time_sync_poll", func(c *Config) { c.Startup.TimeSyncPollSeconds = 0 }},
|
{"bad_time_sync_poll", func(c *Config) { c.Startup.TimeSyncPollSeconds = 0 }},
|
||||||
{"bad_time_sync_quorum", func(c *Config) { c.Startup.TimeSyncMode = "quorum"; c.Startup.TimeSyncQuorum = 0 }},
|
{"bad_time_sync_quorum", func(c *Config) { c.Startup.TimeSyncMode = "quorum"; c.Startup.TimeSyncQuorum = 0 }},
|
||||||
@ -69,42 +68,19 @@ func TestValidateRejectsInvalidFieldsMatrix(t *testing.T) {
|
|||||||
{"bad_ingress_ignore_entry", func(c *Config) { c.Startup.IngressChecklistIgnoreHosts = []string{"", "grafana.bstein.dev"} }},
|
{"bad_ingress_ignore_entry", func(c *Config) { c.Startup.IngressChecklistIgnoreHosts = []string{"", "grafana.bstein.dev"} }},
|
||||||
{"bad_node_ssh_wait", func(c *Config) { c.Startup.NodeSSHAuthWaitSeconds = 0 }},
|
{"bad_node_ssh_wait", func(c *Config) { c.Startup.NodeSSHAuthWaitSeconds = 0 }},
|
||||||
{"bad_node_ssh_poll", func(c *Config) { c.Startup.NodeSSHAuthPollSeconds = 0 }},
|
{"bad_node_ssh_poll", func(c *Config) { c.Startup.NodeSSHAuthPollSeconds = 0 }},
|
||||||
{"bad_empty_node_ssh_required_node", func(c *Config) { c.Startup.NodeSSHAuthRequiredNodes = []string{"titan-0a", ""} }},
|
|
||||||
{"bad_flux_wait", func(c *Config) { c.Startup.FluxHealthWaitSeconds = 0 }},
|
{"bad_flux_wait", func(c *Config) { c.Startup.FluxHealthWaitSeconds = 0 }},
|
||||||
{"bad_flux_poll", func(c *Config) { c.Startup.FluxHealthPollSeconds = 0 }},
|
{"bad_flux_poll", func(c *Config) { c.Startup.FluxHealthPollSeconds = 0 }},
|
||||||
{"bad_empty_flux_required_kustomization", func(c *Config) { c.Startup.FluxHealthRequiredKustomizations = []string{"flux-system/core", ""} }},
|
|
||||||
{"bad_malformed_flux_required_kustomization", func(c *Config) { c.Startup.FluxHealthRequiredKustomizations = []string{"flux-system-core"} }},
|
|
||||||
{"bad_workload_wait", func(c *Config) { c.Startup.WorkloadConvergenceWaitSeconds = 0 }},
|
{"bad_workload_wait", func(c *Config) { c.Startup.WorkloadConvergenceWaitSeconds = 0 }},
|
||||||
{"bad_workload_poll", func(c *Config) { c.Startup.WorkloadConvergencePollSeconds = 0 }},
|
{"bad_workload_poll", func(c *Config) { c.Startup.WorkloadConvergencePollSeconds = 0 }},
|
||||||
{"bad_empty_required_workload_namespace", func(c *Config) { c.Startup.WorkloadConvergenceRequiredNamespaces = []string{"monitoring", ""} }},
|
|
||||||
{"bad_stuck_pod_grace", func(c *Config) { c.Startup.StuckPodGraceSeconds = 0 }},
|
{"bad_stuck_pod_grace", func(c *Config) { c.Startup.StuckPodGraceSeconds = 0 }},
|
||||||
{"bad_post_start_auto_heal_seconds", func(c *Config) { c.Startup.PostStartAutoHealSeconds = 0 }},
|
|
||||||
{"bad_dead_node_cleanup_grace_seconds", func(c *Config) { c.Startup.DeadNodeCleanupGraceSeconds = 0 }},
|
|
||||||
{"bad_empty_post_start_probe_entry", func(c *Config) { c.Startup.PostStartProbes = []string{"https://ok", ""} }},
|
{"bad_empty_post_start_probe_entry", func(c *Config) { c.Startup.PostStartProbes = []string{"https://ok", ""} }},
|
||||||
{"bad_empty_ignore_flux_entry", func(c *Config) { c.Startup.IgnoreFluxKustomizations = []string{"", "ns/name"} }},
|
{"bad_empty_ignore_flux_entry", func(c *Config) { c.Startup.IgnoreFluxKustomizations = []string{"", "ns/name"} }},
|
||||||
{"bad_empty_ignore_workloads_entry", func(c *Config) { c.Startup.IgnoreWorkloads = []string{"", "ns/name"} }},
|
{"bad_empty_ignore_workloads_entry", func(c *Config) { c.Startup.IgnoreWorkloads = []string{"", "ns/name"} }},
|
||||||
{"bad_empty_ignore_workload_namespaces_entry", func(c *Config) { c.Startup.IgnoreWorkloadNamespaces = []string{"", "vault"} }},
|
{"bad_empty_ignore_workload_namespaces_entry", func(c *Config) { c.Startup.IgnoreWorkloadNamespaces = []string{"", "vault"} }},
|
||||||
{"bad_overlap_flux_required_and_ignored", func(c *Config) {
|
|
||||||
c.Startup.FluxHealthRequiredKustomizations = []string{"flux-system/core"}
|
|
||||||
c.Startup.IgnoreFluxKustomizations = []string{"flux-system/core"}
|
|
||||||
}},
|
|
||||||
{"bad_overlap_workload_required_and_ignored", func(c *Config) {
|
|
||||||
c.Startup.WorkloadConvergenceRequiredNamespaces = []string{"monitoring"}
|
|
||||||
c.Startup.IgnoreWorkloadNamespaces = []string{"monitoring"}
|
|
||||||
}},
|
|
||||||
{"bad_empty_ignore_unavailable_nodes_entry", func(c *Config) { c.Startup.IgnoreUnavailableNodes = []string{"", "titan-22"} }},
|
{"bad_empty_ignore_unavailable_nodes_entry", func(c *Config) { c.Startup.IgnoreUnavailableNodes = []string{"", "titan-22"} }},
|
||||||
{"bad_empty_vault_key_file", func(c *Config) { c.Startup.VaultUnsealKeyFile = "" }},
|
{"bad_empty_vault_key_file", func(c *Config) { c.Startup.VaultUnsealKeyFile = "" }},
|
||||||
{"bad_scheduling_storm_threshold", func(c *Config) {
|
|
||||||
c.Startup.AutoQuarantineSchedulingStorms = true
|
|
||||||
c.Startup.SchedulingStormEventThreshold = 0
|
|
||||||
}},
|
|
||||||
{"bad_scheduling_storm_window", func(c *Config) {
|
|
||||||
c.Startup.AutoQuarantineSchedulingStorms = true
|
|
||||||
c.Startup.SchedulingStormWindowSeconds = 0
|
|
||||||
}},
|
|
||||||
{"bad_ssh_port_low", func(c *Config) { c.SSHPort = 0 }},
|
{"bad_ssh_port_low", func(c *Config) { c.SSHPort = 0 }},
|
||||||
{"bad_ups_provider", func(c *Config) { c.UPS.Enabled = true; c.UPS.Provider = "" }},
|
{"bad_ups_provider", func(c *Config) { c.UPS.Enabled = true; c.UPS.Provider = "" }},
|
||||||
{"bad_ups_on_battery_grace_negative", func(c *Config) { c.UPS.Enabled = true; c.UPS.OnBatteryGraceSeconds = -1 }},
|
|
||||||
{"bad_ups_target_empty", func(c *Config) { c.UPS.Enabled = true; c.UPS.Provider = "nut"; c.UPS.Target = ""; c.UPS.Targets = nil }},
|
{"bad_ups_target_empty", func(c *Config) { c.UPS.Enabled = true; c.UPS.Provider = "nut"; c.UPS.Target = ""; c.UPS.Targets = nil }},
|
||||||
{"bad_ups_targets_item_empty", func(c *Config) {
|
{"bad_ups_targets_item_empty", func(c *Config) {
|
||||||
c.UPS.Enabled = true
|
c.UPS.Enabled = true
|
||||||
@ -145,13 +121,6 @@ func TestApplyDefaultsPopulatesZeroConfig(t *testing.T) {
|
|||||||
if cfg.Startup.TimeSyncMode == "" || cfg.Startup.EtcdRestoreControlPlane == "" || cfg.Startup.VaultUnsealKeyFile == "" {
|
if cfg.Startup.TimeSyncMode == "" || cfg.Startup.EtcdRestoreControlPlane == "" || cfg.Startup.VaultUnsealKeyFile == "" {
|
||||||
t.Fatalf("expected startup defaults to be set")
|
t.Fatalf("expected startup defaults to be set")
|
||||||
}
|
}
|
||||||
if cfg.Startup.PostStartAutoHealSeconds <= 0 || cfg.Startup.DeadNodeCleanupGraceSeconds <= 0 {
|
|
||||||
t.Fatalf("expected post-start auto-heal defaults to be set")
|
|
||||||
}
|
|
||||||
if cfg.Startup.NodeInventoryReachRequiredNodes == nil || cfg.Startup.NodeSSHAuthRequiredNodes == nil ||
|
|
||||||
cfg.Startup.FluxHealthRequiredKustomizations == nil || cfg.Startup.WorkloadConvergenceRequiredNamespaces == nil {
|
|
||||||
t.Fatalf("expected startup recovery scope slices to be initialized")
|
|
||||||
}
|
|
||||||
if cfg.Startup.CriticalServiceEndpointWaitSec <= 0 || cfg.Startup.CriticalServiceEndpointPollSec <= 0 {
|
if cfg.Startup.CriticalServiceEndpointWaitSec <= 0 || cfg.Startup.CriticalServiceEndpointPollSec <= 0 {
|
||||||
t.Fatalf("expected critical service endpoint timing defaults to be set")
|
t.Fatalf("expected critical service endpoint timing defaults to be set")
|
||||||
}
|
}
|
||||||
|
|||||||
@ -32,8 +32,6 @@ type Daemon struct {
|
|||||||
targets []Target
|
targets []Target
|
||||||
log *log.Logger
|
log *log.Logger
|
||||||
exporter *metrics.Exporter
|
exporter *metrics.Exporter
|
||||||
|
|
||||||
postStartAutoHealOverride func(context.Context) error
|
|
||||||
}
|
}
|
||||||
|
|
||||||
var sshConfigCandidates = []string{
|
var sshConfigCandidates = []string{
|
||||||
@ -94,9 +92,7 @@ func (d *Daemon) Run(ctx context.Context) error {
|
|||||||
|
|
||||||
lastGood := map[string]time.Time{}
|
lastGood := map[string]time.Time{}
|
||||||
lastOnBattery := map[string]bool{}
|
lastOnBattery := map[string]bool{}
|
||||||
onBatterySince := map[string]time.Time{}
|
|
||||||
breachCount := map[string]int{}
|
breachCount := map[string]int{}
|
||||||
lastAutoHeal := time.Time{}
|
|
||||||
for _, t := range d.targets {
|
for _, t := range d.targets {
|
||||||
lastGood[t.Name] = time.Now()
|
lastGood[t.Name] = time.Now()
|
||||||
}
|
}
|
||||||
@ -111,16 +107,12 @@ func (d *Daemon) Run(ctx context.Context) error {
|
|||||||
case <-t.C:
|
case <-t.C:
|
||||||
budget := d.orch.EstimatedEmergencyShutdownSeconds()
|
budget := d.orch.EstimatedEmergencyShutdownSeconds()
|
||||||
threshold := int(math.Ceil(float64(budget) * d.cfg.UPS.RuntimeSafetyFactor))
|
threshold := int(math.Ceil(float64(budget) * d.cfg.UPS.RuntimeSafetyFactor))
|
||||||
anyOnBattery := false
|
|
||||||
|
|
||||||
d.exporter.UpdateBudget(budget)
|
d.exporter.UpdateBudget(budget)
|
||||||
|
|
||||||
for _, target := range d.targets {
|
for _, target := range d.targets {
|
||||||
sample, err := target.Provider.Read(ctx)
|
sample, err := target.Provider.Read(ctx)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if lastOnBattery[target.Name] {
|
|
||||||
anyOnBattery = true
|
|
||||||
}
|
|
||||||
d.log.Printf("warning: ups read failed target=%s (%s): %v", target.Name, target.Target, err)
|
d.log.Printf("warning: ups read failed target=%s (%s): %v", target.Name, target.Target, err)
|
||||||
d.exporter.UpdateSample(metrics.Sample{
|
d.exporter.UpdateSample(metrics.Sample{
|
||||||
Name: target.Name,
|
Name: target.Name,
|
||||||
@ -139,45 +131,17 @@ func (d *Daemon) Run(ctx context.Context) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
lastGood[target.Name] = time.Now()
|
lastGood[target.Name] = time.Now()
|
||||||
if sample.OnBattery {
|
|
||||||
anyOnBattery = true
|
|
||||||
}
|
|
||||||
wasOnBattery := lastOnBattery[target.Name]
|
|
||||||
if sample.OnBattery {
|
|
||||||
if !wasOnBattery || onBatterySince[target.Name].IsZero() {
|
|
||||||
onBatterySince[target.Name] = time.Now()
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
onBatterySince[target.Name] = time.Time{}
|
|
||||||
}
|
|
||||||
lastOnBattery[target.Name] = sample.OnBattery
|
lastOnBattery[target.Name] = sample.OnBattery
|
||||||
|
|
||||||
onBatteryElapsed := 0
|
trigger := sample.LowBattery || (sample.OnBattery && sample.RuntimeSeconds > 0 && sample.RuntimeSeconds <= threshold)
|
||||||
if sample.OnBattery && !onBatterySince[target.Name].IsZero() {
|
|
||||||
onBatteryElapsed = int(time.Since(onBatterySince[target.Name]).Seconds())
|
|
||||||
}
|
|
||||||
|
|
||||||
trigger := false
|
|
||||||
triggerReason := ""
|
|
||||||
switch {
|
|
||||||
case sample.LowBattery:
|
|
||||||
trigger = true
|
|
||||||
triggerReason = fmt.Sprintf("ups-low-battery target=%s status=%s", target.Name, sample.RawStatus)
|
|
||||||
case sample.OnBattery && sample.RuntimeSeconds > 0 && sample.RuntimeSeconds <= threshold:
|
|
||||||
trigger = true
|
|
||||||
triggerReason = fmt.Sprintf("ups-threshold target=%s runtime=%ds threshold=%ds status=%s", target.Name, sample.RuntimeSeconds, threshold, sample.RawStatus)
|
|
||||||
case sample.OnBattery && d.cfg.UPS.OnBatteryGraceSeconds > 0 && onBatteryElapsed >= d.cfg.UPS.OnBatteryGraceSeconds:
|
|
||||||
trigger = true
|
|
||||||
triggerReason = fmt.Sprintf("ups-on-battery target=%s elapsed=%ds grace=%ds status=%s", target.Name, onBatteryElapsed, d.cfg.UPS.OnBatteryGraceSeconds, sample.RawStatus)
|
|
||||||
}
|
|
||||||
if trigger {
|
if trigger {
|
||||||
breachCount[target.Name]++
|
breachCount[target.Name]++
|
||||||
} else {
|
} else {
|
||||||
breachCount[target.Name] = 0
|
breachCount[target.Name] = 0
|
||||||
}
|
}
|
||||||
|
|
||||||
d.log.Printf("ups target=%s status=%s on_battery=%t on_battery_s=%d grace_s=%d runtime_s=%d threshold_s=%d budget_s=%d trigger=%t breach=%d",
|
d.log.Printf("ups target=%s status=%s on_battery=%t runtime_s=%d threshold_s=%d budget_s=%d trigger=%t breach=%d",
|
||||||
target.Name, sample.RawStatus, sample.OnBattery, onBatteryElapsed, d.cfg.UPS.OnBatteryGraceSeconds, sample.RuntimeSeconds, threshold, budget, trigger, breachCount[target.Name])
|
target.Name, sample.RawStatus, sample.OnBattery, sample.RuntimeSeconds, threshold, budget, trigger, breachCount[target.Name])
|
||||||
|
|
||||||
d.exporter.UpdateSample(metrics.Sample{
|
d.exporter.UpdateSample(metrics.Sample{
|
||||||
Name: target.Name,
|
Name: target.Name,
|
||||||
@ -196,54 +160,14 @@ func (d *Daemon) Run(ctx context.Context) error {
|
|||||||
})
|
})
|
||||||
|
|
||||||
if breachCount[target.Name] >= debounce {
|
if breachCount[target.Name] >= debounce {
|
||||||
return d.triggerShutdown(ctx, triggerReason)
|
reason := fmt.Sprintf("ups-threshold target=%s runtime=%ds threshold=%ds status=%s", target.Name, sample.RuntimeSeconds, threshold, sample.RawStatus)
|
||||||
|
return d.triggerShutdown(ctx, reason)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
d.maybeRunPostStartAutoHeal(ctx, &lastAutoHeal, anyOnBattery)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// maybeRunPostStartAutoHeal runs one orchestration or CLI step.
|
|
||||||
// Signature: (d *Daemon) maybeRunPostStartAutoHeal(ctx context.Context, lastRun *time.Time, anyOnBattery bool).
|
|
||||||
// Why: gives the long-running daemon a bounded path to repair post-start drift
|
|
||||||
// like a later Vault reseal or stale dead-node deletions without waiting for a
|
|
||||||
// fresh bootstrap run.
|
|
||||||
func (d *Daemon) maybeRunPostStartAutoHeal(ctx context.Context, lastRun *time.Time, anyOnBattery bool) {
|
|
||||||
interval := time.Duration(d.cfg.Startup.PostStartAutoHealSeconds) * time.Second
|
|
||||||
if interval <= 0 || anyOnBattery {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
if d.orch == nil && d.postStartAutoHealOverride == nil {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
now := time.Now()
|
|
||||||
if lastRun != nil && !lastRun.IsZero() && now.Sub(*lastRun) < interval {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
if lastRun != nil {
|
|
||||||
*lastRun = now
|
|
||||||
}
|
|
||||||
if err := d.runPostStartAutoHeal(ctx); err != nil {
|
|
||||||
d.log.Printf("warning: post-start auto-heal: %v", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// runPostStartAutoHeal runs one orchestration or CLI step.
|
|
||||||
// Signature: (d *Daemon) runPostStartAutoHeal(ctx context.Context) error.
|
|
||||||
// Why: keeps the daemon loop readable while allowing unit tests to inject a
|
|
||||||
// deterministic repair hook without a live cluster.
|
|
||||||
func (d *Daemon) runPostStartAutoHeal(ctx context.Context) error {
|
|
||||||
if d.postStartAutoHealOverride != nil {
|
|
||||||
return d.postStartAutoHealOverride(ctx)
|
|
||||||
}
|
|
||||||
if d.orch == nil {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
return d.orch.RunPostStartAutoHeal(ctx)
|
|
||||||
}
|
|
||||||
|
|
||||||
// triggerShutdown runs one orchestration or CLI step.
|
// triggerShutdown runs one orchestration or CLI step.
|
||||||
// Signature: (d *Daemon) triggerShutdown(ctx context.Context, reason string) error.
|
// Signature: (d *Daemon) triggerShutdown(ctx context.Context, reason string) error.
|
||||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||||
|
|||||||
@ -165,50 +165,6 @@ func TestDaemonRunTriggersShutdownOnTelemetryTimeout(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TestDaemonRunTriggersShutdownAfterOnBatteryGrace runs one orchestration or CLI step.
|
|
||||||
// Signature: TestDaemonRunTriggersShutdownAfterOnBatteryGrace(t *testing.T).
|
|
||||||
// Why: covers the sustained-on-battery trigger so short runtime estimates are not
|
|
||||||
// the only path to a graceful shutdown during abrupt power loss.
|
|
||||||
func TestDaemonRunTriggersShutdownAfterOnBatteryGrace(t *testing.T) {
|
|
||||||
stateDir := t.TempDir()
|
|
||||||
orch := newDaemonTestOrchestrator(t, stateDir)
|
|
||||||
d := &Daemon{
|
|
||||||
cfg: config.Config{
|
|
||||||
UPS: config.UPS{
|
|
||||||
Enabled: true,
|
|
||||||
PollSeconds: 1,
|
|
||||||
DebounceCount: 1,
|
|
||||||
RuntimeSafetyFactor: 1.0,
|
|
||||||
OnBatteryGraceSeconds: 1,
|
|
||||||
},
|
|
||||||
State: config.State{
|
|
||||||
IntentPath: filepath.Join(stateDir, "intent.json"),
|
|
||||||
},
|
|
||||||
Shutdown: config.Shutdown{
|
|
||||||
EmergencySkipDrain: true,
|
|
||||||
EmergencySkipEtcd: true,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
orch: orch,
|
|
||||||
targets: []Target{
|
|
||||||
{
|
|
||||||
Name: "Pyrphoros",
|
|
||||||
Target: "pyrphoros@localhost",
|
|
||||||
Provider: &daemonFakeProvider{
|
|
||||||
samples: []ups.Sample{{OnBattery: true, LowBattery: false, RuntimeSeconds: 9999, RawStatus: "OB"}},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
log: log.New(io.Discard, "", 0),
|
|
||||||
exporter: metrics.New(),
|
|
||||||
}
|
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
|
||||||
defer cancel()
|
|
||||||
if err := d.Run(ctx); err != nil {
|
|
||||||
t.Fatalf("expected sustained-on-battery shutdown path to complete, got %v", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// TestForwardShutdownSucceedsWithSSHShim runs one orchestration or CLI step.
|
// TestForwardShutdownSucceedsWithSSHShim runs one orchestration or CLI step.
|
||||||
// Signature: TestForwardShutdownSucceedsWithSSHShim(t *testing.T).
|
// Signature: TestForwardShutdownSucceedsWithSSHShim(t *testing.T).
|
||||||
// Why: covers forward-shutdown SSH execution path.
|
// Why: covers forward-shutdown SSH execution path.
|
||||||
|
|||||||
@ -1,51 +0,0 @@
|
|||||||
package service
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"testing"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"scm.bstein.dev/bstein/ananke/internal/config"
|
|
||||||
)
|
|
||||||
|
|
||||||
// TestDaemonMaybeRunPostStartAutoHeal runs one orchestration or CLI step.
|
|
||||||
// Signature: TestDaemonMaybeRunPostStartAutoHeal(t *testing.T).
|
|
||||||
// Why: covers the daemon-side interval and on-battery guards for the new
|
|
||||||
// post-start repair loop.
|
|
||||||
func TestDaemonMaybeRunPostStartAutoHeal(t *testing.T) {
|
|
||||||
calls := 0
|
|
||||||
d := &Daemon{
|
|
||||||
cfg: config.Config{
|
|
||||||
Startup: config.Startup{
|
|
||||||
PostStartAutoHealSeconds: 10,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
postStartAutoHealOverride: func(context.Context) error {
|
|
||||||
calls++
|
|
||||||
return nil
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
var last time.Time
|
|
||||||
d.maybeRunPostStartAutoHeal(context.Background(), &last, false)
|
|
||||||
if calls != 1 {
|
|
||||||
t.Fatalf("expected first auto-heal invocation, got %d", calls)
|
|
||||||
}
|
|
||||||
|
|
||||||
d.maybeRunPostStartAutoHeal(context.Background(), &last, false)
|
|
||||||
if calls != 1 {
|
|
||||||
t.Fatalf("expected interval guard to suppress second call, got %d", calls)
|
|
||||||
}
|
|
||||||
|
|
||||||
last = time.Now().Add(-11 * time.Second)
|
|
||||||
d.maybeRunPostStartAutoHeal(context.Background(), &last, true)
|
|
||||||
if calls != 1 {
|
|
||||||
t.Fatalf("expected on-battery guard to suppress call, got %d", calls)
|
|
||||||
}
|
|
||||||
|
|
||||||
last = time.Now().Add(-11 * time.Second)
|
|
||||||
d.maybeRunPostStartAutoHeal(context.Background(), &last, false)
|
|
||||||
if calls != 2 {
|
|
||||||
t.Fatalf("expected second allowed auto-heal call, got %d", calls)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@ -22,23 +22,12 @@ type Intent struct {
|
|||||||
UpdatedAt time.Time `json:"updated_at"`
|
UpdatedAt time.Time `json:"updated_at"`
|
||||||
}
|
}
|
||||||
|
|
||||||
var (
|
var writeIntentImpl = writeIntentDefault
|
||||||
readIntentImpl = readIntentDefault
|
|
||||||
writeIntentImpl = writeIntentDefault
|
|
||||||
)
|
|
||||||
|
|
||||||
// ReadIntent runs one orchestration or CLI step.
|
// ReadIntent runs one orchestration or CLI step.
|
||||||
// Signature: ReadIntent(path string) (Intent, error).
|
// Signature: ReadIntent(path string) (Intent, error).
|
||||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||||
func ReadIntent(path string) (Intent, error) {
|
func ReadIntent(path string) (Intent, error) {
|
||||||
return readIntentImpl(path)
|
|
||||||
}
|
|
||||||
|
|
||||||
// readIntentDefault runs one orchestration or CLI step.
|
|
||||||
// Signature: readIntentDefault(path string) (Intent, error).
|
|
||||||
// Why: keeps production read behavior available while tests can override intent
|
|
||||||
// reads deterministically without racing background file mutations.
|
|
||||||
func readIntentDefault(path string) (Intent, error) {
|
|
||||||
b, err := os.ReadFile(path)
|
b, err := os.ReadFile(path)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if os.IsNotExist(err) {
|
if os.IsNotExist(err) {
|
||||||
|
|||||||
@ -22,34 +22,6 @@ func TestHookWriteIntentDefault(path string, in Intent) error {
|
|||||||
return writeIntentDefault(path, in)
|
return writeIntentDefault(path, in)
|
||||||
}
|
}
|
||||||
|
|
||||||
// TestHookReadIntentDefault runs one orchestration or CLI step.
|
|
||||||
// Signature: TestHookReadIntentDefault(path string) (Intent, error).
|
|
||||||
// Why: lets top-level tests delegate to production ReadIntent behavior while
|
|
||||||
// selectively forcing deterministic read sequences for lifecycle branches.
|
|
||||||
func TestHookReadIntentDefault(path string) (Intent, error) {
|
|
||||||
return readIntentDefault(path)
|
|
||||||
}
|
|
||||||
|
|
||||||
// TestHookSetReadIntentOverride runs one orchestration or CLI step.
|
|
||||||
// Signature: TestHookSetReadIntentOverride(fn func(path string) (Intent, error)) (restore func()).
|
|
||||||
// Why: enables deterministic intent-read failure injection without sleeping
|
|
||||||
// goroutines that race slower CI agents.
|
|
||||||
func TestHookSetReadIntentOverride(fn func(path string) (Intent, error)) (restore func()) {
|
|
||||||
testHookOverrideMu.Lock()
|
|
||||||
prev := readIntentImpl
|
|
||||||
if fn == nil {
|
|
||||||
readIntentImpl = readIntentDefault
|
|
||||||
} else {
|
|
||||||
readIntentImpl = fn
|
|
||||||
}
|
|
||||||
testHookOverrideMu.Unlock()
|
|
||||||
return func() {
|
|
||||||
testHookOverrideMu.Lock()
|
|
||||||
readIntentImpl = prev
|
|
||||||
testHookOverrideMu.Unlock()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// TestHookSetWriteIntentOverride runs one orchestration or CLI step.
|
// TestHookSetWriteIntentOverride runs one orchestration or CLI step.
|
||||||
// Signature: TestHookSetWriteIntentOverride(fn func(path string, in Intent) error) (restore func()).
|
// Signature: TestHookSetWriteIntentOverride(fn func(path string, in Intent) error) (restore func()).
|
||||||
// Why: enables deterministic intent-write failure injection from the top-level
|
// Why: enables deterministic intent-write failure injection from the top-level
|
||||||
|
|||||||
@ -1,116 +0,0 @@
|
|||||||
# Binary, config template, and systemd artifact helpers for the installer.
|
|
||||||
|
|
||||||
resolve_build_target() {
|
|
||||||
if [[ -d "${REPO_DIR}/cmd/ananke" ]]; then
|
|
||||||
echo "./cmd/ananke"
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
return 1
|
|
||||||
}
|
|
||||||
|
|
||||||
install_config_template() {
|
|
||||||
local template="$1"
|
|
||||||
local dest="$2"
|
|
||||||
local src legacy
|
|
||||||
local -a modern_candidates=()
|
|
||||||
local -a legacy_candidates=()
|
|
||||||
|
|
||||||
case "${template}" in
|
|
||||||
coordinator)
|
|
||||||
modern_candidates=("configs/ananke.coordinator.yaml" "configs/ananke.titan-db.yaml")
|
|
||||||
legacy_candidates=("configs/hecate.titan-db.yaml")
|
|
||||||
;;
|
|
||||||
peer)
|
|
||||||
modern_candidates=("configs/ananke.peer.yaml" "configs/ananke.tethys.yaml")
|
|
||||||
legacy_candidates=("configs/hecate.tethys.yaml")
|
|
||||||
;;
|
|
||||||
example)
|
|
||||||
modern_candidates=("configs/ananke.example.yaml")
|
|
||||||
legacy_candidates=("configs/hecate.example.yaml")
|
|
||||||
;;
|
|
||||||
*)
|
|
||||||
echo "[install] unknown config template key: ${template}" >&2
|
|
||||||
return 1
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
|
|
||||||
for src in "${modern_candidates[@]}"; do
|
|
||||||
if [[ -f "${src}" ]]; then
|
|
||||||
install -m 0640 "${src}" "${dest}"
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
for legacy in "${legacy_candidates[@]}"; do
|
|
||||||
if [[ -f "${legacy}" ]]; then
|
|
||||||
src="$(mktemp)"
|
|
||||||
legacy_path_rewrite "${legacy}" "${src}"
|
|
||||||
install -m 0640 "${src}" "${dest}"
|
|
||||||
rm -f "${src}"
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
echo "[install] missing config template sources for '${template}'. modern=[${modern_candidates[*]}] legacy=[${legacy_candidates[*]}]" >&2
|
|
||||||
return 1
|
|
||||||
}
|
|
||||||
|
|
||||||
install_systemd_units() {
|
|
||||||
local tmp
|
|
||||||
|
|
||||||
while IFS='|' read -r target_name modern_name legacy_name; do
|
|
||||||
local modern_src="deploy/systemd/${modern_name}"
|
|
||||||
local legacy_src="deploy/systemd/${legacy_name}"
|
|
||||||
local target="${SYSTEMD_DIR}/${target_name}"
|
|
||||||
|
|
||||||
if [[ -f "${modern_src}" ]]; then
|
|
||||||
install -m 0644 "${modern_src}" "${target}"
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ -f "${legacy_src}" ]]; then
|
|
||||||
tmp="$(mktemp)"
|
|
||||||
legacy_path_rewrite "${legacy_src}" "${tmp}"
|
|
||||||
install -m 0644 "${tmp}" "${target}"
|
|
||||||
rm -f "${tmp}"
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "[install] missing both modern and legacy systemd unit sources for ${target_name}" >&2
|
|
||||||
return 1
|
|
||||||
done <<'EOF_UNITS'
|
|
||||||
ananke.service|ananke.service|hecate.service
|
|
||||||
ananke-bootstrap.service|ananke-bootstrap.service|hecate-bootstrap.service
|
|
||||||
ananke-update.service|ananke-update.service|hecate-update.service
|
|
||||||
ananke-update.timer|ananke-update.timer|hecate-update.timer
|
|
||||||
EOF_UNITS
|
|
||||||
}
|
|
||||||
|
|
||||||
install_self_update_script() {
|
|
||||||
local modern_src="scripts/ananke-self-update.sh"
|
|
||||||
local legacy_src="scripts/hecate-self-update.sh"
|
|
||||||
local target="${LIB_DIR}/ananke-self-update.sh"
|
|
||||||
local tmp
|
|
||||||
|
|
||||||
if [[ -f "${modern_src}" ]]; then
|
|
||||||
install -m 0755 "${modern_src}" "${target}"
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ -f "${legacy_src}" ]]; then
|
|
||||||
tmp="$(mktemp)"
|
|
||||||
legacy_path_rewrite "${legacy_src}" "${tmp}"
|
|
||||||
sed -Ei \
|
|
||||||
-e 's/HECATE_/ANANKE_/g' \
|
|
||||||
-e 's/hecate-self-update/ananke-self-update/g' \
|
|
||||||
-e 's#/opt/hecate#/opt/ananke#g' \
|
|
||||||
-e 's#bstein/hecate\.git#bstein/ananke.git#g' \
|
|
||||||
"${tmp}"
|
|
||||||
install -m 0755 "${tmp}" "${target}"
|
|
||||||
rm -f "${tmp}"
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "[install] missing both modern and legacy self-update scripts." >&2
|
|
||||||
return 1
|
|
||||||
}
|
|
||||||
@ -1,334 +0,0 @@
|
|||||||
# Config migration helpers for the Ananke host installer.
|
|
||||||
|
|
||||||
read_ananke_role() {
|
|
||||||
if [[ ! -f "${CONF_DIR}/ananke.yaml" ]]; then
|
|
||||||
echo "coordinator"
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
local role
|
|
||||||
role="$(awk '/^[[:space:]]*role:[[:space:]]*/ {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true)"
|
|
||||||
if [[ -z "${role}" ]]; then
|
|
||||||
role="coordinator"
|
|
||||||
fi
|
|
||||||
echo "${role}"
|
|
||||||
}
|
|
||||||
|
|
||||||
migration_yaml_lookup() {
|
|
||||||
local key="$1"
|
|
||||||
awk -F': *' -v k="${key}" '$1 == k {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true
|
|
||||||
}
|
|
||||||
|
|
||||||
first_control_plane_name() {
|
|
||||||
awk '
|
|
||||||
/^control_planes:[[:space:]]*$/ {in_list=1; next}
|
|
||||||
in_list && /^[[:space:]]*-[[:space:]]*/ {gsub(/^[[:space:]]*-[[:space:]]*/, "", $0); print $0; exit}
|
|
||||||
in_list && /^[^[:space:]]/ {in_list=0}
|
|
||||||
' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true
|
|
||||||
}
|
|
||||||
|
|
||||||
lookup_node_host() {
|
|
||||||
local node="$1"
|
|
||||||
awk -F': *' -v n="${node}" '$1 == " " n {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true
|
|
||||||
}
|
|
||||||
|
|
||||||
migrate_ananke_config() {
|
|
||||||
if [[ ! -f "${CONF_DIR}/ananke.yaml" ]]; then
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
local changed=0
|
|
||||||
local role_hint
|
|
||||||
role_hint="$(read_ananke_role)"
|
|
||||||
if grep -Eq 'default_budget_seconds:[[:space:]]*300' "${CONF_DIR}/ananke.yaml"; then
|
|
||||||
sed -Ei 's/(default_budget_seconds:[[:space:]]*)300/\11380/' "${CONF_DIR}/ananke.yaml"
|
|
||||||
echo "[install] migrated default_budget_seconds 300 -> 1380 in ${CONF_DIR}/ananke.yaml"
|
|
||||||
changed=1
|
|
||||||
fi
|
|
||||||
if grep -Eq 'runtime_safety_factor:[[:space:]]*1\.10' "${CONF_DIR}/ananke.yaml"; then
|
|
||||||
sed -Ei 's/(runtime_safety_factor:[[:space:]]*)1\.10/\11.25/' "${CONF_DIR}/ananke.yaml"
|
|
||||||
echo "[install] migrated runtime_safety_factor 1.10 -> 1.25 in ${CONF_DIR}/ananke.yaml"
|
|
||||||
changed=1
|
|
||||||
fi
|
|
||||||
if grep -Eq '^ssh_node_users:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
|
|
||||||
&& grep -Eq '^ titan-24:[[:space:]]*tethys[[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
|
|
||||||
sed -Ei 's/^ titan-24:[[:space:]]*tethys[[:space:]]*$/ titan-24: atlas/' "${CONF_DIR}/ananke.yaml"
|
|
||||||
echo "[install] migrated ssh_node_users titan-24 override to atlas"
|
|
||||||
changed=1
|
|
||||||
fi
|
|
||||||
if grep -Eq '^ command_timeout_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
|
|
||||||
&& ! grep -Eq '^ startup_guard_max_age_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml"; then
|
|
||||||
sed -Ei '/^ command_timeout_seconds:[[:space:]]*[0-9]+/a\ startup_guard_max_age_seconds: 900' "${CONF_DIR}/ananke.yaml"
|
|
||||||
echo "[install] added coordination.startup_guard_max_age_seconds=900"
|
|
||||||
changed=1
|
|
||||||
fi
|
|
||||||
if grep -Eq '^[[:space:]]*poweroff_enabled:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
|
|
||||||
sed -Ei \
|
|
||||||
-e '/^[[:space:]]*poweroff_enabled:[[:space:]]*(true|false)/d' \
|
|
||||||
-e '/^[[:space:]]*poweroff_delay_seconds:[[:space:]]*[0-9]+/d' \
|
|
||||||
-e '/^[[:space:]]*poweroff_local_host:[[:space:]]*(true|false)/d' \
|
|
||||||
-e '/^[[:space:]]*extra_poweroff_hosts:[[:space:]]*(\[\])?[[:space:]]*$/d' \
|
|
||||||
"${CONF_DIR}/ananke.yaml"
|
|
||||||
echo "[install] removed deprecated host-poweroff shutdown config keys"
|
|
||||||
changed=1
|
|
||||||
fi
|
|
||||||
if grep -Eq '^ minimum_battery_percent:[[:space:]]*[0-9.]+' "${CONF_DIR}/ananke.yaml" \
|
|
||||||
&& ! grep -Eq '^ require_node_inventory_reachability:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
|
|
||||||
sed -Ei '/^ minimum_battery_percent:[[:space:]]*[0-9.]+/a\ require_node_inventory_reachability: true\n node_inventory_reachability_wait_seconds: 300\n node_inventory_reachability_poll_seconds: 5' "${CONF_DIR}/ananke.yaml"
|
|
||||||
echo "[install] added startup node inventory reachability gate defaults"
|
|
||||||
changed=1
|
|
||||||
fi
|
|
||||||
if grep -Eq '^state:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
|
|
||||||
&& ! grep -Eq '^ reports_dir:[[:space:]]*/var/lib/ananke/reports' "${CONF_DIR}/ananke.yaml"; then
|
|
||||||
sed -Ei '/^ dir:[[:space:]]*\/var\/lib\/ananke$/a\ reports_dir: /var/lib/ananke/reports' "${CONF_DIR}/ananke.yaml"
|
|
||||||
echo "[install] added state.reports_dir default"
|
|
||||||
changed=1
|
|
||||||
fi
|
|
||||||
if ! grep -Eq '^ peer_hosts:' "${CONF_DIR}/ananke.yaml"; then
|
|
||||||
if [[ "${role_hint}" == "peer" ]] && grep -Eq '^ forward_shutdown_host:[[:space:]]*[A-Za-z0-9._-]+' "${CONF_DIR}/ananke.yaml"; then
|
|
||||||
local peer_host
|
|
||||||
peer_host="$(awk -F': *' '/^ forward_shutdown_host:[[:space:]]*/ {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true)"
|
|
||||||
if [[ -n "${peer_host}" ]]; then
|
|
||||||
sed -Ei '/^ forward_shutdown_config:[[:space:]]*.*$/a\ peer_hosts:\n - '"${peer_host}"'' "${CONF_DIR}/ananke.yaml"
|
|
||||||
echo "[install] added coordination.peer_hosts from forward_shutdown_host (${peer_host})"
|
|
||||||
changed=1
|
|
||||||
fi
|
|
||||||
elif [[ "${role_hint}" == "coordinator" ]] && grep -Eq '^ titan-24:[[:space:]]*[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' "${CONF_DIR}/ananke.yaml"; then
|
|
||||||
sed -Ei '/^ forward_shutdown_config:[[:space:]]*.*$/a\ peer_hosts:\n - titan-24' "${CONF_DIR}/ananke.yaml"
|
|
||||||
echo "[install] added coordination.peer_hosts default (titan-24) for coordinator role"
|
|
||||||
changed=1
|
|
||||||
else
|
|
||||||
sed -Ei '/^ forward_shutdown_config:[[:space:]]*.*$/a\ peer_hosts: []' "${CONF_DIR}/ananke.yaml"
|
|
||||||
echo "[install] added coordination.peer_hosts empty default"
|
|
||||||
changed=1
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
local default_restore_cp
|
|
||||||
default_restore_cp="$(first_control_plane_name)"
|
|
||||||
if [[ -z "${default_restore_cp}" ]]; then
|
|
||||||
default_restore_cp="titan-0a"
|
|
||||||
fi
|
|
||||||
if grep -Eq '^ api_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
|
|
||||||
&& ! grep -Eq '^ auto_etcd_restore_on_api_failure:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
|
|
||||||
sed -Ei '/^ api_poll_seconds:[[:space:]]*[0-9]+/a\ require_time_sync: true\n time_sync_wait_seconds: 240\n time_sync_poll_seconds: 5\n reconcile_access_on_boot: true\n auto_etcd_restore_on_api_failure: true\n etcd_restore_control_plane: '"${default_restore_cp}"'' "${CONF_DIR}/ananke.yaml"
|
|
||||||
echo "[install] added startup.auto_etcd_restore_on_api_failure + startup.etcd_restore_control_plane defaults"
|
|
||||||
changed=1
|
|
||||||
fi
|
|
||||||
if grep -Eq '^ api_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
|
|
||||||
&& ! grep -Eq '^ require_time_sync:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
|
|
||||||
sed -Ei '/^ api_poll_seconds:[[:space:]]*[0-9]+/a\ require_time_sync: true\n time_sync_wait_seconds: 240\n time_sync_poll_seconds: 5\n reconcile_access_on_boot: true' "${CONF_DIR}/ananke.yaml"
|
|
||||||
echo "[install] added startup time sync + access reconciliation defaults"
|
|
||||||
changed=1
|
|
||||||
fi
|
|
||||||
if grep -Eq '^ time_sync_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
|
|
||||||
&& ! grep -Eq '^ time_sync_mode:[[:space:]]*(strict|quorum)' "${CONF_DIR}/ananke.yaml"; then
|
|
||||||
sed -Ei '/^ time_sync_poll_seconds:[[:space:]]*[0-9]+/a\ time_sync_mode: quorum\n time_sync_quorum: 2' "${CONF_DIR}/ananke.yaml"
|
|
||||||
echo "[install] added startup time sync quorum defaults"
|
|
||||||
changed=1
|
|
||||||
fi
|
|
||||||
if grep -Eq '^ etcd_restore_control_plane:[[:space:]]*[A-Za-z0-9._-]+' "${CONF_DIR}/ananke.yaml" \
|
|
||||||
&& ! grep -Eq '^ require_storage_ready:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
|
|
||||||
sed -Ei '/^ etcd_restore_control_plane:[[:space:]]*[A-Za-z0-9._-]+/a\ require_storage_ready: true\n storage_ready_wait_seconds: 420\n storage_ready_poll_seconds: 5\n storage_min_ready_nodes: 2\n storage_critical_pvcs:\n - vault/data-vault-0\n - postgres/postgres-data-postgres-0\n - gitea/gitea-data\n - sso/keycloak-data' "${CONF_DIR}/ananke.yaml"
|
|
||||||
echo "[install] added startup storage readiness defaults"
|
|
||||||
changed=1
|
|
||||||
fi
|
|
||||||
if grep -Eq '^ storage_critical_pvcs:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
|
|
||||||
&& ! grep -Eq '^ require_post_start_probes:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
|
|
||||||
sed -Ei '/^ - sso\/keycloak-data$/a\ require_post_start_probes: true\n post_start_probe_wait_seconds: 240\n post_start_probe_poll_seconds: 5\n post_start_probes:\n - https://scm.bstein.dev/api/healthz\n - https://metrics.bstein.dev/api/health\n require_service_checklist: true\n service_checklist_wait_seconds: 420\n service_checklist_poll_seconds: 5\n service_checklist_stability_seconds: 120\n service_checklist:\n - name: gitea-api\n url: https://scm.bstein.dev/api/healthz\n accepted_statuses: [200]\n body_contains: pass\n timeout_seconds: 12\n - name: grafana-api\n url: https://metrics.bstein.dev/api/health\n accepted_statuses: [200]\n body_contains: '\''\"database\":\"ok\"'\''\n timeout_seconds: 12\n vault_unseal_key_file: /var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"
|
|
||||||
echo "[install] added startup post-start probe + vault key fallback defaults"
|
|
||||||
changed=1
|
|
||||||
fi
|
|
||||||
if grep -Eq '^ - https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration$' "${CONF_DIR}/ananke.yaml"; then
|
|
||||||
sed -Ei '/^ - https:\/\/sso\.bstein\.dev\/realms\/atlas\/\.well-known\/openid-configuration$/d' "${CONF_DIR}/ananke.yaml"
|
|
||||||
echo "[install] removed sso OIDC probe from startup.post_start_probes (returns 404 in current deployment)"
|
|
||||||
changed=1
|
|
||||||
fi
|
|
||||||
if ! grep -Eq '^ vault_unseal_key_file:[[:space:]]*/var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"; then
|
|
||||||
if grep -Eq '^startup:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" && grep -Eq '^ post_start_probes:[[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
|
|
||||||
sed -Ei '/^ - https:\/\/metrics\.bstein\.dev\/api\/health$/a\ vault_unseal_key_file: /var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"
|
|
||||||
echo "[install] added startup.vault_unseal_key_file default"
|
|
||||||
changed=1
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
if ! grep -Eq '^ vault_unseal_breakglass_timeout_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml"; then
|
|
||||||
if grep -Eq '^ vault_unseal_key_file:[[:space:]]*/var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"; then
|
|
||||||
sed -Ei '/^ vault_unseal_key_file:[[:space:]]*\/var\/lib\/ananke\/vault-unseal.key$/a\ vault_unseal_breakglass_command: ""\n vault_unseal_breakglass_timeout_seconds: 15' "${CONF_DIR}/ananke.yaml"
|
|
||||||
echo "[install] added startup break-glass fallback defaults"
|
|
||||||
changed=1
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
install_cluster_inventory_defaults "${role_hint}" && changed=1
|
|
||||||
|
|
||||||
if [[ "${changed}" -eq 1 ]]; then
|
|
||||||
chmod 0640 "${CONF_DIR}/ananke.yaml" || true
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
install_cluster_inventory_defaults() {
|
|
||||||
local role="$1"
|
|
||||||
local changed=0
|
|
||||||
local inventory_block=""
|
|
||||||
local managed_block=""
|
|
||||||
local workers_block
|
|
||||||
workers_block='workers:
|
|
||||||
- titan-04
|
|
||||||
- titan-05
|
|
||||||
- titan-06
|
|
||||||
- titan-07
|
|
||||||
- titan-08
|
|
||||||
- titan-09
|
|
||||||
- titan-10
|
|
||||||
- titan-11
|
|
||||||
- titan-12
|
|
||||||
- titan-13
|
|
||||||
- titan-14
|
|
||||||
- titan-15
|
|
||||||
- titan-17
|
|
||||||
- titan-18
|
|
||||||
- titan-19
|
|
||||||
- titan-20
|
|
||||||
- titan-21
|
|
||||||
- titan-22
|
|
||||||
- titan-24'
|
|
||||||
|
|
||||||
if [[ "${role}" == "coordinator" || "${role}" == "peer" ]]; then
|
|
||||||
inventory_block='ssh_node_hosts:
|
|
||||||
titan-db: 192.168.22.10
|
|
||||||
titan-0a: 192.168.22.11
|
|
||||||
titan-0b: 192.168.22.12
|
|
||||||
titan-0c: 192.168.22.13
|
|
||||||
titan-04: 192.168.22.30
|
|
||||||
titan-05: 192.168.22.31
|
|
||||||
titan-06: 192.168.22.32
|
|
||||||
titan-07: 192.168.22.33
|
|
||||||
titan-08: 192.168.22.34
|
|
||||||
titan-09: 192.168.22.35
|
|
||||||
titan-10: 192.168.22.36
|
|
||||||
titan-11: 192.168.22.37
|
|
||||||
titan-12: 192.168.22.40
|
|
||||||
titan-13: 192.168.22.41
|
|
||||||
titan-14: 192.168.22.42
|
|
||||||
titan-15: 192.168.22.43
|
|
||||||
titan-17: 192.168.22.45
|
|
||||||
titan-18: 192.168.22.46
|
|
||||||
titan-19: 192.168.22.47
|
|
||||||
titan-20: 192.168.22.20
|
|
||||||
titan-21: 192.168.22.21
|
|
||||||
titan-22: 192.168.22.22
|
|
||||||
titan-24: 192.168.22.26'
|
|
||||||
managed_block='ssh_managed_nodes:
|
|
||||||
- titan-db
|
|
||||||
- titan-0a
|
|
||||||
- titan-0b
|
|
||||||
- titan-0c
|
|
||||||
- titan-04
|
|
||||||
- titan-05
|
|
||||||
- titan-06
|
|
||||||
- titan-07
|
|
||||||
- titan-08
|
|
||||||
- titan-09
|
|
||||||
- titan-10
|
|
||||||
- titan-11
|
|
||||||
- titan-12
|
|
||||||
- titan-13
|
|
||||||
- titan-14
|
|
||||||
- titan-15
|
|
||||||
- titan-17
|
|
||||||
- titan-18
|
|
||||||
- titan-19
|
|
||||||
- titan-20
|
|
||||||
- titan-21
|
|
||||||
- titan-22
|
|
||||||
- titan-24'
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ -n "${inventory_block}" ]] && grep -Eq '^ssh_node_hosts:[[:space:]]*\{\}[[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
|
|
||||||
perl -0pi -e 's#ssh_node_hosts:\s*\{\}\n#'"${inventory_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
|
|
||||||
echo "[install] hydrated ssh_node_hosts inventory for role=${role}"
|
|
||||||
changed=1
|
|
||||||
fi
|
|
||||||
if grep -Eq '^workers:[[:space:]]*\[\][[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
|
|
||||||
perl -0pi -e 's#workers:\s*\[\]\n#'"${workers_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
|
|
||||||
echo "[install] hydrated workers inventory for startup/shutdown orchestration"
|
|
||||||
changed=1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ -n "${managed_block}" ]]; then
|
|
||||||
if grep -Eq '^ssh_managed_nodes:[[:space:]]*\[\][[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
|
|
||||||
perl -0pi -e 's#ssh_managed_nodes:\s*\[\]\n#'"${managed_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
|
|
||||||
echo "[install] hydrated ssh_managed_nodes inventory for role=${role}"
|
|
||||||
changed=1
|
|
||||||
fi
|
|
||||||
if ! grep -Eq '^ - titan-04$' "${CONF_DIR}/ananke.yaml" || ! grep -Eq '^ - titan-21$' "${CONF_DIR}/ananke.yaml"; then
|
|
||||||
perl -0pi -e 's#ssh_managed_nodes:\n(?: - [^\n]*\n)*#'"${managed_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
|
|
||||||
echo "[install] refreshed ssh_managed_nodes coverage for role=${role}"
|
|
||||||
changed=1
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ "${role}" == "peer" ]]; then
|
|
||||||
install_peer_inventory_defaults && changed=1
|
|
||||||
fi
|
|
||||||
|
|
||||||
[[ "${changed}" -eq 1 ]]
|
|
||||||
}
|
|
||||||
|
|
||||||
install_peer_inventory_defaults() {
|
|
||||||
local changed=0
|
|
||||||
if grep -Eq '^ssh_managed_nodes:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
|
|
||||||
&& grep -Eq '^ - titan-db$' "${CONF_DIR}/ananke.yaml" \
|
|
||||||
&& grep -Eq '^ - titan-24$' "${CONF_DIR}/ananke.yaml" \
|
|
||||||
&& ! grep -Eq '^ - titan-0a$' "${CONF_DIR}/ananke.yaml"; then
|
|
||||||
perl -0pi -e 's#ssh_managed_nodes:\n - titan-db\n - titan-24\n#ssh_managed_nodes:\n - titan-db\n - titan-0a\n - titan-0b\n - titan-0c\n - titan-04\n - titan-05\n - titan-06\n - titan-07\n - titan-08\n - titan-09\n - titan-10\n - titan-11\n - titan-12\n - titan-13\n - titan-14\n - titan-15\n - titan-17\n - titan-18\n - titan-19\n - titan-20\n - titan-21\n - titan-22\n - titan-24\n#s' "${CONF_DIR}/ananke.yaml"
|
|
||||||
echo "[install] expanded peer ssh_managed_nodes for bootstrap fallback coverage"
|
|
||||||
changed=1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if ! grep -Eq '^ - services/keycloak$' "${CONF_DIR}/ananke.yaml" || ! grep -Eq '^ - infrastructure/cert-manager$' "${CONF_DIR}/ananke.yaml" || ! grep -Eq '^ - services/oauth2-proxy$' "${CONF_DIR}/ananke.yaml"; then
|
|
||||||
perl -0pi -e 's#local_bootstrap_paths:\n(?: - [^\n]*\n)*#local_bootstrap_paths:\n - infrastructure/core\n - clusters/atlas/flux-system\n - infrastructure/sources/helm\n - infrastructure/metallb\n - infrastructure/traefik\n - infrastructure/cert-manager\n - infrastructure/vault-csi\n - infrastructure/vault-injector\n - services/vault\n - infrastructure/postgres\n - services/gitea\n - services/keycloak\n - services/oauth2-proxy\n#s' "${CONF_DIR}/ananke.yaml"
|
|
||||||
echo "[install] refreshed peer local_bootstrap_paths for full fallback bootstrap parity"
|
|
||||||
changed=1
|
|
||||||
fi
|
|
||||||
[[ "${changed}" -eq 1 ]]
|
|
||||||
}
|
|
||||||
|
|
||||||
sanitize_migrated_ananke_config() {
|
|
||||||
local cfg="${CONF_DIR}/ananke.yaml"
|
|
||||||
[[ -f "${cfg}" ]] || return 0
|
|
||||||
|
|
||||||
local tmp changed=0
|
|
||||||
tmp="$(mktemp)"
|
|
||||||
|
|
||||||
# If a legacy migration bug appended root-level node entries after
|
|
||||||
# ssh_managed_nodes, drop those orphan entries until the next top-level key.
|
|
||||||
awk '
|
|
||||||
BEGIN {in_managed=0}
|
|
||||||
/^ssh_managed_nodes:[[:space:]]*$/ {in_managed=1; print; next}
|
|
||||||
{
|
|
||||||
if (in_managed) {
|
|
||||||
if ($0 ~ /^ - /) {print; next}
|
|
||||||
if ($0 ~ /^- /) {next}
|
|
||||||
if ($0 ~ /^[A-Za-z0-9_]+:[[:space:]]*/) {in_managed=0}
|
|
||||||
}
|
|
||||||
print
|
|
||||||
}
|
|
||||||
' "${cfg}" > "${tmp}"
|
|
||||||
|
|
||||||
if ! cmp -s "${cfg}" "${tmp}"; then
|
|
||||||
mv "${tmp}" "${cfg}"
|
|
||||||
changed=1
|
|
||||||
echo "[install] sanitized malformed ssh_managed_nodes block in ${cfg}"
|
|
||||||
else
|
|
||||||
rm -f "${tmp}"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if grep -Eq '^[[:space:]]*forward_shutdown_config:[[:space:]]*/etc/ananke/hecate.yaml[[:space:]]*$' "${cfg}"; then
|
|
||||||
sed -Ei 's#(^[[:space:]]*forward_shutdown_config:[[:space:]]*)/etc/ananke/hecate.yaml#\1/etc/ananke/ananke.yaml#' "${cfg}"
|
|
||||||
changed=1
|
|
||||||
echo "[install] migrated coordination.forward_shutdown_config to /etc/ananke/ananke.yaml"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ "${changed}" -eq 1 ]]; then
|
|
||||||
chmod 0640 "${cfg}" || true
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
@ -1,239 +0,0 @@
|
|||||||
# Host bootstrap helpers for the Ananke installer.
|
|
||||||
|
|
||||||
resolve_nut_ups_name() {
|
|
||||||
if [[ -n "${NUT_UPS_NAME}" ]]; then
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ -f "${CONF_DIR}/ananke.yaml" ]]; then
|
|
||||||
local target=""
|
|
||||||
target="$(grep -Eo 'target:[[:space:]]*[A-Za-z0-9._-]+@localhost' "${CONF_DIR}/ananke.yaml" | head -n 1 | awk '{print $2}')"
|
|
||||||
if [[ -n "${target}" ]]; then
|
|
||||||
NUT_UPS_NAME="${target%@localhost}"
|
|
||||||
echo "[install] inferred NUT UPS name from config: ${NUT_UPS_NAME}"
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
NUT_UPS_NAME="pyrphoros"
|
|
||||||
echo "[install] defaulting NUT UPS name to ${NUT_UPS_NAME}"
|
|
||||||
}
|
|
||||||
|
|
||||||
ensure_ananke_kubeconfig() {
|
|
||||||
local kubeconfig_path
|
|
||||||
kubeconfig_path="$(migration_yaml_lookup "kubeconfig")"
|
|
||||||
if [[ -z "${kubeconfig_path}" ]]; then
|
|
||||||
kubeconfig_path="/etc/ananke/kubeconfig"
|
|
||||||
fi
|
|
||||||
install -d -m 0750 "$(dirname "${kubeconfig_path}")"
|
|
||||||
|
|
||||||
if [[ -s "${kubeconfig_path}" ]] && KUBECONFIG="${kubeconfig_path}" kubectl version --request-timeout=5s >/dev/null 2>&1; then
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ -r /etc/rancher/k3s/k3s.yaml ]]; then
|
|
||||||
install -m 0600 /etc/rancher/k3s/k3s.yaml "${kubeconfig_path}"
|
|
||||||
echo "[install] refreshed kubeconfig from local /etc/rancher/k3s/k3s.yaml"
|
|
||||||
if KUBECONFIG="${kubeconfig_path}" kubectl version --request-timeout=5s >/dev/null 2>&1; then
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
local cp_name cp_host ssh_user ssh_port ssh_cfg ssh_key
|
|
||||||
cp_name="$(first_control_plane_name)"
|
|
||||||
if [[ -z "${cp_name}" ]]; then
|
|
||||||
echo "[install] warning: cannot infer control plane name; kubeconfig bootstrap skipped"
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
cp_host="$(lookup_node_host "${cp_name}")"
|
|
||||||
if [[ -z "${cp_host}" ]]; then
|
|
||||||
cp_host="${cp_name}"
|
|
||||||
fi
|
|
||||||
ssh_user="$(migration_yaml_lookup "ssh_user")"
|
|
||||||
ssh_port="$(migration_yaml_lookup "ssh_port")"
|
|
||||||
ssh_cfg="$(migration_yaml_lookup "ssh_config_file")"
|
|
||||||
ssh_key="$(migration_yaml_lookup "ssh_identity_file")"
|
|
||||||
if [[ -z "${ssh_port}" ]]; then
|
|
||||||
ssh_port="2277"
|
|
||||||
fi
|
|
||||||
|
|
||||||
local target
|
|
||||||
target="${cp_host}"
|
|
||||||
if [[ -n "${ssh_user}" ]]; then
|
|
||||||
target="${ssh_user}@${cp_host}"
|
|
||||||
fi
|
|
||||||
local ssh_args=(
|
|
||||||
-o BatchMode=yes
|
|
||||||
-o ConnectTimeout=8
|
|
||||||
-o StrictHostKeyChecking=accept-new
|
|
||||||
)
|
|
||||||
if [[ -n "${ssh_cfg}" && -f "${ssh_cfg}" ]]; then
|
|
||||||
ssh_args+=(-F "${ssh_cfg}")
|
|
||||||
fi
|
|
||||||
if [[ -n "${ssh_key}" && -f "${ssh_key}" ]]; then
|
|
||||||
ssh_args+=(-i "${ssh_key}")
|
|
||||||
fi
|
|
||||||
if [[ -n "${ssh_port}" ]]; then
|
|
||||||
ssh_args+=(-p "${ssh_port}")
|
|
||||||
fi
|
|
||||||
|
|
||||||
local remote_cfg
|
|
||||||
if remote_cfg="$(ssh "${ssh_args[@]}" "${target}" "sudo cat /etc/rancher/k3s/k3s.yaml" 2>/dev/null)"; then
|
|
||||||
printf '%s\n' "${remote_cfg}" > "${kubeconfig_path}"
|
|
||||||
sed -Ei "s#server:[[:space:]]*https://127\\.0\\.0\\.1:6443#server: https://${cp_host}:6443#g" "${kubeconfig_path}" || true
|
|
||||||
chmod 0600 "${kubeconfig_path}"
|
|
||||||
echo "[install] bootstrapped kubeconfig from control plane ${cp_name} (${cp_host})"
|
|
||||||
if KUBECONFIG="${kubeconfig_path}" kubectl version --request-timeout=5s >/dev/null 2>&1; then
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
echo "[install] warning: failed to fetch kubeconfig from ${cp_name} (${cp_host})"
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "[install] warning: kubeconfig at ${kubeconfig_path} is still not validated; local startup fallback may fail"
|
|
||||||
}
|
|
||||||
|
|
||||||
ensure_ananke_ssh_identity() {
|
|
||||||
local key_path key_dir key_user key_comment
|
|
||||||
key_path="$(migration_yaml_lookup "ssh_identity_file")"
|
|
||||||
if [[ -z "${key_path}" ]]; then
|
|
||||||
key_path="/home/atlas/.ssh/id_ed25519"
|
|
||||||
fi
|
|
||||||
key_dir="$(dirname "${key_path}")"
|
|
||||||
key_comment="ananke-$(hostname)-forward"
|
|
||||||
|
|
||||||
key_user="root"
|
|
||||||
if [[ "${key_path}" == /home/*/* ]]; then
|
|
||||||
key_user="${key_path#/home/}"
|
|
||||||
key_user="${key_user%%/*}"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if ! id "${key_user}" >/dev/null 2>&1; then
|
|
||||||
echo "[install] warning: ssh identity owner ${key_user} does not exist; skipping key bootstrap for ${key_path}"
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
install -d -m 0700 -o "${key_user}" -g "${key_user}" "${key_dir}"
|
|
||||||
if [[ ! -s "${key_path}" ]]; then
|
|
||||||
echo "[install] generating missing SSH identity at ${key_path}"
|
|
||||||
if [[ "${key_user}" == "root" ]]; then
|
|
||||||
ssh-keygen -q -t ed25519 -N '' -C "${key_comment}" -f "${key_path}"
|
|
||||||
else
|
|
||||||
runuser -u "${key_user}" -- ssh-keygen -q -t ed25519 -N '' -C "${key_comment}" -f "${key_path}"
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
chown "${key_user}:${key_user}" "${key_path}" "${key_path}.pub" 2>/dev/null || true
|
|
||||||
chmod 0600 "${key_path}" || true
|
|
||||||
chmod 0644 "${key_path}.pub" || true
|
|
||||||
}
|
|
||||||
|
|
||||||
ensure_apt_packages() {
|
|
||||||
local missing=()
|
|
||||||
for pkg in "$@"; do
|
|
||||||
if ! dpkg -s "${pkg}" >/dev/null 2>&1; then
|
|
||||||
missing+=("${pkg}")
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
if [[ ${#missing[@]} -eq 0 ]]; then
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
echo "[install] apt install: ${missing[*]}"
|
|
||||||
export DEBIAN_FRONTEND=noninteractive
|
|
||||||
apt-get update -y
|
|
||||||
apt-get install -y "${missing[@]}"
|
|
||||||
}
|
|
||||||
|
|
||||||
install_kubectl_if_missing() {
|
|
||||||
if command -v kubectl >/dev/null 2>&1; then
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
ensure_apt_packages kubernetes-client || true
|
|
||||||
if command -v kubectl >/dev/null 2>&1; then
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
echo "[install] installing kubectl via upstream binary"
|
|
||||||
local arch
|
|
||||||
arch="$(uname -m)"
|
|
||||||
case "${arch}" in
|
|
||||||
x86_64) arch="amd64" ;;
|
|
||||||
aarch64|arm64) arch="arm64" ;;
|
|
||||||
*) echo "Unsupported arch for kubectl install: ${arch}" >&2; return 1 ;;
|
|
||||||
esac
|
|
||||||
local version
|
|
||||||
version="$(curl -fsSL https://dl.k8s.io/release/stable.txt)"
|
|
||||||
curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/${version}/bin/linux/${arch}/kubectl"
|
|
||||||
chmod 0755 /usr/local/bin/kubectl
|
|
||||||
}
|
|
||||||
|
|
||||||
ensure_dependencies() {
|
|
||||||
if [[ "${INSTALL_DEPS}" -eq 0 ]]; then
|
|
||||||
echo "[install] skipping dependency installation"
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
if ! command -v apt-get >/dev/null 2>&1; then
|
|
||||||
echo "This installer currently supports apt-based hosts only." >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
ensure_apt_packages ca-certificates curl git openssh-client jq nut-client nut-server nut-monitor golang-go
|
|
||||||
install_kubectl_if_missing
|
|
||||||
}
|
|
||||||
|
|
||||||
configure_nut() {
|
|
||||||
if [[ "${MANAGE_NUT}" != "1" ]]; then
|
|
||||||
echo "[install] skipping NUT configuration (ANANKE_MANAGE_NUT=${MANAGE_NUT})"
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "[install] configuring NUT + udev for UPS ${NUT_UPS_NAME} (${NUT_VENDOR_ID}:${NUT_PRODUCT_ID})"
|
|
||||||
install -d -m 0755 /etc/nut /etc/udev/rules.d
|
|
||||||
|
|
||||||
cat > /etc/nut/nut.conf <<EOF
|
|
||||||
MODE=standalone
|
|
||||||
EOF
|
|
||||||
|
|
||||||
cat > /etc/nut/ups.conf <<EOF
|
|
||||||
[${NUT_UPS_NAME}]
|
|
||||||
driver = usbhid-ups
|
|
||||||
port = auto
|
|
||||||
vendorid = ${NUT_VENDOR_ID}
|
|
||||||
productid = ${NUT_PRODUCT_ID}
|
|
||||||
pollinterval = 5
|
|
||||||
EOF
|
|
||||||
|
|
||||||
cat > /etc/nut/upsd.users <<EOF
|
|
||||||
[${NUT_MONITOR_USER}]
|
|
||||||
password = ${NUT_MONITOR_PASSWORD}
|
|
||||||
upsmon primary
|
|
||||||
EOF
|
|
||||||
chmod 0640 /etc/nut/upsd.users
|
|
||||||
if getent group nut >/dev/null 2>&1; then
|
|
||||||
chown root:nut /etc/nut/upsd.users
|
|
||||||
else
|
|
||||||
chown root:root /etc/nut/upsd.users
|
|
||||||
fi
|
|
||||||
|
|
||||||
cat > /etc/nut/upsmon.conf <<EOF
|
|
||||||
RUN_AS_USER nut
|
|
||||||
MONITOR ${NUT_UPS_NAME}@localhost 1 ${NUT_MONITOR_USER} ${NUT_MONITOR_PASSWORD} primary
|
|
||||||
MINSUPPLIES 1
|
|
||||||
SHUTDOWNCMD "/sbin/shutdown -h +0"
|
|
||||||
POLLFREQ 5
|
|
||||||
POLLFREQALERT 5
|
|
||||||
HOSTSYNC 15
|
|
||||||
DEADTIME 15
|
|
||||||
POWERDOWNFLAG /etc/killpower
|
|
||||||
EOF
|
|
||||||
|
|
||||||
cat > /etc/udev/rules.d/99-ananke-ups.rules <<EOF
|
|
||||||
# Managed by ananke install.sh: ensure UPS USB HID devices are readable by NUT
|
|
||||||
ACTION=="add|change", SUBSYSTEM=="usb", ATTR{idVendor}=="${NUT_VENDOR_ID}", ATTR{idProduct}=="${NUT_PRODUCT_ID}", MODE:="0660", GROUP:="nut"
|
|
||||||
EOF
|
|
||||||
|
|
||||||
udevadm control --reload-rules || true
|
|
||||||
udevadm trigger --subsystem-match=usb --attr-match=idVendor="${NUT_VENDOR_ID}" --attr-match=idProduct="${NUT_PRODUCT_ID}" || true
|
|
||||||
|
|
||||||
systemctl enable nut-driver-enumerator.service nut-server.service nut-monitor.service >/dev/null 2>&1 || true
|
|
||||||
systemctl restart nut-driver-enumerator.service >/dev/null 2>&1 || true
|
|
||||||
systemctl restart "nut-driver@${NUT_UPS_NAME}.service" >/dev/null 2>&1 || true
|
|
||||||
systemctl restart nut-server.service nut-monitor.service >/dev/null 2>&1 || true
|
|
||||||
}
|
|
||||||
@ -1,98 +0,0 @@
|
|||||||
# Legacy Hecate migration helpers for the Ananke installer.
|
|
||||||
|
|
||||||
legacy_path_rewrite() {
|
|
||||||
local src="$1"
|
|
||||||
local dst="$2"
|
|
||||||
sed \
|
|
||||||
-e 's#/etc/hecate/hecate.yaml#/etc/ananke/ananke.yaml#g' \
|
|
||||||
-e 's#/etc/hecate/kubeconfig#/etc/ananke/kubeconfig#g' \
|
|
||||||
-e 's#/var/lib/hecate/vault-unseal.key#/var/lib/ananke/vault-unseal.key#g' \
|
|
||||||
-e 's#/var/lib/hecate/hecate.lock#/var/lib/ananke/ananke.lock#g' \
|
|
||||||
-e 's#/opt/hecate#/opt/ananke#g' \
|
|
||||||
-e 's#/etc/hecate#/etc/ananke#g' \
|
|
||||||
-e 's#/var/lib/hecate#/var/lib/ananke#g' \
|
|
||||||
-e 's#/usr/local/bin/hecate#/usr/local/bin/ananke#g' \
|
|
||||||
-e 's#/usr/local/lib/hecate#/usr/local/lib/ananke#g' \
|
|
||||||
-e 's/hecate.yaml/ananke.yaml/g' \
|
|
||||||
-e 's/hecate.lock/ananke.lock/g' \
|
|
||||||
-e 's/hecate/ananke/g' \
|
|
||||||
-e 's/Hecate/Ananke/g' \
|
|
||||||
-e 's#hecate\.lock#ananke.lock#g' \
|
|
||||||
"${src}" > "${dst}"
|
|
||||||
}
|
|
||||||
|
|
||||||
migrate_legacy_hecate_install() {
|
|
||||||
local legacy_conf_dir="/etc/hecate"
|
|
||||||
local legacy_state_dir="/var/lib/hecate"
|
|
||||||
local legacy_systemd_dir="/etc/systemd/system"
|
|
||||||
|
|
||||||
install -d -m 0750 "${CONF_DIR}"
|
|
||||||
install -d -m 0750 "${STATE_DIR}"
|
|
||||||
|
|
||||||
if [[ ! -f "${CONF_DIR}/ananke.yaml" && -f "${legacy_conf_dir}/hecate.yaml" ]]; then
|
|
||||||
echo "[install] migrating legacy config ${legacy_conf_dir}/hecate.yaml -> ${CONF_DIR}/ananke.yaml"
|
|
||||||
legacy_path_rewrite "${legacy_conf_dir}/hecate.yaml" "${CONF_DIR}/ananke.yaml"
|
|
||||||
chmod 0640 "${CONF_DIR}/ananke.yaml"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ ! -f "${CONF_DIR}/kubeconfig" && -f "${legacy_conf_dir}/kubeconfig" ]]; then
|
|
||||||
echo "[install] migrating legacy kubeconfig ${legacy_conf_dir}/kubeconfig -> ${CONF_DIR}/kubeconfig"
|
|
||||||
install -m 0600 "${legacy_conf_dir}/kubeconfig" "${CONF_DIR}/kubeconfig"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ ! -f "${STATE_DIR}/vault-unseal.key" && -f "${legacy_state_dir}/vault-unseal.key" ]]; then
|
|
||||||
echo "[install] migrating legacy vault key ${legacy_state_dir}/vault-unseal.key -> ${STATE_DIR}/vault-unseal.key"
|
|
||||||
install -m 0600 "${legacy_state_dir}/vault-unseal.key" "${STATE_DIR}/vault-unseal.key"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ ! -f "${STATE_DIR}/runs.json" && -f "${legacy_state_dir}/runs.json" ]]; then
|
|
||||||
echo "[install] migrating legacy run history ${legacy_state_dir}/runs.json -> ${STATE_DIR}/runs.json"
|
|
||||||
install -m 0640 "${legacy_state_dir}/runs.json" "${STATE_DIR}/runs.json"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ ! -f "${STATE_DIR}/intent.json" && -f "${legacy_state_dir}/intent.json" ]]; then
|
|
||||||
echo "[install] migrating legacy intent state ${legacy_state_dir}/intent.json -> ${STATE_DIR}/intent.json"
|
|
||||||
install -m 0640 "${legacy_state_dir}/intent.json" "${STATE_DIR}/intent.json"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ ! -f "${STATE_DIR}/ananke.lock" && -f "${legacy_state_dir}/hecate.lock" ]]; then
|
|
||||||
echo "[install] migrating legacy lock ${legacy_state_dir}/hecate.lock -> ${STATE_DIR}/ananke.lock"
|
|
||||||
install -m 0640 "${legacy_state_dir}/hecate.lock" "${STATE_DIR}/ananke.lock"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ -d "${legacy_systemd_dir}" ]]; then
|
|
||||||
if ls "${legacy_systemd_dir}"/hecate*.service >/dev/null 2>&1 || ls "${legacy_systemd_dir}"/hecate*.timer >/dev/null 2>&1; then
|
|
||||||
echo "[install] detected legacy hecate systemd unit files; will retire after ananke install"
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
retire_legacy_hecate_install() {
|
|
||||||
local ts backup_dir
|
|
||||||
ts="$(date +%Y%m%d%H%M%S)"
|
|
||||||
backup_dir="/var/backups/ananke-legacy-hecate-${ts}"
|
|
||||||
|
|
||||||
systemctl disable --now hecate.service hecate-bootstrap.service hecate-update.timer >/dev/null 2>&1 || true
|
|
||||||
systemctl stop hecate-update.service >/dev/null 2>&1 || true
|
|
||||||
|
|
||||||
if [[ -d /etc/hecate || -d /var/lib/hecate || -d /usr/local/lib/hecate || -d /opt/hecate ]]; then
|
|
||||||
install -d -m 0750 "${backup_dir}"
|
|
||||||
[[ -d /etc/hecate ]] && cp -a /etc/hecate "${backup_dir}/" || true
|
|
||||||
[[ -d /var/lib/hecate ]] && cp -a /var/lib/hecate "${backup_dir}/" || true
|
|
||||||
[[ -d /usr/local/lib/hecate ]] && cp -a /usr/local/lib/hecate "${backup_dir}/" || true
|
|
||||||
[[ -d /opt/hecate ]] && cp -a /opt/hecate "${backup_dir}/" || true
|
|
||||||
[[ -f /usr/local/bin/hecate ]] && install -m 0755 /usr/local/bin/hecate "${backup_dir}/hecate.bin" || true
|
|
||||||
echo "[install] backed up legacy hecate assets to ${backup_dir}"
|
|
||||||
fi
|
|
||||||
|
|
||||||
rm -f \
|
|
||||||
/etc/systemd/system/hecate.service \
|
|
||||||
/etc/systemd/system/hecate-bootstrap.service \
|
|
||||||
/etc/systemd/system/hecate-update.service \
|
|
||||||
/etc/systemd/system/hecate-update.timer
|
|
||||||
rm -f /usr/local/bin/hecate
|
|
||||||
rm -rf /usr/local/lib/hecate
|
|
||||||
rm -rf /opt/hecate
|
|
||||||
rm -rf /etc/hecate
|
|
||||||
rm -rf /var/lib/hecate
|
|
||||||
}
|
|
||||||
@ -41,10 +41,829 @@ while [[ $# -gt 0 ]]; do
|
|||||||
esac
|
esac
|
||||||
done
|
done
|
||||||
|
|
||||||
source "${REPO_DIR}/scripts/install-config-migration.sh"
|
resolve_nut_ups_name() {
|
||||||
source "${REPO_DIR}/scripts/install-host-bootstrap.sh"
|
if [[ -n "${NUT_UPS_NAME}" ]]; then
|
||||||
source "${REPO_DIR}/scripts/install-legacy-migration.sh"
|
return 0
|
||||||
source "${REPO_DIR}/scripts/install-artifacts.sh"
|
fi
|
||||||
|
|
||||||
|
if [[ -f "${CONF_DIR}/ananke.yaml" ]]; then
|
||||||
|
local target=""
|
||||||
|
target="$(grep -Eo 'target:[[:space:]]*[A-Za-z0-9._-]+@localhost' "${CONF_DIR}/ananke.yaml" | head -n 1 | awk '{print $2}')"
|
||||||
|
if [[ -n "${target}" ]]; then
|
||||||
|
NUT_UPS_NAME="${target%@localhost}"
|
||||||
|
echo "[install] inferred NUT UPS name from config: ${NUT_UPS_NAME}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
NUT_UPS_NAME="pyrphoros"
|
||||||
|
echo "[install] defaulting NUT UPS name to ${NUT_UPS_NAME}"
|
||||||
|
}
|
||||||
|
|
||||||
|
read_ananke_role() {
|
||||||
|
if [[ ! -f "${CONF_DIR}/ananke.yaml" ]]; then
|
||||||
|
echo "coordinator"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
local role
|
||||||
|
role="$(awk '/^[[:space:]]*role:[[:space:]]*/ {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true)"
|
||||||
|
if [[ -z "${role}" ]]; then
|
||||||
|
role="coordinator"
|
||||||
|
fi
|
||||||
|
echo "${role}"
|
||||||
|
}
|
||||||
|
|
||||||
|
migration_yaml_lookup() {
|
||||||
|
local key="$1"
|
||||||
|
awk -F': *' -v k="${key}" '$1 == k {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true
|
||||||
|
}
|
||||||
|
|
||||||
|
first_control_plane_name() {
|
||||||
|
awk '
|
||||||
|
/^control_planes:[[:space:]]*$/ {in_list=1; next}
|
||||||
|
in_list && /^[[:space:]]*-[[:space:]]*/ {gsub(/^[[:space:]]*-[[:space:]]*/, "", $0); print $0; exit}
|
||||||
|
in_list && /^[^[:space:]]/ {in_list=0}
|
||||||
|
' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true
|
||||||
|
}
|
||||||
|
|
||||||
|
lookup_node_host() {
|
||||||
|
local node="$1"
|
||||||
|
awk -F': *' -v n="${node}" '$1 == " " n {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true
|
||||||
|
}
|
||||||
|
|
||||||
|
ensure_ananke_kubeconfig() {
|
||||||
|
local kubeconfig_path
|
||||||
|
kubeconfig_path="$(migration_yaml_lookup "kubeconfig")"
|
||||||
|
if [[ -z "${kubeconfig_path}" ]]; then
|
||||||
|
kubeconfig_path="/etc/ananke/kubeconfig"
|
||||||
|
fi
|
||||||
|
install -d -m 0750 "$(dirname "${kubeconfig_path}")"
|
||||||
|
|
||||||
|
if [[ -s "${kubeconfig_path}" ]] && KUBECONFIG="${kubeconfig_path}" kubectl version --request-timeout=5s >/dev/null 2>&1; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -r /etc/rancher/k3s/k3s.yaml ]]; then
|
||||||
|
install -m 0600 /etc/rancher/k3s/k3s.yaml "${kubeconfig_path}"
|
||||||
|
echo "[install] refreshed kubeconfig from local /etc/rancher/k3s/k3s.yaml"
|
||||||
|
if KUBECONFIG="${kubeconfig_path}" kubectl version --request-timeout=5s >/dev/null 2>&1; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
local cp_name cp_host ssh_user ssh_port ssh_cfg ssh_key
|
||||||
|
cp_name="$(first_control_plane_name)"
|
||||||
|
if [[ -z "${cp_name}" ]]; then
|
||||||
|
echo "[install] warning: cannot infer control plane name; kubeconfig bootstrap skipped"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
cp_host="$(lookup_node_host "${cp_name}")"
|
||||||
|
if [[ -z "${cp_host}" ]]; then
|
||||||
|
cp_host="${cp_name}"
|
||||||
|
fi
|
||||||
|
ssh_user="$(migration_yaml_lookup "ssh_user")"
|
||||||
|
ssh_port="$(migration_yaml_lookup "ssh_port")"
|
||||||
|
ssh_cfg="$(migration_yaml_lookup "ssh_config_file")"
|
||||||
|
ssh_key="$(migration_yaml_lookup "ssh_identity_file")"
|
||||||
|
if [[ -z "${ssh_port}" ]]; then
|
||||||
|
ssh_port="2277"
|
||||||
|
fi
|
||||||
|
|
||||||
|
local target
|
||||||
|
target="${cp_host}"
|
||||||
|
if [[ -n "${ssh_user}" ]]; then
|
||||||
|
target="${ssh_user}@${cp_host}"
|
||||||
|
fi
|
||||||
|
local ssh_args=(
|
||||||
|
-o BatchMode=yes
|
||||||
|
-o ConnectTimeout=8
|
||||||
|
-o StrictHostKeyChecking=accept-new
|
||||||
|
)
|
||||||
|
if [[ -n "${ssh_cfg}" && -f "${ssh_cfg}" ]]; then
|
||||||
|
ssh_args+=(-F "${ssh_cfg}")
|
||||||
|
fi
|
||||||
|
if [[ -n "${ssh_key}" && -f "${ssh_key}" ]]; then
|
||||||
|
ssh_args+=(-i "${ssh_key}")
|
||||||
|
fi
|
||||||
|
if [[ -n "${ssh_port}" ]]; then
|
||||||
|
ssh_args+=(-p "${ssh_port}")
|
||||||
|
fi
|
||||||
|
|
||||||
|
local remote_cfg
|
||||||
|
if remote_cfg="$(ssh "${ssh_args[@]}" "${target}" "sudo cat /etc/rancher/k3s/k3s.yaml" 2>/dev/null)"; then
|
||||||
|
printf '%s\n' "${remote_cfg}" > "${kubeconfig_path}"
|
||||||
|
sed -Ei "s#server:[[:space:]]*https://127\\.0\\.0\\.1:6443#server: https://${cp_host}:6443#g" "${kubeconfig_path}" || true
|
||||||
|
chmod 0600 "${kubeconfig_path}"
|
||||||
|
echo "[install] bootstrapped kubeconfig from control plane ${cp_name} (${cp_host})"
|
||||||
|
if KUBECONFIG="${kubeconfig_path}" kubectl version --request-timeout=5s >/dev/null 2>&1; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo "[install] warning: failed to fetch kubeconfig from ${cp_name} (${cp_host})"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "[install] warning: kubeconfig at ${kubeconfig_path} is still not validated; local startup fallback may fail"
|
||||||
|
}
|
||||||
|
|
||||||
|
ensure_ananke_ssh_identity() {
|
||||||
|
local key_path key_dir key_user key_comment
|
||||||
|
key_path="$(migration_yaml_lookup "ssh_identity_file")"
|
||||||
|
if [[ -z "${key_path}" ]]; then
|
||||||
|
key_path="/home/atlas/.ssh/id_ed25519"
|
||||||
|
fi
|
||||||
|
key_dir="$(dirname "${key_path}")"
|
||||||
|
key_comment="ananke-$(hostname)-forward"
|
||||||
|
|
||||||
|
key_user="root"
|
||||||
|
if [[ "${key_path}" == /home/*/* ]]; then
|
||||||
|
key_user="${key_path#/home/}"
|
||||||
|
key_user="${key_user%%/*}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ! id "${key_user}" >/dev/null 2>&1; then
|
||||||
|
echo "[install] warning: ssh identity owner ${key_user} does not exist; skipping key bootstrap for ${key_path}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
install -d -m 0700 -o "${key_user}" -g "${key_user}" "${key_dir}"
|
||||||
|
if [[ ! -s "${key_path}" ]]; then
|
||||||
|
echo "[install] generating missing SSH identity at ${key_path}"
|
||||||
|
if [[ "${key_user}" == "root" ]]; then
|
||||||
|
ssh-keygen -q -t ed25519 -N '' -C "${key_comment}" -f "${key_path}"
|
||||||
|
else
|
||||||
|
runuser -u "${key_user}" -- ssh-keygen -q -t ed25519 -N '' -C "${key_comment}" -f "${key_path}"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
chown "${key_user}:${key_user}" "${key_path}" "${key_path}.pub" 2>/dev/null || true
|
||||||
|
chmod 0600 "${key_path}" || true
|
||||||
|
chmod 0644 "${key_path}.pub" || true
|
||||||
|
}
|
||||||
|
|
||||||
|
migrate_ananke_config() {
|
||||||
|
if [[ ! -f "${CONF_DIR}/ananke.yaml" ]]; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
local changed=0
|
||||||
|
local role_hint
|
||||||
|
role_hint="$(read_ananke_role)"
|
||||||
|
if grep -Eq 'default_budget_seconds:[[:space:]]*300' "${CONF_DIR}/ananke.yaml"; then
|
||||||
|
sed -Ei 's/(default_budget_seconds:[[:space:]]*)300/\11380/' "${CONF_DIR}/ananke.yaml"
|
||||||
|
echo "[install] migrated default_budget_seconds 300 -> 1380 in ${CONF_DIR}/ananke.yaml"
|
||||||
|
changed=1
|
||||||
|
fi
|
||||||
|
if grep -Eq 'runtime_safety_factor:[[:space:]]*1\.10' "${CONF_DIR}/ananke.yaml"; then
|
||||||
|
sed -Ei 's/(runtime_safety_factor:[[:space:]]*)1\.10/\11.25/' "${CONF_DIR}/ananke.yaml"
|
||||||
|
echo "[install] migrated runtime_safety_factor 1.10 -> 1.25 in ${CONF_DIR}/ananke.yaml"
|
||||||
|
changed=1
|
||||||
|
fi
|
||||||
|
if grep -Eq '^ssh_node_users:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
|
||||||
|
&& grep -Eq '^ titan-24:[[:space:]]*tethys[[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
|
||||||
|
sed -Ei 's/^ titan-24:[[:space:]]*tethys[[:space:]]*$/ titan-24: atlas/' "${CONF_DIR}/ananke.yaml"
|
||||||
|
echo "[install] migrated ssh_node_users titan-24 override to atlas"
|
||||||
|
changed=1
|
||||||
|
fi
|
||||||
|
if grep -Eq '^ command_timeout_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
|
||||||
|
&& ! grep -Eq '^ startup_guard_max_age_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml"; then
|
||||||
|
sed -Ei '/^ command_timeout_seconds:[[:space:]]*[0-9]+/a\ startup_guard_max_age_seconds: 900' "${CONF_DIR}/ananke.yaml"
|
||||||
|
echo "[install] added coordination.startup_guard_max_age_seconds=900"
|
||||||
|
changed=1
|
||||||
|
fi
|
||||||
|
if grep -Eq '^[[:space:]]*poweroff_enabled:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
|
||||||
|
sed -Ei \
|
||||||
|
-e '/^[[:space:]]*poweroff_enabled:[[:space:]]*(true|false)/d' \
|
||||||
|
-e '/^[[:space:]]*poweroff_delay_seconds:[[:space:]]*[0-9]+/d' \
|
||||||
|
-e '/^[[:space:]]*poweroff_local_host:[[:space:]]*(true|false)/d' \
|
||||||
|
-e '/^[[:space:]]*extra_poweroff_hosts:[[:space:]]*(\[\])?[[:space:]]*$/d' \
|
||||||
|
"${CONF_DIR}/ananke.yaml"
|
||||||
|
echo "[install] removed deprecated host-poweroff shutdown config keys"
|
||||||
|
changed=1
|
||||||
|
fi
|
||||||
|
if grep -Eq '^ minimum_battery_percent:[[:space:]]*[0-9.]+' "${CONF_DIR}/ananke.yaml" \
|
||||||
|
&& ! grep -Eq '^ require_node_inventory_reachability:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
|
||||||
|
sed -Ei '/^ minimum_battery_percent:[[:space:]]*[0-9.]+/a\ require_node_inventory_reachability: true\n node_inventory_reachability_wait_seconds: 300\n node_inventory_reachability_poll_seconds: 5' "${CONF_DIR}/ananke.yaml"
|
||||||
|
echo "[install] added startup node inventory reachability gate defaults"
|
||||||
|
changed=1
|
||||||
|
fi
|
||||||
|
if grep -Eq '^state:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
|
||||||
|
&& ! grep -Eq '^ reports_dir:[[:space:]]*/var/lib/ananke/reports' "${CONF_DIR}/ananke.yaml"; then
|
||||||
|
sed -Ei '/^ dir:[[:space:]]*\/var\/lib\/ananke$/a\ reports_dir: /var/lib/ananke/reports' "${CONF_DIR}/ananke.yaml"
|
||||||
|
echo "[install] added state.reports_dir default"
|
||||||
|
changed=1
|
||||||
|
fi
|
||||||
|
if ! grep -Eq '^ peer_hosts:' "${CONF_DIR}/ananke.yaml"; then
|
||||||
|
if [[ "${role_hint}" == "peer" ]] && grep -Eq '^ forward_shutdown_host:[[:space:]]*[A-Za-z0-9._-]+' "${CONF_DIR}/ananke.yaml"; then
|
||||||
|
local peer_host
|
||||||
|
peer_host="$(awk -F': *' '/^ forward_shutdown_host:[[:space:]]*/ {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true)"
|
||||||
|
if [[ -n "${peer_host}" ]]; then
|
||||||
|
sed -Ei '/^ forward_shutdown_config:[[:space:]]*.*$/a\ peer_hosts:\n - '"${peer_host}"'' "${CONF_DIR}/ananke.yaml"
|
||||||
|
echo "[install] added coordination.peer_hosts from forward_shutdown_host (${peer_host})"
|
||||||
|
changed=1
|
||||||
|
fi
|
||||||
|
elif [[ "${role_hint}" == "coordinator" ]] && grep -Eq '^ titan-24:[[:space:]]*[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' "${CONF_DIR}/ananke.yaml"; then
|
||||||
|
sed -Ei '/^ forward_shutdown_config:[[:space:]]*.*$/a\ peer_hosts:\n - titan-24' "${CONF_DIR}/ananke.yaml"
|
||||||
|
echo "[install] added coordination.peer_hosts default (titan-24) for coordinator role"
|
||||||
|
changed=1
|
||||||
|
else
|
||||||
|
sed -Ei '/^ forward_shutdown_config:[[:space:]]*.*$/a\ peer_hosts: []' "${CONF_DIR}/ananke.yaml"
|
||||||
|
echo "[install] added coordination.peer_hosts empty default"
|
||||||
|
changed=1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
local default_restore_cp
|
||||||
|
default_restore_cp="$(first_control_plane_name)"
|
||||||
|
if [[ -z "${default_restore_cp}" ]]; then
|
||||||
|
default_restore_cp="titan-0a"
|
||||||
|
fi
|
||||||
|
if grep -Eq '^ api_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
|
||||||
|
&& ! grep -Eq '^ auto_etcd_restore_on_api_failure:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
|
||||||
|
sed -Ei '/^ api_poll_seconds:[[:space:]]*[0-9]+/a\ require_time_sync: true\n time_sync_wait_seconds: 240\n time_sync_poll_seconds: 5\n reconcile_access_on_boot: true\n auto_etcd_restore_on_api_failure: true\n etcd_restore_control_plane: '"${default_restore_cp}"'' "${CONF_DIR}/ananke.yaml"
|
||||||
|
echo "[install] added startup.auto_etcd_restore_on_api_failure + startup.etcd_restore_control_plane defaults"
|
||||||
|
changed=1
|
||||||
|
fi
|
||||||
|
if grep -Eq '^ api_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
|
||||||
|
&& ! grep -Eq '^ require_time_sync:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
|
||||||
|
sed -Ei '/^ api_poll_seconds:[[:space:]]*[0-9]+/a\ require_time_sync: true\n time_sync_wait_seconds: 240\n time_sync_poll_seconds: 5\n reconcile_access_on_boot: true' "${CONF_DIR}/ananke.yaml"
|
||||||
|
echo "[install] added startup time sync + access reconciliation defaults"
|
||||||
|
changed=1
|
||||||
|
fi
|
||||||
|
if grep -Eq '^ time_sync_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
|
||||||
|
&& ! grep -Eq '^ time_sync_mode:[[:space:]]*(strict|quorum)' "${CONF_DIR}/ananke.yaml"; then
|
||||||
|
sed -Ei '/^ time_sync_poll_seconds:[[:space:]]*[0-9]+/a\ time_sync_mode: quorum\n time_sync_quorum: 2' "${CONF_DIR}/ananke.yaml"
|
||||||
|
echo "[install] added startup time sync quorum defaults"
|
||||||
|
changed=1
|
||||||
|
fi
|
||||||
|
if grep -Eq '^ etcd_restore_control_plane:[[:space:]]*[A-Za-z0-9._-]+' "${CONF_DIR}/ananke.yaml" \
|
||||||
|
&& ! grep -Eq '^ require_storage_ready:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
|
||||||
|
sed -Ei '/^ etcd_restore_control_plane:[[:space:]]*[A-Za-z0-9._-]+/a\ require_storage_ready: true\n storage_ready_wait_seconds: 420\n storage_ready_poll_seconds: 5\n storage_min_ready_nodes: 2\n storage_critical_pvcs:\n - vault/data-vault-0\n - postgres/postgres-data-postgres-0\n - gitea/gitea-data\n - sso/keycloak-data' "${CONF_DIR}/ananke.yaml"
|
||||||
|
echo "[install] added startup storage readiness defaults"
|
||||||
|
changed=1
|
||||||
|
fi
|
||||||
|
if grep -Eq '^ storage_critical_pvcs:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
|
||||||
|
&& ! grep -Eq '^ require_post_start_probes:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
|
||||||
|
sed -Ei '/^ - sso\/keycloak-data$/a\ require_post_start_probes: true\n post_start_probe_wait_seconds: 240\n post_start_probe_poll_seconds: 5\n post_start_probes:\n - https://scm.bstein.dev/api/healthz\n - https://metrics.bstein.dev/api/health\n require_service_checklist: true\n service_checklist_wait_seconds: 420\n service_checklist_poll_seconds: 5\n service_checklist_stability_seconds: 120\n service_checklist:\n - name: gitea-api\n url: https://scm.bstein.dev/api/healthz\n accepted_statuses: [200]\n body_contains: pass\n timeout_seconds: 12\n - name: grafana-api\n url: https://metrics.bstein.dev/api/health\n accepted_statuses: [200]\n body_contains: '\''\"database\":\"ok\"'\''\n timeout_seconds: 12\n vault_unseal_key_file: /var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"
|
||||||
|
echo "[install] added startup post-start probe + vault key fallback defaults"
|
||||||
|
changed=1
|
||||||
|
fi
|
||||||
|
if grep -Eq '^ - https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration$' "${CONF_DIR}/ananke.yaml"; then
|
||||||
|
sed -Ei '/^ - https:\/\/sso\.bstein\.dev\/realms\/atlas\/\.well-known\/openid-configuration$/d' "${CONF_DIR}/ananke.yaml"
|
||||||
|
echo "[install] removed sso OIDC probe from startup.post_start_probes (returns 404 in current deployment)"
|
||||||
|
changed=1
|
||||||
|
fi
|
||||||
|
if ! grep -Eq '^ vault_unseal_key_file:[[:space:]]*/var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"; then
|
||||||
|
if grep -Eq '^startup:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" && grep -Eq '^ post_start_probes:[[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
|
||||||
|
sed -Ei '/^ - https:\/\/metrics\.bstein\.dev\/api\/health$/a\ vault_unseal_key_file: /var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"
|
||||||
|
echo "[install] added startup.vault_unseal_key_file default"
|
||||||
|
changed=1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
if ! grep -Eq '^ vault_unseal_breakglass_timeout_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml"; then
|
||||||
|
if grep -Eq '^ vault_unseal_key_file:[[:space:]]*/var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"; then
|
||||||
|
sed -Ei '/^ vault_unseal_key_file:[[:space:]]*\/var\/lib\/ananke\/vault-unseal.key$/a\ vault_unseal_breakglass_command: ""\n vault_unseal_breakglass_timeout_seconds: 15' "${CONF_DIR}/ananke.yaml"
|
||||||
|
echo "[install] added startup break-glass fallback defaults"
|
||||||
|
changed=1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
local role
|
||||||
|
role="$(read_ananke_role)"
|
||||||
|
local inventory_block
|
||||||
|
local managed_block
|
||||||
|
local workers_block
|
||||||
|
workers_block='workers:
|
||||||
|
- titan-04
|
||||||
|
- titan-05
|
||||||
|
- titan-06
|
||||||
|
- titan-07
|
||||||
|
- titan-08
|
||||||
|
- titan-09
|
||||||
|
- titan-10
|
||||||
|
- titan-11
|
||||||
|
- titan-12
|
||||||
|
- titan-13
|
||||||
|
- titan-14
|
||||||
|
- titan-15
|
||||||
|
- titan-17
|
||||||
|
- titan-18
|
||||||
|
- titan-19
|
||||||
|
- titan-20
|
||||||
|
- titan-21
|
||||||
|
- titan-22
|
||||||
|
- titan-24'
|
||||||
|
if [[ "${role}" == "coordinator" ]]; then
|
||||||
|
inventory_block='ssh_node_hosts:
|
||||||
|
titan-db: 192.168.22.10
|
||||||
|
titan-0a: 192.168.22.11
|
||||||
|
titan-0b: 192.168.22.12
|
||||||
|
titan-0c: 192.168.22.13
|
||||||
|
titan-04: 192.168.22.30
|
||||||
|
titan-05: 192.168.22.31
|
||||||
|
titan-06: 192.168.22.32
|
||||||
|
titan-07: 192.168.22.33
|
||||||
|
titan-08: 192.168.22.34
|
||||||
|
titan-09: 192.168.22.35
|
||||||
|
titan-10: 192.168.22.36
|
||||||
|
titan-11: 192.168.22.37
|
||||||
|
titan-12: 192.168.22.40
|
||||||
|
titan-13: 192.168.22.41
|
||||||
|
titan-14: 192.168.22.42
|
||||||
|
titan-15: 192.168.22.43
|
||||||
|
titan-17: 192.168.22.45
|
||||||
|
titan-18: 192.168.22.46
|
||||||
|
titan-19: 192.168.22.47
|
||||||
|
titan-20: 192.168.22.20
|
||||||
|
titan-21: 192.168.22.21
|
||||||
|
titan-22: 192.168.22.22
|
||||||
|
titan-24: 192.168.22.26'
|
||||||
|
managed_block='ssh_managed_nodes:
|
||||||
|
- titan-db
|
||||||
|
- titan-0a
|
||||||
|
- titan-0b
|
||||||
|
- titan-0c
|
||||||
|
- titan-04
|
||||||
|
- titan-05
|
||||||
|
- titan-06
|
||||||
|
- titan-07
|
||||||
|
- titan-08
|
||||||
|
- titan-09
|
||||||
|
- titan-10
|
||||||
|
- titan-11
|
||||||
|
- titan-12
|
||||||
|
- titan-13
|
||||||
|
- titan-14
|
||||||
|
- titan-15
|
||||||
|
- titan-17
|
||||||
|
- titan-18
|
||||||
|
- titan-19
|
||||||
|
- titan-20
|
||||||
|
- titan-21
|
||||||
|
- titan-22
|
||||||
|
- titan-24'
|
||||||
|
elif [[ "${role}" == "peer" ]]; then
|
||||||
|
inventory_block='ssh_node_hosts:
|
||||||
|
titan-db: 192.168.22.10
|
||||||
|
titan-0a: 192.168.22.11
|
||||||
|
titan-0b: 192.168.22.12
|
||||||
|
titan-0c: 192.168.22.13
|
||||||
|
titan-04: 192.168.22.30
|
||||||
|
titan-05: 192.168.22.31
|
||||||
|
titan-06: 192.168.22.32
|
||||||
|
titan-07: 192.168.22.33
|
||||||
|
titan-08: 192.168.22.34
|
||||||
|
titan-09: 192.168.22.35
|
||||||
|
titan-10: 192.168.22.36
|
||||||
|
titan-11: 192.168.22.37
|
||||||
|
titan-12: 192.168.22.40
|
||||||
|
titan-13: 192.168.22.41
|
||||||
|
titan-14: 192.168.22.42
|
||||||
|
titan-15: 192.168.22.43
|
||||||
|
titan-17: 192.168.22.45
|
||||||
|
titan-18: 192.168.22.46
|
||||||
|
titan-19: 192.168.22.47
|
||||||
|
titan-20: 192.168.22.20
|
||||||
|
titan-21: 192.168.22.21
|
||||||
|
titan-22: 192.168.22.22
|
||||||
|
titan-24: 192.168.22.26'
|
||||||
|
managed_block='ssh_managed_nodes:
|
||||||
|
- titan-db
|
||||||
|
- titan-0a
|
||||||
|
- titan-0b
|
||||||
|
- titan-0c
|
||||||
|
- titan-04
|
||||||
|
- titan-05
|
||||||
|
- titan-06
|
||||||
|
- titan-07
|
||||||
|
- titan-08
|
||||||
|
- titan-09
|
||||||
|
- titan-10
|
||||||
|
- titan-11
|
||||||
|
- titan-12
|
||||||
|
- titan-13
|
||||||
|
- titan-14
|
||||||
|
- titan-15
|
||||||
|
- titan-17
|
||||||
|
- titan-18
|
||||||
|
- titan-19
|
||||||
|
- titan-20
|
||||||
|
- titan-21
|
||||||
|
- titan-22
|
||||||
|
- titan-24'
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -n "${inventory_block}" ]]; then
|
||||||
|
if grep -Eq '^ssh_node_hosts:[[:space:]]*\{\}[[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
|
||||||
|
perl -0pi -e 's#ssh_node_hosts:\s*\{\}\n#'"${inventory_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
|
||||||
|
echo "[install] hydrated ssh_node_hosts inventory for role=${role}"
|
||||||
|
changed=1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
if grep -Eq '^workers:[[:space:]]*\[\][[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
|
||||||
|
perl -0pi -e 's#workers:\s*\[\]\n#'"${workers_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
|
||||||
|
echo "[install] hydrated workers inventory for startup/shutdown orchestration"
|
||||||
|
changed=1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -n "${managed_block}" ]]; then
|
||||||
|
if grep -Eq '^ssh_managed_nodes:[[:space:]]*\[\][[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
|
||||||
|
perl -0pi -e 's#ssh_managed_nodes:\s*\[\]\n#'"${managed_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
|
||||||
|
echo "[install] hydrated ssh_managed_nodes inventory for role=${role}"
|
||||||
|
changed=1
|
||||||
|
fi
|
||||||
|
if ! grep -Eq '^ - titan-04$' "${CONF_DIR}/ananke.yaml" || ! grep -Eq '^ - titan-21$' "${CONF_DIR}/ananke.yaml"; then
|
||||||
|
perl -0pi -e 's#ssh_managed_nodes:\n(?: - [^\n]*\n)*#'"${managed_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
|
||||||
|
echo "[install] refreshed ssh_managed_nodes coverage for role=${role}"
|
||||||
|
changed=1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "${role}" == "peer" ]]; then
|
||||||
|
if grep -Eq '^ssh_managed_nodes:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
|
||||||
|
&& grep -Eq '^ - titan-db$' "${CONF_DIR}/ananke.yaml" \
|
||||||
|
&& grep -Eq '^ - titan-24$' "${CONF_DIR}/ananke.yaml" \
|
||||||
|
&& ! grep -Eq '^ - titan-0a$' "${CONF_DIR}/ananke.yaml"; then
|
||||||
|
perl -0pi -e 's#ssh_managed_nodes:\n - titan-db\n - titan-24\n#ssh_managed_nodes:\n - titan-db\n - titan-0a\n - titan-0b\n - titan-0c\n - titan-04\n - titan-05\n - titan-06\n - titan-07\n - titan-08\n - titan-09\n - titan-10\n - titan-11\n - titan-12\n - titan-13\n - titan-14\n - titan-15\n - titan-17\n - titan-18\n - titan-19\n - titan-20\n - titan-21\n - titan-22\n - titan-24\n#s' "${CONF_DIR}/ananke.yaml"
|
||||||
|
echo "[install] expanded peer ssh_managed_nodes for bootstrap fallback coverage"
|
||||||
|
changed=1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ! grep -Eq '^ - services/keycloak$' "${CONF_DIR}/ananke.yaml" || ! grep -Eq '^ - infrastructure/cert-manager$' "${CONF_DIR}/ananke.yaml" || ! grep -Eq '^ - services/oauth2-proxy$' "${CONF_DIR}/ananke.yaml"; then
|
||||||
|
perl -0pi -e 's#local_bootstrap_paths:\n(?: - [^\n]*\n)*#local_bootstrap_paths:\n - infrastructure/core\n - clusters/atlas/flux-system\n - infrastructure/sources/helm\n - infrastructure/metallb\n - infrastructure/traefik\n - infrastructure/cert-manager\n - infrastructure/vault-csi\n - infrastructure/vault-injector\n - services/vault\n - infrastructure/postgres\n - services/gitea\n - services/keycloak\n - services/oauth2-proxy\n#s' "${CONF_DIR}/ananke.yaml"
|
||||||
|
echo "[install] refreshed peer local_bootstrap_paths for full fallback bootstrap parity"
|
||||||
|
changed=1
|
||||||
|
fi
|
||||||
|
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "${changed}" -eq 1 ]]; then
|
||||||
|
chmod 0640 "${CONF_DIR}/ananke.yaml" || true
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
sanitize_migrated_ananke_config() {
|
||||||
|
local cfg="${CONF_DIR}/ananke.yaml"
|
||||||
|
[[ -f "${cfg}" ]] || return 0
|
||||||
|
|
||||||
|
local tmp changed=0
|
||||||
|
tmp="$(mktemp)"
|
||||||
|
|
||||||
|
# Legacy migration bug guard:
|
||||||
|
# If root-level "- node" entries were accidentally appended after ssh_managed_nodes,
|
||||||
|
# drop those orphan entries until the next top-level key.
|
||||||
|
awk '
|
||||||
|
BEGIN {in_managed=0}
|
||||||
|
/^ssh_managed_nodes:[[:space:]]*$/ {in_managed=1; print; next}
|
||||||
|
{
|
||||||
|
if (in_managed) {
|
||||||
|
if ($0 ~ /^ - /) {print; next}
|
||||||
|
if ($0 ~ /^- /) {next}
|
||||||
|
if ($0 ~ /^[A-Za-z0-9_]+:[[:space:]]*/) {in_managed=0}
|
||||||
|
}
|
||||||
|
print
|
||||||
|
}
|
||||||
|
' "${cfg}" > "${tmp}"
|
||||||
|
|
||||||
|
if ! cmp -s "${cfg}" "${tmp}"; then
|
||||||
|
mv "${tmp}" "${cfg}"
|
||||||
|
changed=1
|
||||||
|
echo "[install] sanitized malformed ssh_managed_nodes block in ${cfg}"
|
||||||
|
else
|
||||||
|
rm -f "${tmp}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if grep -Eq '^[[:space:]]*forward_shutdown_config:[[:space:]]*/etc/ananke/hecate.yaml[[:space:]]*$' "${cfg}"; then
|
||||||
|
sed -Ei 's#(^[[:space:]]*forward_shutdown_config:[[:space:]]*)/etc/ananke/hecate.yaml#\1/etc/ananke/ananke.yaml#' "${cfg}"
|
||||||
|
changed=1
|
||||||
|
echo "[install] migrated coordination.forward_shutdown_config to /etc/ananke/ananke.yaml"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "${changed}" -eq 1 ]]; then
|
||||||
|
chmod 0640 "${cfg}" || true
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
ensure_apt_packages() {
|
||||||
|
local missing=()
|
||||||
|
for pkg in "$@"; do
|
||||||
|
if ! dpkg -s "${pkg}" >/dev/null 2>&1; then
|
||||||
|
missing+=("${pkg}")
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
if [[ ${#missing[@]} -eq 0 ]]; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
echo "[install] apt install: ${missing[*]}"
|
||||||
|
export DEBIAN_FRONTEND=noninteractive
|
||||||
|
apt-get update -y
|
||||||
|
apt-get install -y "${missing[@]}"
|
||||||
|
}
|
||||||
|
|
||||||
|
install_kubectl_if_missing() {
|
||||||
|
if command -v kubectl >/dev/null 2>&1; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
ensure_apt_packages kubernetes-client || true
|
||||||
|
if command -v kubectl >/dev/null 2>&1; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
echo "[install] installing kubectl via upstream binary"
|
||||||
|
local arch
|
||||||
|
arch="$(uname -m)"
|
||||||
|
case "${arch}" in
|
||||||
|
x86_64) arch="amd64" ;;
|
||||||
|
aarch64|arm64) arch="arm64" ;;
|
||||||
|
*) echo "Unsupported arch for kubectl install: ${arch}" >&2; return 1 ;;
|
||||||
|
esac
|
||||||
|
local version
|
||||||
|
version="$(curl -fsSL https://dl.k8s.io/release/stable.txt)"
|
||||||
|
curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/${version}/bin/linux/${arch}/kubectl"
|
||||||
|
chmod 0755 /usr/local/bin/kubectl
|
||||||
|
}
|
||||||
|
|
||||||
|
ensure_dependencies() {
|
||||||
|
if [[ "${INSTALL_DEPS}" -eq 0 ]]; then
|
||||||
|
echo "[install] skipping dependency installation"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
if ! command -v apt-get >/dev/null 2>&1; then
|
||||||
|
echo "This installer currently supports apt-based hosts only." >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
ensure_apt_packages ca-certificates curl git openssh-client jq nut-client nut-server nut-monitor golang-go
|
||||||
|
install_kubectl_if_missing
|
||||||
|
}
|
||||||
|
|
||||||
|
legacy_path_rewrite() {
|
||||||
|
local src="$1"
|
||||||
|
local dst="$2"
|
||||||
|
sed \
|
||||||
|
-e 's#/etc/hecate/hecate.yaml#/etc/ananke/ananke.yaml#g' \
|
||||||
|
-e 's#/etc/hecate/kubeconfig#/etc/ananke/kubeconfig#g' \
|
||||||
|
-e 's#/var/lib/hecate/vault-unseal.key#/var/lib/ananke/vault-unseal.key#g' \
|
||||||
|
-e 's#/var/lib/hecate/hecate.lock#/var/lib/ananke/ananke.lock#g' \
|
||||||
|
-e 's#/opt/hecate#/opt/ananke#g' \
|
||||||
|
-e 's#/etc/hecate#/etc/ananke#g' \
|
||||||
|
-e 's#/var/lib/hecate#/var/lib/ananke#g' \
|
||||||
|
-e 's#/usr/local/bin/hecate#/usr/local/bin/ananke#g' \
|
||||||
|
-e 's#/usr/local/lib/hecate#/usr/local/lib/ananke#g' \
|
||||||
|
-e 's/hecate.yaml/ananke.yaml/g' \
|
||||||
|
-e 's/hecate.lock/ananke.lock/g' \
|
||||||
|
-e 's/hecate/ananke/g' \
|
||||||
|
-e 's/Hecate/Ananke/g' \
|
||||||
|
-e 's#hecate\.lock#ananke.lock#g' \
|
||||||
|
"${src}" > "${dst}"
|
||||||
|
}
|
||||||
|
|
||||||
|
migrate_legacy_hecate_install() {
|
||||||
|
local legacy_conf_dir="/etc/hecate"
|
||||||
|
local legacy_state_dir="/var/lib/hecate"
|
||||||
|
local legacy_systemd_dir="/etc/systemd/system"
|
||||||
|
|
||||||
|
install -d -m 0750 "${CONF_DIR}"
|
||||||
|
install -d -m 0750 "${STATE_DIR}"
|
||||||
|
|
||||||
|
if [[ ! -f "${CONF_DIR}/ananke.yaml" && -f "${legacy_conf_dir}/hecate.yaml" ]]; then
|
||||||
|
echo "[install] migrating legacy config ${legacy_conf_dir}/hecate.yaml -> ${CONF_DIR}/ananke.yaml"
|
||||||
|
legacy_path_rewrite "${legacy_conf_dir}/hecate.yaml" "${CONF_DIR}/ananke.yaml"
|
||||||
|
chmod 0640 "${CONF_DIR}/ananke.yaml"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ ! -f "${CONF_DIR}/kubeconfig" && -f "${legacy_conf_dir}/kubeconfig" ]]; then
|
||||||
|
echo "[install] migrating legacy kubeconfig ${legacy_conf_dir}/kubeconfig -> ${CONF_DIR}/kubeconfig"
|
||||||
|
install -m 0600 "${legacy_conf_dir}/kubeconfig" "${CONF_DIR}/kubeconfig"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ ! -f "${STATE_DIR}/vault-unseal.key" && -f "${legacy_state_dir}/vault-unseal.key" ]]; then
|
||||||
|
echo "[install] migrating legacy vault key ${legacy_state_dir}/vault-unseal.key -> ${STATE_DIR}/vault-unseal.key"
|
||||||
|
install -m 0600 "${legacy_state_dir}/vault-unseal.key" "${STATE_DIR}/vault-unseal.key"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ ! -f "${STATE_DIR}/runs.json" && -f "${legacy_state_dir}/runs.json" ]]; then
|
||||||
|
echo "[install] migrating legacy run history ${legacy_state_dir}/runs.json -> ${STATE_DIR}/runs.json"
|
||||||
|
install -m 0640 "${legacy_state_dir}/runs.json" "${STATE_DIR}/runs.json"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ ! -f "${STATE_DIR}/intent.json" && -f "${legacy_state_dir}/intent.json" ]]; then
|
||||||
|
echo "[install] migrating legacy intent state ${legacy_state_dir}/intent.json -> ${STATE_DIR}/intent.json"
|
||||||
|
install -m 0640 "${legacy_state_dir}/intent.json" "${STATE_DIR}/intent.json"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ ! -f "${STATE_DIR}/ananke.lock" && -f "${legacy_state_dir}/hecate.lock" ]]; then
|
||||||
|
echo "[install] migrating legacy lock ${legacy_state_dir}/hecate.lock -> ${STATE_DIR}/ananke.lock"
|
||||||
|
install -m 0640 "${legacy_state_dir}/hecate.lock" "${STATE_DIR}/ananke.lock"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -d "${legacy_systemd_dir}" ]]; then
|
||||||
|
if ls "${legacy_systemd_dir}"/hecate*.service >/dev/null 2>&1 || ls "${legacy_systemd_dir}"/hecate*.timer >/dev/null 2>&1; then
|
||||||
|
echo "[install] detected legacy hecate systemd unit files; will retire after ananke install"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
retire_legacy_hecate_install() {
|
||||||
|
local ts backup_dir
|
||||||
|
ts="$(date +%Y%m%d%H%M%S)"
|
||||||
|
backup_dir="/var/backups/ananke-legacy-hecate-${ts}"
|
||||||
|
|
||||||
|
systemctl disable --now hecate.service hecate-bootstrap.service hecate-update.timer >/dev/null 2>&1 || true
|
||||||
|
systemctl stop hecate-update.service >/dev/null 2>&1 || true
|
||||||
|
|
||||||
|
if [[ -d /etc/hecate || -d /var/lib/hecate || -d /usr/local/lib/hecate || -d /opt/hecate ]]; then
|
||||||
|
install -d -m 0750 "${backup_dir}"
|
||||||
|
[[ -d /etc/hecate ]] && cp -a /etc/hecate "${backup_dir}/" || true
|
||||||
|
[[ -d /var/lib/hecate ]] && cp -a /var/lib/hecate "${backup_dir}/" || true
|
||||||
|
[[ -d /usr/local/lib/hecate ]] && cp -a /usr/local/lib/hecate "${backup_dir}/" || true
|
||||||
|
[[ -d /opt/hecate ]] && cp -a /opt/hecate "${backup_dir}/" || true
|
||||||
|
[[ -f /usr/local/bin/hecate ]] && install -m 0755 /usr/local/bin/hecate "${backup_dir}/hecate.bin" || true
|
||||||
|
echo "[install] backed up legacy hecate assets to ${backup_dir}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
rm -f \
|
||||||
|
/etc/systemd/system/hecate.service \
|
||||||
|
/etc/systemd/system/hecate-bootstrap.service \
|
||||||
|
/etc/systemd/system/hecate-update.service \
|
||||||
|
/etc/systemd/system/hecate-update.timer
|
||||||
|
rm -f /usr/local/bin/hecate
|
||||||
|
rm -rf /usr/local/lib/hecate
|
||||||
|
rm -rf /opt/hecate
|
||||||
|
rm -rf /etc/hecate
|
||||||
|
rm -rf /var/lib/hecate
|
||||||
|
}
|
||||||
|
|
||||||
|
resolve_build_target() {
|
||||||
|
if [[ -d "${REPO_DIR}/cmd/ananke" ]]; then
|
||||||
|
echo "./cmd/ananke"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
install_config_template() {
|
||||||
|
local template="$1"
|
||||||
|
local dest="$2"
|
||||||
|
local src legacy
|
||||||
|
local -a modern_candidates=()
|
||||||
|
local -a legacy_candidates=()
|
||||||
|
|
||||||
|
case "${template}" in
|
||||||
|
coordinator)
|
||||||
|
modern_candidates=("configs/ananke.coordinator.yaml" "configs/ananke.titan-db.yaml")
|
||||||
|
legacy_candidates=("configs/hecate.titan-db.yaml")
|
||||||
|
;;
|
||||||
|
peer)
|
||||||
|
modern_candidates=("configs/ananke.peer.yaml" "configs/ananke.tethys.yaml")
|
||||||
|
legacy_candidates=("configs/hecate.tethys.yaml")
|
||||||
|
;;
|
||||||
|
example)
|
||||||
|
modern_candidates=("configs/ananke.example.yaml")
|
||||||
|
legacy_candidates=("configs/hecate.example.yaml")
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo "[install] unknown config template key: ${template}" >&2
|
||||||
|
return 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
for src in "${modern_candidates[@]}"; do
|
||||||
|
if [[ -f "${src}" ]]; then
|
||||||
|
install -m 0640 "${src}" "${dest}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
for legacy in "${legacy_candidates[@]}"; do
|
||||||
|
if [[ -f "${legacy}" ]]; then
|
||||||
|
src="$(mktemp)"
|
||||||
|
legacy_path_rewrite "${legacy}" "${src}"
|
||||||
|
install -m 0640 "${src}" "${dest}"
|
||||||
|
rm -f "${src}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "[install] missing config template sources for '${template}'. modern=[${modern_candidates[*]}] legacy=[${legacy_candidates[*]}]" >&2
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
install_systemd_units() {
|
||||||
|
local source_map
|
||||||
|
local tmp
|
||||||
|
|
||||||
|
while IFS='|' read -r target_name modern_name legacy_name; do
|
||||||
|
local modern_src="deploy/systemd/${modern_name}"
|
||||||
|
local legacy_src="deploy/systemd/${legacy_name}"
|
||||||
|
local target="${SYSTEMD_DIR}/${target_name}"
|
||||||
|
|
||||||
|
if [[ -f "${modern_src}" ]]; then
|
||||||
|
install -m 0644 "${modern_src}" "${target}"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -f "${legacy_src}" ]]; then
|
||||||
|
tmp="$(mktemp)"
|
||||||
|
legacy_path_rewrite "${legacy_src}" "${tmp}"
|
||||||
|
install -m 0644 "${tmp}" "${target}"
|
||||||
|
rm -f "${tmp}"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "[install] missing both modern and legacy systemd unit sources for ${target_name}" >&2
|
||||||
|
return 1
|
||||||
|
done <<'EOF_UNITS'
|
||||||
|
ananke.service|ananke.service|hecate.service
|
||||||
|
ananke-bootstrap.service|ananke-bootstrap.service|hecate-bootstrap.service
|
||||||
|
ananke-update.service|ananke-update.service|hecate-update.service
|
||||||
|
ananke-update.timer|ananke-update.timer|hecate-update.timer
|
||||||
|
EOF_UNITS
|
||||||
|
}
|
||||||
|
|
||||||
|
install_self_update_script() {
|
||||||
|
local modern_src="scripts/ananke-self-update.sh"
|
||||||
|
local legacy_src="scripts/hecate-self-update.sh"
|
||||||
|
local target="${LIB_DIR}/ananke-self-update.sh"
|
||||||
|
local tmp
|
||||||
|
|
||||||
|
if [[ -f "${modern_src}" ]]; then
|
||||||
|
install -m 0755 "${modern_src}" "${target}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -f "${legacy_src}" ]]; then
|
||||||
|
tmp="$(mktemp)"
|
||||||
|
legacy_path_rewrite "${legacy_src}" "${tmp}"
|
||||||
|
sed -Ei \
|
||||||
|
-e 's/HECATE_/ANANKE_/g' \
|
||||||
|
-e 's/hecate-self-update/ananke-self-update/g' \
|
||||||
|
-e 's#/opt/hecate#/opt/ananke#g' \
|
||||||
|
-e 's#bstein/hecate\.git#bstein/ananke.git#g' \
|
||||||
|
"${tmp}"
|
||||||
|
install -m 0755 "${tmp}" "${target}"
|
||||||
|
rm -f "${tmp}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "[install] missing both modern and legacy self-update scripts." >&2
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
configure_nut() {
|
||||||
|
if [[ "${MANAGE_NUT}" != "1" ]]; then
|
||||||
|
echo "[install] skipping NUT configuration (ANANKE_MANAGE_NUT=${MANAGE_NUT})"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "[install] configuring NUT + udev for UPS ${NUT_UPS_NAME} (${NUT_VENDOR_ID}:${NUT_PRODUCT_ID})"
|
||||||
|
install -d -m 0755 /etc/nut /etc/udev/rules.d
|
||||||
|
|
||||||
|
cat > /etc/nut/nut.conf <<EOF
|
||||||
|
MODE=standalone
|
||||||
|
EOF
|
||||||
|
|
||||||
|
cat > /etc/nut/ups.conf <<EOF
|
||||||
|
[${NUT_UPS_NAME}]
|
||||||
|
driver = usbhid-ups
|
||||||
|
port = auto
|
||||||
|
vendorid = ${NUT_VENDOR_ID}
|
||||||
|
productid = ${NUT_PRODUCT_ID}
|
||||||
|
pollinterval = 5
|
||||||
|
EOF
|
||||||
|
|
||||||
|
cat > /etc/nut/upsd.users <<EOF
|
||||||
|
[${NUT_MONITOR_USER}]
|
||||||
|
password = ${NUT_MONITOR_PASSWORD}
|
||||||
|
upsmon primary
|
||||||
|
EOF
|
||||||
|
chmod 0640 /etc/nut/upsd.users
|
||||||
|
if getent group nut >/dev/null 2>&1; then
|
||||||
|
chown root:nut /etc/nut/upsd.users
|
||||||
|
else
|
||||||
|
chown root:root /etc/nut/upsd.users
|
||||||
|
fi
|
||||||
|
|
||||||
|
cat > /etc/nut/upsmon.conf <<EOF
|
||||||
|
RUN_AS_USER nut
|
||||||
|
MONITOR ${NUT_UPS_NAME}@localhost 1 ${NUT_MONITOR_USER} ${NUT_MONITOR_PASSWORD} primary
|
||||||
|
MINSUPPLIES 1
|
||||||
|
SHUTDOWNCMD "/sbin/shutdown -h +0"
|
||||||
|
POLLFREQ 5
|
||||||
|
POLLFREQALERT 5
|
||||||
|
HOSTSYNC 15
|
||||||
|
DEADTIME 15
|
||||||
|
POWERDOWNFLAG /etc/killpower
|
||||||
|
EOF
|
||||||
|
|
||||||
|
cat > /etc/udev/rules.d/99-ananke-ups.rules <<EOF
|
||||||
|
# Managed by ananke install.sh: ensure UPS USB HID devices are readable by NUT
|
||||||
|
ACTION=="add|change", SUBSYSTEM=="usb", ATTR{idVendor}=="${NUT_VENDOR_ID}", ATTR{idProduct}=="${NUT_PRODUCT_ID}", MODE:="0660", GROUP:="nut"
|
||||||
|
EOF
|
||||||
|
|
||||||
|
udevadm control --reload-rules || true
|
||||||
|
udevadm trigger --subsystem-match=usb --attr-match=idVendor="${NUT_VENDOR_ID}" --attr-match=idProduct="${NUT_PRODUCT_ID}" || true
|
||||||
|
|
||||||
|
systemctl enable nut-driver-enumerator.service nut-server.service nut-monitor.service >/dev/null 2>&1 || true
|
||||||
|
systemctl restart nut-driver-enumerator.service >/dev/null 2>&1 || true
|
||||||
|
systemctl restart "nut-driver@${NUT_UPS_NAME}.service" >/dev/null 2>&1 || true
|
||||||
|
systemctl restart nut-server.service nut-monitor.service >/dev/null 2>&1 || true
|
||||||
|
}
|
||||||
|
|
||||||
ensure_dependencies
|
ensure_dependencies
|
||||||
migrate_legacy_hecate_install
|
migrate_legacy_hecate_install
|
||||||
|
|||||||
@ -6,28 +6,9 @@ cd "${REPO_DIR}"
|
|||||||
export PATH="$(go env GOPATH)/bin:${PATH}"
|
export PATH="$(go env GOPATH)/bin:${PATH}"
|
||||||
STATICCHECK_VERSION="${ANANKE_STATICCHECK_VERSION:-2025.1.1}"
|
STATICCHECK_VERSION="${ANANKE_STATICCHECK_VERSION:-2025.1.1}"
|
||||||
|
|
||||||
run_with_retry() {
|
|
||||||
local attempts="$1"
|
|
||||||
shift
|
|
||||||
local try=1
|
|
||||||
local delay=3
|
|
||||||
local rc=0
|
|
||||||
while true; do
|
|
||||||
"$@" && return 0
|
|
||||||
rc=$?
|
|
||||||
if [[ "${try}" -ge "${attempts}" ]]; then
|
|
||||||
return "${rc}"
|
|
||||||
fi
|
|
||||||
echo "[lint] retry ${try}/${attempts} after rc=${rc}: $*" >&2
|
|
||||||
sleep "${delay}"
|
|
||||||
delay=$((delay * 2))
|
|
||||||
try=$((try + 1))
|
|
||||||
done
|
|
||||||
}
|
|
||||||
|
|
||||||
if ! command -v staticcheck >/dev/null 2>&1 || ! staticcheck -version 2>/dev/null | grep -q "${STATICCHECK_VERSION}"; then
|
if ! command -v staticcheck >/dev/null 2>&1 || ! staticcheck -version 2>/dev/null | grep -q "${STATICCHECK_VERSION}"; then
|
||||||
echo "[lint] installing staticcheck ${STATICCHECK_VERSION}"
|
echo "[lint] installing staticcheck ${STATICCHECK_VERSION}"
|
||||||
run_with_retry 4 go install "honnef.co/go/tools/cmd/staticcheck@${STATICCHECK_VERSION}"
|
go install "honnef.co/go/tools/cmd/staticcheck@${STATICCHECK_VERSION}"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "[lint] go vet"
|
echo "[lint] go vet"
|
||||||
|
|||||||
@ -77,17 +77,6 @@ def _fetch_existing_counter(pushgateway_url: str, metric: str, labels: dict[str,
|
|||||||
return 0.0
|
return 0.0
|
||||||
|
|
||||||
|
|
||||||
def _series_exists(pushgateway_url: str, metric: str, labels: dict[str, str], timeout_seconds: float) -> bool:
|
|
||||||
"""Return whether Pushgateway already has a series for this build."""
|
|
||||||
text = _read_http(f"{pushgateway_url.rstrip('/')}/metrics", timeout_seconds)
|
|
||||||
for line in text.splitlines():
|
|
||||||
if not line.startswith(metric + "{"):
|
|
||||||
continue
|
|
||||||
if all(f'{key}="{value}"' in line for key, value in labels.items()):
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def _build_payload(
|
def _build_payload(
|
||||||
suite: str,
|
suite: str,
|
||||||
trigger: str,
|
trigger: str,
|
||||||
@ -100,25 +89,9 @@ def _build_payload(
|
|||||||
tests_skipped: int,
|
tests_skipped: int,
|
||||||
test_cases: list[tuple[str, str]],
|
test_cases: list[tuple[str, str]],
|
||||||
coverage_percent: float,
|
coverage_percent: float,
|
||||||
source_files_total: int,
|
|
||||||
source_lines_over_500: int,
|
source_lines_over_500: int,
|
||||||
branch: str,
|
|
||||||
build_number: str,
|
|
||||||
jenkins_job: str,
|
|
||||||
checks: dict[str, str],
|
checks: dict[str, str],
|
||||||
) -> str:
|
) -> str:
|
||||||
build_labels = {
|
|
||||||
"suite": suite,
|
|
||||||
"branch": branch,
|
|
||||||
"build_number": build_number or "unknown",
|
|
||||||
"jenkins_job": jenkins_job,
|
|
||||||
}
|
|
||||||
test_case_base_labels = {
|
|
||||||
"suite": suite,
|
|
||||||
"branch": branch,
|
|
||||||
"build_number": build_number or "unknown",
|
|
||||||
"jenkins_job": jenkins_job,
|
|
||||||
}
|
|
||||||
lines = [
|
lines = [
|
||||||
"# TYPE platform_quality_gate_runs_total counter",
|
"# TYPE platform_quality_gate_runs_total counter",
|
||||||
f'platform_quality_gate_runs_total{{suite="{suite}",status="ok"}} {ok_count}',
|
f'platform_quality_gate_runs_total{{suite="{suite}",status="ok"}} {ok_count}',
|
||||||
@ -132,29 +105,20 @@ def _build_payload(
|
|||||||
f'ananke_quality_gate_coverage_percent{{suite="{suite}"}} {coverage_percent:.3f}',
|
f'ananke_quality_gate_coverage_percent{{suite="{suite}"}} {coverage_percent:.3f}',
|
||||||
"# TYPE platform_quality_gate_workspace_line_coverage_percent gauge",
|
"# TYPE platform_quality_gate_workspace_line_coverage_percent gauge",
|
||||||
f'platform_quality_gate_workspace_line_coverage_percent{{suite="{suite}"}} {coverage_percent:.3f}',
|
f'platform_quality_gate_workspace_line_coverage_percent{{suite="{suite}"}} {coverage_percent:.3f}',
|
||||||
"# TYPE platform_quality_gate_source_files_total gauge",
|
|
||||||
f'platform_quality_gate_source_files_total{{suite="{suite}"}} {source_files_total}',
|
|
||||||
"# TYPE platform_quality_gate_source_lines_over_500_total gauge",
|
"# TYPE platform_quality_gate_source_lines_over_500_total gauge",
|
||||||
f'platform_quality_gate_source_lines_over_500_total{{suite="{suite}"}} {source_lines_over_500}',
|
f'platform_quality_gate_source_lines_over_500_total{{suite="{suite}"}} {source_lines_over_500}',
|
||||||
"# TYPE platform_quality_gate_build_info gauge",
|
"# TYPE platform_quality_gate_test_case_result gauge",
|
||||||
f"platform_quality_gate_build_info{_label_str(build_labels)} 1",
|
|
||||||
"# TYPE ananke_quality_gate_checks_total gauge",
|
"# TYPE ananke_quality_gate_checks_total gauge",
|
||||||
"# TYPE ananke_quality_gate_publish_info gauge",
|
"# TYPE ananke_quality_gate_publish_info gauge",
|
||||||
f'ananke_quality_gate_publish_info{_label_str({"suite": suite, "trigger": trigger})} 1',
|
f'ananke_quality_gate_publish_info{_label_str({"suite": suite, "trigger": trigger})} 1',
|
||||||
]
|
]
|
||||||
lines.extend(
|
lines.extend(
|
||||||
f'ananke_quality_gate_checks_total{{suite="{suite}",check="{check_name}",result="{check_status}"}} 1'
|
f'platform_quality_gate_test_case_result{{suite="{suite}",test="{_escape_label(test_name)}",status="{_escape_label(test_status)}"}} 1'
|
||||||
for check_name, check_status in checks.items()
|
|
||||||
)
|
|
||||||
lines.append("# TYPE platform_quality_gate_test_case_result gauge")
|
|
||||||
if test_cases:
|
|
||||||
lines.extend(
|
|
||||||
f"platform_quality_gate_test_case_result{_label_str({**test_case_base_labels, 'test': test_name, 'status': test_status})} 1"
|
|
||||||
for test_name, test_status in test_cases
|
for test_name, test_status in test_cases
|
||||||
)
|
)
|
||||||
else:
|
lines.extend(
|
||||||
lines.append(
|
f'ananke_quality_gate_checks_total{{suite="{suite}",check="{check_name}",result="{check_status}"}} 1'
|
||||||
f"platform_quality_gate_test_case_result{_label_str({**test_case_base_labels, 'test': '__no_test_cases__', 'status': 'skipped'})} 1"
|
for check_name, check_status in checks.items()
|
||||||
)
|
)
|
||||||
return "\n".join(lines) + "\n"
|
return "\n".join(lines) + "\n"
|
||||||
|
|
||||||
@ -172,7 +136,8 @@ def _read_coverage_percent(path: str) -> float:
|
|||||||
return 0.0
|
return 0.0
|
||||||
|
|
||||||
|
|
||||||
def _iter_source_files(repo_root: Path):
|
def _count_source_files_over_limit(repo_root: Path, max_lines: int = 500) -> int:
|
||||||
|
count = 0
|
||||||
for rel_root in SOURCE_SCAN_ROOTS:
|
for rel_root in SOURCE_SCAN_ROOTS:
|
||||||
base = repo_root / rel_root
|
base = repo_root / rel_root
|
||||||
if not base.exists():
|
if not base.exists():
|
||||||
@ -182,37 +147,12 @@ def _iter_source_files(repo_root: Path):
|
|||||||
continue
|
continue
|
||||||
if path.suffix not in SOURCE_EXTENSIONS:
|
if path.suffix not in SOURCE_EXTENSIONS:
|
||||||
continue
|
continue
|
||||||
if path.name.endswith("_test.go") or path.name.endswith(".test.py"):
|
|
||||||
continue
|
|
||||||
yield path
|
|
||||||
|
|
||||||
|
|
||||||
def _count_source_files(repo_root: Path) -> int:
|
|
||||||
return sum(1 for _ in _iter_source_files(repo_root))
|
|
||||||
|
|
||||||
|
|
||||||
def _count_source_files_over_limit(repo_root: Path, max_lines: int = 500) -> int:
|
|
||||||
count = 0
|
|
||||||
for path in _iter_source_files(repo_root):
|
|
||||||
lines = len(path.read_text(encoding="utf-8", errors="ignore").splitlines())
|
lines = len(path.read_text(encoding="utf-8", errors="ignore").splitlines())
|
||||||
if lines > max_lines:
|
if lines > max_lines:
|
||||||
count += 1
|
count += 1
|
||||||
return count
|
return count
|
||||||
|
|
||||||
|
|
||||||
def _unit_tests_failed(output_path: Path, coverage_percent: float) -> bool:
|
|
||||||
if coverage_percent <= 0 or not output_path.exists():
|
|
||||||
return True
|
|
||||||
text = output_path.read_text(encoding="utf-8", errors="ignore")
|
|
||||||
start_marker = "[quality] unit tests + workspace coverage profile"
|
|
||||||
end_marker = "[quality] hygiene: doc contracts"
|
|
||||||
if start_marker in text:
|
|
||||||
text = text.split(start_marker, 1)[1]
|
|
||||||
if end_marker in text:
|
|
||||||
text = text.split(end_marker, 1)[0]
|
|
||||||
return bool(re.search(r"^(--- FAIL:|FAIL\\b)", text, flags=re.M))
|
|
||||||
|
|
||||||
|
|
||||||
def _parse_go_test_counts(output_path: Path) -> dict[str, int]:
|
def _parse_go_test_counts(output_path: Path) -> dict[str, int]:
|
||||||
if not output_path.exists():
|
if not output_path.exists():
|
||||||
return {"passed": 0, "failed": 0, "errors": 0, "skipped": 0}
|
return {"passed": 0, "failed": 0, "errors": 0, "skipped": 0}
|
||||||
@ -226,37 +166,14 @@ def _parse_go_test_counts(output_path: Path) -> dict[str, int]:
|
|||||||
|
|
||||||
|
|
||||||
def _parse_go_test_cases(output_path: Path) -> list[tuple[str, str]]:
|
def _parse_go_test_cases(output_path: Path) -> list[tuple[str, str]]:
|
||||||
"""Parse per-test status records from go test output text."""
|
|
||||||
if not output_path.exists():
|
if not output_path.exists():
|
||||||
return []
|
return []
|
||||||
text = output_path.read_text(encoding="utf-8", errors="ignore")
|
text = output_path.read_text(encoding="utf-8", errors="ignore")
|
||||||
cases: list[tuple[str, str]] = []
|
cases: list[tuple[str, str]] = []
|
||||||
patterns = {
|
for match in re.finditer(r"^---\s+(PASS|FAIL|SKIP):\s+(\S+)", text, flags=re.M):
|
||||||
"passed": re.compile(r"^--- PASS: ([^\s(]+)", flags=re.M),
|
raw_status, test_name = match.groups()
|
||||||
"failed": re.compile(r"^--- FAIL: ([^\s(]+)", flags=re.M),
|
status = {"PASS": "passed", "FAIL": "failed", "SKIP": "skipped"}.get(raw_status, "error")
|
||||||
"skipped": re.compile(r"^--- SKIP: ([^\s(]+)", flags=re.M),
|
cases.append((test_name.strip(), status))
|
||||||
}
|
|
||||||
for status, pattern in patterns.items():
|
|
||||||
for test_name in pattern.findall(text):
|
|
||||||
cleaned = str(test_name).strip()
|
|
||||||
if cleaned:
|
|
||||||
cases.append((cleaned, status))
|
|
||||||
if cases:
|
|
||||||
return cases
|
|
||||||
|
|
||||||
# Fallback for non-verbose `go test` output where individual test names are absent.
|
|
||||||
package_cases: list[tuple[str, str]] = []
|
|
||||||
for package_name in re.findall(r"^ok\s+([^\s]+)", text, flags=re.M):
|
|
||||||
cleaned = str(package_name).strip()
|
|
||||||
if cleaned:
|
|
||||||
package_cases.append((f"package::{cleaned}", "passed"))
|
|
||||||
for package_name in re.findall(r"^FAIL\s+([^\s]+)", text, flags=re.M):
|
|
||||||
cleaned = str(package_name).strip()
|
|
||||||
if cleaned:
|
|
||||||
package_cases.append((f"package::{cleaned}", "failed"))
|
|
||||||
if package_cases:
|
|
||||||
deduped = list(dict.fromkeys(package_cases))
|
|
||||||
return deduped
|
|
||||||
return cases
|
return cases
|
||||||
|
|
||||||
|
|
||||||
@ -307,23 +224,17 @@ def _sonarqube_check_status(build_dir: Path) -> str:
|
|||||||
|
|
||||||
|
|
||||||
def _supply_chain_check_status(build_dir: Path) -> str:
|
def _supply_chain_check_status(build_dir: Path) -> str:
|
||||||
required = os.getenv("QUALITY_GATE_IRONBANK_REQUIRED", "0").strip().lower() in {"1", "true", "yes", "on"}
|
|
||||||
report = _load_json(Path(os.getenv("QUALITY_GATE_IRONBANK_REPORT", str(build_dir / "ironbank-compliance.json"))))
|
report = _load_json(Path(os.getenv("QUALITY_GATE_IRONBANK_REPORT", str(build_dir / "ironbank-compliance.json"))))
|
||||||
if not report:
|
if not report:
|
||||||
return "failed" if required else "not_applicable"
|
return "not_applicable"
|
||||||
compliant = report.get("compliant")
|
compliant = report.get("compliant")
|
||||||
if isinstance(compliant, bool):
|
if isinstance(compliant, bool):
|
||||||
return "ok" if compliant else "failed"
|
return "ok" if compliant else "failed"
|
||||||
status_candidates = [report.get("status"), report.get("result"), report.get("compliance")]
|
status_candidates = [report.get("status"), report.get("result"), report.get("compliance")]
|
||||||
for value in status_candidates:
|
for value in status_candidates:
|
||||||
if isinstance(value, str):
|
if isinstance(value, str):
|
||||||
normalized = value.strip().lower()
|
return "ok" if value.strip().lower() in QUALITY_SUCCESS_STATES else "failed"
|
||||||
if normalized in QUALITY_SUCCESS_STATES:
|
return "failed"
|
||||||
return "ok"
|
|
||||||
if normalized in {"n/a", "na", "not_applicable", "not-applicable", "skipped", "skip"}:
|
|
||||||
return "failed" if required else "not_applicable"
|
|
||||||
return "failed" if required else "not_applicable"
|
|
||||||
return "failed" if required else "not_applicable"
|
|
||||||
|
|
||||||
|
|
||||||
def parse_args(argv: list[str]) -> argparse.Namespace:
|
def parse_args(argv: list[str]) -> argparse.Namespace:
|
||||||
@ -367,19 +278,10 @@ def main(argv: list[str] | None = None) -> int:
|
|||||||
args = parse_args(argv or sys.argv[1:])
|
args = parse_args(argv or sys.argv[1:])
|
||||||
repo_root = Path(__file__).resolve().parents[1]
|
repo_root = Path(__file__).resolve().parents[1]
|
||||||
build_dir = repo_root / "build"
|
build_dir = repo_root / "build"
|
||||||
gate_rc = _read_exit_code(Path(os.getenv("ANANKE_QUALITY_EXIT_CODE_PATH", str(build_dir / "quality-gate.rc"))))
|
|
||||||
current_ok = 1 if gate_rc == 0 else 0
|
|
||||||
current_failed = 0 if gate_rc == 0 else 1
|
|
||||||
|
|
||||||
branch = os.getenv("BRANCH_NAME") or os.getenv("GIT_BRANCH") or "unknown"
|
|
||||||
if branch.startswith("origin/"):
|
|
||||||
branch = branch[len("origin/") :]
|
|
||||||
build_number = os.getenv("BUILD_NUMBER", "")
|
|
||||||
jenkins_job = os.getenv("JOB_NAME", "ananke")
|
|
||||||
remote_ok = 0
|
remote_ok = 0
|
||||||
remote_failed = 0
|
remote_failed = 0
|
||||||
remote_error = ""
|
remote_error = ""
|
||||||
already_recorded = False
|
|
||||||
try:
|
try:
|
||||||
remote_ok = int(
|
remote_ok = int(
|
||||||
_fetch_existing_counter(
|
_fetch_existing_counter(
|
||||||
@ -397,39 +299,21 @@ def main(argv: list[str] | None = None) -> int:
|
|||||||
args.timeout_seconds,
|
args.timeout_seconds,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
already_recorded = bool(build_number) and _series_exists(
|
|
||||||
args.pushgateway_url,
|
|
||||||
"platform_quality_gate_build_info",
|
|
||||||
{
|
|
||||||
"job": args.job_name,
|
|
||||||
"suite": args.suite,
|
|
||||||
"branch": branch or "unknown",
|
|
||||||
"build_number": build_number or "unknown",
|
|
||||||
"jenkins_job": jenkins_job,
|
|
||||||
},
|
|
||||||
args.timeout_seconds,
|
|
||||||
)
|
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
remote_error = str(exc)
|
remote_error = str(exc)
|
||||||
|
|
||||||
resolved_ok = remote_ok
|
resolved_ok = max(args.local_ok, remote_ok)
|
||||||
resolved_failed = remote_failed
|
resolved_failed = max(args.local_failed, remote_failed)
|
||||||
if remote_error:
|
|
||||||
resolved_ok = args.local_ok
|
|
||||||
resolved_failed = args.local_failed
|
|
||||||
elif not already_recorded:
|
|
||||||
resolved_ok += current_ok
|
|
||||||
resolved_failed += current_failed
|
|
||||||
coverage_percent = _read_coverage_percent(args.coverage_percent_file)
|
coverage_percent = _read_coverage_percent(args.coverage_percent_file)
|
||||||
source_files_total = _count_source_files(repo_root)
|
|
||||||
source_lines_over_500 = _count_source_files_over_limit(repo_root, max_lines=500)
|
source_lines_over_500 = _count_source_files_over_limit(repo_root, max_lines=500)
|
||||||
quality_output = Path(os.getenv("ANANKE_QUALITY_OUTPUT_FILE", str(build_dir / "quality-gate.out")))
|
test_output = Path(os.getenv("ANANKE_QUALITY_OUTPUT_FILE", str(build_dir / "quality-gate.out")))
|
||||||
tests = _parse_go_test_counts(quality_output)
|
tests = _parse_go_test_counts(test_output)
|
||||||
test_cases = _parse_go_test_cases(quality_output)
|
test_cases = _parse_go_test_cases(test_output)
|
||||||
|
gate_rc = _read_exit_code(Path(os.getenv("ANANKE_QUALITY_EXIT_CODE_PATH", str(build_dir / "quality-gate.rc"))))
|
||||||
docs_status = _read_status(Path(os.getenv("ANANKE_QUALITY_DOCS_STATUS_PATH", str(build_dir / "docs-naming.status"))))
|
docs_status = _read_status(Path(os.getenv("ANANKE_QUALITY_DOCS_STATUS_PATH", str(build_dir / "docs-naming.status"))))
|
||||||
unit_tests_failed = _unit_tests_failed(quality_output, coverage_percent)
|
gate_failed = gate_rc != 0
|
||||||
checks = {
|
checks = {
|
||||||
"tests": "failed" if unit_tests_failed or tests["failed"] > 0 or tests["errors"] > 0 else "ok",
|
"tests": "failed" if gate_failed or tests["failed"] > 0 else "ok",
|
||||||
"coverage": "ok" if coverage_percent >= 95.0 else "failed",
|
"coverage": "ok" if coverage_percent >= 95.0 else "failed",
|
||||||
"loc": "ok" if source_lines_over_500 == 0 else "failed",
|
"loc": "ok" if source_lines_over_500 == 0 else "failed",
|
||||||
"docs_naming": docs_status,
|
"docs_naming": docs_status,
|
||||||
@ -448,11 +332,7 @@ def main(argv: list[str] | None = None) -> int:
|
|||||||
tests_skipped=tests["skipped"],
|
tests_skipped=tests["skipped"],
|
||||||
test_cases=test_cases,
|
test_cases=test_cases,
|
||||||
coverage_percent=coverage_percent,
|
coverage_percent=coverage_percent,
|
||||||
source_files_total=source_files_total,
|
|
||||||
source_lines_over_500=source_lines_over_500,
|
source_lines_over_500=source_lines_over_500,
|
||||||
branch=branch,
|
|
||||||
build_number=build_number,
|
|
||||||
jenkins_job=jenkins_job,
|
|
||||||
checks=checks,
|
checks=checks,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -465,8 +345,7 @@ def main(argv: list[str] | None = None) -> int:
|
|||||||
|
|
||||||
summary = (
|
summary = (
|
||||||
f"[quality] published Pushgateway metrics suite={args.suite} job={args.job_name} ok={resolved_ok} "
|
f"[quality] published Pushgateway metrics suite={args.suite} job={args.job_name} ok={resolved_ok} "
|
||||||
f"failed={resolved_failed} coverage={coverage_percent:.3f} source_files_total={source_files_total} "
|
f"failed={resolved_failed} coverage={coverage_percent:.3f} source_lines_over_500={source_lines_over_500}"
|
||||||
f"source_lines_over_500={source_lines_over_500}"
|
|
||||||
)
|
)
|
||||||
if remote_error:
|
if remote_error:
|
||||||
summary += f" remote_read_error={remote_error}"
|
summary += f" remote_read_error={remote_error}"
|
||||||
|
|||||||
@ -3,11 +3,8 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import http.server
|
import http.server
|
||||||
from pathlib import Path
|
|
||||||
import socketserver
|
import socketserver
|
||||||
import tempfile
|
|
||||||
import threading
|
import threading
|
||||||
from unittest import mock
|
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
import publish_quality_metrics as publisher
|
import publish_quality_metrics as publisher
|
||||||
@ -61,19 +58,7 @@ class PublishQualityMetricsTest(unittest.TestCase):
|
|||||||
self.server.server_close()
|
self.server.server_close()
|
||||||
self.thread.join(timeout=5)
|
self.thread.join(timeout=5)
|
||||||
|
|
||||||
def _env_for_gate_status(self, status: int = 0) -> dict[str, str]:
|
def test_publish_uses_remote_high_water_mark(self) -> None:
|
||||||
tmp_dir = tempfile.TemporaryDirectory()
|
|
||||||
self.addCleanup(tmp_dir.cleanup)
|
|
||||||
rc_path = Path(tmp_dir.name) / "quality-gate.rc"
|
|
||||||
rc_path.write_text(f"{status}\n", encoding="utf-8")
|
|
||||||
return {
|
|
||||||
"ANANKE_QUALITY_EXIT_CODE_PATH": str(rc_path),
|
|
||||||
"ANANKE_QUALITY_COVERAGE_PERCENT_FILE": str(Path(tmp_dir.name) / "coverage.txt"),
|
|
||||||
"ANANKE_QUALITY_OUTPUT_FILE": str(Path(tmp_dir.name) / "quality-gate.out"),
|
|
||||||
"ANANKE_QUALITY_DOCS_STATUS_PATH": str(Path(tmp_dir.name) / "docs-naming.status"),
|
|
||||||
}
|
|
||||||
|
|
||||||
def test_publish_adds_current_run_to_remote_counters(self) -> None:
|
|
||||||
_GatewayHandler.metrics_text = "\n".join(
|
_GatewayHandler.metrics_text = "\n".join(
|
||||||
[
|
[
|
||||||
'# TYPE platform_quality_gate_runs_total counter',
|
'# TYPE platform_quality_gate_runs_total counter',
|
||||||
@ -82,7 +67,6 @@ class PublishQualityMetricsTest(unittest.TestCase):
|
|||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
with mock.patch.dict("os.environ", self._env_for_gate_status(0)):
|
|
||||||
exit_code = publisher.main(
|
exit_code = publisher.main(
|
||||||
[
|
[
|
||||||
"--pushgateway-url",
|
"--pushgateway-url",
|
||||||
@ -104,57 +88,16 @@ class PublishQualityMetricsTest(unittest.TestCase):
|
|||||||
self.assertEqual(len(_GatewayHandler.posts), 1)
|
self.assertEqual(len(_GatewayHandler.posts), 1)
|
||||||
path, body = _GatewayHandler.posts[0]
|
path, body = _GatewayHandler.posts[0]
|
||||||
self.assertEqual(path, "/metrics/job/platform-quality-ci/suite/ananke")
|
self.assertEqual(path, "/metrics/job/platform-quality-ci/suite/ananke")
|
||||||
self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="ok"} 8', body)
|
self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="ok"} 7', body)
|
||||||
self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="failed"} 1', body)
|
self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="failed"} 2', body)
|
||||||
self.assertIn('ananke_quality_gate_publish_info{suite="ananke",trigger="host"} 1', body)
|
self.assertIn('ananke_quality_gate_publish_info{suite="ananke",trigger="host"} 1', body)
|
||||||
self.assertIn('ananke_quality_gate_coverage_percent{suite="ananke"}', body)
|
self.assertIn('ananke_quality_gate_coverage_percent{suite="ananke"}', body)
|
||||||
self.assertIn('platform_quality_gate_workspace_line_coverage_percent{suite="ananke"}', body)
|
self.assertIn('platform_quality_gate_workspace_line_coverage_percent{suite="ananke"}', body)
|
||||||
self.assertIn('platform_quality_gate_source_files_total{suite="ananke"}', body)
|
|
||||||
self.assertIn('platform_quality_gate_source_lines_over_500_total{suite="ananke"}', body)
|
self.assertIn('platform_quality_gate_source_lines_over_500_total{suite="ananke"}', body)
|
||||||
|
|
||||||
def test_publish_does_not_double_count_same_build(self) -> None:
|
|
||||||
_GatewayHandler.metrics_text = "\n".join(
|
|
||||||
[
|
|
||||||
'platform_quality_gate_runs_total{job="platform-quality-ci",suite="ananke",status="ok"} 7',
|
|
||||||
'platform_quality_gate_runs_total{job="platform-quality-ci",suite="ananke",status="failed"} 1',
|
|
||||||
'platform_quality_gate_build_info{job="platform-quality-ci",suite="ananke",branch="main",build_number="78",jenkins_job="ananke"} 1',
|
|
||||||
]
|
|
||||||
)
|
|
||||||
with mock.patch.dict(
|
|
||||||
"os.environ",
|
|
||||||
{
|
|
||||||
**self._env_for_gate_status(0),
|
|
||||||
"BRANCH_NAME": "main",
|
|
||||||
"BUILD_NUMBER": "78",
|
|
||||||
"JOB_NAME": "ananke",
|
|
||||||
},
|
|
||||||
):
|
|
||||||
exit_code = publisher.main(
|
|
||||||
[
|
|
||||||
"--pushgateway-url",
|
|
||||||
self.base_url,
|
|
||||||
"--job-name",
|
|
||||||
"platform-quality-ci",
|
|
||||||
"--suite",
|
|
||||||
"ananke",
|
|
||||||
"--trigger",
|
|
||||||
"host",
|
|
||||||
"--local-ok",
|
|
||||||
"1",
|
|
||||||
"--local-failed",
|
|
||||||
"0",
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
self.assertEqual(exit_code, 0)
|
|
||||||
_, body = _GatewayHandler.posts[0]
|
|
||||||
self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="ok"} 7', body)
|
|
||||||
self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="failed"} 1', body)
|
|
||||||
|
|
||||||
def test_publish_falls_back_to_local_counters_when_metrics_read_fails(self) -> None:
|
def test_publish_falls_back_to_local_counters_when_metrics_read_fails(self) -> None:
|
||||||
_GatewayHandler.fail_metrics_read = True
|
_GatewayHandler.fail_metrics_read = True
|
||||||
|
|
||||||
with mock.patch.dict("os.environ", self._env_for_gate_status(0)):
|
|
||||||
exit_code = publisher.main(
|
exit_code = publisher.main(
|
||||||
[
|
[
|
||||||
"--pushgateway-url",
|
"--pushgateway-url",
|
||||||
@ -176,7 +119,6 @@ class PublishQualityMetricsTest(unittest.TestCase):
|
|||||||
self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="ok"} 11', body)
|
self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="ok"} 11', body)
|
||||||
self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="failed"} 3', body)
|
self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="failed"} 3', body)
|
||||||
self.assertIn('platform_quality_gate_workspace_line_coverage_percent{suite="ananke"}', body)
|
self.assertIn('platform_quality_gate_workspace_line_coverage_percent{suite="ananke"}', body)
|
||||||
self.assertIn('platform_quality_gate_source_files_total{suite="ananke"}', body)
|
|
||||||
self.assertIn('platform_quality_gate_source_lines_over_500_total{suite="ananke"}', body)
|
self.assertIn('platform_quality_gate_source_lines_over_500_total{suite="ananke"}', body)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -158,9 +158,15 @@ mkdir -p "${BUILD_DIR}"
|
|||||||
rm -f "${COVERAGE_PROFILE}" "${COVERAGE_PERCENT_FILE}"
|
rm -f "${COVERAGE_PROFILE}" "${COVERAGE_PERCENT_FILE}"
|
||||||
printf 'failed\n' > "${BUILD_DIR}/docs-naming.status"
|
printf 'failed\n' > "${BUILD_DIR}/docs-naming.status"
|
||||||
|
|
||||||
echo "[quality] dependency download"
|
echo "[quality] unit tests + workspace coverage profile"
|
||||||
export GOPROXY="${GOPROXY:-https://proxy.golang.org,direct}"
|
export GOPROXY="${GOPROXY:-https://proxy.golang.org,direct}"
|
||||||
run_with_retry 4 go mod download
|
run_with_retry 4 go mod download
|
||||||
|
run_with_retry 3 go test -coverprofile="${COVERAGE_PROFILE}" ./...
|
||||||
|
coverage_percent="$(go tool cover -func="${COVERAGE_PROFILE}" | awk '/^total:/ {gsub("%","",$3); print $3}')"
|
||||||
|
if [[ -z "${coverage_percent}" ]]; then
|
||||||
|
coverage_percent="0"
|
||||||
|
fi
|
||||||
|
printf '%s\n' "${coverage_percent}" > "${COVERAGE_PERCENT_FILE}"
|
||||||
|
|
||||||
echo "[quality] hygiene: doc contracts"
|
echo "[quality] hygiene: doc contracts"
|
||||||
cd testing
|
cd testing
|
||||||
@ -183,14 +189,6 @@ echo "[quality] lint"
|
|||||||
echo "[quality] installer template contracts"
|
echo "[quality] installer template contracts"
|
||||||
./scripts/verify_install_templates.sh
|
./scripts/verify_install_templates.sh
|
||||||
|
|
||||||
echo "[quality] unit tests + workspace coverage profile"
|
|
||||||
run_with_retry 3 go test -coverprofile="${COVERAGE_PROFILE}" ./...
|
|
||||||
coverage_percent="$(go tool cover -func="${COVERAGE_PROFILE}" | awk '/^total:/ {gsub("%","",$3); print $3}')"
|
|
||||||
if [[ -z "${coverage_percent}" ]]; then
|
|
||||||
coverage_percent="0"
|
|
||||||
fi
|
|
||||||
printf '%s\n' "${coverage_percent}" > "${COVERAGE_PERCENT_FILE}"
|
|
||||||
|
|
||||||
echo "[quality] per-file coverage gate (95%)"
|
echo "[quality] per-file coverage gate (95%)"
|
||||||
cd testing
|
cd testing
|
||||||
ANANKE_ENFORCE_COVERAGE=1 ANANKE_PER_FILE_COVERAGE_TARGET=95 go test ./coverage -run TestPerFileCoverageReport -count=1 -v
|
ANANKE_ENFORCE_COVERAGE=1 ANANKE_PER_FILE_COVERAGE_TARGET=95 go test ./coverage -run TestPerFileCoverageReport -count=1 -v
|
||||||
|
|||||||
@ -17,12 +17,6 @@ import (
|
|||||||
const maxGoFileLOC = 500
|
const maxGoFileLOC = 500
|
||||||
|
|
||||||
var goFileNamePattern = regexp.MustCompile(`^[a-z0-9]+(_[a-z0-9]+)*(_test)?\.go$`)
|
var goFileNamePattern = regexp.MustCompile(`^[a-z0-9]+(_[a-z0-9]+)*(_test)?\.go$`)
|
||||||
var genericFileNameTokens = map[string]struct{}{
|
|
||||||
"chunk": {},
|
|
||||||
"part": {},
|
|
||||||
"piece": {},
|
|
||||||
"split": {},
|
|
||||||
}
|
|
||||||
|
|
||||||
func repoRoot(tb testing.TB) string {
|
func repoRoot(tb testing.TB) string {
|
||||||
tb.Helper()
|
tb.Helper()
|
||||||
@ -67,16 +61,13 @@ func collectGoFiles(tb testing.TB, roots ...string) []string {
|
|||||||
func TestHygieneContracts(t *testing.T) {
|
func TestHygieneContracts(t *testing.T) {
|
||||||
root := repoRoot(t)
|
root := repoRoot(t)
|
||||||
files := collectGoFiles(t, filepath.Join(root, "cmd"), filepath.Join(root, "internal"))
|
files := collectGoFiles(t, filepath.Join(root, "cmd"), filepath.Join(root, "internal"))
|
||||||
namingFiles := append([]string{}, files...)
|
|
||||||
namingFiles = append(namingFiles, collectGoFiles(t, filepath.Join(root, "testing"))...)
|
|
||||||
sort.Strings(files)
|
sort.Strings(files)
|
||||||
sort.Strings(namingFiles)
|
|
||||||
|
|
||||||
t.Run("doc_contract", func(t *testing.T) {
|
t.Run("doc_contract", func(t *testing.T) {
|
||||||
checkDocContracts(t, files)
|
checkDocContracts(t, files)
|
||||||
})
|
})
|
||||||
t.Run("naming_contract", func(t *testing.T) {
|
t.Run("naming_contract", func(t *testing.T) {
|
||||||
checkNamingContracts(t, namingFiles)
|
checkNamingContracts(t, files)
|
||||||
})
|
})
|
||||||
t.Run("loc_limit", func(t *testing.T) {
|
t.Run("loc_limit", func(t *testing.T) {
|
||||||
checkFileLOCLimits(t, files)
|
checkFileLOCLimits(t, files)
|
||||||
@ -130,17 +121,7 @@ func checkNamingContracts(t *testing.T, files []string) {
|
|||||||
if !goFileNamePattern.MatchString(base) {
|
if !goFileNamePattern.MatchString(base) {
|
||||||
t.Errorf("%s: filename %q violates naming contract %s", file, base, goFileNamePattern.String())
|
t.Errorf("%s: filename %q violates naming contract %s", file, base, goFileNamePattern.String())
|
||||||
}
|
}
|
||||||
for _, token := range filenameTokens(base) {
|
|
||||||
if _, ok := genericFileNameTokens[token]; ok {
|
|
||||||
t.Errorf("%s: filename %q uses generic split-file token %q", file, base, token)
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func filenameTokens(name string) []string {
|
|
||||||
trimmed := strings.TrimSuffix(strings.TrimSuffix(name, ".go"), "_test")
|
|
||||||
return strings.Split(trimmed, "_")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// checkFileLOCLimits runs one orchestration or CLI step.
|
// checkFileLOCLimits runs one orchestration or CLI step.
|
||||||
|
|||||||
@ -13,8 +13,6 @@ cmd/ananke/power_safety_test.go
|
|||||||
cmd/ananke/test_helpers_test.go
|
cmd/ananke/test_helpers_test.go
|
||||||
internal/cluster/orchestrator_inventory_test.go
|
internal/cluster/orchestrator_inventory_test.go
|
||||||
internal/cluster/orchestrator_report_test.go
|
internal/cluster/orchestrator_report_test.go
|
||||||
internal/cluster/orchestrator_autorepair_test.go
|
|
||||||
internal/cluster/orchestrator_autorepair_cleanup_test.go
|
|
||||||
internal/cluster/orchestrator_test.go
|
internal/cluster/orchestrator_test.go
|
||||||
internal/cluster/orchestrator_unit_additional_test.go
|
internal/cluster/orchestrator_unit_additional_test.go
|
||||||
internal/cluster/orchestrator_vault_test.go
|
internal/cluster/orchestrator_vault_test.go
|
||||||
@ -23,7 +21,6 @@ internal/config/load_additional_test.go
|
|||||||
internal/config/validate_matrix_test.go
|
internal/config/validate_matrix_test.go
|
||||||
internal/service/daemon_additional_test.go
|
internal/service/daemon_additional_test.go
|
||||||
internal/service/daemon_coverage_closeout_test.go
|
internal/service/daemon_coverage_closeout_test.go
|
||||||
internal/service/daemon_poststart_autorepair_test.go
|
|
||||||
internal/service/daemon_quality_branches_test.go
|
internal/service/daemon_quality_branches_test.go
|
||||||
internal/service/daemon_test.go
|
internal/service/daemon_test.go
|
||||||
internal/sshutil/repair_test.go
|
internal/sshutil/repair_test.go
|
||||||
|
|||||||
@ -363,3 +363,4 @@ func TestHookBootstrapCacheAndRepoSyncFailureBranches(t *testing.T) {
|
|||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -79,29 +79,6 @@ func TestHookFluxAndWorkloadMatrix(t *testing.T) {
|
|||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
t.Run("flux-health-required-kustomizations-ignore-optional-failures", func(t *testing.T) {
|
|
||||||
cfg := lifecycleConfig(t)
|
|
||||||
cfg.Startup.FluxHealthRequiredKustomizations = []string{"flux-system/monitoring"}
|
|
||||||
|
|
||||||
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
|
||||||
command := name + " " + strings.Join(args, " ")
|
|
||||||
switch {
|
|
||||||
case name == "kubectl" && strings.Contains(command, "get kustomizations.kustomize.toolkit.fluxcd.io -A -o json"):
|
|
||||||
return `{"items":[
|
|
||||||
{"metadata":{"namespace":"flux-system","name":"monitoring"},"spec":{"suspend":false},"status":{"conditions":[{"type":"Ready","status":"True","message":"ok"}]}},
|
|
||||||
{"metadata":{"namespace":"flux-system","name":"jellyfin"},"spec":{"suspend":false},"status":{"conditions":[{"type":"Ready","status":"False","message":"optional still down"}]}}
|
|
||||||
]}`, nil
|
|
||||||
default:
|
|
||||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
orch, _ := newHookOrchestrator(t, cfg, run, run)
|
|
||||||
ready, detail, err := orch.TestHookFluxHealthReady(context.Background())
|
|
||||||
if err != nil || !ready || !strings.Contains(detail, "required kustomizations ready=") {
|
|
||||||
t.Fatalf("expected required flux kustomization scope to pass, ready=%v detail=%q err=%v", ready, detail, err)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
t.Run("workload-convergence-and-recycle-branches", func(t *testing.T) {
|
t.Run("workload-convergence-and-recycle-branches", func(t *testing.T) {
|
||||||
cfg := lifecycleConfig(t)
|
cfg := lifecycleConfig(t)
|
||||||
cfg.Startup.WorkloadConvergenceWaitSeconds = 1
|
cfg.Startup.WorkloadConvergenceWaitSeconds = 1
|
||||||
@ -168,42 +145,6 @@ func TestHookFluxAndWorkloadMatrix(t *testing.T) {
|
|||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
t.Run("required-workload-namespaces-ignore-optional-failure-pods", func(t *testing.T) {
|
|
||||||
cfg := lifecycleConfig(t)
|
|
||||||
cfg.Startup.WorkloadConvergenceRequiredNamespaces = []string{"monitoring"}
|
|
||||||
cfg.Startup.StuckPodGraceSeconds = 1
|
|
||||||
|
|
||||||
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
|
||||||
command := name + " " + strings.Join(args, " ")
|
|
||||||
switch {
|
|
||||||
case name == "kubectl" && strings.Contains(command, "get deploy,statefulset,daemonset -A -o json"):
|
|
||||||
return `{"items":[
|
|
||||||
{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"grafana"},"spec":{"replicas":1},"status":{"readyReplicas":1}},
|
|
||||||
{"kind":"Deployment","metadata":{"namespace":"jellyfin","name":"pegasus"},"spec":{"replicas":1},"status":{"readyReplicas":0}}
|
|
||||||
]}`, nil
|
|
||||||
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
|
|
||||||
return `{"items":[
|
|
||||||
{"metadata":{"namespace":"monitoring","name":"grafana-0","creationTimestamp":"2020-01-01T00:00:00Z","ownerReferences":[{"kind":"ReplicaSet","name":"grafana"}]},"spec":{"nodeName":"titan-23","containers":[{"name":"grafana"}]},"status":{"containerStatuses":[{"state":{"running":{}}}]}},
|
|
||||||
{"metadata":{"namespace":"jellyfin","name":"pegasus-0","creationTimestamp":"2020-01-01T00:00:00Z","ownerReferences":[{"kind":"ReplicaSet","name":"pegasus"}]},"spec":{"nodeName":"titan-23","containers":[{"name":"pegasus"}]},"status":{"containerStatuses":[{"state":{"waiting":{"reason":"CrashLoopBackOff"}}}]}}
|
|
||||||
]}`, nil
|
|
||||||
default:
|
|
||||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
orch, _ := newHookOrchestrator(t, cfg, run, run)
|
|
||||||
ready, detail, err := orch.TestHookWorkloadConvergenceReady(context.Background())
|
|
||||||
if err != nil || !ready || !strings.Contains(detail, "controllers ready=") {
|
|
||||||
t.Fatalf("expected required workload namespace scope to pass, ready=%v detail=%q err=%v", ready, detail, err)
|
|
||||||
}
|
|
||||||
failures, err := orch.TestHookStartupFailurePods(context.Background())
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("startup failure pod query: %v", err)
|
|
||||||
}
|
|
||||||
if len(failures) != 0 {
|
|
||||||
t.Fatalf("expected optional namespace failures to be ignored, got %v", failures)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
t.Run("critical-workload-replica-heal-branches", func(t *testing.T) {
|
t.Run("critical-workload-replica-heal-branches", func(t *testing.T) {
|
||||||
cfg := lifecycleConfig(t)
|
cfg := lifecycleConfig(t)
|
||||||
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||||||
|
|||||||
@ -19,11 +19,11 @@ import (
|
|||||||
"scm.bstein.dev/bstein/ananke/internal/state"
|
"scm.bstein.dev/bstein/ananke/internal/state"
|
||||||
)
|
)
|
||||||
|
|
||||||
// newHookOrchestratorWithRunnerMode runs one orchestration or CLI step.
|
// newHookOrchestratorAdvanced runs one orchestration or CLI step.
|
||||||
// Signature: newHookOrchestratorWithRunnerMode(t *testing.T, cfg config.Config, dryRun bool, run commandOverride, runSensitive commandOverride) (*cluster.Orchestrator, *commandRecorder).
|
// Signature: newHookOrchestratorAdvanced(t *testing.T, cfg config.Config, dryRun bool, run commandOverride, runSensitive commandOverride) (*cluster.Orchestrator, *commandRecorder).
|
||||||
// Why: these scenarios needs dry-run and non-dry-run variants while keeping
|
// Why: this part10 matrix needs dry-run and non-dry-run variants while keeping
|
||||||
// command dispatch deterministic from the top-level testing module.
|
// command dispatch deterministic from the top-level testing module.
|
||||||
func newHookOrchestratorWithRunnerMode(
|
func newHookOrchestratorAdvanced(
|
||||||
t *testing.T,
|
t *testing.T,
|
||||||
cfg config.Config,
|
cfg config.Config,
|
||||||
dryRun bool,
|
dryRun bool,
|
||||||
@ -49,11 +49,11 @@ func newHookOrchestratorWithRunnerMode(
|
|||||||
return orch, recorder
|
return orch, recorder
|
||||||
}
|
}
|
||||||
|
|
||||||
// TestHookVaultLifecycleBranchMatrix runs one orchestration or CLI step.
|
// TestHookGapMatrixPart10LowFileClosure runs one orchestration or CLI step.
|
||||||
// Signature: TestHookVaultLifecycleBranchMatrix(t *testing.T).
|
// Signature: TestHookGapMatrixPart10LowFileClosure(t *testing.T).
|
||||||
// Why: closes remaining branch gaps on low-coverage orchestrator files using
|
// Why: closes remaining branch gaps on low-coverage orchestrator files using
|
||||||
// targeted hook-level scenarios instead of brittle full-drill reruns.
|
// targeted hook-level scenarios instead of brittle full-drill reruns.
|
||||||
func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
func TestHookGapMatrixPart10LowFileClosure(t *testing.T) {
|
||||||
t.Run("critical-vault-low-branches", func(t *testing.T) {
|
t.Run("critical-vault-low-branches", func(t *testing.T) {
|
||||||
t.Run("vault-sealed-parse-error", func(t *testing.T) {
|
t.Run("vault-sealed-parse-error", func(t *testing.T) {
|
||||||
cfg := lifecycleConfig(t)
|
cfg := lifecycleConfig(t)
|
||||||
@ -64,7 +64,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
|||||||
}
|
}
|
||||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||||
}
|
}
|
||||||
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
|
orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
|
||||||
if _, err := orch.TestHookVaultSealed(context.Background()); err == nil || !strings.Contains(err.Error(), "parse vault status") {
|
if _, err := orch.TestHookVaultSealed(context.Background()); err == nil || !strings.Contains(err.Error(), "parse vault status") {
|
||||||
t.Fatalf("expected vault status parse error branch, got %v", err)
|
t.Fatalf("expected vault status parse error branch, got %v", err)
|
||||||
}
|
}
|
||||||
@ -81,7 +81,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
|||||||
}
|
}
|
||||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||||
}
|
}
|
||||||
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
|
orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
|
||||||
if _, err := orch.TestHookVaultUnsealKey(context.Background()); err == nil || !strings.Contains(err.Error(), "vault-init unseal key is empty") {
|
if _, err := orch.TestHookVaultUnsealKey(context.Background()); err == nil || !strings.Contains(err.Error(), "vault-init unseal key is empty") {
|
||||||
t.Fatalf("expected empty decoded unseal key branch, got %v", err)
|
t.Fatalf("expected empty decoded unseal key branch, got %v", err)
|
||||||
}
|
}
|
||||||
@ -90,7 +90,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
|||||||
t.Run("write-unseal-key-file-write-error", func(t *testing.T) {
|
t.Run("write-unseal-key-file-write-error", func(t *testing.T) {
|
||||||
cfg := lifecycleConfig(t)
|
cfg := lifecycleConfig(t)
|
||||||
cfg.Startup.VaultUnsealKeyFile = t.TempDir()
|
cfg.Startup.VaultUnsealKeyFile = t.TempDir()
|
||||||
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, nil, nil)
|
orch, _ := newHookOrchestratorAdvanced(t, cfg, false, nil, nil)
|
||||||
if err := orch.TestHookWriteVaultUnsealKeyFile("vault-key"); err == nil || !strings.Contains(err.Error(), "write vault unseal key file") {
|
if err := orch.TestHookWriteVaultUnsealKeyFile("vault-key"); err == nil || !strings.Contains(err.Error(), "write vault unseal key file") {
|
||||||
t.Fatalf("expected write failure branch when key path is a directory, got %v", err)
|
t.Fatalf("expected write failure branch when key path is a directory, got %v", err)
|
||||||
}
|
}
|
||||||
@ -105,7 +105,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
|||||||
}
|
}
|
||||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||||
}
|
}
|
||||||
orchNoValue, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, runNoValue, runNoValue)
|
orchNoValue, _ := newHookOrchestratorAdvanced(t, cfg, false, runNoValue, runNoValue)
|
||||||
ready, err := orchNoValue.TestHookWorkloadReady(context.Background(), "vault", "statefulset", "vault")
|
ready, err := orchNoValue.TestHookWorkloadReady(context.Background(), "vault", "statefulset", "vault")
|
||||||
if err != nil || ready {
|
if err != nil || ready {
|
||||||
t.Fatalf("expected no-value readiness branch, ready=%v err=%v", ready, err)
|
t.Fatalf("expected no-value readiness branch, ready=%v err=%v", ready, err)
|
||||||
@ -124,7 +124,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
|||||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
orchEnsureErr, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, runEnsureErr, runEnsureErr)
|
orchEnsureErr, _ := newHookOrchestratorAdvanced(t, cfg, false, runEnsureErr, runEnsureErr)
|
||||||
if err := orchEnsureErr.TestHookEnsureCriticalStartupWorkloads(context.Background()); err == nil || !strings.Contains(err.Error(), "rollout failed") {
|
if err := orchEnsureErr.TestHookEnsureCriticalStartupWorkloads(context.Background()); err == nil || !strings.Contains(err.Error(), "rollout failed") {
|
||||||
t.Fatalf("expected ensureCriticalStartupWorkloads wait error branch, got %v", err)
|
t.Fatalf("expected ensureCriticalStartupWorkloads wait error branch, got %v", err)
|
||||||
}
|
}
|
||||||
@ -139,7 +139,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
|||||||
}
|
}
|
||||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||||
}
|
}
|
||||||
orchPhase, _ := newHookOrchestratorWithRunnerMode(t, cfgPhase, false, runPhase, runPhase)
|
orchPhase, _ := newHookOrchestratorAdvanced(t, cfgPhase, false, runPhase, runPhase)
|
||||||
if err := orchPhase.TestHookEnsureVaultUnsealed(context.Background()); err == nil || !strings.Contains(err.Error(), "pod phase") {
|
if err := orchPhase.TestHookEnsureVaultUnsealed(context.Background()); err == nil || !strings.Contains(err.Error(), "pod phase") {
|
||||||
t.Fatalf("expected pod phase guard branch, got %v", err)
|
t.Fatalf("expected pod phase guard branch, got %v", err)
|
||||||
}
|
}
|
||||||
@ -170,7 +170,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
|||||||
}
|
}
|
||||||
return runFollowup(ctx, timeout, name, args...)
|
return runFollowup(ctx, timeout, name, args...)
|
||||||
}
|
}
|
||||||
orchFollowup, _ := newHookOrchestratorWithRunnerMode(t, cfgFollowup, false, runFollowup, runSensitive)
|
orchFollowup, _ := newHookOrchestratorAdvanced(t, cfgFollowup, false, runFollowup, runSensitive)
|
||||||
if err := orchFollowup.TestHookEnsureVaultUnsealed(context.Background()); err == nil || !strings.Contains(err.Error(), "vault status check failed") {
|
if err := orchFollowup.TestHookEnsureVaultUnsealed(context.Background()); err == nil || !strings.Contains(err.Error(), "vault status check failed") {
|
||||||
t.Fatalf("expected follow-up sealed status error branch, got %v", err)
|
t.Fatalf("expected follow-up sealed status error branch, got %v", err)
|
||||||
}
|
}
|
||||||
@ -204,7 +204,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
|||||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
|
orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
|
||||||
err := orch.TestHookDrainWorkers(context.Background(), workers)
|
err := orch.TestHookDrainWorkers(context.Background(), workers)
|
||||||
if err == nil || !strings.Contains(err.Error(), "drain workers had 5 errors") {
|
if err == nil || !strings.Contains(err.Error(), "drain workers had 5 errors") {
|
||||||
t.Fatalf("expected drain aggregation branch, got %v", err)
|
t.Fatalf("expected drain aggregation branch, got %v", err)
|
||||||
@ -217,7 +217,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
|||||||
cfg.SSHManagedNodes = []string{"titan-db"}
|
cfg.SSHManagedNodes = []string{"titan-db"}
|
||||||
rec := &commandRecorder{}
|
rec := &commandRecorder{}
|
||||||
base := lifecycleDispatcher(rec)
|
base := lifecycleDispatcher(rec)
|
||||||
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, base, base)
|
orch, _ := newHookOrchestratorAdvanced(t, cfg, false, base, base)
|
||||||
orch.TestHookRunSSHAcrossNodes(context.Background(), []string{"titan-db", "not-managed"}, "noop", "echo ok")
|
orch.TestHookRunSSHAcrossNodes(context.Background(), []string{"titan-db", "not-managed"}, "noop", "echo ok")
|
||||||
if !rec.contains("atlas@titan-db echo ok") {
|
if !rec.contains("atlas@titan-db echo ok") {
|
||||||
t.Fatalf("expected managed ssh execution branch")
|
t.Fatalf("expected managed ssh execution branch")
|
||||||
@ -233,7 +233,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
|||||||
}
|
}
|
||||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||||
}
|
}
|
||||||
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
|
orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
|
||||||
if _, err := orch.TestHookLatestEtcdSnapshotPath(context.Background(), "titan-db"); err == nil || !strings.Contains(err.Error(), "no etcd snapshots found") {
|
if _, err := orch.TestHookLatestEtcdSnapshotPath(context.Background(), "titan-db"); err == nil || !strings.Contains(err.Error(), "no etcd snapshots found") {
|
||||||
t.Fatalf("expected empty snapshot-list branch, got %v", err)
|
t.Fatalf("expected empty snapshot-list branch, got %v", err)
|
||||||
}
|
}
|
||||||
@ -250,7 +250,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
|||||||
}
|
}
|
||||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||||
}
|
}
|
||||||
orchWorkers, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, runWorkers, runWorkers)
|
orchWorkers, _ := newHookOrchestratorAdvanced(t, cfg, false, runWorkers, runWorkers)
|
||||||
workers, err := orchWorkers.TestHookEffectiveWorkers(context.Background())
|
workers, err := orchWorkers.TestHookEffectiveWorkers(context.Background())
|
||||||
if err != nil || len(workers) == 0 {
|
if err != nil || len(workers) == 0 {
|
||||||
t.Fatalf("expected inventory worker fallback branch, workers=%v err=%v", workers, err)
|
t.Fatalf("expected inventory worker fallback branch, workers=%v err=%v", workers, err)
|
||||||
@ -273,7 +273,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
|||||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
orchWrite, _ := newHookOrchestratorWithRunnerMode(t, cfgWrite, false, runWrite, runWrite)
|
orchWrite, _ := newHookOrchestratorAdvanced(t, cfgWrite, false, runWrite, runWrite)
|
||||||
if err := orchWrite.TestHookScaleDownApps(context.Background()); err == nil || !strings.Contains(err.Error(), "write scaled workload snapshot") {
|
if err := orchWrite.TestHookScaleDownApps(context.Background()); err == nil || !strings.Contains(err.Error(), "write scaled workload snapshot") {
|
||||||
t.Fatalf("expected scaled snapshot write-failure branch, got %v", err)
|
t.Fatalf("expected scaled snapshot write-failure branch, got %v", err)
|
||||||
}
|
}
|
||||||
@ -294,7 +294,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
|||||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
orchReady, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, runReady, runReady)
|
orchReady, _ := newHookOrchestratorAdvanced(t, cfg, false, runReady, runReady)
|
||||||
ready, detail, err := orchReady.TestHookFluxHealthReady(context.Background())
|
ready, detail, err := orchReady.TestHookFluxHealthReady(context.Background())
|
||||||
if err != nil || ready || !strings.Contains(detail, "ready=false") {
|
if err != nil || ready || !strings.Contains(detail, "ready=false") {
|
||||||
t.Fatalf("expected flux ready-reason fallback branch, ready=%v detail=%q err=%v", ready, detail, err)
|
t.Fatalf("expected flux ready-reason fallback branch, ready=%v detail=%q err=%v", ready, detail, err)
|
||||||
@ -319,7 +319,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
|||||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
|
orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
|
||||||
ctx, cancel := context.WithCancel(context.Background())
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
cancel()
|
cancel()
|
||||||
if err := orch.TestHookWaitForFluxHealth(ctx); !errors.Is(err, context.Canceled) {
|
if err := orch.TestHookWaitForFluxHealth(ctx); !errors.Is(err, context.Canceled) {
|
||||||
@ -336,7 +336,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
|||||||
}
|
}
|
||||||
rec := &commandRecorder{}
|
rec := &commandRecorder{}
|
||||||
base := lifecycleDispatcher(rec)
|
base := lifecycleDispatcher(rec)
|
||||||
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, base, base)
|
orch, _ := newHookOrchestratorAdvanced(t, cfg, false, base, base)
|
||||||
if err := orch.TestHookEnsureRequiredNodeLabels(context.Background()); err != nil {
|
if err := orch.TestHookEnsureRequiredNodeLabels(context.Background()); err != nil {
|
||||||
t.Fatalf("expected ensureRequiredNodeLabels skip/apply branches, got %v", err)
|
t.Fatalf("expected ensureRequiredNodeLabels skip/apply branches, got %v", err)
|
||||||
}
|
}
|
||||||
@ -347,7 +347,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
|||||||
|
|
||||||
t.Run("wait-for-startup-convergence-dryrun-and-critical-endpoint-fail", func(t *testing.T) {
|
t.Run("wait-for-startup-convergence-dryrun-and-critical-endpoint-fail", func(t *testing.T) {
|
||||||
cfgDry := lifecycleConfig(t)
|
cfgDry := lifecycleConfig(t)
|
||||||
orchDry, _ := newHookOrchestratorWithRunnerMode(t, cfgDry, true, nil, nil)
|
orchDry, _ := newHookOrchestratorAdvanced(t, cfgDry, true, nil, nil)
|
||||||
if err := orchDry.TestHookWaitForStartupConvergence(context.Background()); err != nil {
|
if err := orchDry.TestHookWaitForStartupConvergence(context.Background()); err != nil {
|
||||||
t.Fatalf("expected startup convergence dry-run fast path, got %v", err)
|
t.Fatalf("expected startup convergence dry-run fast path, got %v", err)
|
||||||
}
|
}
|
||||||
@ -365,7 +365,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
|||||||
}
|
}
|
||||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||||
}
|
}
|
||||||
orchFail, _ := newHookOrchestratorWithRunnerMode(t, cfgFail, false, run, run)
|
orchFail, _ := newHookOrchestratorAdvanced(t, cfgFail, false, run, run)
|
||||||
if err := orchFail.TestHookWaitForStartupConvergence(context.Background()); err == nil || !strings.Contains(err.Error(), "query endpoints") {
|
if err := orchFail.TestHookWaitForStartupConvergence(context.Background()); err == nil || !strings.Contains(err.Error(), "query endpoints") {
|
||||||
t.Fatalf("expected critical-endpoint convergence failure branch, got %v", err)
|
t.Fatalf("expected critical-endpoint convergence failure branch, got %v", err)
|
||||||
}
|
}
|
||||||
@ -373,7 +373,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
|||||||
|
|
||||||
t.Run("ingress-namespace-discovery-empty-and-query-error", func(t *testing.T) {
|
t.Run("ingress-namespace-discovery-empty-and-query-error", func(t *testing.T) {
|
||||||
cfg := lifecycleConfig(t)
|
cfg := lifecycleConfig(t)
|
||||||
orchEmpty, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, nil, nil)
|
orchEmpty, _ := newHookOrchestratorAdvanced(t, cfg, false, nil, nil)
|
||||||
namespaces, err := orchEmpty.TestHookDiscoverIngressNamespacesForHost(context.Background(), " ")
|
namespaces, err := orchEmpty.TestHookDiscoverIngressNamespacesForHost(context.Background(), " ")
|
||||||
if err != nil || len(namespaces) != 0 {
|
if err != nil || len(namespaces) != 0 {
|
||||||
t.Fatalf("expected empty-host fast path, namespaces=%v err=%v", namespaces, err)
|
t.Fatalf("expected empty-host fast path, namespaces=%v err=%v", namespaces, err)
|
||||||
@ -386,7 +386,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
|||||||
}
|
}
|
||||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||||
}
|
}
|
||||||
orchErr, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, runErr, runErr)
|
orchErr, _ := newHookOrchestratorAdvanced(t, cfg, false, runErr, runErr)
|
||||||
if _, err := orchErr.TestHookDiscoverIngressNamespacesForHost(context.Background(), "metrics.bstein.dev"); err == nil || !strings.Contains(err.Error(), "query ingresses") {
|
if _, err := orchErr.TestHookDiscoverIngressNamespacesForHost(context.Background(), "metrics.bstein.dev"); err == nil || !strings.Contains(err.Error(), "query ingresses") {
|
||||||
t.Fatalf("expected ingress query error branch, got %v", err)
|
t.Fatalf("expected ingress query error branch, got %v", err)
|
||||||
}
|
}
|
||||||
@ -412,7 +412,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
|||||||
URL: "http://" + listener.Addr().String() + "/health",
|
URL: "http://" + listener.Addr().String() + "/health",
|
||||||
AcceptedStatuses: []int{200},
|
AcceptedStatuses: []int{200},
|
||||||
}}
|
}}
|
||||||
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, nil, nil)
|
orch, _ := newHookOrchestratorAdvanced(t, cfg, false, nil, nil)
|
||||||
ready, detail := orch.TestHookServiceChecklistReady(context.Background())
|
ready, detail := orch.TestHookServiceChecklistReady(context.Background())
|
||||||
if ready || !strings.Contains(detail, "http://") {
|
if ready || !strings.Contains(detail, "http://") {
|
||||||
t.Fatalf("expected service checklist URL-name fallback failure, ready=%v detail=%q", ready, detail)
|
t.Fatalf("expected service checklist URL-name fallback failure, ready=%v detail=%q", ready, detail)
|
||||||
@ -435,7 +435,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
|||||||
}
|
}
|
||||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||||
}
|
}
|
||||||
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
|
orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
|
||||||
ctx, cancel := context.WithCancel(context.Background())
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
cancel()
|
cancel()
|
||||||
if err := orch.TestHookWaitForNodeInventoryReachability(ctx); !errors.Is(err, context.Canceled) {
|
if err := orch.TestHookWaitForNodeInventoryReachability(ctx); !errors.Is(err, context.Canceled) {
|
||||||
@ -456,7 +456,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
|||||||
}
|
}
|
||||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||||
}
|
}
|
||||||
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
|
orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
|
||||||
ctx, cancel := context.WithCancel(context.Background())
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
cancel()
|
cancel()
|
||||||
if err := orch.TestHookWaitForPostStartProbes(ctx); !errors.Is(err, context.Canceled) {
|
if err := orch.TestHookWaitForPostStartProbes(ctx); !errors.Is(err, context.Canceled) {
|
||||||
@ -478,7 +478,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
|||||||
}
|
}
|
||||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||||
}
|
}
|
||||||
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
|
orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
|
||||||
if err := orch.TestHookResumeFluxAndReconcile(context.Background()); err != nil {
|
if err := orch.TestHookResumeFluxAndReconcile(context.Background()); err != nil {
|
||||||
t.Fatalf("expected resume flux warning-only branch, got %v", err)
|
t.Fatalf("expected resume flux warning-only branch, got %v", err)
|
||||||
}
|
}
|
||||||
@ -505,7 +505,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
|||||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
|
orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
|
||||||
ctx, cancel := context.WithCancel(context.Background())
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
cancel()
|
cancel()
|
||||||
if err := orch.TestHookWaitForTimeSync(ctx, []string{"", "titan-db"}); !errors.Is(err, context.Canceled) {
|
if err := orch.TestHookWaitForTimeSync(ctx, []string{"", "titan-db"}); !errors.Is(err, context.Canceled) {
|
||||||
@ -532,14 +532,14 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
|||||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
|
orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
|
||||||
if err := orch.TestHookWaitForWorkloadConvergence(context.Background()); err != nil {
|
if err := orch.TestHookWaitForWorkloadConvergence(context.Background()); err != nil {
|
||||||
t.Fatalf("expected workload convergence default-branch success, got %v", err)
|
t.Fatalf("expected workload convergence default-branch success, got %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
cfgIgnore := lifecycleConfig(t)
|
cfgIgnore := lifecycleConfig(t)
|
||||||
cfgIgnore.Startup.AutoRecycleStuckPods = false
|
cfgIgnore.Startup.AutoRecycleStuckPods = false
|
||||||
orchIgnoreDry, _ := newHookOrchestratorWithRunnerMode(t, cfgIgnore, true, run, run)
|
orchIgnoreDry, _ := newHookOrchestratorAdvanced(t, cfgIgnore, true, run, run)
|
||||||
now := time.Now().UTC().Add(-time.Hour)
|
now := time.Now().UTC().Add(-time.Hour)
|
||||||
orchIgnoreDry.TestHookMaybeAutoRecycleStuckPods(context.Background(), &now)
|
orchIgnoreDry.TestHookMaybeAutoRecycleStuckPods(context.Background(), &now)
|
||||||
orchIgnoreDry.TestHookMaybeAutoHealCriticalWorkloadReplicas(context.Background(), &now)
|
orchIgnoreDry.TestHookMaybeAutoHealCriticalWorkloadReplicas(context.Background(), &now)
|
||||||
@ -551,7 +551,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
|
|||||||
}
|
}
|
||||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||||
}
|
}
|
||||||
orchHealErr, _ := newHookOrchestratorWithRunnerMode(t, lifecycleConfig(t), false, runHealErr, runHealErr)
|
orchHealErr, _ := newHookOrchestratorAdvanced(t, lifecycleConfig(t), false, runHealErr, runHealErr)
|
||||||
if _, err := orchHealErr.TestHookHealCriticalWorkloadReplicas(context.Background()); err == nil || !strings.Contains(err.Error(), "query workloads") {
|
if _, err := orchHealErr.TestHookHealCriticalWorkloadReplicas(context.Background()); err == nil || !strings.Contains(err.Error(), "query workloads") {
|
||||||
t.Fatalf("expected critical workload heal query-error branch, got %v", err)
|
t.Fatalf("expected critical workload heal query-error branch, got %v", err)
|
||||||
}
|
}
|
||||||
@ -20,7 +20,7 @@ import (
|
|||||||
|
|
||||||
// newLifecycleMatrixOrchestrator runs one orchestration or CLI step.
|
// newLifecycleMatrixOrchestrator runs one orchestration or CLI step.
|
||||||
// Signature: newLifecycleMatrixOrchestrator(t *testing.T, cfg config.Config, dryRun bool, run commandOverride, runSensitive commandOverride, kubeconfig string) *cluster.Orchestrator.
|
// Signature: newLifecycleMatrixOrchestrator(t *testing.T, cfg config.Config, dryRun bool, run commandOverride, runSensitive commandOverride, kubeconfig string) *cluster.Orchestrator.
|
||||||
// Why: lifecycle cleanup scenarios need direct control over runner dry-run and kubeconfig branches.
|
// Why: part11 needs direct control over runner dry-run and kubeconfig branches.
|
||||||
func newLifecycleMatrixOrchestrator(
|
func newLifecycleMatrixOrchestrator(
|
||||||
t *testing.T,
|
t *testing.T,
|
||||||
cfg config.Config,
|
cfg config.Config,
|
||||||
@ -49,11 +49,11 @@ func newLifecycleMatrixOrchestrator(
|
|||||||
return orch
|
return orch
|
||||||
}
|
}
|
||||||
|
|
||||||
// TestHookLifecycleCleanupRemainingClosure runs one orchestration or CLI step.
|
// TestHookGapMatrixPart11RemainingClosure runs one orchestration or CLI step.
|
||||||
// Signature: TestHookLifecycleCleanupRemainingClosure(t *testing.T).
|
// Signature: TestHookGapMatrixPart11RemainingClosure(t *testing.T).
|
||||||
// Why: closes final branch gaps for lifecycle + remaining near-threshold
|
// Why: closes final branch gaps for lifecycle + remaining near-threshold
|
||||||
// orchestrator files so per-file coverage reaches the enforced 95% target.
|
// orchestrator files so per-file coverage reaches the enforced 95% target.
|
||||||
func TestHookLifecycleCleanupRemainingClosure(t *testing.T) {
|
func TestHookGapMatrixPart11RemainingClosure(t *testing.T) {
|
||||||
t.Run("critical-vault-final-closures", func(t *testing.T) {
|
t.Run("critical-vault-final-closures", func(t *testing.T) {
|
||||||
t.Run("ensure-critical-cleanup-error-and-cleanup-branches", func(t *testing.T) {
|
t.Run("ensure-critical-cleanup-error-and-cleanup-branches", func(t *testing.T) {
|
||||||
cfg := lifecycleConfig(t)
|
cfg := lifecycleConfig(t)
|
||||||
@ -633,7 +633,7 @@ func TestHookLifecycleCleanupRemainingClosure(t *testing.T) {
|
|||||||
switch {
|
switch {
|
||||||
case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
|
case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
|
||||||
apiVersionCalls++
|
apiVersionCalls++
|
||||||
if apiVersionCalls <= 2 {
|
if apiVersionCalls == 1 {
|
||||||
return "", errors.New("api down")
|
return "", errors.New("api down")
|
||||||
}
|
}
|
||||||
return "v1.31.0", nil
|
return "v1.31.0", nil
|
||||||
@ -17,11 +17,11 @@ import (
|
|||||||
"scm.bstein.dev/bstein/ananke/internal/state"
|
"scm.bstein.dev/bstein/ananke/internal/state"
|
||||||
)
|
)
|
||||||
|
|
||||||
// TestHookTimesyncAndStabilityMatrix runs one orchestration or CLI step.
|
// TestHookGapMatrixPart2TimesyncAndStability runs one orchestration or CLI step.
|
||||||
// Signature: TestHookTimesyncAndStabilityMatrix(t *testing.T).
|
// Signature: TestHookGapMatrixPart2TimesyncAndStability(t *testing.T).
|
||||||
// Why: drives low-coverage time-sync, datastore parsing, and startup stability
|
// Why: drives low-coverage time-sync, datastore parsing, and startup stability
|
||||||
// branches from the top-level testing module.
|
// branches from the top-level testing module.
|
||||||
func TestHookTimesyncAndStabilityMatrix(t *testing.T) {
|
func TestHookGapMatrixPart2TimesyncAndStability(t *testing.T) {
|
||||||
t.Run("parse-datastore-endpoint-matrix", func(t *testing.T) {
|
t.Run("parse-datastore-endpoint-matrix", func(t *testing.T) {
|
||||||
cases := []struct {
|
cases := []struct {
|
||||||
line string
|
line string
|
||||||
@ -162,11 +162,11 @@ func TestHookTimesyncAndStabilityMatrix(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
// TestHookFluxScalingReportMatrix runs one orchestration or CLI step.
|
// TestHookGapMatrixPart2FluxScalingReport runs one orchestration or CLI step.
|
||||||
// Signature: TestHookFluxScalingReportMatrix(t *testing.T).
|
// Signature: TestHookGapMatrixPart2FluxScalingReport(t *testing.T).
|
||||||
// Why: targets low branch density in flux-health, scaling snapshot handling,
|
// Why: targets low branch density in flux-health, scaling snapshot handling,
|
||||||
// and report sanitization helpers.
|
// and report sanitization helpers.
|
||||||
func TestHookFluxScalingReportMatrix(t *testing.T) {
|
func TestHookGapMatrixPart2FluxScalingReport(t *testing.T) {
|
||||||
t.Run("flux-helper-matrix", func(t *testing.T) {
|
t.Run("flux-helper-matrix", func(t *testing.T) {
|
||||||
if !cluster.TestHookLooksLikeImmutableJobError("Job update failed: FIELD IS IMMUTABLE") {
|
if !cluster.TestHookLooksLikeImmutableJobError("Job update failed: FIELD IS IMMUTABLE") {
|
||||||
t.Fatalf("expected immutable matcher true for uppercase+job variant")
|
t.Fatalf("expected immutable matcher true for uppercase+job variant")
|
||||||
@ -241,11 +241,11 @@ func TestHookFluxScalingReportMatrix(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
// TestHookVaultAndCoordinationMatrix runs one orchestration or CLI step.
|
// TestHookGapMatrixPart2VaultAndCoordination runs one orchestration or CLI step.
|
||||||
// Signature: TestHookVaultAndCoordinationMatrix(t *testing.T).
|
// Signature: TestHookGapMatrixPart2VaultAndCoordination(t *testing.T).
|
||||||
// Why: raises branch coverage on vault/key and coordination helpers without
|
// Why: raises branch coverage on vault/key and coordination helpers without
|
||||||
// requiring package-local tests.
|
// requiring package-local tests.
|
||||||
func TestHookVaultAndCoordinationMatrix(t *testing.T) {
|
func TestHookGapMatrixPart2VaultAndCoordination(t *testing.T) {
|
||||||
t.Run("vault-unseal-and-file-branches", func(t *testing.T) {
|
t.Run("vault-unseal-and-file-branches", func(t *testing.T) {
|
||||||
cfg := lifecycleConfig(t)
|
cfg := lifecycleConfig(t)
|
||||||
cfg.Startup.VaultUnsealKeyFile = ""
|
cfg.Startup.VaultUnsealKeyFile = ""
|
||||||
@ -296,11 +296,11 @@ func TestHookVaultAndCoordinationMatrix(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
// TestHookWorkloadIgnoreMatrix runs one orchestration or CLI step.
|
// TestHookGapMatrixPart2WorkloadIgnore runs one orchestration or CLI step.
|
||||||
// Signature: TestHookWorkloadIgnoreMatrix(t *testing.T).
|
// Signature: TestHookGapMatrixPart2WorkloadIgnore(t *testing.T).
|
||||||
// Why: expands low branch coverage in workload ignore helpers and startup-failure
|
// Why: expands low branch coverage in workload ignore helpers and startup-failure
|
||||||
// pod classification.
|
// pod classification.
|
||||||
func TestHookWorkloadIgnoreMatrix(t *testing.T) {
|
func TestHookGapMatrixPart2WorkloadIgnore(t *testing.T) {
|
||||||
t.Run("ignored-node-helper-matrix", func(t *testing.T) {
|
t.Run("ignored-node-helper-matrix", func(t *testing.T) {
|
||||||
if !cluster.TestHookWorkloadTargetsIgnoredNodes("titan-22", nil, []string{"titan-22"}) {
|
if !cluster.TestHookWorkloadTargetsIgnoredNodes("titan-22", nil, []string{"titan-22"}) {
|
||||||
t.Fatalf("expected selector-host ignored match")
|
t.Fatalf("expected selector-host ignored match")
|
||||||
@ -11,11 +11,11 @@ import (
|
|||||||
"scm.bstein.dev/bstein/ananke/internal/config"
|
"scm.bstein.dev/bstein/ananke/internal/config"
|
||||||
)
|
)
|
||||||
|
|
||||||
// TestHookConvergenceAndStabilityMatrix runs one orchestration or CLI step.
|
// TestHookGapMatrixPart3ConvergenceAndStability runs one orchestration or CLI step.
|
||||||
// Signature: TestHookConvergenceAndStabilityMatrix(t *testing.T).
|
// Signature: TestHookGapMatrixPart3ConvergenceAndStability(t *testing.T).
|
||||||
// Why: raises coverage for startup convergence orchestration and stability gates
|
// Why: raises coverage for startup convergence orchestration and stability gates
|
||||||
// that determine whether startup is considered truly complete.
|
// that determine whether startup is considered truly complete.
|
||||||
func TestHookConvergenceAndStabilityMatrix(t *testing.T) {
|
func TestHookGapMatrixPart3ConvergenceAndStability(t *testing.T) {
|
||||||
t.Run("wait-for-startup-convergence-gate-matrix", func(t *testing.T) {
|
t.Run("wait-for-startup-convergence-gate-matrix", func(t *testing.T) {
|
||||||
cfgIngress := lifecycleConfig(t)
|
cfgIngress := lifecycleConfig(t)
|
||||||
cfgIngress.Startup.RequireIngressChecklist = true
|
cfgIngress.Startup.RequireIngressChecklist = true
|
||||||
@ -108,11 +108,11 @@ func TestHookConvergenceAndStabilityMatrix(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
// TestHookLifecycleRestoreShutdownMatrix runs one orchestration or CLI step.
|
// TestHookGapMatrixPart3LifecycleRestoreShutdown runs one orchestration or CLI step.
|
||||||
// Signature: TestHookLifecycleRestoreShutdownMatrix(t *testing.T).
|
// Signature: TestHookGapMatrixPart3LifecycleRestoreShutdown(t *testing.T).
|
||||||
// Why: fills lifecycle restore/shutdown success paths that are easy to miss in
|
// Why: fills lifecycle restore/shutdown success paths that are easy to miss in
|
||||||
// failure-focused drill tests.
|
// failure-focused drill tests.
|
||||||
func TestHookLifecycleRestoreShutdownMatrix(t *testing.T) {
|
func TestHookGapMatrixPart3LifecycleRestoreShutdown(t *testing.T) {
|
||||||
t.Run("etcd-restore-dry-run-and-success", func(t *testing.T) {
|
t.Run("etcd-restore-dry-run-and-success", func(t *testing.T) {
|
||||||
cfgDry := lifecycleConfig(t)
|
cfgDry := lifecycleConfig(t)
|
||||||
dry := newDryRunHookOrchestrator(t, cfgDry, nil)
|
dry := newDryRunHookOrchestrator(t, cfgDry, nil)
|
||||||
@ -19,11 +19,11 @@ import (
|
|||||||
"scm.bstein.dev/bstein/ananke/internal/state"
|
"scm.bstein.dev/bstein/ananke/internal/state"
|
||||||
)
|
)
|
||||||
|
|
||||||
// TestHookCoordinationAndReachabilityMatrix runs one orchestration or CLI step.
|
// TestHookGapMatrixPart4CoordinationAndReachability runs one orchestration or CLI step.
|
||||||
// Signature: TestHookCoordinationAndReachabilityMatrix(t *testing.T).
|
// Signature: TestHookGapMatrixPart4CoordinationAndReachability(t *testing.T).
|
||||||
// Why: closes remaining coordination/reachability low branches with deterministic
|
// Why: closes remaining coordination/reachability low branches with deterministic
|
||||||
// command responses and short timeouts.
|
// command responses and short timeouts.
|
||||||
func TestHookCoordinationAndReachabilityMatrix(t *testing.T) {
|
func TestHookGapMatrixPart4CoordinationAndReachability(t *testing.T) {
|
||||||
t.Run("peer-shutdown-complete-cooldown-blocks-startup", func(t *testing.T) {
|
t.Run("peer-shutdown-complete-cooldown-blocks-startup", func(t *testing.T) {
|
||||||
cfg := lifecycleConfig(t)
|
cfg := lifecycleConfig(t)
|
||||||
cfg.Coordination.PeerHosts = []string{"titan-24"}
|
cfg.Coordination.PeerHosts = []string{"titan-24"}
|
||||||
@ -136,11 +136,11 @@ func TestHookCoordinationAndReachabilityMatrix(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
// TestHookIngressServiceAndPostStartMatrix runs one orchestration or CLI step.
|
// TestHookGapMatrixPart4IngressServiceAndPostStart runs one orchestration or CLI step.
|
||||||
// Signature: TestHookIngressServiceAndPostStartMatrix(t *testing.T).
|
// Signature: TestHookGapMatrixPart4IngressServiceAndPostStart(t *testing.T).
|
||||||
// Why: drives ingress/service checklist and post-start branches that were still
|
// Why: drives ingress/service checklist and post-start branches that were still
|
||||||
// under-covered after drill-focused matrix tests.
|
// under-covered after drill-focused matrix tests.
|
||||||
func TestHookIngressServiceAndPostStartMatrix(t *testing.T) {
|
func TestHookGapMatrixPart4IngressServiceAndPostStart(t *testing.T) {
|
||||||
t.Run("ingress-backend-autoscale-cooldown-and-host-parser", func(t *testing.T) {
|
t.Run("ingress-backend-autoscale-cooldown-and-host-parser", func(t *testing.T) {
|
||||||
cfg := lifecycleConfig(t)
|
cfg := lifecycleConfig(t)
|
||||||
cfg.Startup.ServiceChecklist = []config.ServiceChecklistCheck{
|
cfg.Startup.ServiceChecklist = []config.ServiceChecklistCheck{
|
||||||
@ -233,11 +233,11 @@ func TestHookIngressServiceAndPostStartMatrix(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
// TestHookReportScalingStorageDrainMatrix runs one orchestration or CLI step.
|
// TestHookGapMatrixPart4ReportScalingStorageDrain runs one orchestration or CLI step.
|
||||||
// Signature: TestHookReportScalingStorageDrainMatrix(t *testing.T).
|
// Signature: TestHookGapMatrixPart4ReportScalingStorageDrain(t *testing.T).
|
||||||
// Why: covers artifact, scaling snapshot, storage, and drain error branches that
|
// Why: covers artifact, scaling snapshot, storage, and drain error branches that
|
||||||
// are difficult to hit from happy-path lifecycle drills.
|
// are difficult to hit from happy-path lifecycle drills.
|
||||||
func TestHookReportScalingStorageDrainMatrix(t *testing.T) {
|
func TestHookGapMatrixPart4ReportScalingStorageDrain(t *testing.T) {
|
||||||
t.Run("report-artifact-and-progress-error-paths", func(t *testing.T) {
|
t.Run("report-artifact-and-progress-error-paths", func(t *testing.T) {
|
||||||
cfg := lifecycleConfig(t)
|
cfg := lifecycleConfig(t)
|
||||||
reportsFile := filepath.Join(t.TempDir(), "reports-as-file")
|
reportsFile := filepath.Join(t.TempDir(), "reports-as-file")
|
||||||
@ -339,11 +339,11 @@ func TestHookReportScalingStorageDrainMatrix(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
// TestHookTimesyncLifecycleAndAccessMatrix runs one orchestration or CLI step.
|
// TestHookGapMatrixPart4TimesyncLifecycleAndAccess runs one orchestration or CLI step.
|
||||||
// Signature: TestHookTimesyncLifecycleAndAccessMatrix(t *testing.T).
|
// Signature: TestHookGapMatrixPart4TimesyncLifecycleAndAccess(t *testing.T).
|
||||||
// Why: closes remaining timing/access/lifecycle branches that still sat below
|
// Why: closes remaining timing/access/lifecycle branches that still sat below
|
||||||
// target after the earlier matrices.
|
// target after the earlier matrices.
|
||||||
func TestHookTimesyncLifecycleAndAccessMatrix(t *testing.T) {
|
func TestHookGapMatrixPart4TimesyncLifecycleAndAccess(t *testing.T) {
|
||||||
t.Run("timesync-quorum-and-strict-failure-branches", func(t *testing.T) {
|
t.Run("timesync-quorum-and-strict-failure-branches", func(t *testing.T) {
|
||||||
cfg := lifecycleConfig(t)
|
cfg := lifecycleConfig(t)
|
||||||
cfg.Startup.TimeSyncMode = "quorum"
|
cfg.Startup.TimeSyncMode = "quorum"
|
||||||
@ -20,11 +20,11 @@ import (
|
|||||||
"scm.bstein.dev/bstein/ananke/internal/state"
|
"scm.bstein.dev/bstein/ananke/internal/state"
|
||||||
)
|
)
|
||||||
|
|
||||||
// TestHookEndpointHealingCoverageClosure runs one orchestration or CLI step.
|
// TestHookGapMatrixPart5CoverageClosure runs one orchestration or CLI step.
|
||||||
// Signature: TestHookEndpointHealingCoverageClosure(t *testing.T).
|
// Signature: TestHookGapMatrixPart5CoverageClosure(t *testing.T).
|
||||||
// Why: closes branch gaps that still remained after drill-style tests by driving
|
// Why: closes branch gaps that still remained after drill-style tests by driving
|
||||||
// low-coverage orchestrator internals through the exported top-level hook surface.
|
// low-coverage orchestrator internals through the exported top-level hook surface.
|
||||||
func TestHookEndpointHealingCoverageClosure(t *testing.T) {
|
func TestHookGapMatrixPart5CoverageClosure(t *testing.T) {
|
||||||
t.Run("critical-endpoint-backend-heal-matrix", func(t *testing.T) {
|
t.Run("critical-endpoint-backend-heal-matrix", func(t *testing.T) {
|
||||||
t.Run("empty-namespace-service-noop", func(t *testing.T) {
|
t.Run("empty-namespace-service-noop", func(t *testing.T) {
|
||||||
orch, _ := newHookOrchestrator(t, lifecycleConfig(t), nil, nil)
|
orch, _ := newHookOrchestrator(t, lifecycleConfig(t), nil, nil)
|
||||||
@ -491,10 +491,10 @@ func httpStatusHandler(code int, body string) func(http.ResponseWriter, *http.Re
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TestHookIngressHostMappingRegression runs one orchestration or CLI step.
|
// TestHookGapMatrixPart5IngressHostMappingRegression runs one orchestration or CLI step.
|
||||||
// Signature: TestHookIngressHostMappingRegression(t *testing.T).
|
// Signature: TestHookGapMatrixPart5IngressHostMappingRegression(t *testing.T).
|
||||||
// Why: ensures host parsing fallback paths stay stable for ingress/service checklist failures.
|
// Why: ensures host parsing fallback paths stay stable for ingress/service checklist failures.
|
||||||
func TestHookIngressHostMappingRegression(t *testing.T) {
|
func TestHookGapMatrixPart5IngressHostMappingRegression(t *testing.T) {
|
||||||
cfg := lifecycleConfig(t)
|
cfg := lifecycleConfig(t)
|
||||||
cfg.Startup.ServiceChecklist = []config.ServiceChecklistCheck{
|
cfg.Startup.ServiceChecklist = []config.ServiceChecklistCheck{
|
||||||
{Name: "metrics", URL: "https://metrics.bstein.dev/api/health"},
|
{Name: "metrics", URL: "https://metrics.bstein.dev/api/health"},
|
||||||
@ -16,11 +16,11 @@ import (
|
|||||||
"scm.bstein.dev/bstein/ananke/internal/state"
|
"scm.bstein.dev/bstein/ananke/internal/state"
|
||||||
)
|
)
|
||||||
|
|
||||||
// TestHookVaultPostStartBranchMatrix runs one orchestration or CLI step.
|
// TestHookGapMatrixPart6CoverageClosure runs one orchestration or CLI step.
|
||||||
// Signature: TestHookVaultPostStartBranchMatrix(t *testing.T).
|
// Signature: TestHookGapMatrixPart6CoverageClosure(t *testing.T).
|
||||||
// Why: targets the remaining low branch paths after endpoint-healing coverage so per-file coverage
|
// Why: targets the remaining low branch paths after part5 so per-file coverage
|
||||||
// can move toward the strict 95% quality gate.
|
// can move toward the strict 95% quality gate.
|
||||||
func TestHookVaultPostStartBranchMatrix(t *testing.T) {
|
func TestHookGapMatrixPart6CoverageClosure(t *testing.T) {
|
||||||
t.Run("critical-vault-and-poststart-branches", func(t *testing.T) {
|
t.Run("critical-vault-and-poststart-branches", func(t *testing.T) {
|
||||||
t.Run("wait-vault-ready-dryrun-and-cancel", func(t *testing.T) {
|
t.Run("wait-vault-ready-dryrun-and-cancel", func(t *testing.T) {
|
||||||
cfg := lifecycleConfig(t)
|
cfg := lifecycleConfig(t)
|
||||||
@ -14,11 +14,11 @@ import (
|
|||||||
"scm.bstein.dev/bstein/ananke/internal/state"
|
"scm.bstein.dev/bstein/ananke/internal/state"
|
||||||
)
|
)
|
||||||
|
|
||||||
// TestHookWorkloadStorageAccessMatrix runs one orchestration or CLI step.
|
// TestHookGapMatrixPart7CoverageClosure runs one orchestration or CLI step.
|
||||||
// Signature: TestHookWorkloadStorageAccessMatrix(t *testing.T).
|
// Signature: TestHookGapMatrixPart7CoverageClosure(t *testing.T).
|
||||||
// Why: closes additional low-coverage branches in convergence, storage, access,
|
// Why: closes additional low-coverage branches in convergence, storage, access,
|
||||||
// flux, lifecycle, and sensitive command wrappers.
|
// flux, lifecycle, and sensitive command wrappers.
|
||||||
func TestHookWorkloadStorageAccessMatrix(t *testing.T) {
|
func TestHookGapMatrixPart7CoverageClosure(t *testing.T) {
|
||||||
t.Run("workload-convergence-branch-matrix", func(t *testing.T) {
|
t.Run("workload-convergence-branch-matrix", func(t *testing.T) {
|
||||||
cfg := lifecycleConfig(t)
|
cfg := lifecycleConfig(t)
|
||||||
cfg.Startup.WorkloadConvergenceWaitSeconds = 1
|
cfg.Startup.WorkloadConvergenceWaitSeconds = 1
|
||||||
@ -165,32 +165,6 @@ func TestHookWorkloadStorageAccessMatrix(t *testing.T) {
|
|||||||
t.Fatalf("expected inventory reachability timeout on unexpected output, got %v", err)
|
t.Fatalf("expected inventory reachability timeout on unexpected output, got %v", err)
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
t.Run("wait-for-node-gates-only-require-core-nodes", func(t *testing.T) {
|
|
||||||
cfg := lifecycleConfig(t)
|
|
||||||
cfg.Startup.RequireNodeSSHAuth = true
|
|
||||||
cfg.Startup.NodeSSHAuthWaitSeconds = 1
|
|
||||||
cfg.Startup.NodeSSHAuthPollSeconds = 1
|
|
||||||
cfg.Startup.NodeInventoryReachWaitSeconds = 1
|
|
||||||
cfg.Startup.NodeInventoryReachPollSeconds = 1
|
|
||||||
cfg.Startup.NodeInventoryReachRequiredNodes = []string{"titan-db"}
|
|
||||||
cfg.Startup.NodeSSHAuthRequiredNodes = []string{"titan-db"}
|
|
||||||
|
|
||||||
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
|
||||||
command := name + " " + strings.Join(args, " ")
|
|
||||||
if name == "ssh" && strings.Contains(command, "titan-23") && strings.Contains(command, "__ANANKE_") {
|
|
||||||
return "", errors.New("no route to host")
|
|
||||||
}
|
|
||||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
|
||||||
}
|
|
||||||
orch, _ := newHookOrchestrator(t, cfg, run, run)
|
|
||||||
if err := orch.TestHookWaitForNodeSSHAuth(context.Background(), []string{"titan-db", "titan-23"}); err != nil {
|
|
||||||
t.Fatalf("expected ssh-auth gate to ignore non-core worker failure, got %v", err)
|
|
||||||
}
|
|
||||||
if err := orch.TestHookWaitForNodeInventoryReachability(context.Background()); err != nil {
|
|
||||||
t.Fatalf("expected inventory reachability gate to ignore non-core worker failure, got %v", err)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
})
|
})
|
||||||
|
|
||||||
t.Run("flux-lifecycle-and-sensitive-run-branches", func(t *testing.T) {
|
t.Run("flux-lifecycle-and-sensitive-run-branches", func(t *testing.T) {
|
||||||
@ -19,11 +19,11 @@ import (
|
|||||||
"scm.bstein.dev/bstein/ananke/internal/state"
|
"scm.bstein.dev/bstein/ananke/internal/state"
|
||||||
)
|
)
|
||||||
|
|
||||||
// TestHookAccessVaultLifecycleMatrix runs one orchestration or CLI step.
|
// TestHookGapMatrixPart8CoverageClosure runs one orchestration or CLI step.
|
||||||
// Signature: TestHookAccessVaultLifecycleMatrix(t *testing.T).
|
// Signature: TestHookGapMatrixPart8CoverageClosure(t *testing.T).
|
||||||
// Why: closes additional low-coverage branches in access, vault, lifecycle,
|
// Why: closes additional low-coverage branches in access, vault, lifecycle,
|
||||||
// ingress/service stability, and timesync/inventory orchestration paths.
|
// ingress/service stability, and timesync/inventory orchestration paths.
|
||||||
func TestHookAccessVaultLifecycleMatrix(t *testing.T) {
|
func TestHookGapMatrixPart8CoverageClosure(t *testing.T) {
|
||||||
t.Run("access-ssh-and-branch-guard-branches", func(t *testing.T) {
|
t.Run("access-ssh-and-branch-guard-branches", func(t *testing.T) {
|
||||||
cfg := lifecycleConfig(t)
|
cfg := lifecycleConfig(t)
|
||||||
cfg.Startup.RequireNodeSSHAuth = true
|
cfg.Startup.RequireNodeSSHAuth = true
|
||||||
@ -331,11 +331,11 @@ func TestHookAccessVaultLifecycleMatrix(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
// TestHookLifecycleStartupAutoRestoreBranch runs one orchestration or CLI step.
|
// TestHookGapMatrixPart8LifecycleStartupAutoRestoreBranch runs one orchestration or CLI step.
|
||||||
// Signature: TestHookLifecycleStartupAutoRestoreBranch(t *testing.T).
|
// Signature: TestHookGapMatrixPart8LifecycleStartupAutoRestoreBranch(t *testing.T).
|
||||||
// Why: covers Startup's API-failure->auto-restore retry path that is otherwise
|
// Why: covers Startup's API-failure->auto-restore retry path that is otherwise
|
||||||
// hard to exercise in deterministic top-level tests.
|
// hard to exercise in deterministic top-level tests.
|
||||||
func TestHookLifecycleStartupAutoRestoreBranch(t *testing.T) {
|
func TestHookGapMatrixPart8LifecycleStartupAutoRestoreBranch(t *testing.T) {
|
||||||
cfg := lifecycleConfig(t)
|
cfg := lifecycleConfig(t)
|
||||||
cfg.Startup.AutoEtcdRestoreOnAPIFailure = true
|
cfg.Startup.AutoEtcdRestoreOnAPIFailure = true
|
||||||
cfg.Startup.EtcdRestoreControlPlane = "titan-db"
|
cfg.Startup.EtcdRestoreControlPlane = "titan-db"
|
||||||
@ -384,7 +384,7 @@ func TestHookLifecycleStartupAutoRestoreBranch(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
orch, _ := newHookOrchestrator(t, cfg, run, run)
|
orch, _ := newHookOrchestrator(t, cfg, run, run)
|
||||||
err = orch.Startup(context.Background(), cluster.StartupOptions{Reason: "lifecycle-auto-restore"})
|
err = orch.Startup(context.Background(), cluster.StartupOptions{Reason: "part8-auto-restore"})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("expected startup auto-restore path success, got %v", err)
|
t.Fatalf("expected startup auto-restore path success, got %v", err)
|
||||||
}
|
}
|
||||||
@ -394,7 +394,7 @@ func TestHookLifecycleStartupAutoRestoreBranch(t *testing.T) {
|
|||||||
|
|
||||||
cfgBadMode := lifecycleConfig(t)
|
cfgBadMode := lifecycleConfig(t)
|
||||||
orchBadMode, _ := newHookOrchestrator(t, cfgBadMode, nil, nil)
|
orchBadMode, _ := newHookOrchestrator(t, cfgBadMode, nil, nil)
|
||||||
err = orchBadMode.Shutdown(context.Background(), cluster.ShutdownOptions{Reason: "lifecycle", Mode: "unknown-mode"})
|
err = orchBadMode.Shutdown(context.Background(), cluster.ShutdownOptions{Reason: "part8", Mode: "unknown-mode"})
|
||||||
if err == nil || !strings.Contains(err.Error(), "unsupported shutdown mode") {
|
if err == nil || !strings.Contains(err.Error(), "unsupported shutdown mode") {
|
||||||
t.Fatalf("expected shutdown unsupported-mode branch, got %v", err)
|
t.Fatalf("expected shutdown unsupported-mode branch, got %v", err)
|
||||||
}
|
}
|
||||||
@ -16,11 +16,11 @@ import (
|
|||||||
"scm.bstein.dev/bstein/ananke/internal/state"
|
"scm.bstein.dev/bstein/ananke/internal/state"
|
||||||
)
|
)
|
||||||
|
|
||||||
// TestHookAccessCoordinationEndpointsMatrix runs one orchestration or CLI step.
|
// TestHookGapMatrixPart9AccessCoordinationEndpoints runs one orchestration or CLI step.
|
||||||
// Signature: TestHookAccessCoordinationEndpointsMatrix(t *testing.T).
|
// Signature: TestHookGapMatrixPart9AccessCoordinationEndpoints(t *testing.T).
|
||||||
// Why: closes uncovered statement ranges in access/fluxsource, coordination,
|
// Why: closes uncovered statement ranges in access/fluxsource, coordination,
|
||||||
// and critical-endpoint orchestration helpers.
|
// and critical-endpoint orchestration helpers.
|
||||||
func TestHookAccessCoordinationEndpointsMatrix(t *testing.T) {
|
func TestHookGapMatrixPart9AccessCoordinationEndpoints(t *testing.T) {
|
||||||
t.Run("access-fluxsource-uncovered-ranges", func(t *testing.T) {
|
t.Run("access-fluxsource-uncovered-ranges", func(t *testing.T) {
|
||||||
cfg := lifecycleConfig(t)
|
cfg := lifecycleConfig(t)
|
||||||
cfg.Shutdown.SSHParallelism = 0
|
cfg.Shutdown.SSHParallelism = 0
|
||||||
@ -53,48 +53,6 @@ func TestHookIngressServiceMatrix(t *testing.T) {
|
|||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
t.Run("required-node-labels-skip-ignored-unavailable-nodes", func(t *testing.T) {
|
|
||||||
cfg := lifecycleConfig(t)
|
|
||||||
cfg.Startup.RequiredNodeLabels = map[string]map[string]string{
|
|
||||||
"titan-09": {
|
|
||||||
"ananke.bstein.dev/harbor-bootstrap": "true",
|
|
||||||
},
|
|
||||||
}
|
|
||||||
cfg.Startup.IgnoreUnavailableNodes = []string{"titan-09"}
|
|
||||||
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
|
||||||
command := name + " " + strings.Join(args, " ")
|
|
||||||
if name == "kubectl" && strings.Contains(command, "label node titan-09 --overwrite") {
|
|
||||||
t.Fatalf("expected ignored unavailable node labels to be skipped, got %q", command)
|
|
||||||
}
|
|
||||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
|
||||||
}
|
|
||||||
orch, _ := newHookOrchestrator(t, cfg, run, run)
|
|
||||||
if err := orch.TestHookEnsureRequiredNodeLabels(context.Background()); err != nil {
|
|
||||||
t.Fatalf("expected ignored unavailable node label enforcement to be skipped, got %v", err)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
t.Run("required-node-labels-skip-absent-non-core-nodes", func(t *testing.T) {
|
|
||||||
cfg := lifecycleConfig(t)
|
|
||||||
cfg.Startup.RequiredNodeLabels = map[string]map[string]string{
|
|
||||||
"titan-09": {
|
|
||||||
"ananke.bstein.dev/harbor-bootstrap": "true",
|
|
||||||
},
|
|
||||||
}
|
|
||||||
cfg.Startup.NodeInventoryReachRequiredNodes = []string{"titan-db"}
|
|
||||||
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
|
||||||
command := name + " " + strings.Join(args, " ")
|
|
||||||
if name == "kubectl" && strings.Contains(command, "label node titan-09 --overwrite") {
|
|
||||||
return "", errors.New("Error from server (NotFound): nodes \"titan-09\" not found")
|
|
||||||
}
|
|
||||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
|
||||||
}
|
|
||||||
orch, _ := newHookOrchestrator(t, cfg, run, run)
|
|
||||||
if err := orch.TestHookEnsureRequiredNodeLabels(context.Background()); err != nil {
|
|
||||||
t.Fatalf("expected absent non-core node label enforcement to be skipped, got %v", err)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
t.Run("ingress-discovery-checklist-and-heal", func(t *testing.T) {
|
t.Run("ingress-discovery-checklist-and-heal", func(t *testing.T) {
|
||||||
tlsServer := httptest.NewTLSServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
|
tlsServer := httptest.NewTLSServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
|
||||||
w.WriteHeader(http.StatusOK)
|
w.WriteHeader(http.StatusOK)
|
||||||
|
|||||||
@ -4,6 +4,7 @@ import (
|
|||||||
"context"
|
"context"
|
||||||
"errors"
|
"errors"
|
||||||
"net"
|
"net"
|
||||||
|
"os"
|
||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
@ -124,25 +125,20 @@ func TestLifecycleDeepFailureMatrix(t *testing.T) {
|
|||||||
|
|
||||||
t.Run("startup-cooldown-reread-intent-error", func(t *testing.T) {
|
t.Run("startup-cooldown-reread-intent-error", func(t *testing.T) {
|
||||||
cfg := lifecycleFastConfig(t)
|
cfg := lifecycleFastConfig(t)
|
||||||
cfg.Startup.RequireNodeInventoryReach = false
|
cfg.Startup.ShutdownCooldownSeconds = 1
|
||||||
cfg.Startup.ShutdownCooldownSeconds = 5
|
if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{
|
||||||
reads := 0
|
|
||||||
restoreRead := state.TestHookSetReadIntentOverride(func(path string) (state.Intent, error) {
|
|
||||||
if path != cfg.State.IntentPath {
|
|
||||||
return state.TestHookReadIntentDefault(path)
|
|
||||||
}
|
|
||||||
reads++
|
|
||||||
if reads == 1 {
|
|
||||||
return state.Intent{
|
|
||||||
State: state.IntentShutdownComplete,
|
State: state.IntentShutdownComplete,
|
||||||
Reason: "recent",
|
Reason: "recent",
|
||||||
Source: "test",
|
Source: "test",
|
||||||
UpdatedAt: time.Now().UTC().Add(-4 * time.Second),
|
UpdatedAt: time.Now().UTC(),
|
||||||
}, nil
|
}); err != nil {
|
||||||
|
t.Fatalf("seed cooldown intent: %v", err)
|
||||||
}
|
}
|
||||||
return state.Intent{}, errors.New("forced reread failure")
|
go func(intentPath string) {
|
||||||
})
|
time.Sleep(150 * time.Millisecond)
|
||||||
t.Cleanup(restoreRead)
|
_ = os.Remove(intentPath)
|
||||||
|
_ = os.Mkdir(intentPath, 0o755)
|
||||||
|
}(cfg.State.IntentPath)
|
||||||
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
|
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
|
||||||
err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "cooldown-reread"})
|
err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "cooldown-reread"})
|
||||||
if err == nil || !strings.Contains(err.Error(), "re-read startup intent after cooldown wait") {
|
if err == nil || !strings.Contains(err.Error(), "re-read startup intent after cooldown wait") {
|
||||||
@ -152,30 +148,24 @@ func TestLifecycleDeepFailureMatrix(t *testing.T) {
|
|||||||
|
|
||||||
t.Run("startup-cooldown-shutdown-became-active", func(t *testing.T) {
|
t.Run("startup-cooldown-shutdown-became-active", func(t *testing.T) {
|
||||||
cfg := lifecycleFastConfig(t)
|
cfg := lifecycleFastConfig(t)
|
||||||
cfg.Startup.RequireNodeInventoryReach = false
|
cfg.Startup.ShutdownCooldownSeconds = 1
|
||||||
cfg.Startup.ShutdownCooldownSeconds = 5
|
if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{
|
||||||
reads := 0
|
|
||||||
restoreRead := state.TestHookSetReadIntentOverride(func(path string) (state.Intent, error) {
|
|
||||||
if path != cfg.State.IntentPath {
|
|
||||||
return state.TestHookReadIntentDefault(path)
|
|
||||||
}
|
|
||||||
reads++
|
|
||||||
if reads == 1 {
|
|
||||||
return state.Intent{
|
|
||||||
State: state.IntentShutdownComplete,
|
State: state.IntentShutdownComplete,
|
||||||
Reason: "recent",
|
Reason: "recent",
|
||||||
Source: "test",
|
Source: "test",
|
||||||
UpdatedAt: time.Now().UTC().Add(-4 * time.Second),
|
UpdatedAt: time.Now().UTC(),
|
||||||
}, nil
|
}); err != nil {
|
||||||
|
t.Fatalf("seed cooldown intent: %v", err)
|
||||||
}
|
}
|
||||||
return state.Intent{
|
go func(intentPath string) {
|
||||||
|
time.Sleep(150 * time.Millisecond)
|
||||||
|
_ = state.WriteIntent(intentPath, state.Intent{
|
||||||
State: state.IntentShuttingDown,
|
State: state.IntentShuttingDown,
|
||||||
Reason: "peer-shutdown",
|
Reason: "peer-shutdown",
|
||||||
Source: "test",
|
Source: "test",
|
||||||
UpdatedAt: time.Now().UTC(),
|
UpdatedAt: time.Now().UTC(),
|
||||||
}, nil
|
|
||||||
})
|
})
|
||||||
t.Cleanup(restoreRead)
|
}(cfg.State.IntentPath)
|
||||||
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
|
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
|
||||||
err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "cooldown-peer-active"})
|
err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "cooldown-peer-active"})
|
||||||
if err == nil || !strings.Contains(err.Error(), "shutdown intent became active during cooldown wait") {
|
if err == nil || !strings.Contains(err.Error(), "shutdown intent became active during cooldown wait") {
|
||||||
|
|||||||
@ -1,432 +0,0 @@
|
|||||||
package orchestrator
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"errors"
|
|
||||||
"strings"
|
|
||||||
"testing"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"scm.bstein.dev/bstein/ananke/internal/cluster"
|
|
||||||
)
|
|
||||||
|
|
||||||
// TestHookSchedulingStormHelpers runs one orchestration or CLI step.
|
|
||||||
// Signature: TestHookSchedulingStormHelpers(t *testing.T).
|
|
||||||
// Why: keeps scheduling-storm helper coverage in the split top-level testing module
|
|
||||||
// required by the repo hygiene contract.
|
|
||||||
func TestHookSchedulingStormHelpers(t *testing.T) {
|
|
||||||
if got, ok := cluster.TestHookSchedulingStormOwnerWorkload("ai", "ReplicaSet", "ollama-rs", "Deployment", "ollama"); !ok || got != "ai/deployment/ollama" {
|
|
||||||
t.Fatalf("unexpected deployment owner resolution: got=%q ok=%v", got, ok)
|
|
||||||
}
|
|
||||||
if got, ok := cluster.TestHookSchedulingStormOwnerWorkload("storage", "StatefulSet", "nextcloud", "", ""); !ok || got != "storage/statefulset/nextcloud" {
|
|
||||||
t.Fatalf("unexpected statefulset owner resolution: got=%q ok=%v", got, ok)
|
|
||||||
}
|
|
||||||
if got, ok := cluster.TestHookSchedulingStormOwnerWorkload("ai", "ReplicaSet", "missing", "", ""); ok || got != "" {
|
|
||||||
t.Fatalf("expected missing replicaset owner lookup to fail, got=%q ok=%v", got, ok)
|
|
||||||
}
|
|
||||||
|
|
||||||
if got := cluster.TestHookEventObservationCount(3, 9); got != 9 {
|
|
||||||
t.Fatalf("expected series count to win, got %d", got)
|
|
||||||
}
|
|
||||||
if got := cluster.TestHookEventObservationCount(0, 0); got != 1 {
|
|
||||||
t.Fatalf("expected zero-count normalization to 1, got %d", got)
|
|
||||||
}
|
|
||||||
|
|
||||||
now := time.Now().UTC().Round(time.Second)
|
|
||||||
if got := cluster.TestHookEventLastObservedAt(now, now.Add(-time.Minute), now.Add(-2*time.Minute), now.Add(-3*time.Minute)); !got.Equal(now) {
|
|
||||||
t.Fatalf("expected series timestamp priority, got %s", got)
|
|
||||||
}
|
|
||||||
if got := cluster.TestHookEventLastObservedAt(time.Time{}, now, now.Add(-time.Minute), now.Add(-2*time.Minute)); !got.Equal(now) {
|
|
||||||
t.Fatalf("expected lastTimestamp fallback, got %s", got)
|
|
||||||
}
|
|
||||||
if got := cluster.TestHookEventLastObservedAt(time.Time{}, time.Time{}, now, now.Add(-time.Minute)); !got.Equal(now) {
|
|
||||||
t.Fatalf("expected eventTime fallback, got %s", got)
|
|
||||||
}
|
|
||||||
if got := cluster.TestHookEventLastObservedAt(time.Time{}, time.Time{}, time.Time{}, now); !got.Equal(now) {
|
|
||||||
t.Fatalf("expected creationTimestamp fallback, got %s", got)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// TestHookSchedulingStormQuarantine runs one orchestration or CLI step.
|
|
||||||
// Signature: TestHookSchedulingStormQuarantine(t *testing.T).
|
|
||||||
// Why: verifies that only non-core workloads generating real scheduling storms
|
|
||||||
// are auto-quarantined, which prevents event/Kine churn from spiking control-plane CPU.
|
|
||||||
func TestHookSchedulingStormQuarantine(t *testing.T) {
|
|
||||||
now := time.Now().UTC().Format(time.RFC3339)
|
|
||||||
cfg := lifecycleConfig(t)
|
|
||||||
cfg.Startup.AutoQuarantineSchedulingStorms = true
|
|
||||||
cfg.Startup.SchedulingStormEventThreshold = 30
|
|
||||||
cfg.Startup.SchedulingStormWindowSeconds = 180
|
|
||||||
cfg.Startup.WorkloadConvergenceRequiredNamespaces = []string{"vault"}
|
|
||||||
cfg.Startup.IgnoreWorkloadNamespaces = []string{"ignored-ns"}
|
|
||||||
cfg.Startup.IgnoreWorkloads = []string{"monitoring/deployment/ignore-me"}
|
|
||||||
cfg.Startup.IgnoreUnavailableNodes = []string{"titan-22"}
|
|
||||||
scaledOllama := false
|
|
||||||
|
|
||||||
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
|
||||||
command := name + " " + strings.Join(args, " ")
|
|
||||||
switch {
|
|
||||||
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
|
|
||||||
return `{"items":[
|
|
||||||
{"metadata":{"namespace":"ai","name":"ollama-pod","ownerReferences":[{"kind":"ReplicaSet","name":"ollama-rs"}]},"spec":{},"status":{"phase":"Pending"}},
|
|
||||||
{"metadata":{"namespace":"vault","name":"vault-0","ownerReferences":[{"kind":"StatefulSet","name":"vault"}]},"spec":{},"status":{"phase":"Pending"}},
|
|
||||||
{"metadata":{"namespace":"ignored-ns","name":"skip-pod","ownerReferences":[{"kind":"ReplicaSet","name":"skip-rs"}]},"spec":{},"status":{"phase":"Pending"}},
|
|
||||||
{"metadata":{"namespace":"monitoring","name":"ignore-me-pod","ownerReferences":[{"kind":"ReplicaSet","name":"ignore-me-rs"}]},"spec":{},"status":{"phase":"Pending"}},
|
|
||||||
{"metadata":{"namespace":"monitoring","name":"ignored-node-pod","ownerReferences":[{"kind":"ReplicaSet","name":"ignored-node-rs"}]},"spec":{"nodeName":"titan-22"},"status":{"phase":"Pending"}},
|
|
||||||
{"metadata":{"namespace":"monitoring","name":"running-pod","ownerReferences":[{"kind":"ReplicaSet","name":"running-rs"}]},"spec":{},"status":{"phase":"Running"}}
|
|
||||||
]}`, nil
|
|
||||||
case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
|
|
||||||
return `{"items":[
|
|
||||||
{"metadata":{"namespace":"ai","name":"ollama-rs","ownerReferences":[{"kind":"Deployment","name":"ollama"}]}},
|
|
||||||
{"metadata":{"namespace":"ignored-ns","name":"skip-rs","ownerReferences":[{"kind":"Deployment","name":"skip"}]}},
|
|
||||||
{"metadata":{"namespace":"monitoring","name":"ignore-me-rs","ownerReferences":[{"kind":"Deployment","name":"ignore-me"}]}},
|
|
||||||
{"metadata":{"namespace":"monitoring","name":"ignored-node-rs","ownerReferences":[{"kind":"Deployment","name":"ignored-node"}]}},
|
|
||||||
{"metadata":{"namespace":"monitoring","name":"running-rs","ownerReferences":[{"kind":"Deployment","name":"running"}]}}
|
|
||||||
]}`, nil
|
|
||||||
case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
|
|
||||||
return `{"items":[
|
|
||||||
{"metadata":{"creationTimestamp":"` + now + `"},"involvedObject":{"kind":"Pod","namespace":"ai","name":"ollama-pod"},"type":"Warning","reason":"FailedScheduling","count":45},
|
|
||||||
{"metadata":{"creationTimestamp":"` + now + `"},"involvedObject":{"kind":"Pod","namespace":"vault","name":"vault-0"},"type":"Warning","reason":"FailedScheduling","count":45},
|
|
||||||
{"metadata":{"creationTimestamp":"` + now + `"},"involvedObject":{"kind":"Pod","namespace":"ignored-ns","name":"skip-pod"},"type":"Warning","reason":"FailedScheduling","count":45},
|
|
||||||
{"metadata":{"creationTimestamp":"` + now + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"ignore-me-pod"},"type":"Warning","reason":"FailedScheduling","count":45},
|
|
||||||
{"metadata":{"creationTimestamp":"` + now + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"ignored-node-pod"},"type":"Warning","reason":"FailedScheduling","count":45},
|
|
||||||
{"metadata":{"creationTimestamp":"` + now + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"running-pod"},"type":"Warning","reason":"FailedScheduling","count":45},
|
|
||||||
{"metadata":{"creationTimestamp":"2000-01-01T00:00:00Z"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"stale-pod"},"type":"Warning","reason":"FailedScheduling","count":99}
|
|
||||||
]}`, nil
|
|
||||||
case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"):
|
|
||||||
return `{"items":[
|
|
||||||
{"kind":"Deployment","metadata":{"namespace":"ai","name":"ollama"},"spec":{"replicas":1}},
|
|
||||||
{"kind":"StatefulSet","metadata":{"namespace":"vault","name":"vault"},"spec":{"replicas":1}},
|
|
||||||
{"kind":"Deployment","metadata":{"namespace":"ignored-ns","name":"skip"},"spec":{"replicas":1}},
|
|
||||||
{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"ignore-me"},"spec":{"replicas":1}},
|
|
||||||
{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"ignored-node"},"spec":{"replicas":1}},
|
|
||||||
{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"running"},"spec":{"replicas":1}}
|
|
||||||
]}`, nil
|
|
||||||
case name == "kubectl" && strings.Contains(command, "-n ai scale deployment ollama --replicas=0"):
|
|
||||||
scaledOllama = true
|
|
||||||
return "", nil
|
|
||||||
default:
|
|
||||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
orch, _ := newHookOrchestrator(t, cfg, run, run)
|
|
||||||
orch.TestHookBeginStartupReport("scheduling-storm")
|
|
||||||
defer orch.TestHookFinalizeStartupReport(nil)
|
|
||||||
|
|
||||||
if err := orch.TestHookQuarantineSchedulingStormWorkloads(context.Background()); err != nil {
|
|
||||||
t.Fatalf("quarantine scheduling storm workloads: %v", err)
|
|
||||||
}
|
|
||||||
if !scaledOllama {
|
|
||||||
t.Fatalf("expected ollama deployment to be scaled to zero")
|
|
||||||
}
|
|
||||||
progress := readStartupProgress(t, orch)
|
|
||||||
if !strings.Contains(progress, "ollama") {
|
|
||||||
t.Fatalf("expected startup progress to mention ollama quarantine, payload=%s", progress)
|
|
||||||
}
|
|
||||||
if strings.Contains(progress, "vault") || strings.Contains(progress, "ignore-me") || strings.Contains(progress, "ignored-node") {
|
|
||||||
t.Fatalf("expected only the non-core eligible workload to be quarantined, payload=%s", progress)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// TestHookSchedulingStormTriggerGuards runs one orchestration or CLI step.
|
|
||||||
// Signature: TestHookSchedulingStormTriggerGuards(t *testing.T).
|
|
||||||
// Why: covers dry-run/disabled/rate-limit guards so the scheduler-storm auto-heal
|
|
||||||
// only activates when the cluster is actually suffering this exact failure mode.
|
|
||||||
func TestHookSchedulingStormTriggerGuards(t *testing.T) {
|
|
||||||
cfgDisabled := lifecycleConfig(t)
|
|
||||||
orchDisabled, _ := newHookOrchestrator(t, cfgDisabled, nil, nil)
|
|
||||||
lastAttempt := time.Time{}
|
|
||||||
orchDisabled.TestHookMaybeAutoQuarantineSchedulingStorms(context.Background(), &lastAttempt)
|
|
||||||
if !lastAttempt.IsZero() {
|
|
||||||
t.Fatalf("expected disabled scheduling-storm trigger to be skipped")
|
|
||||||
}
|
|
||||||
|
|
||||||
cfgDry := lifecycleConfig(t)
|
|
||||||
cfgDry.Startup.AutoQuarantineSchedulingStorms = true
|
|
||||||
orchDry := newDryRunHookOrchestrator(t, cfgDry, nil)
|
|
||||||
orchDry.TestHookMaybeAutoQuarantineSchedulingStorms(context.Background(), &lastAttempt)
|
|
||||||
if !lastAttempt.IsZero() {
|
|
||||||
t.Fatalf("expected dry-run scheduling-storm trigger to be skipped")
|
|
||||||
}
|
|
||||||
|
|
||||||
cfgRate := lifecycleConfig(t)
|
|
||||||
cfgRate.Startup.AutoQuarantineSchedulingStorms = true
|
|
||||||
cfgRate.Startup.SchedulingStormEventThreshold = 5
|
|
||||||
cfgRate.Startup.SchedulingStormWindowSeconds = 60
|
|
||||||
recorder := &commandRecorder{}
|
|
||||||
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
|
||||||
recorder.record(name, args)
|
|
||||||
command := name + " " + strings.Join(args, " ")
|
|
||||||
switch {
|
|
||||||
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
|
|
||||||
return `{"items":[]}`, nil
|
|
||||||
case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
|
|
||||||
return `{"items":[]}`, nil
|
|
||||||
case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
|
|
||||||
return `{"items":[]}`, nil
|
|
||||||
case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"):
|
|
||||||
return `{"items":[]}`, nil
|
|
||||||
default:
|
|
||||||
return lifecycleDispatcher(recorder)(ctx, timeout, name, args...)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
orchRate, _ := newHookOrchestrator(t, cfgRate, run, run)
|
|
||||||
lastAttempt = time.Now()
|
|
||||||
orchRate.TestHookMaybeAutoQuarantineSchedulingStorms(context.Background(), &lastAttempt)
|
|
||||||
if recorder.contains("get pods -A -o json") {
|
|
||||||
t.Fatalf("expected rate-limited scheduling-storm trigger to skip kubectl scans")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// TestHookSchedulingStormTriggerAndNoOpBranches runs one orchestration or CLI step.
|
|
||||||
// Signature: TestHookSchedulingStormTriggerAndNoOpBranches(t *testing.T).
|
|
||||||
// Why: raises scheduling-storm branch coverage on the success/no-op paths so the
|
|
||||||
// auto-heal only acts on genuine event storms and stays quiet otherwise.
|
|
||||||
func TestHookSchedulingStormTriggerAndNoOpBranches(t *testing.T) {
|
|
||||||
cfg := lifecycleConfig(t)
|
|
||||||
cfg.Startup.AutoQuarantineSchedulingStorms = true
|
|
||||||
cfg.Startup.SchedulingStormEventThreshold = 0
|
|
||||||
cfg.Startup.SchedulingStormWindowSeconds = 0
|
|
||||||
|
|
||||||
scanRan := false
|
|
||||||
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
|
||||||
command := name + " " + strings.Join(args, " ")
|
|
||||||
switch {
|
|
||||||
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
|
|
||||||
scanRan = true
|
|
||||||
return `{"items":[
|
|
||||||
{"metadata":{"namespace":"","name":"missing"}},
|
|
||||||
{"metadata":{"namespace":"monitoring","name":"no-owner"},"spec":{},"status":{"phase":"Pending"}},
|
|
||||||
{"metadata":{"namespace":"monitoring","name":"done","ownerReferences":[{"kind":"ReplicaSet","name":"done-rs"}]},"spec":{},"status":{"phase":"Running"}},
|
|
||||||
{"metadata":{"namespace":"monitoring","name":"zero-replicas","ownerReferences":[{"kind":"ReplicaSet","name":"zero-rs"}]},"spec":{},"status":{"phase":"Pending"}}
|
|
||||||
]}`, nil
|
|
||||||
case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
|
|
||||||
return `{"items":[
|
|
||||||
{"metadata":{"namespace":"","name":"bad-rs"}},
|
|
||||||
{"metadata":{"namespace":"monitoring","name":"done-rs","ownerReferences":[{"kind":"","name":"ignored"}]}},
|
|
||||||
{"metadata":{"namespace":"monitoring","name":"zero-rs","ownerReferences":[{"kind":"Deployment","name":"zero"}]}}
|
|
||||||
]}`, nil
|
|
||||||
case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
|
|
||||||
return `{"items":[
|
|
||||||
{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"normal"},"type":"Normal","reason":"FailedScheduling","count":99},
|
|
||||||
{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"wrong-reason"},"type":"Warning","reason":"SomeOtherReason","count":99},
|
|
||||||
{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Service","namespace":"monitoring","name":"wrong-kind"},"type":"Warning","reason":"FailedScheduling","count":99},
|
|
||||||
{"metadata":{"creationTimestamp":"2000-01-01T00:00:00Z"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"old"},"type":"Warning","reason":"FailedScheduling","count":99},
|
|
||||||
{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"low-count"},"type":"Warning","reason":"FailedScheduling","count":1},
|
|
||||||
{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"missing-pod"},"type":"Warning","reason":"FailedScheduling","count":99},
|
|
||||||
{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"done"},"type":"Warning","reason":"FailedScheduling","count":99},
|
|
||||||
{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"no-owner"},"type":"Warning","reason":"FailedScheduling","count":99},
|
|
||||||
{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"zero-replicas"},"type":"Warning","reason":"FailedScheduling","count":99}
|
|
||||||
]}`, nil
|
|
||||||
case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"):
|
|
||||||
return `{"items":[
|
|
||||||
{"kind":"","metadata":{"namespace":"monitoring","name":"blank-kind"}},
|
|
||||||
{"kind":"Job","metadata":{"namespace":"monitoring","name":"unsupported"}},
|
|
||||||
{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"zero"},"spec":{"replicas":0}}
|
|
||||||
]}`, nil
|
|
||||||
default:
|
|
||||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
orch, _ := newHookOrchestrator(t, cfg, run, run)
|
|
||||||
orch.TestHookBeginStartupReport("scheduling-storm-noop")
|
|
||||||
defer orch.TestHookFinalizeStartupReport(nil)
|
|
||||||
|
|
||||||
lastAttempt := time.Time{}
|
|
||||||
orch.TestHookMaybeAutoQuarantineSchedulingStorms(context.Background(), &lastAttempt)
|
|
||||||
if lastAttempt.IsZero() {
|
|
||||||
t.Fatalf("expected successful scheduling-storm trigger to update lastAttempt")
|
|
||||||
}
|
|
||||||
if !scanRan {
|
|
||||||
t.Fatalf("expected scheduling-storm scan to execute")
|
|
||||||
}
|
|
||||||
progress := readStartupProgress(t, orch)
|
|
||||||
if strings.Contains(progress, "quarantined scheduling storm workload") {
|
|
||||||
t.Fatalf("expected no-op scheduling-storm scan to avoid auto-heal output, payload=%s", progress)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// TestHookSchedulingStormErrorMatrix runs one orchestration or CLI step.
|
|
||||||
// Signature: TestHookSchedulingStormErrorMatrix(t *testing.T).
|
|
||||||
// Why: covers malformed/error response branches in the scheduling-storm scan so
|
|
||||||
// Ananke can surface precise diagnostics when the API itself is part of the problem.
|
|
||||||
func TestHookSchedulingStormErrorMatrix(t *testing.T) {
|
|
||||||
cases := []struct {
|
|
||||||
name string
|
|
||||||
run func(context.Context, time.Duration, string, ...string) (string, error)
|
|
||||||
wantErr string
|
|
||||||
}{
|
|
||||||
{
|
|
||||||
name: "pods-query-error",
|
|
||||||
run: func(_ context.Context, _ time.Duration, name string, _ ...string) (string, error) {
|
|
||||||
if name == "kubectl" {
|
|
||||||
return "", errors.New("pods boom")
|
|
||||||
}
|
|
||||||
return "", nil
|
|
||||||
},
|
|
||||||
wantErr: "query pods for scheduling storm scan",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "pods-decode-error",
|
|
||||||
run: func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
|
|
||||||
if name == "kubectl" && strings.Contains(strings.Join(args, " "), "get pods -A -o json") {
|
|
||||||
return "{", nil
|
|
||||||
}
|
|
||||||
return `{"items":[]}`, nil
|
|
||||||
},
|
|
||||||
wantErr: "decode pods for scheduling storm scan",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "replicasets-query-error",
|
|
||||||
run: func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
|
|
||||||
command := name + " " + strings.Join(args, " ")
|
|
||||||
switch {
|
|
||||||
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
|
|
||||||
return `{"items":[]}`, nil
|
|
||||||
case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
|
|
||||||
return "", errors.New("replicasets boom")
|
|
||||||
default:
|
|
||||||
return "", nil
|
|
||||||
}
|
|
||||||
},
|
|
||||||
wantErr: "query replicasets for scheduling storm scan",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "replicasets-decode-error",
|
|
||||||
run: func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
|
|
||||||
command := name + " " + strings.Join(args, " ")
|
|
||||||
switch {
|
|
||||||
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
|
|
||||||
return `{"items":[]}`, nil
|
|
||||||
case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
|
|
||||||
return "{", nil
|
|
||||||
default:
|
|
||||||
return `{"items":[]}`, nil
|
|
||||||
}
|
|
||||||
},
|
|
||||||
wantErr: "decode replicasets for scheduling storm scan",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "events-query-error",
|
|
||||||
run: func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
|
|
||||||
command := name + " " + strings.Join(args, " ")
|
|
||||||
switch {
|
|
||||||
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
|
|
||||||
return `{"items":[]}`, nil
|
|
||||||
case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
|
|
||||||
return `{"items":[]}`, nil
|
|
||||||
case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
|
|
||||||
return "", errors.New("events boom")
|
|
||||||
default:
|
|
||||||
return "", nil
|
|
||||||
}
|
|
||||||
},
|
|
||||||
wantErr: "query events for scheduling storm scan",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "events-decode-error",
|
|
||||||
run: func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
|
|
||||||
command := name + " " + strings.Join(args, " ")
|
|
||||||
switch {
|
|
||||||
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
|
|
||||||
return `{"items":[]}`, nil
|
|
||||||
case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
|
|
||||||
return `{"items":[]}`, nil
|
|
||||||
case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
|
|
||||||
return "{", nil
|
|
||||||
default:
|
|
||||||
return `{"items":[]}`, nil
|
|
||||||
}
|
|
||||||
},
|
|
||||||
wantErr: "decode events for scheduling storm scan",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "workloads-query-error",
|
|
||||||
run: func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
|
|
||||||
command := name + " " + strings.Join(args, " ")
|
|
||||||
switch {
|
|
||||||
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
|
|
||||||
return `{"items":[]}`, nil
|
|
||||||
case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
|
|
||||||
return `{"items":[]}`, nil
|
|
||||||
case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
|
|
||||||
return `{"items":[]}`, nil
|
|
||||||
case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"):
|
|
||||||
return "", errors.New("workloads boom")
|
|
||||||
default:
|
|
||||||
return "", nil
|
|
||||||
}
|
|
||||||
},
|
|
||||||
wantErr: "query workloads for scheduling storm scan",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "workloads-decode-error",
|
|
||||||
run: func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
|
|
||||||
command := name + " " + strings.Join(args, " ")
|
|
||||||
switch {
|
|
||||||
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
|
|
||||||
return `{"items":[]}`, nil
|
|
||||||
case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
|
|
||||||
return `{"items":[]}`, nil
|
|
||||||
case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
|
|
||||||
return `{"items":[]}`, nil
|
|
||||||
case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"):
|
|
||||||
return "{", nil
|
|
||||||
default:
|
|
||||||
return "", nil
|
|
||||||
}
|
|
||||||
},
|
|
||||||
wantErr: "decode workloads for scheduling storm scan",
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, tc := range cases {
|
|
||||||
t.Run(tc.name, func(t *testing.T) {
|
|
||||||
cfg := lifecycleConfig(t)
|
|
||||||
cfg.Startup.AutoQuarantineSchedulingStorms = true
|
|
||||||
orch, _ := newHookOrchestrator(t, cfg, tc.run, tc.run)
|
|
||||||
err := orch.TestHookQuarantineSchedulingStormWorkloads(context.Background())
|
|
||||||
if err == nil || !strings.Contains(err.Error(), tc.wantErr) {
|
|
||||||
t.Fatalf("expected error containing %q, got %v", tc.wantErr, err)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// TestHookSchedulingStormScaleError runs one orchestration or CLI step.
|
|
||||||
// Signature: TestHookSchedulingStormScaleError(t *testing.T).
|
|
||||||
// Why: covers the final error path where Ananke detects a real storm but cannot
|
|
||||||
// scale the offending workload down.
|
|
||||||
func TestHookSchedulingStormScaleError(t *testing.T) {
|
|
||||||
now := time.Now().UTC().Format(time.RFC3339)
|
|
||||||
cfg := lifecycleConfig(t)
|
|
||||||
cfg.Startup.AutoQuarantineSchedulingStorms = true
|
|
||||||
cfg.Startup.SchedulingStormEventThreshold = 5
|
|
||||||
cfg.Startup.SchedulingStormWindowSeconds = 60
|
|
||||||
|
|
||||||
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
|
||||||
command := name + " " + strings.Join(args, " ")
|
|
||||||
switch {
|
|
||||||
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
|
|
||||||
return `{"items":[{"metadata":{"namespace":"ai","name":"ollama-pod","ownerReferences":[{"kind":"ReplicaSet","name":"ollama-rs"}]},"spec":{},"status":{"phase":"Pending"}}]}`, nil
|
|
||||||
case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
|
|
||||||
return `{"items":[{"metadata":{"namespace":"ai","name":"ollama-rs","ownerReferences":[{"kind":"Deployment","name":"ollama"}]}}]}`, nil
|
|
||||||
case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
|
|
||||||
return `{"items":[{"metadata":{"creationTimestamp":"` + now + `"},"involvedObject":{"kind":"Pod","namespace":"ai","name":"ollama-pod"},"type":"Warning","reason":"FailedScheduling","count":45}]}`, nil
|
|
||||||
case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"):
|
|
||||||
return `{"items":[{"kind":"Deployment","metadata":{"namespace":"ai","name":"ollama"},"spec":{"replicas":1}}]}`, nil
|
|
||||||
case name == "kubectl" && strings.Contains(command, "-n ai scale deployment ollama --replicas=0"):
|
|
||||||
return "", errors.New("scale denied")
|
|
||||||
default:
|
|
||||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
orch, _ := newHookOrchestrator(t, cfg, run, run)
|
|
||||||
err := orch.TestHookQuarantineSchedulingStormWorkloads(context.Background())
|
|
||||||
if err == nil || !strings.Contains(err.Error(), "scale scheduling storm workload ai/deployment/ollama to 0") {
|
|
||||||
t.Fatalf("expected scale error, got %v", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@ -1,222 +0,0 @@
|
|||||||
package orchestrator
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"errors"
|
|
||||||
"os"
|
|
||||||
"strings"
|
|
||||||
"testing"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"scm.bstein.dev/bstein/ananke/internal/cluster"
|
|
||||||
)
|
|
||||||
|
|
||||||
// readStartupProgress runs one orchestration or CLI step.
|
|
||||||
// Signature: readStartupProgress(t *testing.T, orch *cluster.Orchestrator) string.
|
|
||||||
// Why: startup helper tests need to inspect progress artifacts without reaching
|
|
||||||
// into internal package state from the top-level testing module.
|
|
||||||
func readStartupProgress(t *testing.T, orch *cluster.Orchestrator) string {
|
|
||||||
t.Helper()
|
|
||||||
payload, err := os.ReadFile(orch.TestHookStartupProgressPath())
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("read startup progress: %v", err)
|
|
||||||
}
|
|
||||||
return string(payload)
|
|
||||||
}
|
|
||||||
|
|
||||||
// TestHookStartupScopeAndVaultHelpers runs one orchestration or CLI step.
|
|
||||||
// Signature: TestHookStartupScopeAndVaultHelpers(t *testing.T).
|
|
||||||
// Why: keeps startup-scope and startup-Vault helper branches covered from the
|
|
||||||
// split top-level testing module required by the repo hygiene contract.
|
|
||||||
func TestHookStartupScopeAndVaultHelpers(t *testing.T) {
|
|
||||||
t.Run("startup-scope-helpers", func(t *testing.T) {
|
|
||||||
nodes := []string{"titan-db", " titan-23 ", "", "titan-24"}
|
|
||||||
if got := cluster.TestHookStartupRequiredNodes(nodes, nil); len(got) != len(nodes) {
|
|
||||||
t.Fatalf("expected passthrough node list, got %v", got)
|
|
||||||
}
|
|
||||||
got := cluster.TestHookStartupRequiredNodes(nodes, []string{"titan-db", "titan-24"})
|
|
||||||
if len(got) != 2 || got[0] != "titan-db" || got[1] != "titan-24" {
|
|
||||||
t.Fatalf("unexpected filtered node list: %v", got)
|
|
||||||
}
|
|
||||||
|
|
||||||
if !cluster.TestHookContainsNode([]string{"titan-db", " titan-23 "}, " titan-23 ") {
|
|
||||||
t.Fatalf("expected trimmed node membership match")
|
|
||||||
}
|
|
||||||
if cluster.TestHookContainsNode([]string{"titan-db"}, " ") {
|
|
||||||
t.Fatalf("expected blank node probe to be ignored")
|
|
||||||
}
|
|
||||||
|
|
||||||
cfg := lifecycleConfig(t)
|
|
||||||
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
|
|
||||||
if !orch.TestHookStartupNodeStrictlyRequired("titan-23") {
|
|
||||||
t.Fatalf("expected all nodes to be strict when no recovery scopes are configured")
|
|
||||||
}
|
|
||||||
|
|
||||||
cfgScoped := lifecycleConfig(t)
|
|
||||||
cfgScoped.Startup.NodeInventoryReachRequiredNodes = []string{"titan-23"}
|
|
||||||
cfgScoped.Startup.NodeSSHAuthRequiredNodes = []string{"titan-24"}
|
|
||||||
cfgScoped.Startup.FluxHealthRequiredKustomizations = []string{"flux-system/core", " flux-system/gitea "}
|
|
||||||
cfgScoped.Startup.WorkloadConvergenceRequiredNamespaces = []string{"vault", " monitoring "}
|
|
||||||
orchScoped, _ := newHookOrchestrator(t, cfgScoped, nil, nil)
|
|
||||||
if !orchScoped.TestHookStartupNodeStrictlyRequired("titan-db") {
|
|
||||||
t.Fatalf("expected control plane to remain strict")
|
|
||||||
}
|
|
||||||
if !orchScoped.TestHookStartupNodeStrictlyRequired("titan-23") {
|
|
||||||
t.Fatalf("expected inventory-scoped node to remain strict")
|
|
||||||
}
|
|
||||||
if !orchScoped.TestHookStartupNodeStrictlyRequired("titan-24") {
|
|
||||||
t.Fatalf("expected ssh-scoped node to remain strict")
|
|
||||||
}
|
|
||||||
if orchScoped.TestHookStartupNodeStrictlyRequired("titan-25") {
|
|
||||||
t.Fatalf("expected non-core worker to stop being strict")
|
|
||||||
}
|
|
||||||
|
|
||||||
flux := orchScoped.TestHookStartupRequiredFluxKustomizations()
|
|
||||||
if _, ok := flux["flux-system/core"]; !ok {
|
|
||||||
t.Fatalf("expected core flux kustomization in required set: %v", flux)
|
|
||||||
}
|
|
||||||
if _, ok := flux["flux-system/gitea"]; !ok {
|
|
||||||
t.Fatalf("expected trimmed gitea kustomization in required set: %v", flux)
|
|
||||||
}
|
|
||||||
|
|
||||||
namespaces := orchScoped.TestHookStartupRequiredWorkloadNamespaces()
|
|
||||||
if _, ok := namespaces["vault"]; !ok {
|
|
||||||
t.Fatalf("expected vault namespace in required set: %v", namespaces)
|
|
||||||
}
|
|
||||||
if _, ok := namespaces["monitoring"]; !ok {
|
|
||||||
t.Fatalf("expected trimmed monitoring namespace in required set: %v", namespaces)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
t.Run("startup-vault-helpers", func(t *testing.T) {
|
|
||||||
t.Run("early-vault-unseal-paths", func(t *testing.T) {
|
|
||||||
cfgAPI := lifecycleConfig(t)
|
|
||||||
runAPI := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
|
||||||
command := name + " " + strings.Join(args, " ")
|
|
||||||
if name == "kubectl" && strings.Contains(command, "version --request-timeout=5s") {
|
|
||||||
return "", errors.New("api down")
|
|
||||||
}
|
|
||||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
|
||||||
}
|
|
||||||
orchAPI, _ := newHookOrchestrator(t, cfgAPI, runAPI, runAPI)
|
|
||||||
orchAPI.TestHookBeginStartupReport("startup-vault")
|
|
||||||
orchAPI.TestHookMaybeRunEarlyVaultUnseal(context.Background())
|
|
||||||
if payload := readStartupProgress(t, orchAPI); strings.Contains(payload, "vault-unseal-early") {
|
|
||||||
t.Fatalf("expected no early vault check when api is unavailable, payload=%s", payload)
|
|
||||||
}
|
|
||||||
|
|
||||||
cfgErr := lifecycleConfig(t)
|
|
||||||
runErr := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
|
||||||
command := name + " " + strings.Join(args, " ")
|
|
||||||
switch {
|
|
||||||
case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
|
|
||||||
return "v1.31.0", nil
|
|
||||||
case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
|
|
||||||
return "", errors.New("phase probe failed")
|
|
||||||
default:
|
|
||||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
orchErr, _ := newHookOrchestrator(t, cfgErr, runErr, runErr)
|
|
||||||
orchErr.TestHookBeginStartupReport("startup-vault")
|
|
||||||
orchErr.TestHookMaybeRunEarlyVaultUnseal(context.Background())
|
|
||||||
if payload := readStartupProgress(t, orchErr); !strings.Contains(payload, "deferred early vault unseal") {
|
|
||||||
t.Fatalf("expected early vault auto-heal detail, payload=%s", payload)
|
|
||||||
}
|
|
||||||
|
|
||||||
cfgDeferred := lifecycleConfig(t)
|
|
||||||
runDeferred := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
|
||||||
command := name + " " + strings.Join(args, " ")
|
|
||||||
switch {
|
|
||||||
case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
|
|
||||||
return "v1.31.0", nil
|
|
||||||
case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
|
|
||||||
return "Pending", nil
|
|
||||||
default:
|
|
||||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
orchDeferred, _ := newHookOrchestrator(t, cfgDeferred, runDeferred, runDeferred)
|
|
||||||
orchDeferred.TestHookBeginStartupReport("startup-vault")
|
|
||||||
orchDeferred.TestHookMaybeRunEarlyVaultUnseal(context.Background())
|
|
||||||
if payload := readStartupProgress(t, orchDeferred); !strings.Contains(payload, `vault-0 pod phase is \"Pending\"`) {
|
|
||||||
t.Fatalf("expected deferred early vault detail, payload=%s", payload)
|
|
||||||
}
|
|
||||||
|
|
||||||
cfgSuccess := lifecycleConfig(t)
|
|
||||||
runSuccess := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
|
||||||
command := name + " " + strings.Join(args, " ")
|
|
||||||
switch {
|
|
||||||
case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
|
|
||||||
return "v1.31.0", nil
|
|
||||||
case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
|
|
||||||
return "Running", nil
|
|
||||||
case name == "kubectl" && strings.Contains(command, "vault status -format=json"):
|
|
||||||
return `{"sealed":false,"initialized":true}`, nil
|
|
||||||
default:
|
|
||||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
orchSuccess, _ := newHookOrchestrator(t, cfgSuccess, runSuccess, runSuccess)
|
|
||||||
orchSuccess.TestHookBeginStartupReport("startup-vault")
|
|
||||||
orchSuccess.TestHookMaybeRunEarlyVaultUnseal(context.Background())
|
|
||||||
if payload := readStartupProgress(t, orchSuccess); !strings.Contains(payload, `"vault-unseal-early"`) || !strings.Contains(payload, `"passed"`) {
|
|
||||||
t.Fatalf("expected successful early vault check, payload=%s", payload)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
t.Run("startup-vault-gate-paths", func(t *testing.T) {
|
|
||||||
cfgErr := lifecycleConfig(t)
|
|
||||||
runErr := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
|
||||||
command := name + " " + strings.Join(args, " ")
|
|
||||||
if name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}") {
|
|
||||||
return "", errors.New("phase probe failed")
|
|
||||||
}
|
|
||||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
|
||||||
}
|
|
||||||
orchErr, _ := newHookOrchestrator(t, cfgErr, runErr, runErr)
|
|
||||||
orchErr.TestHookBeginStartupReport("startup-vault")
|
|
||||||
if err := orchErr.TestHookRunStartupVaultUnsealGate(context.Background()); err == nil || !strings.Contains(err.Error(), "phase probe failed") {
|
|
||||||
t.Fatalf("expected startup vault gate error, got %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
cfgDeferred := lifecycleConfig(t)
|
|
||||||
runDeferred := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
|
||||||
command := name + " " + strings.Join(args, " ")
|
|
||||||
if name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}") {
|
|
||||||
return "Pending", nil
|
|
||||||
}
|
|
||||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
|
||||||
}
|
|
||||||
orchDeferred, _ := newHookOrchestrator(t, cfgDeferred, runDeferred, runDeferred)
|
|
||||||
orchDeferred.TestHookBeginStartupReport("startup-vault")
|
|
||||||
if err := orchDeferred.TestHookRunStartupVaultUnsealGate(context.Background()); err != nil {
|
|
||||||
t.Fatalf("expected deferred startup vault gate to succeed, got %v", err)
|
|
||||||
}
|
|
||||||
if payload := readStartupProgress(t, orchDeferred); !strings.Contains(payload, `vault-0 pod phase is \"Pending\"`) {
|
|
||||||
t.Fatalf("expected deferred startup vault detail, payload=%s", payload)
|
|
||||||
}
|
|
||||||
|
|
||||||
cfgSuccess := lifecycleConfig(t)
|
|
||||||
runSuccess := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
|
||||||
command := name + " " + strings.Join(args, " ")
|
|
||||||
switch {
|
|
||||||
case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
|
|
||||||
return "Running", nil
|
|
||||||
case name == "kubectl" && strings.Contains(command, "vault status -format=json"):
|
|
||||||
return `{"sealed":false,"initialized":true}`, nil
|
|
||||||
default:
|
|
||||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
orchSuccess, _ := newHookOrchestrator(t, cfgSuccess, runSuccess, runSuccess)
|
|
||||||
orchSuccess.TestHookBeginStartupReport("startup-vault")
|
|
||||||
if err := orchSuccess.TestHookRunStartupVaultUnsealGate(context.Background()); err != nil {
|
|
||||||
t.Fatalf("expected successful startup vault gate, got %v", err)
|
|
||||||
}
|
|
||||||
if payload := readStartupProgress(t, orchSuccess); !strings.Contains(payload, `"vault is unsealed"`) {
|
|
||||||
t.Fatalf("expected successful startup vault detail, payload=%s", payload)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
})
|
|
||||||
}
|
|
||||||
@ -24,36 +24,12 @@ func TestStateTestHookOverrideSetters(t *testing.T) {
|
|||||||
}
|
}
|
||||||
restoreWriteNil()
|
restoreWriteNil()
|
||||||
|
|
||||||
restoreReadNil := state.TestHookSetReadIntentOverride(nil)
|
|
||||||
readAfterNil, err := state.ReadIntent(intentPath)
|
|
||||||
if err != nil || readAfterNil.State != state.IntentNormal {
|
|
||||||
t.Fatalf("expected default read intent path after nil override, got %v / %v", readAfterNil, err)
|
|
||||||
}
|
|
||||||
restoreReadNil()
|
|
||||||
|
|
||||||
readOverrideCalled := false
|
|
||||||
restoreRead := state.TestHookSetReadIntentOverride(func(path string) (state.Intent, error) {
|
|
||||||
readOverrideCalled = true
|
|
||||||
return state.Intent{}, errors.New("forced read override")
|
|
||||||
})
|
|
||||||
_, err = state.ReadIntent(intentPath)
|
|
||||||
if err == nil || !strings.Contains(err.Error(), "forced read override") {
|
|
||||||
t.Fatalf("expected forced read override error, got %v", err)
|
|
||||||
}
|
|
||||||
if !readOverrideCalled {
|
|
||||||
t.Fatalf("expected read override to be invoked")
|
|
||||||
}
|
|
||||||
restoreRead()
|
|
||||||
if _, err := state.TestHookReadIntentDefault(intentPath); err != nil {
|
|
||||||
t.Fatalf("expected explicit default read helper to succeed, got %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
writeOverrideCalled := false
|
writeOverrideCalled := false
|
||||||
restoreWrite := state.TestHookSetWriteIntentOverride(func(path string, in state.Intent) error {
|
restoreWrite := state.TestHookSetWriteIntentOverride(func(path string, in state.Intent) error {
|
||||||
writeOverrideCalled = true
|
writeOverrideCalled = true
|
||||||
return errors.New("forced write override")
|
return errors.New("forced write override")
|
||||||
})
|
})
|
||||||
err = state.WriteIntent(intentPath, state.Intent{State: state.IntentNormal})
|
err := state.WriteIntent(intentPath, state.Intent{State: state.IntentNormal})
|
||||||
if err == nil || !strings.Contains(err.Error(), "forced write override") {
|
if err == nil || !strings.Contains(err.Error(), "forced write override") {
|
||||||
t.Fatalf("expected forced write override error, got %v", err)
|
t.Fatalf("expected forced write override error, got %v", err)
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user