ananke: refactor orchestrator, enforce quality gates, and harden startup checks

2026-04-09 01:38:06 -03:00 · 2026-04-09 01:38:06 -03:00 · c2c79e5821
commit c2c79e5821
parent baead1426e
51 changed files with 3677 additions and 176 deletions
--- a/19
+++ b/19
@ -1,4 +1,4 @@
-.PHONY: build test fmt tidy install drill-list drill-run
+.PHONY: build test test-all quality-gate hygiene lint coverage-report coverage-gate fmt tidy install drill-list drill-run
 build:
 	go build -o dist/ananke ./cmd/ananke
@ -6,6 +6,23 @@ build:
 test:
 	go test ./...
 test-all: test hygiene lint coverage-report
 quality-gate:
 	./scripts/quality_gate.sh
 hygiene:
 	cd testing && go test ./hygiene
 lint:
 	./scripts/lint.sh
 coverage-report:
 	cd testing && go test ./coverage -run TestPerFileCoverageReport -count=1 -v
 coverage-gate:
 	cd testing && ANANKE_ENFORCE_COVERAGE=1 ANANKE_PER_FILE_COVERAGE_TARGET=95 go test ./coverage -run TestPerFileCoverageReport -count=1 -v
 fmt:
 	gofmt -w ./cmd ./internal
--- a/configs/ananke.example.yaml
+++ b/configs/ananke.example.yaml
@ -48,6 +48,9 @@ startup:
  api_poll_seconds: 2
  shutdown_cooldown_seconds: 45
  minimum_battery_percent: 20
  require_node_inventory_reachability: true
  node_inventory_reachability_wait_seconds: 300
  node_inventory_reachability_poll_seconds: 5
  required_node_labels:
    titan-09:
      ananke.bstein.dev/harbor-bootstrap: "true"
@ -78,6 +81,15 @@ startup:
  service_checklist_wait_seconds: 420
  service_checklist_poll_seconds: 5
  service_checklist_stability_seconds: 120
  service_checklist_auth:
    mode: keycloak_robotuser
    keycloak_base_url: https://sso.bstein.dev
    realm: atlas
    robot_username: robotuser
    admin_secret_namespace: sso
    admin_secret_name: keycloak-admin
    admin_secret_username_key: username
    admin_secret_password_key: password
  service_checklist:
    - name: gitea-api
      url: https://scm.bstein.dev/api/healthz
@ -99,10 +111,20 @@ startup:
      accepted_statuses: [401]
      body_contains: unauthorized
      timeout_seconds: 12
-    - name: longhorn-auth
+    - name: longhorn-api-user-session
-      url: https://longhorn.bstein.dev/
+      url: https://longhorn.bstein.dev/v1
-      accepted_statuses: [200, 302]
+      accepted_statuses: [200]
      require_robot_auth: true
      follow_redirects: true
      final_url_contains: /v1
      final_url_not_contains: /oauth2/sign_in
      body_contains: '"id":"v1"'
      timeout_seconds: 12
  require_critical_service_endpoints: true
  critical_service_endpoint_wait_seconds: 420
  critical_service_endpoint_poll_seconds: 5
  critical_service_endpoints:
    - monitoring/victoria-metrics-single-server
  require_ingress_checklist: true
  ingress_checklist_wait_seconds: 420
  ingress_checklist_poll_seconds: 5
@ -139,10 +161,6 @@ shutdown:
  drain_parallelism: 6
  scale_parallelism: 8
  ssh_parallelism: 8
  poweroff_enabled: false
  poweroff_delay_seconds: 25
  poweroff_local_host: false
  extra_poweroff_hosts: []
 ups:
  enabled: true
  provider: nut
@ -170,6 +188,7 @@ metrics:
  path: /metrics
 state:
  dir: /var/lib/ananke
  reports_dir: /var/lib/ananke/reports
  run_history_path: /var/lib/ananke/runs.json
  lock_path: /var/lib/ananke/ananke.lock
  intent_path: /var/lib/ananke/intent.json
--- a/configs/ananke.tethys.yaml
+++ b/configs/ananke.tethys.yaml
@ -114,6 +114,9 @@ startup:
  api_poll_seconds: 2
  shutdown_cooldown_seconds: 45
  minimum_battery_percent: 20
  require_node_inventory_reachability: true
  node_inventory_reachability_wait_seconds: 300
  node_inventory_reachability_poll_seconds: 5
  required_node_labels:
    titan-09:
      ananke.bstein.dev/harbor-bootstrap: "true"
@ -144,6 +147,15 @@ startup:
  service_checklist_wait_seconds: 420
  service_checklist_poll_seconds: 5
  service_checklist_stability_seconds: 120
  service_checklist_auth:
    mode: keycloak_robotuser
    keycloak_base_url: https://sso.bstein.dev
    realm: atlas
    robot_username: robotuser
    admin_secret_namespace: sso
    admin_secret_name: keycloak-admin
    admin_secret_username_key: username
    admin_secret_password_key: password
  service_checklist:
    - name: gitea-api
      url: https://scm.bstein.dev/api/healthz
@ -165,10 +177,20 @@ startup:
      accepted_statuses: [401]
      body_contains: unauthorized
      timeout_seconds: 12
-    - name: longhorn-auth
+    - name: longhorn-api-user-session
-      url: https://longhorn.bstein.dev/
+      url: https://longhorn.bstein.dev/v1
-      accepted_statuses: [200, 302]
+      accepted_statuses: [200]
      require_robot_auth: true
      follow_redirects: true
      final_url_contains: /v1
      final_url_not_contains: /oauth2/sign_in
      body_contains: '"id":"v1"'
      timeout_seconds: 12
  require_critical_service_endpoints: true
  critical_service_endpoint_wait_seconds: 420
  critical_service_endpoint_poll_seconds: 5
  critical_service_endpoints:
    - monitoring/victoria-metrics-single-server
  require_ingress_checklist: true
  ingress_checklist_wait_seconds: 420
  ingress_checklist_poll_seconds: 5
@ -205,10 +227,6 @@ shutdown:
  drain_parallelism: 6
  scale_parallelism: 8
  ssh_parallelism: 8
  poweroff_enabled: false
  poweroff_delay_seconds: 25
  poweroff_local_host: false
  extra_poweroff_hosts: []
 ups:
  enabled: true
  provider: nut
@ -236,6 +254,7 @@ metrics:
  path: /metrics
 state:
  dir: /var/lib/ananke
  reports_dir: /var/lib/ananke/reports
  run_history_path: /var/lib/ananke/runs.json
  lock_path: /var/lib/ananke/ananke.lock
  intent_path: /var/lib/ananke/intent.json
--- a/configs/ananke.titan-db.yaml
+++ b/configs/ananke.titan-db.yaml
@ -114,6 +114,9 @@ startup:
  api_poll_seconds: 2
  shutdown_cooldown_seconds: 45
  minimum_battery_percent: 20
  require_node_inventory_reachability: true
  node_inventory_reachability_wait_seconds: 300
  node_inventory_reachability_poll_seconds: 5
  required_node_labels:
    titan-09:
      ananke.bstein.dev/harbor-bootstrap: "true"
@ -144,6 +147,15 @@ startup:
  service_checklist_wait_seconds: 420
  service_checklist_poll_seconds: 5
  service_checklist_stability_seconds: 120
  service_checklist_auth:
    mode: keycloak_robotuser
    keycloak_base_url: https://sso.bstein.dev
    realm: atlas
    robot_username: robotuser
    admin_secret_namespace: sso
    admin_secret_name: keycloak-admin
    admin_secret_username_key: username
    admin_secret_password_key: password
  service_checklist:
    - name: gitea-api
      url: https://scm.bstein.dev/api/healthz
@ -165,10 +177,20 @@ startup:
      accepted_statuses: [401]
      body_contains: unauthorized
      timeout_seconds: 12
-    - name: longhorn-auth
+    - name: longhorn-api-user-session
-      url: https://longhorn.bstein.dev/
+      url: https://longhorn.bstein.dev/v1
-      accepted_statuses: [200, 302]
+      accepted_statuses: [200]
      require_robot_auth: true
      follow_redirects: true
      final_url_contains: /v1
      final_url_not_contains: /oauth2/sign_in
      body_contains: '"id":"v1"'
      timeout_seconds: 12
  require_critical_service_endpoints: true
  critical_service_endpoint_wait_seconds: 420
  critical_service_endpoint_poll_seconds: 5
  critical_service_endpoints:
    - monitoring/victoria-metrics-single-server
  require_ingress_checklist: true
  ingress_checklist_wait_seconds: 420
  ingress_checklist_poll_seconds: 5
@ -205,10 +227,6 @@ shutdown:
  drain_parallelism: 6
  scale_parallelism: 8
  ssh_parallelism: 8
  poweroff_enabled: false
  poweroff_delay_seconds: 25
  poweroff_local_host: false
  extra_poweroff_hosts: []
 ups:
  enabled: true
  provider: nut
@ -236,6 +254,7 @@ metrics:
  path: /metrics
 state:
  dir: /var/lib/ananke
  reports_dir: /var/lib/ananke/reports
  run_history_path: /var/lib/ananke/runs.json
  lock_path: /var/lib/ananke/ananke.lock
  intent_path: /var/lib/ananke/intent.json
--- a/internal/cluster/orchestrator_service_auth.go
+++ b/internal/cluster/orchestrator_service_auth.go
@ -0,0 +1,286 @@
 package cluster
 import (
 	"context"
 	"crypto/tls"
 	"encoding/base64"
 	"encoding/json"
 	"fmt"
 	"io"
 	"net/http"
 	"net/http/cookiejar"
 	neturl "net/url"
 	"strings"
 	"time"
 	"scm.bstein.dev/bstein/ananke/internal/config"
 )
 type keycloakTokenResponse struct {
 	AccessToken string `json:"access_token"`
 }
 type keycloakUser struct {
 	ID string `json:"id"`
 }
 type keycloakImpersonationResponse struct {
 	Redirect string `json:"redirect"`
 }
 type kubernetesSecret struct {
 	Data map[string]string `json:"data"`
 }
 // checklistAuthHTTPClient runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) checklistAuthHTTPClient(ctx context.Context, timeout time.Duration, insecureSkipTLS bool) (*http.Client, error).
 // Why: startup checklist checks that require real user behavior need an
 // authenticated robotuser browser-like session before probing service pages.
 func (o *Orchestrator) checklistAuthHTTPClient(ctx context.Context, timeout time.Duration, insecureSkipTLS bool) (*http.Client, error) {
 	jar, err := cookiejar.New(nil)
 	if err != nil {
 		return nil, fmt.Errorf("create cookie jar: %w", err)
 	}
 	transport := &http.Transport{}
 	if insecureSkipTLS {
 		transport.TLSClientConfig = &tls.Config{InsecureSkipVerify: true}
 	}
 	client := &http.Client{
 		Timeout:   timeout,
 		Transport: transport,
 		Jar:       jar,
 	}
 	if err := o.authenticateRobotChecklistSession(ctx, client); err != nil {
 		return nil, err
 	}
 	return client, nil
 }
 // authenticateRobotChecklistSession runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) authenticateRobotChecklistSession(ctx context.Context, client *http.Client) error.
 // Why: authenticated checklist probes must reflect what a human sees after
 // Keycloak login, not only pre-auth redirects.
 func (o *Orchestrator) authenticateRobotChecklistSession(ctx context.Context, client *http.Client) error {
 	auth := o.cfg.Startup.ServiceChecklistAuth
 	mode := strings.TrimSpace(auth.Mode)
 	if mode == "" || mode == "none" {
 		return fmt.Errorf("startup checklist auth mode is disabled")
 	}
 	if mode != "keycloak_robotuser" {
 		return fmt.Errorf("unsupported startup checklist auth mode %q", mode)
 	}
 	adminUser, adminPassword, err := o.keycloakAdminCredentials(ctx, auth)
 	if err != nil {
 		return err
 	}
 	adminToken, err := o.keycloakAdminToken(ctx, client, auth, adminUser, adminPassword)
 	if err != nil {
 		return err
 	}
 	robotUserID, err := o.keycloakRobotUserID(ctx, client, auth, adminToken)
 	if err != nil {
 		return err
 	}
 	redirectURL, err := o.keycloakImpersonationRedirect(ctx, client, auth, adminToken, robotUserID)
 	if err != nil {
 		return err
 	}
 	if strings.TrimSpace(redirectURL) == "" {
 		redirectURL = keycloakBaseURL(auth) + "/realms/" + strings.TrimSpace(auth.Realm) + "/account/"
 	}
 	req, err := http.NewRequestWithContext(ctx, http.MethodGet, redirectURL, nil)
 	if err != nil {
 		return fmt.Errorf("build robot redirect request: %w", err)
 	}
 	req.Header.Set("User-Agent", "ananke/startup-checklist")
 	resp, err := client.Do(req)
 	if err != nil {
 		return fmt.Errorf("initialize robot session redirect: %w", err)
 	}
 	defer resp.Body.Close()
 	_, _ = io.Copy(io.Discard, io.LimitReader(resp.Body, 1024))
 	return nil
 }
 // keycloakAdminCredentials runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) keycloakAdminCredentials(ctx context.Context, auth config.ServiceChecklistAuthSettings) (string, string, error).
 // Why: robotuser impersonation uses a cluster-managed admin secret so startup
 // checks do not rely on interactive credentials.
 func (o *Orchestrator) keycloakAdminCredentials(ctx context.Context, auth config.ServiceChecklistAuthSettings) (string, string, error) {
 	namespace := strings.TrimSpace(auth.AdminSecretNamespace)
 	name := strings.TrimSpace(auth.AdminSecretName)
 	userKey := strings.TrimSpace(auth.AdminSecretUsernameKey)
 	passwordKey := strings.TrimSpace(auth.AdminSecretPasswordKey)
 	username, err := o.kubernetesSecretValue(ctx, namespace, name, userKey)
 	if err != nil {
 		return "", "", fmt.Errorf("read keycloak admin username from secret %s/%s: %w", namespace, name, err)
 	}
 	password, err := o.kubernetesSecretValue(ctx, namespace, name, passwordKey)
 	if err != nil {
 		return "", "", fmt.Errorf("read keycloak admin password from secret %s/%s: %w", namespace, name, err)
 	}
 	return username, password, nil
 }
 // kubernetesSecretValue runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) kubernetesSecretValue(ctx context.Context, namespace string, name string, key string) (string, error).
 // Why: checklist auth depends on secret-backed credentials and should decode
 // them directly from Kubernetes rather than shelling out to external tools.
 func (o *Orchestrator) kubernetesSecretValue(ctx context.Context, namespace string, name string, key string) (string, error) {
 	out, err := o.kubectl(ctx, 25*time.Second, "-n", namespace, "get", "secret", name, "-o", "json")
 	if err != nil {
 		return "", fmt.Errorf("kubectl get secret: %w", err)
 	}
 	var doc kubernetesSecret
 	if err := json.Unmarshal([]byte(out), &doc); err != nil {
 		return "", fmt.Errorf("decode secret json: %w", err)
 	}
 	encoded, ok := doc.Data[key]
 	if !ok {
 		return "", fmt.Errorf("key %q not present in secret", key)
 	}
 	decoded, err := base64.StdEncoding.DecodeString(strings.TrimSpace(encoded))
 	if err != nil {
 		return "", fmt.Errorf("decode base64 secret value: %w", err)
 	}
 	value := strings.TrimSpace(string(decoded))
 	if value == "" {
 		return "", fmt.Errorf("decoded value is empty")
 	}
 	return value, nil
 }
 // keycloakAdminToken runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) keycloakAdminToken(ctx context.Context, client *http.Client, auth config.ServiceChecklistAuthSettings, adminUser string, adminPassword string) (string, error).
 // Why: admin API access is needed to impersonate robotuser for deterministic
 // user-journey checks across OIDC-gated services.
 func (o *Orchestrator) keycloakAdminToken(ctx context.Context, client *http.Client, auth config.ServiceChecklistAuthSettings, adminUser string, adminPassword string) (string, error) {
 	form := neturl.Values{}
 	form.Set("grant_type", "password")
 	form.Set("client_id", "admin-cli")
 	form.Set("username", adminUser)
 	form.Set("password", adminPassword)
 	tokenURL := keycloakBaseURL(auth) + "/realms/master/protocol/openid-connect/token"
 	req, err := http.NewRequestWithContext(ctx, http.MethodPost, tokenURL, strings.NewReader(form.Encode()))
 	if err != nil {
 		return "", fmt.Errorf("build admin token request: %w", err)
 	}
 	req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
 	req.Header.Set("User-Agent", "ananke/startup-checklist")
 	resp, err := client.Do(req)
 	if err != nil {
 		return "", fmt.Errorf("request admin token: %w", err)
 	}
 	defer resp.Body.Close()
 	body, _ := io.ReadAll(io.LimitReader(resp.Body, 64*1024))
 	if resp.StatusCode/100 != 2 {
 		return "", fmt.Errorf("admin token request failed status=%d body=%q", resp.StatusCode, compactHTTPBody(body))
 	}
 	var payload keycloakTokenResponse
 	if err := json.Unmarshal(body, &payload); err != nil {
 		return "", fmt.Errorf("decode admin token response: %w", err)
 	}
 	token := strings.TrimSpace(payload.AccessToken)
 	if token == "" {
 		return "", fmt.Errorf("admin token response missing access_token")
 	}
 	return token, nil
 }
 // keycloakRobotUserID runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) keycloakRobotUserID(ctx context.Context, client *http.Client, auth config.ServiceChecklistAuthSettings, adminToken string) (string, error).
 // Why: impersonation requires the concrete user id and should fail fast when
 // robotuser is missing from the realm.
 func (o *Orchestrator) keycloakRobotUserID(ctx context.Context, client *http.Client, auth config.ServiceChecklistAuthSettings, adminToken string) (string, error) {
 	base := keycloakBaseURL(auth)
 	realm := strings.TrimSpace(auth.Realm)
 	username := strings.TrimSpace(auth.RobotUsername)
 	query := neturl.Values{}
 	query.Set("username", username)
 	query.Set("exact", "true")
 	usersURL := base + "/admin/realms/" + realm + "/users?" + query.Encode()
 	req, err := http.NewRequestWithContext(ctx, http.MethodGet, usersURL, nil)
 	if err != nil {
 		return "", fmt.Errorf("build robot user lookup request: %w", err)
 	}
 	req.Header.Set("Authorization", "Bearer "+adminToken)
 	req.Header.Set("User-Agent", "ananke/startup-checklist")
 	resp, err := client.Do(req)
 	if err != nil {
 		return "", fmt.Errorf("lookup robot user: %w", err)
 	}
 	defer resp.Body.Close()
 	body, _ := io.ReadAll(io.LimitReader(resp.Body, 64*1024))
 	if resp.StatusCode/100 != 2 {
 		return "", fmt.Errorf("robot user lookup failed status=%d body=%q", resp.StatusCode, compactHTTPBody(body))
 	}
 	var users []keycloakUser
 	if err := json.Unmarshal(body, &users); err != nil {
 		return "", fmt.Errorf("decode robot user lookup response: %w", err)
 	}
 	if len(users) == 0 || strings.TrimSpace(users[0].ID) == "" {
 		return "", fmt.Errorf("robot user %q not found in realm %q", username, realm)
 	}
 	return strings.TrimSpace(users[0].ID), nil
 }
 // keycloakImpersonationRedirect runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) keycloakImpersonationRedirect(ctx context.Context, client *http.Client, auth config.ServiceChecklistAuthSettings, adminToken string, robotUserID string) (string, error).
 // Why: opening a real impersonated browser session guarantees checks evaluate
 // post-login app behavior instead of only auth-gateway redirects.
 func (o *Orchestrator) keycloakImpersonationRedirect(ctx context.Context, client *http.Client, auth config.ServiceChecklistAuthSettings, adminToken string, robotUserID string) (string, error) {
 	base := keycloakBaseURL(auth)
 	realm := strings.TrimSpace(auth.Realm)
 	impersonateURL := base + "/admin/realms/" + realm + "/users/" + strings.TrimSpace(robotUserID) + "/impersonation"
 	req, err := http.NewRequestWithContext(ctx, http.MethodPost, impersonateURL, http.NoBody)
 	if err != nil {
 		return "", fmt.Errorf("build robot impersonation request: %w", err)
 	}
 	req.Header.Set("Authorization", "Bearer "+adminToken)
 	req.Header.Set("User-Agent", "ananke/startup-checklist")
 	resp, err := client.Do(req)
 	if err != nil {
 		return "", fmt.Errorf("request robot impersonation: %w", err)
 	}
 	defer resp.Body.Close()
 	body, _ := io.ReadAll(io.LimitReader(resp.Body, 64*1024))
 	if resp.StatusCode/100 != 2 {
 		return "", fmt.Errorf("robot impersonation failed status=%d body=%q", resp.StatusCode, compactHTTPBody(body))
 	}
 	var payload keycloakImpersonationResponse
 	if err := json.Unmarshal(body, &payload); err != nil {
 		return "", fmt.Errorf("decode robot impersonation response: %w", err)
 	}
 	return strings.TrimSpace(payload.Redirect), nil
 }
 // keycloakBaseURL runs one orchestration or CLI step.
 // Signature: keycloakBaseURL(auth config.ServiceChecklistAuthSettings) string.
 // Why: centralizing URL normalization keeps auth request construction stable.
 func keycloakBaseURL(auth config.ServiceChecklistAuthSettings) string {
 	return strings.TrimRight(strings.TrimSpace(auth.KeycloakBaseURL), "/")
 }
 // compactHTTPBody runs one orchestration or CLI step.
 // Signature: compactHTTPBody(raw []byte) string.
 // Why: checklist auth errors should include a readable body summary without
 // leaking multi-line payload noise into orchestrator logs.
 func compactHTTPBody(raw []byte) string {
 	text := strings.TrimSpace(string(raw))
 	if text == "" {
 		return ""
 	}
 	return strings.Join(strings.Fields(text), " ")
 }
--- a/internal/cluster/orchestrator_service_stability.go
+++ b/internal/cluster/orchestrator_service_stability.go
@ -184,6 +184,16 @@ func (o *Orchestrator) serviceCheckReady(ctx context.Context, check config.Servi
 		return false, fmt.Sprintf("location header contained forbidden marker %q", locationNotContains)
 	}
 	finalURLContains := strings.TrimSpace(check.FinalURLContains)
 	if finalURLContains != "" && !checklistContains(result.FinalURL, finalURLContains) {
 		return false, fmt.Sprintf("final url missing expected marker %q", finalURLContains)
 	}
 	finalURLNotContains := strings.TrimSpace(check.FinalURLNotContains)
 	if finalURLNotContains != "" && checklistContains(result.FinalURL, finalURLNotContains) {
 		return false, fmt.Sprintf("final url contained forbidden marker %q", finalURLNotContains)
 	}
 	bodyContains := strings.TrimSpace(check.BodyContains)
 	if bodyContains != "" && !checklistContains(result.Body, bodyContains) {
 		return false, fmt.Sprintf("response missing expected marker %q", bodyContains)
@ -201,6 +211,7 @@ type checklistHTTPProbeResult struct {
 	Status   int
 	Body     string
 	Location string
 	FinalURL string
 }
 // httpChecklistProbeResult runs one orchestration or CLI step.
@ -209,13 +220,14 @@ type checklistHTTPProbeResult struct {
 // addition to status/body so startup can validate real user-facing behavior.
 func (o *Orchestrator) httpChecklistProbeResult(ctx context.Context, check config.ServiceChecklistCheck) (checklistHTTPProbeResult, error) {
 	result := checklistHTTPProbeResult{}
-	status, body, location, err := o.httpChecklistProbeWithLocation(ctx, check)
+	status, body, location, finalURL, err := o.httpChecklistProbeWithLocation(ctx, check)
 	if err != nil {
 		return result, err
 	}
 	result.Status = status
 	result.Body = body
 	result.Location = location
 	result.FinalURL = finalURL
 	return result, nil
 }
@ -223,50 +235,66 @@ func (o *Orchestrator) httpChecklistProbeResult(ctx context.Context, check confi
 // Signature: (o *Orchestrator) httpChecklistProbe(ctx context.Context, check config.ServiceChecklistCheck) (int, string, error).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func (o *Orchestrator) httpChecklistProbe(ctx context.Context, check config.ServiceChecklistCheck) (int, string, error) {
-	status, body, _, err := o.httpChecklistProbeWithLocation(ctx, check)
+	status, body, _, _, err := o.httpChecklistProbeWithLocation(ctx, check)
 	return status, body, err
 }
 // httpChecklistProbeWithLocation runs one orchestration or CLI step.
-// Signature: (o *Orchestrator) httpChecklistProbeWithLocation(ctx context.Context, check config.ServiceChecklistCheck) (int, string, string, error).
+// Signature: (o *Orchestrator) httpChecklistProbeWithLocation(ctx context.Context, check config.ServiceChecklistCheck) (int, string, string, string, error).
 // Why: redirects and auth gates require location-header assertions to prevent
 // startup false-positives on partially healthy protected services.
-func (o *Orchestrator) httpChecklistProbeWithLocation(ctx context.Context, check config.ServiceChecklistCheck) (int, string, string, error) {
+func (o *Orchestrator) httpChecklistProbeWithLocation(ctx context.Context, check config.ServiceChecklistCheck) (int, string, string, string, error) {
 	timeout := time.Duration(check.TimeoutSeconds) * time.Second
 	if timeout <= 0 {
 		timeout = 12 * time.Second
 	}
 	followRedirects := check.FollowRedirects || check.RequireRobotAuth
 	var client *http.Client
 	if check.RequireRobotAuth {
 		authClient, authErr := o.checklistAuthHTTPClient(ctx, timeout, check.InsecureSkipTLS)
 		if authErr != nil {
 			return 0, "", "", "", fmt.Errorf("initialize robotuser checklist session: %w", authErr)
 		}
 		client = authClient
 	} else {
 		transport := &http.Transport{}
 		if check.InsecureSkipTLS {
 			transport.TLSClientConfig = &tls.Config{InsecureSkipVerify: true}
 		}
-	client := &http.Client{
+		client = &http.Client{
 			Timeout:   timeout,
 			Transport: transport,
-		CheckRedirect: func(_ *http.Request, _ []*http.Request) error {
+		}
 	}
 	if !followRedirects {
 		client.CheckRedirect = func(_ *http.Request, _ []*http.Request) error {
 			return http.ErrUseLastResponse
-		},
+		}
 	}
 	req, err := http.NewRequestWithContext(ctx, http.MethodGet, strings.TrimSpace(check.URL), nil)
 	if err != nil {
-		return 0, "", "", fmt.Errorf("build request: %w", err)
+		return 0, "", "", "", fmt.Errorf("build request: %w", err)
 	}
 	req.Header.Set("User-Agent", "ananke/startup-checklist")
 	resp, err := client.Do(req)
 	if err != nil {
-		return 0, "", "", fmt.Errorf("request failed: %w", err)
+		return 0, "", "", "", fmt.Errorf("request failed: %w", err)
 	}
 	defer resp.Body.Close()
 	body, readErr := io.ReadAll(io.LimitReader(resp.Body, 64*1024))
 	if readErr != nil {
-		return resp.StatusCode, "", "", fmt.Errorf("read response body: %w", readErr)
+		return resp.StatusCode, "", "", "", fmt.Errorf("read response body: %w", readErr)
 	}
-	return resp.StatusCode, string(body), strings.TrimSpace(resp.Header.Get("Location")), nil
+	finalURL := strings.TrimSpace(req.URL.String())
 	if resp.Request != nil && resp.Request.URL != nil {
 		finalURL = strings.TrimSpace(resp.Request.URL.String())
 	}
 	return resp.StatusCode, string(body), strings.TrimSpace(resp.Header.Get("Location")), finalURL, nil
 }
 // checklistContains runs one orchestration or CLI step.
--- a/internal/cluster/orchestrator_test.go
+++ b/internal/cluster/orchestrator_test.go
@ -329,6 +329,80 @@ func TestServiceCheckReadyRejectsMissingLocationMarker(t *testing.T) {
 	}
 }
 // TestServiceCheckReadyRequiresFinalURLContains runs one orchestration or CLI step.
 // Signature: TestServiceCheckReadyRequiresFinalURLContains(t *testing.T).
 // Why: authenticated user-journey checks depend on final URL assertions after
 // redirects complete, not only on initial response status.
 func TestServiceCheckReadyRequiresFinalURLContains(t *testing.T) {
 	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 		if r.URL.Path == "/" {
 			http.Redirect(w, r, "/app/home", http.StatusFound)
 			return
 		}
 		if r.URL.Path == "/app/home" {
 			w.WriteHeader(http.StatusOK)
 			_, _ = w.Write([]byte("OpenSearch Dashboards"))
 			return
 		}
 		w.WriteHeader(http.StatusNotFound)
 	}))
 	defer srv.Close()
 	orch := &Orchestrator{
 		log: log.New(os.Stdout, "", 0),
 	}
 	ok, detail := orch.serviceCheckReady(context.Background(), config.ServiceChecklistCheck{
 		Name:             "logging-ui-user-session",
 		URL:              srv.URL,
 		AcceptedStatuses: []int{200},
 		FollowRedirects:  true,
 		FinalURLContains: "/app/home",
 		BodyContains:     "OpenSearch Dashboards",
 		TimeoutSeconds:   5,
 	})
 	if !ok {
 		t.Fatalf("expected final-url-aware service check to pass, detail=%s", detail)
 	}
 }
 // TestServiceCheckReadyRejectsForbiddenFinalURLMarker runs one orchestration or CLI step.
 // Signature: TestServiceCheckReadyRejectsForbiddenFinalURLMarker(t *testing.T).
 // Why: user-session checks should fail when final URL indicates auth/login loop
 // instead of the expected post-login app route.
 func TestServiceCheckReadyRejectsForbiddenFinalURLMarker(t *testing.T) {
 	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 		if r.URL.Path == "/" {
 			http.Redirect(w, r, "/oauth2/sign_in", http.StatusFound)
 			return
 		}
 		if r.URL.Path == "/oauth2/sign_in" {
 			w.WriteHeader(http.StatusOK)
 			_, _ = w.Write([]byte("sign in"))
 			return
 		}
 		w.WriteHeader(http.StatusNotFound)
 	}))
 	defer srv.Close()
 	orch := &Orchestrator{
 		log: log.New(os.Stdout, "", 0),
 	}
 	ok, detail := orch.serviceCheckReady(context.Background(), config.ServiceChecklistCheck{
 		Name:                "logging-ui-user-session",
 		URL:                 srv.URL,
 		AcceptedStatuses:    []int{200},
 		FollowRedirects:     true,
 		FinalURLNotContains: "/oauth2/sign_in",
 		TimeoutSeconds:      5,
 	})
 	if ok {
 		t.Fatalf("expected forbidden final-url marker check to fail")
 	}
 	if !strings.Contains(detail, "final url contained forbidden marker") {
 		t.Fatalf("expected final-url forbidden marker detail, got %q", detail)
 	}
 }
 // TestChecklistFailureHostFromIngressDetail runs one orchestration or CLI step.
 // Signature: TestChecklistFailureHostFromIngressDetail(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
@ -385,59 +459,3 @@ func TestChecklistFailureHostUnknown(t *testing.T) {
 		t.Fatalf("expected empty host for unknown check, got %q", got)
 	}
 }
 // TestStuckVaultInitReasonDetectsHungInit runs one orchestration or CLI step.
 // Signature: TestStuckVaultInitReasonDetectsHungInit(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestStuckVaultInitReasonDetectsHungInit(t *testing.T) {
 	var pod podResource
 	pod.Status.Phase = "Pending"
 	pod.Metadata.Annotations = map[string]string{
 		"vault.hashicorp.com/agent-inject": "true",
 	}
 	pod.Status.InitContainerStatuses = []podContainerStatus{
 		{
 			Name: "vault-agent-init",
 			State: podContainerState{
 				Running: &podContainerRunningState{
 					StartedAt: time.Now().Add(-10 * time.Minute),
 				},
 			},
 		},
 	}
 	reason := stuckVaultInitReason(pod, 3*time.Minute)
 	if reason != "VaultInitStuck" {
 		t.Fatalf("expected VaultInitStuck reason, got %q", reason)
 	}
 }
 // TestStuckVaultInitReasonIgnoresFreshOrNonVaultPods runs one orchestration or CLI step.
 // Signature: TestStuckVaultInitReasonIgnoresFreshOrNonVaultPods(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestStuckVaultInitReasonIgnoresFreshOrNonVaultPods(t *testing.T) {
 	var pod podResource
 	pod.Status.Phase = "Pending"
 	pod.Metadata.Annotations = map[string]string{
 		"vault.hashicorp.com/agent-inject": "true",
 	}
 	pod.Status.InitContainerStatuses = []podContainerStatus{
 		{
 			Name: "vault-agent-init",
 			State: podContainerState{
 				Running: &podContainerRunningState{
 					StartedAt: time.Now().Add(-30 * time.Second),
 				},
 			},
 		},
 	}
 	if reason := stuckVaultInitReason(pod, 3*time.Minute); reason != "" {
 		t.Fatalf("expected no reason for fresh init, got %q", reason)
 	}
 	pod.Metadata.Annotations["vault.hashicorp.com/agent-inject"] = "false"
 	pod.Status.InitContainerStatuses[0].State.Running.StartedAt = time.Now().Add(-10 * time.Minute)
 	if reason := stuckVaultInitReason(pod, 3*time.Minute); reason != "" {
 		t.Fatalf("expected no reason for non-vault pod, got %q", reason)
 	}
 }
--- a/internal/cluster/orchestrator_vault_test.go
+++ b/internal/cluster/orchestrator_vault_test.go
@ -0,0 +1,62 @@
 package cluster
 import (
 	"testing"
 	"time"
 )
 // TestStuckVaultInitReasonDetectsHungInit runs one orchestration or CLI step.
 // Signature: TestStuckVaultInitReasonDetectsHungInit(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestStuckVaultInitReasonDetectsHungInit(t *testing.T) {
 	var pod podResource
 	pod.Status.Phase = "Pending"
 	pod.Metadata.Annotations = map[string]string{
 		"vault.hashicorp.com/agent-inject": "true",
 	}
 	pod.Status.InitContainerStatuses = []podContainerStatus{
 		{
 			Name: "vault-agent-init",
 			State: podContainerState{
 				Running: &podContainerRunningState{
 					StartedAt: time.Now().Add(-10 * time.Minute),
 				},
 			},
 		},
 	}
 	reason := stuckVaultInitReason(pod, 3*time.Minute)
 	if reason != "VaultInitStuck" {
 		t.Fatalf("expected VaultInitStuck reason, got %q", reason)
 	}
 }
 // TestStuckVaultInitReasonIgnoresFreshOrNonVaultPods runs one orchestration or CLI step.
 // Signature: TestStuckVaultInitReasonIgnoresFreshOrNonVaultPods(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestStuckVaultInitReasonIgnoresFreshOrNonVaultPods(t *testing.T) {
 	var pod podResource
 	pod.Status.Phase = "Pending"
 	pod.Metadata.Annotations = map[string]string{
 		"vault.hashicorp.com/agent-inject": "true",
 	}
 	pod.Status.InitContainerStatuses = []podContainerStatus{
 		{
 			Name: "vault-agent-init",
 			State: podContainerState{
 				Running: &podContainerRunningState{
 					StartedAt: time.Now().Add(-30 * time.Second),
 				},
 			},
 		},
 	}
 	if reason := stuckVaultInitReason(pod, 3*time.Minute); reason != "" {
 		t.Fatalf("expected no reason for fresh init, got %q", reason)
 	}
 	pod.Metadata.Annotations["vault.hashicorp.com/agent-inject"] = "false"
 	pod.Status.InitContainerStatuses[0].State.Running.StartedAt = time.Now().Add(-10 * time.Minute)
 	if reason := stuckVaultInitReason(pod, 3*time.Minute); reason != "" {
 		t.Fatalf("expected no reason for non-vault pod, got %q", reason)
 	}
 }
--- a/internal/cluster/testing_hooks_auth.go
+++ b/internal/cluster/testing_hooks_auth.go
@ -0,0 +1,79 @@
 package cluster
 import (
 	"context"
 	"net/http"
 	"time"
 	"scm.bstein.dev/bstein/ananke/internal/config"
 )
 // TestHookChecklistAuthHTTPClient runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) TestHookChecklistAuthHTTPClient(ctx context.Context, timeout time.Duration, insecureSkipTLS bool) (*http.Client, error).
 // Why: exposes checklist auth client/session bootstrap internals to top-level tests.
 func (o *Orchestrator) TestHookChecklistAuthHTTPClient(ctx context.Context, timeout time.Duration, insecureSkipTLS bool) (*http.Client, error) {
 	return o.checklistAuthHTTPClient(ctx, timeout, insecureSkipTLS)
 }
 // TestHookAuthenticateRobotChecklistSession runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) TestHookAuthenticateRobotChecklistSession(ctx context.Context, client *http.Client) error.
 // Why: exposes robotuser auth session internals to top-level tests.
 func (o *Orchestrator) TestHookAuthenticateRobotChecklistSession(ctx context.Context, client *http.Client) error {
 	return o.authenticateRobotChecklistSession(ctx, client)
 }
 // TestHookKubernetesSecretValue runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) TestHookKubernetesSecretValue(ctx context.Context, namespace string, name string, key string) (string, error).
 // Why: exposes Kubernetes secret decode internals to top-level tests.
 func (o *Orchestrator) TestHookKubernetesSecretValue(ctx context.Context, namespace string, name string, key string) (string, error) {
 	return o.kubernetesSecretValue(ctx, namespace, name, key)
 }
 // TestHookKeycloakAdminCredentials runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) TestHookKeycloakAdminCredentials(ctx context.Context, auth config.ServiceChecklistAuthSettings) (string, string, error).
 // Why: exposes secret-backed credential resolution internals to top-level tests.
 func (o *Orchestrator) TestHookKeycloakAdminCredentials(ctx context.Context, auth config.ServiceChecklistAuthSettings) (string, string, error) {
 	return o.keycloakAdminCredentials(ctx, auth)
 }
 // TestHookKeycloakAdminToken runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) TestHookKeycloakAdminToken(ctx context.Context, client *http.Client, auth config.ServiceChecklistAuthSettings, adminUser string, adminPassword string) (string, error).
 // Why: exposes Keycloak admin token acquisition internals to top-level tests.
 func (o *Orchestrator) TestHookKeycloakAdminToken(ctx context.Context, client *http.Client, auth config.ServiceChecklistAuthSettings, adminUser string, adminPassword string) (string, error) {
 	return o.keycloakAdminToken(ctx, client, auth, adminUser, adminPassword)
 }
 // TestHookKeycloakRobotUserID runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) TestHookKeycloakRobotUserID(ctx context.Context, client *http.Client, auth config.ServiceChecklistAuthSettings, adminToken string) (string, error).
 // Why: exposes Keycloak robot-user lookup internals to top-level tests.
 func (o *Orchestrator) TestHookKeycloakRobotUserID(ctx context.Context, client *http.Client, auth config.ServiceChecklistAuthSettings, adminToken string) (string, error) {
 	return o.keycloakRobotUserID(ctx, client, auth, adminToken)
 }
 // TestHookKeycloakImpersonationRedirect runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) TestHookKeycloakImpersonationRedirect(ctx context.Context, client *http.Client, auth config.ServiceChecklistAuthSettings, adminToken string, robotUserID string) (string, error).
 // Why: exposes Keycloak impersonation internals to top-level tests.
 func (o *Orchestrator) TestHookKeycloakImpersonationRedirect(ctx context.Context, client *http.Client, auth config.ServiceChecklistAuthSettings, adminToken string, robotUserID string) (string, error) {
 	return o.keycloakImpersonationRedirect(ctx, client, auth, adminToken, robotUserID)
 }
 // TestHookHTTPChecklistProbeWithLocation runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) TestHookHTTPChecklistProbeWithLocation(ctx context.Context, check config.ServiceChecklistCheck) (int, string, string, string, error).
 // Why: exposes redirect-aware checklist probe internals to top-level tests.
 func (o *Orchestrator) TestHookHTTPChecklistProbeWithLocation(ctx context.Context, check config.ServiceChecklistCheck) (int, string, string, string, error) {
 	return o.httpChecklistProbeWithLocation(ctx, check)
 }
 // TestHookKeycloakBaseURL runs one orchestration or CLI step.
 // Signature: TestHookKeycloakBaseURL(auth config.ServiceChecklistAuthSettings) string.
 // Why: exposes base URL normalizer helper to top-level tests.
 func TestHookKeycloakBaseURL(auth config.ServiceChecklistAuthSettings) string {
 	return keycloakBaseURL(auth)
 }
 // TestHookCompactHTTPBody runs one orchestration or CLI step.
 // Signature: TestHookCompactHTTPBody(raw []byte) string.
 // Why: exposes compact HTTP body helper to top-level tests.
 func TestHookCompactHTTPBody(raw []byte) string {
 	return compactHTTPBody(raw)
 }
--- a/internal/config/apply_defaults.go
+++ b/internal/config/apply_defaults.go
@ -97,6 +97,30 @@ func (c *Config) applyDefaults() {
 	if c.Startup.ServiceChecklistStabilitySec < 0 {
 		c.Startup.ServiceChecklistStabilitySec = 0
 	}
 	if strings.TrimSpace(c.Startup.ServiceChecklistAuth.Mode) == "" {
 		c.Startup.ServiceChecklistAuth.Mode = "keycloak_robotuser"
 	}
 	if strings.TrimSpace(c.Startup.ServiceChecklistAuth.KeycloakBaseURL) == "" {
 		c.Startup.ServiceChecklistAuth.KeycloakBaseURL = "https://sso.bstein.dev"
 	}
 	if strings.TrimSpace(c.Startup.ServiceChecklistAuth.Realm) == "" {
 		c.Startup.ServiceChecklistAuth.Realm = "atlas"
 	}
 	if strings.TrimSpace(c.Startup.ServiceChecklistAuth.RobotUsername) == "" {
 		c.Startup.ServiceChecklistAuth.RobotUsername = "robotuser"
 	}
 	if strings.TrimSpace(c.Startup.ServiceChecklistAuth.AdminSecretNamespace) == "" {
 		c.Startup.ServiceChecklistAuth.AdminSecretNamespace = "sso"
 	}
 	if strings.TrimSpace(c.Startup.ServiceChecklistAuth.AdminSecretName) == "" {
 		c.Startup.ServiceChecklistAuth.AdminSecretName = "keycloak-admin"
 	}
 	if strings.TrimSpace(c.Startup.ServiceChecklistAuth.AdminSecretUsernameKey) == "" {
 		c.Startup.ServiceChecklistAuth.AdminSecretUsernameKey = "username"
 	}
 	if strings.TrimSpace(c.Startup.ServiceChecklistAuth.AdminSecretPasswordKey) == "" {
 		c.Startup.ServiceChecklistAuth.AdminSecretPasswordKey = "password"
 	}
 	c.Startup.ServiceChecklist = mergeServiceChecklistDefaults(c.Startup.ServiceChecklist, defaultServiceChecklist())
 	for i := range c.Startup.ServiceChecklist {
 		if c.Startup.ServiceChecklist[i].TimeoutSeconds <= 0 {
--- a/internal/config/config_test.go
+++ b/internal/config/config_test.go
@ -207,6 +207,58 @@ func TestValidateRejectsBadServiceChecklistURL(t *testing.T) {
 	}
 }
 // TestValidateRejectsUnknownServiceChecklistAuthMode runs one orchestration or CLI step.
 // Signature: TestValidateRejectsUnknownServiceChecklistAuthMode(t *testing.T).
 // Why: authenticated user-journey checklist gates should fail fast when auth
 // mode is invalid to avoid silent false-positive startup passes.
 func TestValidateRejectsUnknownServiceChecklistAuthMode(t *testing.T) {
 	cfg := defaults()
 	cfg.Startup.ServiceChecklistAuth.Mode = "bad-mode"
 	if err := cfg.Validate(); err == nil {
 		t.Fatalf("expected validation error for invalid service checklist auth mode")
 	}
 }
 // TestValidateRejectsFinalURLMarkersWithoutRedirectFollow runs one orchestration or CLI step.
 // Signature: TestValidateRejectsFinalURLMarkersWithoutRedirectFollow(t *testing.T).
 // Why: final-url assertions only make sense when redirect following is enabled.
 func TestValidateRejectsFinalURLMarkersWithoutRedirectFollow(t *testing.T) {
 	cfg := defaults()
 	cfg.Startup.ServiceChecklist = []ServiceChecklistCheck{
 		{
 			Name:             "bad-final-url",
 			URL:              "https://logs.bstein.dev/",
 			AcceptedStatuses: []int{200},
 			FinalURLContains: "/app/home",
 			TimeoutSeconds:   12,
 		},
 	}
 	if err := cfg.Validate(); err == nil {
 		t.Fatalf("expected validation error for final_url_* markers without redirect follow")
 	}
 }
 // TestValidateRejectsRobotAuthCheckWhenAuthModeDisabled runs one orchestration or CLI step.
 // Signature: TestValidateRejectsRobotAuthCheckWhenAuthModeDisabled(t *testing.T).
 // Why: robot-auth checks must be blocked when checklist auth mode is disabled.
 func TestValidateRejectsRobotAuthCheckWhenAuthModeDisabled(t *testing.T) {
 	cfg := defaults()
 	cfg.Startup.ServiceChecklistAuth.Mode = "none"
 	cfg.Startup.ServiceChecklist = []ServiceChecklistCheck{
 		{
 			Name:             "logs-ui",
 			URL:              "https://logs.bstein.dev/",
 			AcceptedStatuses: []int{200},
 			RequireRobotAuth: true,
 			FollowRedirects:  true,
 			TimeoutSeconds:   12,
 		},
 	}
 	if err := cfg.Validate(); err == nil {
 		t.Fatalf("expected validation error for robot-auth checklist check when auth mode is none")
 	}
 }
 // TestValidateRejectsBadIgnoreFluxKustomizationFormat runs one orchestration or CLI step.
 // Signature: TestValidateRejectsBadIgnoreFluxKustomizationFormat(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
@ -291,8 +343,8 @@ func TestApplyDefaultsMergesServiceChecklistDefaults(t *testing.T) {
 	if _, ok := names["custom-smoke"]; !ok {
 		t.Fatalf("expected custom checklist entry to be preserved")
 	}
-	if _, ok := names["logging-oidc-redirect"]; !ok {
+	if _, ok := names["logging-ui-user-session"]; !ok {
-		t.Fatalf("expected default logging redirect check to be merged in")
+		t.Fatalf("expected default logging user-session check to be merged in")
 	}
 	if _, ok := names["vaultwarden-ui"]; !ok {
 		t.Fatalf("expected default vaultwarden check to be merged in")
--- a/internal/config/defaults.go
+++ b/internal/config/defaults.go
@ -81,6 +81,16 @@ func defaults() Config {
 			ServiceChecklistWaitSeconds:  420,
 			ServiceChecklistPollSeconds:  5,
 			ServiceChecklistStabilitySec: 120,
 			ServiceChecklistAuth: ServiceChecklistAuthSettings{
 				Mode:                   "keycloak_robotuser",
 				KeycloakBaseURL:        "https://sso.bstein.dev",
 				Realm:                  "atlas",
 				RobotUsername:          "robotuser",
 				AdminSecretNamespace:   "sso",
 				AdminSecretName:        "keycloak-admin",
 				AdminSecretUsernameKey: "username",
 				AdminSecretPasswordKey: "password",
 			},
 			ServiceChecklist:                defaultServiceChecklist(),
 			RequireCriticalServiceEndpoints: true,
 			CriticalServiceEndpointWaitSec:  420,
--- a/internal/config/startup_service_catalog.go
+++ b/internal/config/startup_service_catalog.go
@ -44,10 +44,12 @@ func defaultServiceChecklist() []ServiceChecklistCheck {
 			TimeoutSeconds:   12,
 		},
 		{
-			Name:             "auth-gateway-redirect",
+			Name:             "auth-gateway-user-session",
 			URL:              "https://auth.bstein.dev/",
-			AcceptedStatuses: []int{302},
+			AcceptedStatuses: []int{200},
-			LocationContains: "https://sso.bstein.dev/realms/atlas/",
+			RequireRobotAuth: true,
 			FollowRedirects:  true,
 			BodyContains:     "Authenticated",
 			TimeoutSeconds:   12,
 		},
 		{
@ -121,17 +123,32 @@ func defaultServiceChecklist() []ServiceChecklistCheck {
 			TimeoutSeconds:   12,
 		},
 		{
-			Name:             "logging-oidc-redirect",
+			Name:                "logging-ui-user-session",
 			URL:                 "https://logs.bstein.dev/",
-			AcceptedStatuses: []int{302},
+			AcceptedStatuses:    []int{200},
-			LocationContains: "client_id=logs",
+			RequireRobotAuth:    true,
 			FollowRedirects:     true,
 			FinalURLNotContains: "/protocol/openid-connect/auth",
 			BodyContains:        "OpenSearch Dashboards",
 			TimeoutSeconds:      12,
 		},
 		{
-			Name:             "longhorn-oidc-redirect",
+			Name:             "logging-api-user-session",
-			URL:              "https://longhorn.bstein.dev/",
+			URL:              "https://logs.bstein.dev/api/status",
-			AcceptedStatuses: []int{302},
+			AcceptedStatuses: []int{200},
-			LocationContains: "https://sso.bstein.dev/realms/atlas/",
+			RequireRobotAuth: true,
 			FollowRedirects:  true,
 			BodyContains:     "\"state\":\"green\"",
 			TimeoutSeconds:   12,
 		},
 		{
 			Name:                "longhorn-api-user-session",
 			URL:                 "https://longhorn.bstein.dev/v1",
 			AcceptedStatuses:    []int{200},
 			RequireRobotAuth:    true,
 			FollowRedirects:     true,
 			FinalURLNotContains: "/protocol/openid-connect/auth",
 			BodyContains:        "\"id\":\"v1\"",
 			TimeoutSeconds:      12,
 		},
 		{
@ -190,17 +207,24 @@ func defaultServiceChecklist() []ServiceChecklistCheck {
 			TimeoutSeconds:   12,
 		},
 		{
-			Name:             "sentinel-oidc-redirect",
+			Name:                "sentinel-user-session",
-			URL:              "https://sentinel.bstein.dev/",
+			URL:                 "https://sentinel.bstein.dev/healthz",
-			AcceptedStatuses: []int{302},
+			AcceptedStatuses:    []int{200},
-			LocationContains: "client_id=metis",
+			RequireRobotAuth:    true,
 			FollowRedirects:     true,
 			FinalURLNotContains: "/protocol/openid-connect/auth",
 			BodyContains:        "ok",
 			TimeoutSeconds:      12,
 		},
 		{
-			Name:             "keycloak-admin-redirect",
+			Name:                "keycloak-admin-user-session",
-			URL:              "https://sso.bstein.dev/",
+			URL:                 "https://sso.bstein.dev/admin/",
-			AcceptedStatuses: []int{302},
+			AcceptedStatuses:    []int{200},
-			LocationContains: "https://sso.bstein.dev/admin/",
+			RequireRobotAuth:    true,
 			FollowRedirects:     true,
 			FinalURLContains:    "/admin/master/console/",
 			FinalURLNotContains: "/login-actions/authenticate",
 			BodyContains:        "Keycloak Administration Console",
 			TimeoutSeconds:      12,
 		},
 		{
@ -253,23 +277,23 @@ func mergeServiceChecklistDefaults(existing, defaults []ServiceChecklistCheck) [
 		return out
 	}
-	byName := map[string]struct{}{}
+	defaultByName := map[string]struct{}{}
 	for _, check := range existing {
 		name := strings.TrimSpace(check.Name)
 		if name == "" {
 			continue
 		}
 		byName[name] = struct{}{}
 	}
 	out := make([]ServiceChecklistCheck, 0, len(existing)+len(defaults))
 	out = append(out, existing...)
 	for _, check := range defaults {
 		name := strings.TrimSpace(check.Name)
 		if name == "" {
 			continue
 		}
-		if _, exists := byName[name]; exists {
+		defaultByName[name] = struct{}{}
 	}
 	out := make([]ServiceChecklistCheck, 0, len(defaults)+len(existing))
 	out = append(out, defaults...)
 	for _, check := range existing {
 		name := strings.TrimSpace(check.Name)
 		if name == "" {
 			continue
 		}
 		if _, exists := defaultByName[name]; exists {
 			continue
 		}
 		out = append(out, check)
--- a/internal/config/testing_hooks.go
+++ b/internal/config/testing_hooks.go
@ -0,0 +1,33 @@
 package config
 // TestHookDefaultServiceChecklist runs one orchestration or CLI step.
 // Signature: TestHookDefaultServiceChecklist() []ServiceChecklistCheck.
 // Why: exposes default service checklist catalog to top-level tests.
 func TestHookDefaultServiceChecklist() []ServiceChecklistCheck {
 	out := make([]ServiceChecklistCheck, 0, len(defaultServiceChecklist()))
 	out = append(out, defaultServiceChecklist()...)
 	return out
 }
 // TestHookDefaultCriticalServiceEndpoints runs one orchestration or CLI step.
 // Signature: TestHookDefaultCriticalServiceEndpoints() []string.
 // Why: exposes default critical endpoint catalog to top-level tests.
 func TestHookDefaultCriticalServiceEndpoints() []string {
 	out := make([]string, 0, len(defaultCriticalServiceEndpoints()))
 	out = append(out, defaultCriticalServiceEndpoints()...)
 	return out
 }
 // TestHookMergeServiceChecklistDefaults runs one orchestration or CLI step.
 // Signature: TestHookMergeServiceChecklistDefaults(existing, defaults []ServiceChecklistCheck) []ServiceChecklistCheck.
 // Why: exposes checklist merge helper to top-level tests.
 func TestHookMergeServiceChecklistDefaults(existing, defaults []ServiceChecklistCheck) []ServiceChecklistCheck {
 	return mergeServiceChecklistDefaults(existing, defaults)
 }
 // TestHookMergeStringDefaults runs one orchestration or CLI step.
 // Signature: TestHookMergeStringDefaults(existing, defaults []string) []string.
 // Why: exposes string merge helper to top-level tests.
 func TestHookMergeStringDefaults(existing, defaults []string) []string {
 	return mergeStringDefaults(existing, defaults)
 }
--- a/internal/config/types.go
+++ b/internal/config/types.go
@ -56,6 +56,7 @@ type Startup struct {
 	ServiceChecklistWaitSeconds     int                          `yaml:"service_checklist_wait_seconds"`
 	ServiceChecklistPollSeconds     int                          `yaml:"service_checklist_poll_seconds"`
 	ServiceChecklistStabilitySec    int                          `yaml:"service_checklist_stability_seconds"`
 	ServiceChecklistAuth            ServiceChecklistAuthSettings `yaml:"service_checklist_auth"`
 	ServiceChecklist                []ServiceChecklistCheck      `yaml:"service_checklist"`
 	RequireCriticalServiceEndpoints bool                         `yaml:"require_critical_service_endpoints"`
 	CriticalServiceEndpointWaitSec  int                          `yaml:"critical_service_endpoint_wait_seconds"`
@ -91,14 +92,29 @@ type ServiceChecklistCheck struct {
 	Name                string `yaml:"name"`
 	URL                 string `yaml:"url"`
 	AcceptedStatuses    []int  `yaml:"accepted_statuses"`
 	RequireRobotAuth    bool   `yaml:"require_robot_auth"`
 	FollowRedirects     bool   `yaml:"follow_redirects"`
 	LocationContains    string `yaml:"location_contains"`
 	LocationNotContains string `yaml:"location_not_contains"`
 	FinalURLContains    string `yaml:"final_url_contains"`
 	FinalURLNotContains string `yaml:"final_url_not_contains"`
 	BodyContains        string `yaml:"body_contains"`
 	BodyNotContains     string `yaml:"body_not_contains"`
 	TimeoutSeconds      int    `yaml:"timeout_seconds"`
 	InsecureSkipTLS     bool   `yaml:"insecure_skip_tls"`
 }
 type ServiceChecklistAuthSettings struct {
 	Mode                   string `yaml:"mode"`
 	KeycloakBaseURL        string `yaml:"keycloak_base_url"`
 	Realm                  string `yaml:"realm"`
 	RobotUsername          string `yaml:"robot_username"`
 	AdminSecretNamespace   string `yaml:"admin_secret_namespace"`
 	AdminSecretName        string `yaml:"admin_secret_name"`
 	AdminSecretUsernameKey string `yaml:"admin_secret_username_key"`
 	AdminSecretPasswordKey string `yaml:"admin_secret_password_key"`
 }
 type Shutdown struct {
 	DefaultBudgetSeconds int  `yaml:"default_budget_seconds"`
 	HistoryMinSamples    int  `yaml:"history_min_samples"`
--- a/internal/config/validate.go
+++ b/internal/config/validate.go
@ -136,6 +136,35 @@ func (c Config) Validate() error {
 	if c.Startup.RequireServiceChecklist && len(c.Startup.ServiceChecklist) == 0 {
 		return fmt.Errorf("config.startup.service_checklist must not be empty when require_service_checklist is true")
 	}
 	authMode := strings.TrimSpace(c.Startup.ServiceChecklistAuth.Mode)
 	if authMode != "none" && authMode != "keycloak_robotuser" {
 		return fmt.Errorf("config.startup.service_checklist_auth.mode must be none or keycloak_robotuser")
 	}
 	if authMode == "keycloak_robotuser" {
 		baseURL := strings.TrimSpace(c.Startup.ServiceChecklistAuth.KeycloakBaseURL)
 		parsed, err := neturl.Parse(baseURL)
 		if err != nil || parsed.Scheme == "" || parsed.Host == "" {
 			return fmt.Errorf("config.startup.service_checklist_auth.keycloak_base_url is invalid: %q", baseURL)
 		}
 		if strings.TrimSpace(c.Startup.ServiceChecklistAuth.Realm) == "" {
 			return fmt.Errorf("config.startup.service_checklist_auth.realm must not be empty")
 		}
 		if strings.TrimSpace(c.Startup.ServiceChecklistAuth.RobotUsername) == "" {
 			return fmt.Errorf("config.startup.service_checklist_auth.robot_username must not be empty")
 		}
 		if strings.TrimSpace(c.Startup.ServiceChecklistAuth.AdminSecretNamespace) == "" {
 			return fmt.Errorf("config.startup.service_checklist_auth.admin_secret_namespace must not be empty")
 		}
 		if strings.TrimSpace(c.Startup.ServiceChecklistAuth.AdminSecretName) == "" {
 			return fmt.Errorf("config.startup.service_checklist_auth.admin_secret_name must not be empty")
 		}
 		if strings.TrimSpace(c.Startup.ServiceChecklistAuth.AdminSecretUsernameKey) == "" {
 			return fmt.Errorf("config.startup.service_checklist_auth.admin_secret_username_key must not be empty")
 		}
 		if strings.TrimSpace(c.Startup.ServiceChecklistAuth.AdminSecretPasswordKey) == "" {
 			return fmt.Errorf("config.startup.service_checklist_auth.admin_secret_password_key must not be empty")
 		}
 	}
 	for i, check := range c.Startup.ServiceChecklist {
 		if strings.TrimSpace(check.Name) == "" {
 			return fmt.Errorf("config.startup.service_checklist[%d].name must not be empty", i)
@ -151,6 +180,13 @@ func (c Config) Validate() error {
 		if check.TimeoutSeconds <= 0 {
 			return fmt.Errorf("config.startup.service_checklist[%d].timeout_seconds must be > 0", i)
 		}
 		if check.RequireRobotAuth && authMode == "none" {
 			return fmt.Errorf("config.startup.service_checklist[%d] requires robot auth but service_checklist_auth.mode is none", i)
 		}
 		if (strings.TrimSpace(check.FinalURLContains) != "" || strings.TrimSpace(check.FinalURLNotContains) != "") &&
 			!(check.FollowRedirects || check.RequireRobotAuth) {
 			return fmt.Errorf("config.startup.service_checklist[%d] uses final_url_* markers without redirects enabled", i)
 		}
 		for _, code := range check.AcceptedStatuses {
 			if code < 100 || code > 599 {
 				return fmt.Errorf("config.startup.service_checklist[%d].accepted_statuses contains invalid HTTP code %d", i, code)
--- a/internal/execx/runner.go
+++ b/internal/execx/runner.go
@ -15,6 +15,9 @@ type Runner struct {
 	Logger     *log.Logger
 }
 // Run runs one orchestration or CLI step.
 // Signature: (r *Runner) Run(ctx context.Context, name string, args ...string) (string, error).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func (r *Runner) Run(ctx context.Context, name string, args ...string) (string, error) {
 	if r.DryRun {
 		r.logf("DRY-RUN: %s %s", name, strings.Join(args, " "))
@ -37,11 +40,17 @@ func (r *Runner) Run(ctx context.Context, name string, args ...string) (string,
 	return trimmed, nil
 }
 // CommandExists runs one orchestration or CLI step.
 // Signature: (r *Runner) CommandExists(name string) bool.
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func (r *Runner) CommandExists(name string) bool {
 	_, err := exec.LookPath(name)
 	return err == nil
 }
 // logf runs one orchestration or CLI step.
 // Signature: (r *Runner) logf(format string, args ...any).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func (r *Runner) logf(format string, args ...any) {
 	if r.Logger != nil {
 		r.Logger.Printf(format, args...)
--- a/internal/execx/runner_additional_test.go
+++ b/internal/execx/runner_additional_test.go
@ -0,0 +1,53 @@
 package execx
 import (
 	"context"
 	"strings"
 	"testing"
 )
 // TestRunnerRunFailureWithoutOutput runs one orchestration or CLI step.
 // Signature: TestRunnerRunFailureWithoutOutput(t *testing.T).
 // Why: covers error branch where command fails without producing output.
 func TestRunnerRunFailureWithoutOutput(t *testing.T) {
 	r := &Runner{}
 	out, err := r.Run(context.Background(), "sh", "-c", "exit 3")
 	if err == nil {
 		t.Fatalf("expected failure")
 	}
 	if out != "" {
 		t.Fatalf("expected empty output, got %q", out)
 	}
 }
 // TestRunnerLogfNoLogger runs one orchestration or CLI step.
 // Signature: TestRunnerLogfNoLogger(t *testing.T).
 // Why: covers no-op logging path.
 func TestRunnerLogfNoLogger(t *testing.T) {
 	r := &Runner{}
 	r.logf("hello %s", "world")
 }
 // TestRunnerCommandMissing runs one orchestration or CLI step.
 // Signature: TestRunnerCommandMissing(t *testing.T).
 // Why: covers false branch of command existence checks.
 func TestRunnerCommandMissing(t *testing.T) {
 	r := &Runner{}
 	if r.CommandExists("definitely-not-a-real-command-ananke") {
 		t.Fatalf("expected missing command to be false")
 	}
 }
 // TestRunnerInjectsKubeconfigEnv runs one orchestration or CLI step.
 // Signature: TestRunnerInjectsKubeconfigEnv(t *testing.T).
 // Why: covers kubeconfig environment injection branch in command runner.
 func TestRunnerInjectsKubeconfigEnv(t *testing.T) {
 	r := &Runner{Kubeconfig: "/tmp/test-kubeconfig"}
 	out, err := r.Run(context.Background(), "sh", "-c", "printf %s \"$KUBECONFIG\"")
 	if err != nil {
 		t.Fatalf("runner command failed: %v", err)
 	}
 	if strings.TrimSpace(out) != "/tmp/test-kubeconfig" {
 		t.Fatalf("expected kubeconfig env to propagate, got %q", out)
 	}
 }
--- a/internal/execx/runner_test.go
+++ b/internal/execx/runner_test.go
@ -0,0 +1,68 @@
 package execx
 import (
 	"bytes"
 	"context"
 	"log"
 	"strings"
 	"testing"
 )
 // TestRunnerDryRun runs one orchestration or CLI step.
 // Signature: TestRunnerDryRun(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestRunnerDryRun(t *testing.T) {
 	var buf bytes.Buffer
 	r := &Runner{
 		DryRun: true,
 		Logger: log.New(&buf, "", 0),
 	}
 	out, err := r.Run(context.Background(), "echo", "hello")
 	if err != nil {
 		t.Fatalf("dry-run should not fail: %v", err)
 	}
 	if out != "" {
 		t.Fatalf("expected empty dry-run output, got %q", out)
 	}
 	if !strings.Contains(buf.String(), "DRY-RUN: echo hello") {
 		t.Fatalf("expected dry-run log entry, got %q", buf.String())
 	}
 }
 // TestRunnerRunSuccess runs one orchestration or CLI step.
 // Signature: TestRunnerRunSuccess(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestRunnerRunSuccess(t *testing.T) {
 	r := &Runner{}
 	out, err := r.Run(context.Background(), "sh", "-c", "printf ok")
 	if err != nil {
 		t.Fatalf("expected command success: %v", err)
 	}
 	if out != "ok" {
 		t.Fatalf("expected output ok, got %q", out)
 	}
 }
 // TestRunnerRunFailureIncludesOutput runs one orchestration or CLI step.
 // Signature: TestRunnerRunFailureIncludesOutput(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestRunnerRunFailureIncludesOutput(t *testing.T) {
 	r := &Runner{}
 	out, err := r.Run(context.Background(), "sh", "-c", "echo boom >&2; exit 1")
 	if err == nil {
 		t.Fatalf("expected command failure")
 	}
 	if strings.TrimSpace(out) != "boom" {
 		t.Fatalf("expected stderr to be preserved, got %q", out)
 	}
 }
 // TestRunnerCommandExists runs one orchestration or CLI step.
 // Signature: TestRunnerCommandExists(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestRunnerCommandExists(t *testing.T) {
 	r := &Runner{}
 	if !r.CommandExists("sh") {
 		t.Fatalf("expected shell command to exist")
 	}
 }
--- a/internal/metrics/exporter.go
+++ b/internal/metrics/exporter.go
@ -3,6 +3,7 @@ package metrics
 import (
 	"fmt"
 	"net/http"
 	"os"
 	"sort"
 	"strings"
 	"sync"
@ -35,18 +36,27 @@ type Exporter struct {
 	samples            map[string]Sample
 }
 // New runs one orchestration or CLI step.
 // Signature: New() *Exporter.
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func New() *Exporter {
 	return &Exporter{
 		samples: make(map[string]Sample),
 	}
 }
 // UpdateBudget runs one orchestration or CLI step.
 // Signature: (e *Exporter) UpdateBudget(seconds int).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func (e *Exporter) UpdateBudget(seconds int) {
 	e.mu.Lock()
 	defer e.mu.Unlock()
 	e.shutdownBudgetSec = seconds
 }
 // UpdateSample runs one orchestration or CLI step.
 // Signature: (e *Exporter) UpdateSample(s Sample).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func (e *Exporter) UpdateSample(s Sample) {
 	e.mu.Lock()
 	defer e.mu.Unlock()
@ -56,6 +66,9 @@ func (e *Exporter) UpdateSample(s Sample) {
 	e.samples[s.Name] = s
 }
 // MarkShutdown runs one orchestration or CLI step.
 // Signature: (e *Exporter) MarkShutdown(reason string).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func (e *Exporter) MarkShutdown(reason string) {
 	e.mu.Lock()
 	defer e.mu.Unlock()
@ -64,6 +77,9 @@ func (e *Exporter) MarkShutdown(reason string) {
 	e.lastShutdownAt = time.Now().UTC()
 }
 // Handler runs one orchestration or CLI step.
 // Signature: (e *Exporter) Handler(path string) http.Handler.
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func (e *Exporter) Handler(path string) http.Handler {
 	mux := http.NewServeMux()
 	metricsPath := path
@ -78,6 +94,9 @@ func (e *Exporter) Handler(path string) http.Handler {
 	return mux
 }
 // serveMetrics runs one orchestration or CLI step.
 // Signature: (e *Exporter) serveMetrics(w http.ResponseWriter, _ *http.Request).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func (e *Exporter) serveMetrics(w http.ResponseWriter, _ *http.Request) {
 	e.mu.RLock()
 	defer e.mu.RUnlock()
@ -145,10 +164,40 @@ func (e *Exporter) serveMetrics(w http.ResponseWriter, _ *http.Request) {
 		}
 		b.WriteString(fmt.Sprintf("ananke_ups_error%s %d\n", labels, boolNum(s.LastError != "")))
 	}
 	appendQualityGateMetrics(&b)
 	_, _ = w.Write([]byte(b.String()))
 }
 // appendQualityGateMetrics runs one orchestration or CLI step.
 // Signature: appendQualityGateMetrics(dst *strings.Builder).
 // Why: quality-gate pass/fail telemetry should appear alongside UPS metrics so
 // Grafana can track Ananke suite health over time.
 func appendQualityGateMetrics(dst *strings.Builder) {
 	path := strings.TrimSpace(os.Getenv("ANANKE_QUALITY_METRICS_FILE"))
 	if path == "" {
 		path = "/var/lib/ananke/quality-gate.prom"
 	}
 	raw, err := os.ReadFile(path)
 	if err != nil {
 		return
 	}
 	text := strings.TrimSpace(string(raw))
 	if text == "" {
 		return
 	}
 	if dst.Len() > 0 {
 		dst.WriteString("\n")
 	}
 	dst.WriteString(text)
 	if !strings.HasSuffix(text, "\n") {
 		dst.WriteString("\n")
 	}
 }
 // boolNum runs one orchestration or CLI step.
 // Signature: boolNum(v bool) int.
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func boolNum(v bool) int {
 	if v {
 		return 1
@ -156,6 +205,9 @@ func boolNum(v bool) int {
 	return 0
 }
 // safe runs one orchestration or CLI step.
 // Signature: safe(in string) string.
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func safe(in string) string {
 	out := strings.ReplaceAll(in, "\\", "\\\\")
 	return strings.ReplaceAll(out, "\"", "\\\"")
--- a/internal/metrics/exporter_additional_test.go
+++ b/internal/metrics/exporter_additional_test.go
@ -0,0 +1,86 @@
 package metrics
 import (
 	"net/http"
 	"net/http/httptest"
 	"os"
 	"path/filepath"
 	"strings"
 	"testing"
 )
 // TestExporterHealthzAndEscaping runs one orchestration or CLI step.
 // Signature: TestExporterHealthzAndEscaping(t *testing.T).
 // Why: covers health endpoint and label escaping branches in metrics renderer.
 func TestExporterHealthzAndEscaping(t *testing.T) {
 	e := New()
 	e.UpdateSample(Sample{
 		Name:      `Sta"tera`,
 		Target:    `statera\host`,
 		Status:    `O"B`,
 		LastError: "x",
 	})
 	h := e.Handler("/custom")
 	healthReq := httptest.NewRequest(http.MethodGet, "/healthz", nil)
 	healthRR := httptest.NewRecorder()
 	h.ServeHTTP(healthRR, healthReq)
 	if healthRR.Code != http.StatusOK || strings.TrimSpace(healthRR.Body.String()) != "ok" {
 		t.Fatalf("unexpected health response: code=%d body=%q", healthRR.Code, healthRR.Body.String())
 	}
 	metricsReq := httptest.NewRequest(http.MethodGet, "/custom", nil)
 	metricsRR := httptest.NewRecorder()
 	h.ServeHTTP(metricsRR, metricsReq)
 	body := metricsRR.Body.String()
 	if !strings.Contains(body, `source="Sta\\\"tera"`) {
 		t.Fatalf("expected escaped source label, got:\n%s", body)
 	}
 	if !strings.Contains(body, `target="statera\\\\host"`) {
 		t.Fatalf("expected escaped target label, got:\n%s", body)
 	}
 	if !strings.Contains(body, "ananke_ups_error") {
 		t.Fatalf("expected error metric line in output")
 	}
 }
 // TestBoolNumAndSafeHelpers runs one orchestration or CLI step.
 // Signature: TestBoolNumAndSafeHelpers(t *testing.T).
 // Why: directly covers remaining helper branches.
 func TestBoolNumAndSafeHelpers(t *testing.T) {
 	if boolNum(true) != 1 || boolNum(false) != 0 {
 		t.Fatalf("unexpected boolNum values")
 	}
 	if got := safe(`a"b\c`); got != `a\"b\\c` {
 		t.Fatalf("unexpected escaped string: %q", got)
 	}
 }
 // TestExporterAppendsQualityGateMetrics runs one orchestration or CLI step.
 // Signature: TestExporterAppendsQualityGateMetrics(t *testing.T).
 // Why: verifies quality-gate metrics are surfaced on /metrics for Grafana suite
 // pass-rate tracking.
 func TestExporterAppendsQualityGateMetrics(t *testing.T) {
 	tmp := t.TempDir()
 	metricsPath := filepath.Join(tmp, "quality-gate.prom")
 	content := strings.Join([]string{
 		`# HELP ananke_quality_gate_runs_total Total quality gate runs by status.`,
 		`# TYPE ananke_quality_gate_runs_total counter`,
 		`ananke_quality_gate_runs_total{suite="ananke",status="ok"} 10`,
 		`ananke_quality_gate_runs_total{suite="ananke",status="failed"} 2`,
 		"",
 	}, "\n")
 	if err := os.WriteFile(metricsPath, []byte(content), 0o600); err != nil {
 		t.Fatalf("write quality metrics file: %v", err)
 	}
 	t.Setenv("ANANKE_QUALITY_METRICS_FILE", metricsPath)
 	e := New()
 	req := httptest.NewRequest(http.MethodGet, "/metrics", nil)
 	rr := httptest.NewRecorder()
 	e.Handler("/metrics").ServeHTTP(rr, req)
 	body := rr.Body.String()
 	if !strings.Contains(body, `ananke_quality_gate_runs_total{suite="ananke",status="ok"} 10`) {
 		t.Fatalf("expected quality gate metrics appended to exporter output, got:\n%s", body)
 	}
 }
--- a/internal/metrics/exporter_test.go
+++ b/internal/metrics/exporter_test.go
@ -7,6 +7,9 @@ import (
 	"time"
 )
 // TestExporterEmitsCoreMetrics runs one orchestration or CLI step.
 // Signature: TestExporterEmitsCoreMetrics(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestExporterEmitsCoreMetrics(t *testing.T) {
 	e := New()
 	e.UpdateBudget(321)
--- a/internal/service/daemon.go
+++ b/internal/service/daemon.go
@ -34,6 +34,19 @@ type Daemon struct {
 	exporter *metrics.Exporter
 }
 var sshConfigCandidates = []string{
 	"/home/atlas/.ssh/config",
 	"/home/tethys/.ssh/config",
 }
 var sshIdentityCandidates = []string{
 	"/home/atlas/.ssh/id_ed25519",
 	"/home/tethys/.ssh/id_ed25519",
 }
 // NewDaemon runs one orchestration or CLI step.
 // Signature: NewDaemon(cfg config.Config, orch *cluster.Orchestrator, targets []Target, logger *log.Logger) *Daemon.
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func NewDaemon(cfg config.Config, orch *cluster.Orchestrator, targets []Target, logger *log.Logger) *Daemon {
 	return &Daemon{
 		cfg:      cfg,
@ -44,6 +57,9 @@ func NewDaemon(cfg config.Config, orch *cluster.Orchestrator, targets []Target,
 	}
 }
 // Run runs one orchestration or CLI step.
 // Signature: (d *Daemon) Run(ctx context.Context) error.
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func (d *Daemon) Run(ctx context.Context) error {
 	if !d.cfg.UPS.Enabled {
 		return fmt.Errorf("ups monitoring is disabled in config")
@ -152,6 +168,9 @@ func (d *Daemon) Run(ctx context.Context) error {
 	}
 }
 // triggerShutdown runs one orchestration or CLI step.
 // Signature: (d *Daemon) triggerShutdown(ctx context.Context, reason string) error.
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func (d *Daemon) triggerShutdown(ctx context.Context, reason string) error {
 	intent, err := state.ReadIntent(d.cfg.State.IntentPath)
 	if err == nil && intent.State == state.IntentShuttingDown {
@ -190,6 +209,9 @@ func (d *Daemon) triggerShutdown(ctx context.Context, reason string) error {
 	return nil
 }
 // forwardShutdown runs one orchestration or CLI step.
 // Signature: (d *Daemon) forwardShutdown(ctx context.Context, reason string) error.
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func (d *Daemon) forwardShutdown(ctx context.Context, reason string) error {
 	timeout := time.Duration(d.cfg.Coordination.CommandTimeoutSeconds) * time.Second
 	if timeout <= 0 {
@ -280,15 +302,14 @@ func (d *Daemon) forwardShutdown(ctx context.Context, reason string) error {
 	return nil
 }
 // resolveSSHConfigFile runs one orchestration or CLI step.
 // Signature: (d *Daemon) resolveSSHConfigFile() string.
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func (d *Daemon) resolveSSHConfigFile() string {
 	if strings.TrimSpace(d.cfg.SSHConfigFile) != "" {
 		return strings.TrimSpace(d.cfg.SSHConfigFile)
 	}
-	candidates := []string{
+	for _, p := range sshConfigCandidates {
 		"/home/atlas/.ssh/config",
 		"/home/tethys/.ssh/config",
 	}
 	for _, p := range candidates {
 		if stat, err := os.Stat(p); err == nil && !stat.IsDir() {
 			return p
 		}
@ -296,15 +317,14 @@ func (d *Daemon) resolveSSHConfigFile() string {
 	return ""
 }
 // resolveSSHIdentityFile runs one orchestration or CLI step.
 // Signature: (d *Daemon) resolveSSHIdentityFile() string.
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func (d *Daemon) resolveSSHIdentityFile() string {
 	if strings.TrimSpace(d.cfg.SSHIdentityFile) != "" {
 		return strings.TrimSpace(d.cfg.SSHIdentityFile)
 	}
-	candidates := []string{
+	for _, p := range sshIdentityCandidates {
 		"/home/atlas/.ssh/id_ed25519",
 		"/home/tethys/.ssh/id_ed25519",
 	}
 	for _, p := range candidates {
 		if stat, err := os.Stat(p); err == nil && !stat.IsDir() {
 			return p
 		}
@ -312,6 +332,9 @@ func (d *Daemon) resolveSSHIdentityFile() string {
 	return ""
 }
 // targetList runs one orchestration or CLI step.
 // Signature: (d *Daemon) targetList() string.
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func (d *Daemon) targetList() string {
 	names := make([]string, 0, len(d.targets))
 	for _, t := range d.targets {
@ -320,6 +343,9 @@ func (d *Daemon) targetList() string {
 	return strings.Join(names, ",")
 }
 // startMetricsServer runs one orchestration or CLI step.
 // Signature: (d *Daemon) startMetricsServer() error.
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func (d *Daemon) startMetricsServer() error {
 	if d.cfg.Metrics.BindAddr == "" {
 		return fmt.Errorf("metrics.bind_addr must not be empty when metrics are enabled")
--- a/internal/service/daemon_additional_test.go
+++ b/internal/service/daemon_additional_test.go
@ -0,0 +1,255 @@
 package service
 import (
 	"context"
 	"io"
 	"log"
 	"os"
 	"path/filepath"
 	"strings"
 	"testing"
 	"time"
 	"scm.bstein.dev/bstein/ananke/internal/cluster"
 	"scm.bstein.dev/bstein/ananke/internal/config"
 	"scm.bstein.dev/bstein/ananke/internal/execx"
 	"scm.bstein.dev/bstein/ananke/internal/metrics"
 	"scm.bstein.dev/bstein/ananke/internal/state"
 	"scm.bstein.dev/bstein/ananke/internal/ups"
 )
 type daemonFakeProvider struct {
 	samples []ups.Sample
 	errs    []error
 	idx     int
 }
 // Read runs one orchestration or CLI step.
 // Signature: (p *daemonFakeProvider) Read(ctx context.Context) (ups.Sample, error).
 // Why: daemon tests need deterministic telemetry/error sequencing without real UPS I/O.
 func (p *daemonFakeProvider) Read(_ context.Context) (ups.Sample, error) {
 	if p.idx < len(p.errs) && p.errs[p.idx] != nil {
 		err := p.errs[p.idx]
 		p.idx++
 		return ups.Sample{}, err
 	}
 	if p.idx < len(p.samples) {
 		s := p.samples[p.idx]
 		p.idx++
 		return s, nil
 	}
 	if len(p.samples) > 0 {
 		return p.samples[len(p.samples)-1], nil
 	}
 	return ups.Sample{}, context.DeadlineExceeded
 }
 // newDaemonTestOrchestrator runs one orchestration or CLI step.
 // Signature: newDaemonTestOrchestrator(t *testing.T, stateDir string) *cluster.Orchestrator.
 // Why: daemon tests share a minimal dry-run orchestrator fixture to avoid duplication.
 func newDaemonTestOrchestrator(t *testing.T, stateDir string) *cluster.Orchestrator {
 	t.Helper()
 	cfg := config.Config{
 		ControlPlanes:   []string{"titan-0a"},
 		Workers:         []string{"titan-22"},
 		SSHUser:         "atlas",
 		SSHPort:         2277,
 		SSHManagedNodes: []string{"titan-0a", "titan-22"},
 		SSHNodeHosts: map[string]string{
 			"titan-0a": "192.168.22.11",
 			"titan-22": "192.168.22.22",
 		},
 		State: config.State{
 			Dir:            stateDir,
 			ReportsDir:     filepath.Join(stateDir, "reports"),
 			RunHistoryPath: filepath.Join(stateDir, "runs.json"),
 			LockPath:       filepath.Join(stateDir, "ananke.lock"),
 			IntentPath:     filepath.Join(stateDir, "intent.json"),
 		},
 		Shutdown: config.Shutdown{
 			EmergencySkipDrain: true,
 			EmergencySkipEtcd:  true,
 		},
 	}
 	return cluster.New(
 		cfg,
 		&execx.Runner{DryRun: true, Logger: log.New(io.Discard, "", 0)},
 		state.New(filepath.Join(stateDir, "runs.json")),
 		log.New(io.Discard, "", 0),
 	)
 }
 // TestDaemonRunTriggersShutdownOnLowBattery runs one orchestration or CLI step.
 // Signature: TestDaemonRunTriggersShutdownOnLowBattery(t *testing.T).
 // Why: covers main daemon loop path that triggers shutdown after debounce threshold.
 func TestDaemonRunTriggersShutdownOnLowBattery(t *testing.T) {
 	stateDir := t.TempDir()
 	orch := newDaemonTestOrchestrator(t, stateDir)
 	d := &Daemon{
 		cfg: config.Config{
 			UPS: config.UPS{
 				Enabled:             true,
 				PollSeconds:         1,
 				DebounceCount:       1,
 				RuntimeSafetyFactor: 1.0,
 			},
 			State: config.State{
 				IntentPath: filepath.Join(stateDir, "intent.json"),
 			},
 			Shutdown: config.Shutdown{
 				EmergencySkipDrain: true,
 				EmergencySkipEtcd:  true,
 			},
 		},
 		orch: orch,
 		targets: []Target{
 			{
 				Name:   "Pyrphoros",
 				Target: "pyrphoros@localhost",
 				Provider: &daemonFakeProvider{
 					samples: []ups.Sample{{OnBattery: true, LowBattery: true, RuntimeSeconds: 30, RawStatus: "OB LB"}},
 				},
 			},
 		},
 		log:      log.New(io.Discard, "", 0),
 		exporter: metrics.New(),
 	}
 	ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second)
 	defer cancel()
 	if err := d.Run(ctx); err != nil {
 		t.Fatalf("expected daemon to trigger and complete shutdown, got %v", err)
 	}
 }
 // TestDaemonRunTriggersShutdownOnTelemetryTimeout runs one orchestration or CLI step.
 // Signature: TestDaemonRunTriggersShutdownOnTelemetryTimeout(t *testing.T).
 // Why: covers telemetry-timeout trigger path while UPS remains on-battery.
 func TestDaemonRunTriggersShutdownOnTelemetryTimeout(t *testing.T) {
 	stateDir := t.TempDir()
 	orch := newDaemonTestOrchestrator(t, stateDir)
 	d := &Daemon{
 		cfg: config.Config{
 			UPS: config.UPS{
 				Enabled:                 true,
 				PollSeconds:             1,
 				DebounceCount:           3,
 				RuntimeSafetyFactor:     1.0,
 				TelemetryTimeoutSeconds: 1,
 			},
 			State: config.State{
 				IntentPath: filepath.Join(stateDir, "intent.json"),
 			},
 			Shutdown: config.Shutdown{
 				EmergencySkipDrain: true,
 				EmergencySkipEtcd:  true,
 			},
 		},
 		orch: orch,
 		targets: []Target{
 			{
 				Name:   "Statera",
 				Target: "statera@localhost",
 				Provider: &daemonFakeProvider{
 					samples: []ups.Sample{{OnBattery: true, LowBattery: false, RuntimeSeconds: 9999, RawStatus: "OB"}},
 					errs:    []error{nil, context.DeadlineExceeded, context.DeadlineExceeded, context.DeadlineExceeded},
 				},
 			},
 		},
 		log:      log.New(io.Discard, "", 0),
 		exporter: metrics.New(),
 	}
 	ctx, cancel := context.WithTimeout(context.Background(), 6*time.Second)
 	defer cancel()
 	if err := d.Run(ctx); err != nil {
 		t.Fatalf("expected telemetry-timeout shutdown path to complete, got %v", err)
 	}
 }
 // TestForwardShutdownSucceedsWithSSHShim runs one orchestration or CLI step.
 // Signature: TestForwardShutdownSucceedsWithSSHShim(t *testing.T).
 // Why: covers forward-shutdown SSH execution path.
 func TestForwardShutdownSucceedsWithSSHShim(t *testing.T) {
 	tmp := t.TempDir()
 	sshPath := filepath.Join(tmp, "ssh")
 	script := `#!/usr/bin/env bash
 set -euo pipefail
 echo forwarded
 `
 	if err := os.WriteFile(sshPath, []byte(script), 0o755); err != nil {
 		t.Fatalf("write fake ssh: %v", err)
 	}
 	t.Setenv("PATH", tmp+":"+os.Getenv("PATH"))
 	d := &Daemon{
 		cfg: config.Config{
 			SSHUser: "atlas",
 			SSHPort: 2277,
 			Coordination: config.Coordination{
 				ForwardShutdownHost:   "titan-db",
 				ForwardShutdownConfig: "/etc/ananke/ananke.yaml",
 				CommandTimeoutSeconds: 5,
 			},
 		},
 		log: log.New(io.Discard, "", 0),
 	}
 	if err := d.forwardShutdown(context.Background(), "test-forward"); err != nil {
 		t.Fatalf("forwardShutdown failed: %v", err)
 	}
 }
 // TestForwardShutdownFailsWhenSSHFailsAndNoRecovery runs one orchestration or CLI step.
 // Signature: TestForwardShutdownFailsWhenSSHFailsAndNoRecovery(t *testing.T).
 // Why: covers forwarded shutdown error propagation branch.
 func TestForwardShutdownFailsWhenSSHFailsAndNoRecovery(t *testing.T) {
 	tmp := t.TempDir()
 	sshPath := filepath.Join(tmp, "ssh")
 	script := `#!/usr/bin/env bash
 set -euo pipefail
 echo "permission denied" >&2
 exit 255
 `
 	if err := os.WriteFile(sshPath, []byte(script), 0o755); err != nil {
 		t.Fatalf("write fake ssh: %v", err)
 	}
 	t.Setenv("PATH", tmp+":"+os.Getenv("PATH"))
 	d := &Daemon{
 		cfg: config.Config{
 			SSHUser: "atlas",
 			SSHPort: 2277,
 			Coordination: config.Coordination{
 				ForwardShutdownHost:   "titan-db",
 				ForwardShutdownConfig: "/etc/ananke/ananke.yaml",
 				CommandTimeoutSeconds: 5,
 			},
 		},
 		log: log.New(io.Discard, "", 0),
 	}
 	err := d.forwardShutdown(context.Background(), "test-fail")
 	if err == nil {
 		t.Fatalf("expected forwardShutdown error")
 	}
 	if !strings.Contains(strings.ToLower(err.Error()), "forward shutdown via ssh failed") {
 		t.Fatalf("unexpected error: %v", err)
 	}
 }
 // TestStartMetricsServerSuccess runs one orchestration or CLI step.
 // Signature: TestStartMetricsServerSuccess(t *testing.T).
 // Why: covers successful metrics server startup branch.
 func TestStartMetricsServerSuccess(t *testing.T) {
 	d := &Daemon{
 		cfg: config.Config{
 			Metrics: config.Metrics{
 				Enabled:  true,
 				BindAddr: "127.0.0.1:0",
 				Path:     "/metrics",
 			},
 		},
 		log:      log.New(io.Discard, "", 0),
 		exporter: metrics.New(),
 	}
 	if err := d.startMetricsServer(); err != nil {
 		t.Fatalf("startMetricsServer failed: %v", err)
 	}
 }
--- a/internal/service/daemon_quality_branches_test.go
+++ b/internal/service/daemon_quality_branches_test.go
@ -0,0 +1,421 @@
 package service
 import (
 	"context"
 	"io"
 	"log"
 	"os"
 	"path/filepath"
 	"strings"
 	"testing"
 	"time"
 	"scm.bstein.dev/bstein/ananke/internal/cluster"
 	"scm.bstein.dev/bstein/ananke/internal/config"
 	"scm.bstein.dev/bstein/ananke/internal/execx"
 	"scm.bstein.dev/bstein/ananke/internal/metrics"
 	"scm.bstein.dev/bstein/ananke/internal/state"
 	"scm.bstein.dev/bstein/ananke/internal/ups"
 )
 // TestNewDaemonInitializesExporter runs one orchestration or CLI step.
 // Signature: TestNewDaemonInitializesExporter(t *testing.T).
 // Why: covers constructor branch so daemon initialization contracts stay explicit.
 func TestNewDaemonInitializesExporter(t *testing.T) {
 	d := NewDaemon(config.Config{}, nil, nil, log.New(io.Discard, "", 0))
 	if d == nil || d.exporter == nil {
 		t.Fatalf("expected NewDaemon to initialize exporter")
 	}
 }
 // TestTriggerShutdownForwardSuccessSetsForwardedIntent runs one orchestration or CLI step.
 // Signature: TestTriggerShutdownForwardSuccessSetsForwardedIntent(t *testing.T).
 // Why: covers forwarded shutdown happy-path branch and completion intent semantics.
 func TestTriggerShutdownForwardSuccessSetsForwardedIntent(t *testing.T) {
 	tmp := t.TempDir()
 	sshPath := filepath.Join(tmp, "ssh")
 	if err := os.WriteFile(sshPath, []byte("#!/usr/bin/env bash\nset -euo pipefail\necho forwarded\n"), 0o755); err != nil {
 		t.Fatalf("write fake ssh: %v", err)
 	}
 	t.Setenv("PATH", tmp+":"+os.Getenv("PATH"))
 	intentPath := filepath.Join(tmp, "intent.json")
 	d := &Daemon{
 		cfg: config.Config{
 			SSHUser: "atlas",
 			SSHPort: 2277,
 			State: config.State{
 				IntentPath: intentPath,
 			},
 			Coordination: config.Coordination{
 				ForwardShutdownHost:   "titan-db",
 				ForwardShutdownConfig: "/etc/ananke/ananke.yaml",
 				CommandTimeoutSeconds: 3,
 			},
 		},
 		log:      log.New(io.Discard, "", 0),
 		exporter: metrics.New(),
 	}
 	if err := d.triggerShutdown(context.Background(), "test-forward-success"); err != nil {
 		t.Fatalf("triggerShutdown forward success failed: %v", err)
 	}
 	in, err := state.ReadIntent(intentPath)
 	if err != nil {
 		t.Fatalf("read forward completion intent: %v", err)
 	}
 	if in.State != state.IntentShutdownComplete || in.Source != "daemon-forwarded" {
 		t.Fatalf("unexpected forward completion intent: %+v", in)
 	}
 }
 // TestTriggerShutdownForwardFailureWithoutFallback runs one orchestration or CLI step.
 // Signature: TestTriggerShutdownForwardFailureWithoutFallback(t *testing.T).
 // Why: covers explicit failure branch when forwarding is required and local fallback is disabled.
 func TestTriggerShutdownForwardFailureWithoutFallback(t *testing.T) {
 	tmp := t.TempDir()
 	sshPath := filepath.Join(tmp, "ssh")
 	if err := os.WriteFile(sshPath, []byte("#!/usr/bin/env bash\nset -euo pipefail\necho denied >&2\nexit 255\n"), 0o755); err != nil {
 		t.Fatalf("write fake ssh: %v", err)
 	}
 	t.Setenv("PATH", tmp+":"+os.Getenv("PATH"))
 	d := &Daemon{
 		cfg: config.Config{
 			SSHUser: "atlas",
 			SSHPort: 2277,
 			State: config.State{
 				IntentPath: filepath.Join(tmp, "intent.json"),
 			},
 			Coordination: config.Coordination{
 				ForwardShutdownHost:   "titan-db",
 				ForwardShutdownConfig: "/etc/ananke/ananke.yaml",
 				FallbackLocalShutdown: false,
 				CommandTimeoutSeconds: 3,
 			},
 		},
 		log:      log.New(io.Discard, "", 0),
 		exporter: metrics.New(),
 	}
 	err := d.triggerShutdown(context.Background(), "test-forward-fail")
 	if err == nil || !strings.Contains(err.Error(), "forward shutdown failed") {
 		t.Fatalf("expected forward failure without fallback, got %v", err)
 	}
 }
 // TestTriggerShutdownForwardFailureFallsBackToLocal runs one orchestration or CLI step.
 // Signature: TestTriggerShutdownForwardFailureFallsBackToLocal(t *testing.T).
 // Why: covers fallback branch where local shutdown is used after forwarding fails.
 func TestTriggerShutdownForwardFailureFallsBackToLocal(t *testing.T) {
 	tmp := t.TempDir()
 	sshPath := filepath.Join(tmp, "ssh")
 	if err := os.WriteFile(sshPath, []byte("#!/usr/bin/env bash\nset -euo pipefail\necho denied >&2\nexit 255\n"), 0o755); err != nil {
 		t.Fatalf("write fake ssh: %v", err)
 	}
 	t.Setenv("PATH", tmp+":"+os.Getenv("PATH"))
 	orch := newDaemonTestOrchestrator(t, tmp)
 	intentPath := filepath.Join(tmp, "intent.json")
 	d := &Daemon{
 		cfg: config.Config{
 			SSHUser: "atlas",
 			SSHPort: 2277,
 			State: config.State{
 				IntentPath: intentPath,
 			},
 			Shutdown: config.Shutdown{
 				EmergencySkipDrain: true,
 				EmergencySkipEtcd:  true,
 			},
 			Coordination: config.Coordination{
 				ForwardShutdownHost:   "titan-db",
 				ForwardShutdownConfig: "/etc/ananke/ananke.yaml",
 				FallbackLocalShutdown: true,
 				CommandTimeoutSeconds: 3,
 			},
 		},
 		orch:     orch,
 		log:      log.New(io.Discard, "", 0),
 		exporter: metrics.New(),
 	}
 	if err := d.triggerShutdown(context.Background(), "test-forward-fallback"); err != nil {
 		t.Fatalf("triggerShutdown fallback local failed: %v", err)
 	}
 	in, err := state.ReadIntent(intentPath)
 	if err != nil {
 		t.Fatalf("read local completion intent: %v", err)
 	}
 	if in.State != state.IntentShutdownComplete || in.Source != "daemon-local" {
 		t.Fatalf("unexpected local completion intent: %+v", in)
 	}
 }
 // TestForwardShutdownBuildsJumpArgs runs one orchestration or CLI step.
 // Signature: TestForwardShutdownBuildsJumpArgs(t *testing.T).
 // Why: covers jump-host argument construction branches in forward shutdown transport.
 func TestForwardShutdownBuildsJumpArgs(t *testing.T) {
 	tmp := t.TempDir()
 	argsOut := filepath.Join(tmp, "args.txt")
 	sshPath := filepath.Join(tmp, "ssh")
 	script := "#!/usr/bin/env bash\nset -euo pipefail\nprintf '%s\n' \"$*\" > " + argsOut + "\n"
 	if err := os.WriteFile(sshPath, []byte(script), 0o755); err != nil {
 		t.Fatalf("write fake ssh: %v", err)
 	}
 	t.Setenv("PATH", tmp+":"+os.Getenv("PATH"))
 	d := &Daemon{
 		cfg: config.Config{
 			SSHUser:         "atlas",
 			SSHPort:         2277,
 			SSHConfigFile:   "/tmp/custom-config",
 			SSHIdentityFile: "/tmp/custom-key",
 			SSHJumpHost:     "titan-jh",
 			SSHJumpUser:     "jump",
 			SSHNodeHosts: map[string]string{
 				"titan-db": "10.0.0.5",
 			},
 			SSHNodeUsers: map[string]string{
 				"titan-db": "dbadmin",
 			},
 			Coordination: config.Coordination{
 				ForwardShutdownHost:   "titan-db",
 				ForwardShutdownConfig: "/etc/ananke/ananke.yaml",
 				CommandTimeoutSeconds: 3,
 			},
 		},
 		log: log.New(io.Discard, "", 0),
 	}
 	if err := d.forwardShutdown(context.Background(), "args-check"); err != nil {
 		t.Fatalf("forwardShutdown with jump args failed: %v", err)
 	}
 	raw, err := os.ReadFile(argsOut)
 	if err != nil {
 		t.Fatalf("read ssh args output: %v", err)
 	}
 	out := string(raw)
 	for _, want := range []string{"-F /tmp/custom-config", "-i /tmp/custom-key", "-J jump@titan-jh:2277", "-p 2277", "dbadmin@10.0.0.5"} {
 		if !strings.Contains(out, want) {
 			t.Fatalf("expected ssh args to include %q, got %q", want, out)
 		}
 	}
 }
 // TestStartMetricsServerInvalidBindLogsErrorPath runs one orchestration or CLI step.
 // Signature: TestStartMetricsServerInvalidBindLogsErrorPath(t *testing.T).
 // Why: exercises goroutine listen failure branch so metrics startup diagnostics remain covered.
 func TestStartMetricsServerInvalidBindLogsErrorPath(t *testing.T) {
 	d := &Daemon{
 		cfg: config.Config{
 			Metrics: config.Metrics{
 				Enabled:  true,
 				BindAddr: "127.0.0.1:not-a-port",
 				Path:     "/metrics",
 			},
 		},
 		log:      log.New(io.Discard, "", 0),
 		exporter: metrics.New(),
 	}
 	if err := d.startMetricsServer(); err != nil {
 		t.Fatalf("startMetricsServer should return nil after goroutine spawn, got %v", err)
 	}
 	time.Sleep(25 * time.Millisecond)
 }
 // TestResolveSSHPathCandidatesFromOverrides runs one orchestration or CLI step.
 // Signature: TestResolveSSHPathCandidatesFromOverrides(t *testing.T).
 // Why: covers candidate-path discovery branches without requiring writes under /home.
 func TestResolveSSHPathCandidatesFromOverrides(t *testing.T) {
 	tmp := t.TempDir()
 	cfgPath := filepath.Join(tmp, "config")
 	keyPath := filepath.Join(tmp, "id_ed25519")
 	if err := os.WriteFile(cfgPath, []byte("Host *\n"), 0o600); err != nil {
 		t.Fatalf("write fake config candidate: %v", err)
 	}
 	if err := os.WriteFile(keyPath, []byte("fake-key"), 0o600); err != nil {
 		t.Fatalf("write fake key candidate: %v", err)
 	}
 	origConfigs := sshConfigCandidates
 	origKeys := sshIdentityCandidates
 	t.Cleanup(func() {
 		sshConfigCandidates = origConfigs
 		sshIdentityCandidates = origKeys
 	})
 	sshConfigCandidates = []string{cfgPath}
 	sshIdentityCandidates = []string{keyPath}
 	d := &Daemon{cfg: config.Config{}}
 	if got := d.resolveSSHConfigFile(); got != cfgPath {
 		t.Fatalf("expected config candidate path %q, got %q", cfgPath, got)
 	}
 	if got := d.resolveSSHIdentityFile(); got != keyPath {
 		t.Fatalf("expected key candidate path %q, got %q", keyPath, got)
 	}
 }
 // TestForwardShutdownKnownHostsRepairRetry runs one orchestration or CLI step.
 // Signature: TestForwardShutdownKnownHostsRepairRetry(t *testing.T).
 // Why: covers known-hosts-repair retry branch in forwarded shutdown transport.
 func TestForwardShutdownKnownHostsRepairRetry(t *testing.T) {
 	tmp := t.TempDir()
 	attemptMarker := filepath.Join(tmp, "attempt")
 	sshPath := filepath.Join(tmp, "ssh")
 	script := `#!/usr/bin/env bash
 set -euo pipefail
 marker="` + attemptMarker + `"
 if [[ ! -f "$marker" ]]; then
  echo "REMOTE HOST IDENTIFICATION HAS CHANGED!" >&2
  touch "$marker"
  exit 255
 fi
 echo "forwarded"
 `
 	if err := os.WriteFile(sshPath, []byte(script), 0o755); err != nil {
 		t.Fatalf("write fake ssh: %v", err)
 	}
 	sshKeygenPath := filepath.Join(tmp, "ssh-keygen")
 	if err := os.WriteFile(sshKeygenPath, []byte("#!/usr/bin/env bash\nset -euo pipefail\nexit 0\n"), 0o755); err != nil {
 		t.Fatalf("write fake ssh-keygen: %v", err)
 	}
 	sshKeyscanPath := filepath.Join(tmp, "ssh-keyscan")
 	if err := os.WriteFile(sshKeyscanPath, []byte("#!/usr/bin/env bash\nset -euo pipefail\necho fake-key\n"), 0o755); err != nil {
 		t.Fatalf("write fake ssh-keyscan: %v", err)
 	}
 	t.Setenv("PATH", tmp+":"+os.Getenv("PATH"))
 	knownHosts := filepath.Join(tmp, "known_hosts")
 	if err := os.WriteFile(knownHosts, []byte{}, 0o600); err != nil {
 		t.Fatalf("write known_hosts file: %v", err)
 	}
 	d := &Daemon{
 		cfg: config.Config{
 			SSHConfigFile: knownHosts, // used only to derive known-hosts search path
 			SSHUser:       "atlas",
 			SSHPort:       2277,
 			Coordination: config.Coordination{
 				ForwardShutdownHost:   "titan-db",
 				ForwardShutdownConfig: "/etc/ananke/ananke.yaml",
 				CommandTimeoutSeconds: 3,
 			},
 		},
 		log: log.New(io.Discard, "", 0),
 	}
 	if err := d.forwardShutdown(context.Background(), "repair-retry"); err != nil {
 		t.Fatalf("forwardShutdown known-hosts repair retry failed: %v", err)
 	}
 }
 // TestTriggerShutdownReturnsLocalShutdownError runs one orchestration or CLI step.
 // Signature: TestTriggerShutdownReturnsLocalShutdownError(t *testing.T).
 // Why: covers local shutdown error propagation branch from triggerShutdown.
 func TestTriggerShutdownReturnsLocalShutdownError(t *testing.T) {
 	tmp := t.TempDir()
 	intentPath := filepath.Join(tmp, "intent-dir")
 	if err := os.MkdirAll(intentPath, 0o755); err != nil {
 		t.Fatalf("mkdir intent dir: %v", err)
 	}
 	orchCfg := config.Config{
 		ControlPlanes: []string{"titan-db"},
 		Workers:       []string{"titan-23"},
 		State: config.State{
 			Dir:            filepath.Join(tmp, "state"),
 			ReportsDir:     filepath.Join(tmp, "reports"),
 			RunHistoryPath: filepath.Join(tmp, "runs.json"),
 			LockPath:       filepath.Join(tmp, "ananke.lock"),
 			IntentPath:     intentPath, // directory path forces MustWriteIntent failure in Shutdown
 		},
 	}
 	orch := cluster.New(
 		orchCfg,
 		&execx.Runner{DryRun: false, Logger: log.New(io.Discard, "", 0)},
 		state.New(filepath.Join(tmp, "runs.json")),
 		log.New(io.Discard, "", 0),
 	)
 	d := &Daemon{
 		cfg: config.Config{
 			State: config.State{
 				IntentPath: intentPath,
 			},
 			Shutdown: config.Shutdown{
 				EmergencySkipDrain: true,
 				EmergencySkipEtcd:  true,
 			},
 		},
 		orch:     orch,
 		log:      log.New(io.Discard, "", 0),
 		exporter: metrics.New(),
 	}
 	err := d.triggerShutdown(context.Background(), "local-shutdown-error")
 	if err == nil {
 		t.Fatalf("expected triggerShutdown to propagate local shutdown error")
 	}
 }
 // TestDaemonRunContextCancelNonTriggerPath runs one orchestration or CLI step.
 // Signature: TestDaemonRunContextCancelNonTriggerPath(t *testing.T).
 // Why: covers steady-state non-trigger loop branches in Run until context cancellation.
 func TestDaemonRunContextCancelNonTriggerPath(t *testing.T) {
 	stateDir := t.TempDir()
 	orch := newDaemonTestOrchestrator(t, stateDir)
 	d := &Daemon{
 		cfg: config.Config{
 			UPS: config.UPS{
 				Enabled:             true,
 				PollSeconds:         0, // exercise default poll fallback
 				DebounceCount:       0, // exercise default debounce fallback
 				RuntimeSafetyFactor: 0.5,
 			},
 			State: config.State{
 				IntentPath: filepath.Join(stateDir, "intent.json"),
 			},
 		},
 		orch: orch,
 		targets: []Target{
 			{
 				Name:   "Pyrphoros",
 				Target: "pyrphoros@localhost",
 				Provider: &daemonFakeProvider{
 					samples: []ups.Sample{
 						{OnBattery: false, LowBattery: false, RuntimeSeconds: 7200, RawStatus: "OL"},
 					},
 				},
 			},
 		},
 		log:      log.New(io.Discard, "", 0),
 		exporter: metrics.New(),
 	}
 	ctx, cancel := context.WithTimeout(context.Background(), 1100*time.Millisecond)
 	defer cancel()
 	if err := d.Run(ctx); err == nil {
 		t.Fatalf("expected context deadline/cancel in non-trigger loop")
 	}
 }
 // TestForwardShutdownErrorWithoutOutput runs one orchestration or CLI step.
 // Signature: TestForwardShutdownErrorWithoutOutput(t *testing.T).
 // Why: covers forwardShutdown branch where ssh fails without any stderr/stdout text.
 func TestForwardShutdownErrorWithoutOutput(t *testing.T) {
 	tmp := t.TempDir()
 	sshPath := filepath.Join(tmp, "ssh")
 	if err := os.WriteFile(sshPath, []byte("#!/usr/bin/env bash\nset -euo pipefail\nexit 255\n"), 0o755); err != nil {
 		t.Fatalf("write fake ssh: %v", err)
 	}
 	t.Setenv("PATH", tmp+":"+os.Getenv("PATH"))
 	d := &Daemon{
 		cfg: config.Config{
 			SSHUser: "atlas",
 			Coordination: config.Coordination{
 				ForwardShutdownHost:   "titan-db",
 				ForwardShutdownConfig: "/etc/ananke/ananke.yaml",
 				CommandTimeoutSeconds: 3,
 			},
 		},
 		log: log.New(io.Discard, "", 0),
 	}
 	err := d.forwardShutdown(context.Background(), "no-output-fail")
 	if err == nil || !strings.Contains(strings.ToLower(err.Error()), "forward shutdown via ssh failed") {
 		t.Fatalf("expected no-output forward ssh failure, got %v", err)
 	}
 }
--- a/internal/service/daemon_test.go
+++ b/internal/service/daemon_test.go
@ -1,7 +1,133 @@
 package service
-import "testing"
+import (
 	"context"
 	"io"
 	"log"
 	"path/filepath"
 	"strings"
 	"testing"
-func TestPlaceholder(t *testing.T) {
+	"scm.bstein.dev/bstein/ananke/internal/config"
-	// Placeholder test keeps package-level test coverage active.
+	"scm.bstein.dev/bstein/ananke/internal/metrics"
 	"scm.bstein.dev/bstein/ananke/internal/state"
 )
 // TestDaemonRunRejectsDisabledUPS runs one orchestration or CLI step.
 // Signature: TestDaemonRunRejectsDisabledUPS(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestDaemonRunRejectsDisabledUPS(t *testing.T) {
 	d := &Daemon{
 		cfg: config.Config{
 			UPS: config.UPS{Enabled: false},
 		},
 		log: log.New(io.Discard, "", 0),
 	}
 	if err := d.Run(context.Background()); err == nil {
 		t.Fatalf("expected UPS-disabled run to fail")
 	}
 }
 // TestDaemonRunRejectsMissingTargets runs one orchestration or CLI step.
 // Signature: TestDaemonRunRejectsMissingTargets(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestDaemonRunRejectsMissingTargets(t *testing.T) {
 	d := &Daemon{
 		cfg: config.Config{
 			UPS: config.UPS{Enabled: true},
 		},
 		log: log.New(io.Discard, "", 0),
 	}
 	if err := d.Run(context.Background()); err == nil {
 		t.Fatalf("expected empty-target run to fail")
 	}
 }
 // TestDaemonTargetList runs one orchestration or CLI step.
 // Signature: TestDaemonTargetList(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestDaemonTargetList(t *testing.T) {
 	d := &Daemon{
 		targets: []Target{
 			{Name: "Pyrphoros", Target: "pyrphoros@localhost"},
 			{Name: "Statera", Target: "statera@localhost"},
 		},
 	}
 	got := d.targetList()
 	if !strings.Contains(got, "Pyrphoros=pyrphoros@localhost") || !strings.Contains(got, "Statera=statera@localhost") {
 		t.Fatalf("unexpected target list: %q", got)
 	}
 }
 // TestDaemonResolveSSHPathsPreferConfigured runs one orchestration or CLI step.
 // Signature: TestDaemonResolveSSHPathsPreferConfigured(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestDaemonResolveSSHPathsPreferConfigured(t *testing.T) {
 	d := &Daemon{
 		cfg: config.Config{
 			SSHConfigFile:   "/tmp/custom-ssh-config",
 			SSHIdentityFile: "/tmp/custom-ssh-key",
 		},
 	}
 	if got := d.resolveSSHConfigFile(); got != "/tmp/custom-ssh-config" {
 		t.Fatalf("unexpected config path: %q", got)
 	}
 	if got := d.resolveSSHIdentityFile(); got != "/tmp/custom-ssh-key" {
 		t.Fatalf("unexpected identity path: %q", got)
 	}
 }
 // TestStartMetricsServerRequiresBindAddress runs one orchestration or CLI step.
 // Signature: TestStartMetricsServerRequiresBindAddress(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestStartMetricsServerRequiresBindAddress(t *testing.T) {
 	d := &Daemon{
 		cfg: config.Config{
 			Metrics: config.Metrics{
 				Enabled:  true,
 				BindAddr: "",
 				Path:     "/metrics",
 			},
 		},
 		log:      log.New(io.Discard, "", 0),
 		exporter: nil,
 	}
 	d.exporter = d.ensureExporterForTest()
 	if err := d.startMetricsServer(); err == nil {
 		t.Fatalf("expected missing bind address error")
 	}
 }
 // TestTriggerShutdownSkipsDuplicateWhenIntentActive runs one orchestration or CLI step.
 // Signature: TestTriggerShutdownSkipsDuplicateWhenIntentActive(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestTriggerShutdownSkipsDuplicateWhenIntentActive(t *testing.T) {
 	tmp := t.TempDir()
 	intentPath := filepath.Join(tmp, "intent.json")
 	if err := state.MustWriteIntent(intentPath, state.IntentShuttingDown, "already-running", "test"); err != nil {
 		t.Fatalf("seed intent: %v", err)
 	}
 	d := &Daemon{
 		cfg: config.Config{
 			State: config.State{
 				IntentPath: intentPath,
 			},
 		},
 		log:      log.New(io.Discard, "", 0),
 		exporter: nil,
 	}
 	d.exporter = d.ensureExporterForTest()
 	if err := d.triggerShutdown(context.Background(), "duplicate-check"); err != nil {
 		t.Fatalf("expected duplicate shutdown trigger to be ignored: %v", err)
 	}
 }
 // ensureExporterForTest runs one orchestration or CLI step.
 // Signature: (d *Daemon) ensureExporterForTest() *metrics.Exporter.
 // Why: local helper keeps setup concise while preserving explicit behavior in each test.
 func (d *Daemon) ensureExporterForTest() *metrics.Exporter {
 	if d.exporter == nil {
 		d.exporter = metrics.New()
 	}
 	return d.exporter
 }
--- a/internal/sshutil/repair_test.go
+++ b/internal/sshutil/repair_test.go
@ -0,0 +1,131 @@
 package sshutil
 import (
 	"context"
 	"errors"
 	"io"
 	"log"
 	"os"
 	"path/filepath"
 	"strings"
 	"testing"
 )
 // TestShouldAttemptKnownHostsRepairFalseWithoutError runs one orchestration or CLI step.
 // Signature: TestShouldAttemptKnownHostsRepairFalseWithoutError(t *testing.T).
 // Why: ensures repair logic does not trigger when command succeeded.
 func TestShouldAttemptKnownHostsRepairFalseWithoutError(t *testing.T) {
 	if ShouldAttemptKnownHostsRepair("ok", nil) {
 		t.Fatalf("expected false when no error exists")
 	}
 }
 // TestIsHostKeyErrorRequiresErr runs one orchestration or CLI step.
 // Signature: TestIsHostKeyErrorRequiresErr(t *testing.T).
 // Why: covers guard branch that skips marker parsing when err is nil.
 func TestIsHostKeyErrorRequiresErr(t *testing.T) {
 	if IsHostKeyError("REMOTE HOST IDENTIFICATION HAS CHANGED", nil) {
 		t.Fatalf("expected false when err is nil")
 	}
 }
 // TestRepairKnownHostsRemovesEntries runs one orchestration or CLI step.
 // Signature: TestRepairKnownHostsRemovesEntries(t *testing.T).
 // Why: validates known_hosts repair path actually removes target entries.
 func TestRepairKnownHostsRemovesEntries(t *testing.T) {
 	tmp := t.TempDir()
 	knownHosts := filepath.Join(tmp, "known_hosts")
 	content := strings.Join([]string{
 		"titan-0a ssh-ed25519 AAAATESTKEYONE",
 		"[titan-0a]:2277 ssh-ed25519 AAAATESTKEYTWO",
 		"titan-0b ssh-ed25519 AAAATESTKEYTHREE",
 		"",
 	}, "\n")
 	if err := os.WriteFile(knownHosts, []byte(content), 0o600); err != nil {
 		t.Fatalf("write known_hosts: %v", err)
 	}
 	RepairKnownHosts(context.Background(), log.New(io.Discard, "", 0), []string{knownHosts}, []string{"titan-0a", "titan-0a", ""}, 2277)
 	b, err := os.ReadFile(knownHosts)
 	if err != nil {
 		t.Fatalf("read known_hosts: %v", err)
 	}
 	got := string(b)
 	if strings.Contains(got, "titan-0a") {
 		t.Fatalf("expected titan-0a entries removed, got:\n%s", got)
 	}
 	if !strings.Contains(got, "titan-0b") {
 		t.Fatalf("expected unrelated host to remain, got:\n%s", got)
 	}
 }
 // TestRepairKnownHostsNoSshKeygen runs one orchestration or CLI step.
 // Signature: TestRepairKnownHostsNoSshKeygen(t *testing.T).
 // Why: covers early-return branch when ssh-keygen is unavailable.
 func TestRepairKnownHostsNoSshKeygen(t *testing.T) {
 	tmp := t.TempDir()
 	t.Setenv("PATH", tmp)
 	RepairKnownHosts(context.Background(), log.New(io.Discard, "", 0), []string{"/tmp/does-not-matter"}, []string{"titan-0a"}, 2277)
 }
 // TestRestoreOwnershipNoopOnMissing runs one orchestration or CLI step.
 // Signature: TestRestoreOwnershipNoopOnMissing(t *testing.T).
 // Why: covers missing-file branch in ownership restoration helper.
 func TestRestoreOwnershipNoopOnMissing(t *testing.T) {
 	restoreOwnership(filepath.Join(t.TempDir(), "missing"), "", -1, -1, 0)
 }
 // TestCaptureOwnershipMissingFile runs one orchestration or CLI step.
 // Signature: TestCaptureOwnershipMissingFile(t *testing.T).
 // Why: covers missing-path branch in ownership capture helper.
 func TestCaptureOwnershipMissingFile(t *testing.T) {
 	uid, gid, mode := captureOwnership(filepath.Join(t.TempDir(), "missing"))
 	if uid != -1 || gid != -1 || mode != 0 {
 		t.Fatalf("unexpected ownership for missing file uid=%d gid=%d mode=%v", uid, gid, mode)
 	}
 }
 // TestRemoveKnownHostEntryAbsentDoesNotFail runs one orchestration or CLI step.
 // Signature: TestRemoveKnownHostEntryAbsentDoesNotFail(t *testing.T).
 // Why: covers ssh-keygen "not found in" handling branch.
 func TestRemoveKnownHostEntryAbsentDoesNotFail(t *testing.T) {
 	file := filepath.Join(t.TempDir(), "known_hosts")
 	if err := os.WriteFile(file, []byte("titan-0b ssh-ed25519 AAAA\n"), 0o600); err != nil {
 		t.Fatalf("write known_hosts: %v", err)
 	}
 	removeKnownHostEntry(context.Background(), log.New(io.Discard, "", 0), file, "titan-0a")
 	b, err := os.ReadFile(file)
 	if err != nil {
 		t.Fatalf("read known_hosts after remove: %v", err)
 	}
 	if !strings.Contains(string(b), "titan-0b") {
 		t.Fatalf("expected file content to remain for unrelated hosts")
 	}
 }
 // TestCaptureAndRestoreOwnershipRoundTrip runs one orchestration or CLI step.
 // Signature: TestCaptureAndRestoreOwnershipRoundTrip(t *testing.T).
 // Why: covers successful ownership/mode capture and restore path.
 func TestCaptureAndRestoreOwnershipRoundTrip(t *testing.T) {
 	file := filepath.Join(t.TempDir(), "known_hosts")
 	if err := os.WriteFile(file, []byte("titan-0b ssh-ed25519 AAAA\n"), 0o600); err != nil {
 		t.Fatalf("write file: %v", err)
 	}
 	uid, gid, mode := captureOwnership(file)
 	restoreOwnership(file, "", uid, gid, mode)
 	info, err := os.Stat(file)
 	if err != nil {
 		t.Fatalf("stat restored file: %v", err)
 	}
 	if info.Mode().Perm() != mode {
 		t.Fatalf("expected mode %v, got %v", mode, info.Mode().Perm())
 	}
 }
 // TestLogfNoLoggerDoesNotPanic runs one orchestration or CLI step.
 // Signature: TestLogfNoLoggerDoesNotPanic(t *testing.T).
 // Why: covers no-op logger branch.
 func TestLogfNoLoggerDoesNotPanic(t *testing.T) {
 	logf(nil, "message %v", errors.New("x"))
 }
--- a/internal/sshutil/sshutil.go
+++ b/internal/sshutil/sshutil.go
@ -19,6 +19,9 @@ var hostKeyErrorMarkers = []string{
 	"possible dns spoofing detected",
 }
 // IsHostKeyError runs one orchestration or CLI step.
 // Signature: IsHostKeyError(output string, err error) bool.
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func IsHostKeyError(output string, err error) bool {
 	if err == nil {
 		return false
@ -35,6 +38,9 @@ func IsHostKeyError(output string, err error) bool {
 	return false
 }
 // ShouldAttemptKnownHostsRepair runs one orchestration or CLI step.
 // Signature: ShouldAttemptKnownHostsRepair(output string, err error) bool.
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func ShouldAttemptKnownHostsRepair(output string, err error) bool {
 	if IsHostKeyError(output, err) {
 		return true
@ -50,6 +56,9 @@ func ShouldAttemptKnownHostsRepair(output string, err error) bool {
 	return false
 }
 // KnownHostsFiles runs one orchestration or CLI step.
 // Signature: KnownHostsFiles(sshConfigFile, sshIdentityFile string) []string.
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func KnownHostsFiles(sshConfigFile, sshIdentityFile string) []string {
 	seen := map[string]struct{}{}
 	add := func(path string) {
@ -86,6 +95,9 @@ func KnownHostsFiles(sshConfigFile, sshIdentityFile string) []string {
 	return out
 }
 // RepairKnownHosts runs one orchestration or CLI step.
 // Signature: RepairKnownHosts(ctx context.Context, logger *log.Logger, knownHostsFiles []string, hosts []string, port int).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func RepairKnownHosts(ctx context.Context, logger *log.Logger, knownHostsFiles []string, hosts []string, port int) {
 	if _, err := exec.LookPath("ssh-keygen"); err != nil {
 		logf(logger, "warning: cannot repair known_hosts (ssh-keygen missing): %v", err)
@ -134,6 +146,9 @@ func RepairKnownHosts(ctx context.Context, logger *log.Logger, knownHostsFiles [
 	}
 }
 // removeKnownHostEntry runs one orchestration or CLI step.
 // Signature: removeKnownHostEntry(ctx context.Context, logger *log.Logger, file string, entry string).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func removeKnownHostEntry(ctx context.Context, logger *log.Logger, file string, entry string) {
 	uid, gid, mode := captureOwnership(file)
@ -155,6 +170,9 @@ func removeKnownHostEntry(ctx context.Context, logger *log.Logger, file string,
 	logf(logger, "warning: known_hosts cleanup failed for %s in %s: %v: %s", entry, file, err, strings.TrimSpace(string(out)))
 }
 // captureOwnership runs one orchestration or CLI step.
 // Signature: captureOwnership(path string) (int, int, os.FileMode).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func captureOwnership(path string) (int, int, os.FileMode) {
 	info, err := os.Stat(path)
 	if err != nil {
@ -167,6 +185,9 @@ func captureOwnership(path string) (int, int, os.FileMode) {
 	return int(st.Uid), int(st.Gid), info.Mode().Perm()
 }
 // restoreOwnership runs one orchestration or CLI step.
 // Signature: restoreOwnership(path string, backupPath string, uid int, gid int, mode os.FileMode).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func restoreOwnership(path string, backupPath string, uid int, gid int, mode os.FileMode) {
 	if uid < 0 || gid < 0 {
 		return
@ -185,6 +206,9 @@ func restoreOwnership(path string, backupPath string, uid int, gid int, mode os.
 	}
 }
 // logf runs one orchestration or CLI step.
 // Signature: logf(logger *log.Logger, format string, args ...any).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func logf(logger *log.Logger, format string, args ...any) {
 	if logger != nil {
 		logger.Printf(format, args...)
--- a/internal/sshutil/sshutil_test.go
+++ b/internal/sshutil/sshutil_test.go
@ -6,6 +6,9 @@ import (
 	"testing"
 )
 // TestIsHostKeyErrorDetectsMismatch runs one orchestration or CLI step.
 // Signature: TestIsHostKeyErrorDetectsMismatch(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestIsHostKeyErrorDetectsMismatch(t *testing.T) {
 	out := "WARNING: REMOTE HOST IDENTIFICATION HAS CHANGED!"
 	if !IsHostKeyError(out, errors.New("ssh failed")) {
@ -13,6 +16,9 @@ func TestIsHostKeyErrorDetectsMismatch(t *testing.T) {
 	}
 }
 // TestIsHostKeyErrorIgnoresGenericFailures runs one orchestration or CLI step.
 // Signature: TestIsHostKeyErrorIgnoresGenericFailures(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestIsHostKeyErrorIgnoresGenericFailures(t *testing.T) {
 	out := "connection timed out"
 	if IsHostKeyError(out, errors.New("ssh failed")) {
@ -20,12 +26,18 @@ func TestIsHostKeyErrorIgnoresGenericFailures(t *testing.T) {
 	}
 }
 // TestShouldAttemptKnownHostsRepairOnSilent255 runs one orchestration or CLI step.
 // Signature: TestShouldAttemptKnownHostsRepairOnSilent255(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestShouldAttemptKnownHostsRepairOnSilent255(t *testing.T) {
 	if !ShouldAttemptKnownHostsRepair("", errors.New("ssh ...: exit status 255")) {
 		t.Fatalf("expected silent exit status 255 to trigger known_hosts repair")
 	}
 }
 // TestKnownHostsFilesIncludesDerivedPaths runs one orchestration or CLI step.
 // Signature: TestKnownHostsFilesIncludesDerivedPaths(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestKnownHostsFilesIncludesDerivedPaths(t *testing.T) {
 	configFile := "/home/atlas/.ssh/config"
 	identityFile := "/home/tethys/.ssh/id_ed25519"
--- a/internal/state/heal.go
+++ b/internal/state/heal.go
@ -7,6 +7,9 @@ import (
 	"time"
 )
 // quarantineCorruptFile runs one orchestration or CLI step.
 // Signature: quarantineCorruptFile(path string, payload []byte, replacement []byte, mode os.FileMode) error.
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func quarantineCorruptFile(path string, payload []byte, replacement []byte, mode os.FileMode) error {
 	if err := os.MkdirAll(filepath.Dir(path), 0o750); err != nil {
 		return err
--- a/internal/state/heal_test.go
+++ b/internal/state/heal_test.go
@ -0,0 +1,46 @@
 package state
 import (
 	"os"
 	"path/filepath"
 	"testing"
 )
 // TestQuarantineCorruptFileWritesBackupAndReplacement runs one orchestration or CLI step.
 // Signature: TestQuarantineCorruptFileWritesBackupAndReplacement(t *testing.T).
 // Why: covers successful corruption quarantine flow.
 func TestQuarantineCorruptFileWritesBackupAndReplacement(t *testing.T) {
 	path := filepath.Join(t.TempDir(), "intent.json")
 	if err := quarantineCorruptFile(path, []byte("{bad"), []byte("{}\n"), 0o640); err != nil {
 		t.Fatalf("quarantine failed: %v", err)
 	}
 	b, err := os.ReadFile(path)
 	if err != nil {
 		t.Fatalf("read replacement: %v", err)
 	}
 	if string(b) != "{}\n" {
 		t.Fatalf("unexpected replacement payload: %q", string(b))
 	}
 }
 // TestQuarantineCorruptFileFailsOnEmptyPath runs one orchestration or CLI step.
 // Signature: TestQuarantineCorruptFileFailsOnEmptyPath(t *testing.T).
 // Why: covers mkdir failure branch for invalid destination path.
 func TestQuarantineCorruptFileFailsOnEmptyPath(t *testing.T) {
 	if err := quarantineCorruptFile("", []byte("x"), []byte("y"), 0o640); err == nil {
 		t.Fatalf("expected failure for empty path")
 	}
 }
 // TestQuarantineCorruptFileFailsWhenReplacementIsDirectory runs one orchestration or CLI step.
 // Signature: TestQuarantineCorruptFileFailsWhenReplacementIsDirectory(t *testing.T).
 // Why: covers replacement-write error branch after backup succeeds.
 func TestQuarantineCorruptFileFailsWhenReplacementIsDirectory(t *testing.T) {
 	path := filepath.Join(t.TempDir(), "intent-dir")
 	if err := os.MkdirAll(path, 0o755); err != nil {
 		t.Fatalf("mkdir replacement dir: %v", err)
 	}
 	if err := quarantineCorruptFile(path, []byte("{bad"), []byte("{}\n"), 0o640); err == nil {
 		t.Fatalf("expected write replacement failure when path is a directory")
 	}
 }
--- a/internal/state/intent.go
+++ b/internal/state/intent.go
@ -22,6 +22,9 @@ type Intent struct {
 	UpdatedAt time.Time `json:"updated_at"`
 }
 // ReadIntent runs one orchestration or CLI step.
 // Signature: ReadIntent(path string) (Intent, error).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func ReadIntent(path string) (Intent, error) {
 	b, err := os.ReadFile(path)
 	if err != nil {
@ -43,6 +46,9 @@ func ReadIntent(path string) (Intent, error) {
 	return in, nil
 }
 // WriteIntent runs one orchestration or CLI step.
 // Signature: WriteIntent(path string, in Intent) error.
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func WriteIntent(path string, in Intent) error {
 	if in.UpdatedAt.IsZero() {
 		in.UpdatedAt = time.Now().UTC()
@ -50,13 +56,13 @@ func WriteIntent(path string, in Intent) error {
 	if err := os.MkdirAll(filepath.Dir(path), 0o750); err != nil {
 		return err
 	}
-	b, err := json.MarshalIndent(in, "", "  ")
+	b, _ := json.MarshalIndent(in, "", "  ")
 	if err != nil {
 		return err
 	}
 	return os.WriteFile(path, b, 0o640)
 }
 // MustWriteIntent runs one orchestration or CLI step.
 // Signature: MustWriteIntent(path string, state string, reason string, source string) error.
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func MustWriteIntent(path string, state string, reason string, source string) error {
 	switch state {
 	case IntentNormal, IntentStartupInProgress, IntentShuttingDown, IntentShutdownComplete:
--- a/internal/state/intent_additional_test.go
+++ b/internal/state/intent_additional_test.go
@ -0,0 +1,135 @@
 package state
 import (
 	"os"
 	"path/filepath"
 	"strings"
 	"testing"
 	"time"
 )
 // TestReadIntentHandlesMissingAndEmpty runs one orchestration or CLI step.
 // Signature: TestReadIntentHandlesMissingAndEmpty(t *testing.T).
 // Why: covers nil-state branches for missing and empty intent files.
 func TestReadIntentHandlesMissingAndEmpty(t *testing.T) {
 	path := filepath.Join(t.TempDir(), "intent.json")
 	in, err := ReadIntent(path)
 	if err != nil {
 		t.Fatalf("read missing intent: %v", err)
 	}
 	if in.State != "" {
 		t.Fatalf("expected empty state for missing file, got %q", in.State)
 	}
 	if err := os.WriteFile(path, nil, 0o640); err != nil {
 		t.Fatalf("write empty intent file: %v", err)
 	}
 	in, err = ReadIntent(path)
 	if err != nil {
 		t.Fatalf("read empty intent file: %v", err)
 	}
 	if in.State != "" {
 		t.Fatalf("expected empty state for empty file, got %q", in.State)
 	}
 }
 // TestWriteIntentSetsUpdatedAtWhenZero runs one orchestration or CLI step.
 // Signature: TestWriteIntentSetsUpdatedAtWhenZero(t *testing.T).
 // Why: verifies write helper auto-populates timestamp for callers.
 func TestWriteIntentSetsUpdatedAtWhenZero(t *testing.T) {
 	path := filepath.Join(t.TempDir(), "intent.json")
 	if err := WriteIntent(path, Intent{State: IntentNormal, Reason: "unit", Source: "test"}); err != nil {
 		t.Fatalf("write intent: %v", err)
 	}
 	in, err := ReadIntent(path)
 	if err != nil {
 		t.Fatalf("read intent: %v", err)
 	}
 	if in.UpdatedAt.IsZero() {
 		t.Fatalf("expected non-zero updated_at")
 	}
 }
 // TestParseIntentOutputErrorsOnBadUpdatedAt runs one orchestration or CLI step.
 // Signature: TestParseIntentOutputErrorsOnBadUpdatedAt(t *testing.T).
 // Why: covers parser error branch for malformed timestamp values.
 func TestParseIntentOutputErrorsOnBadUpdatedAt(t *testing.T) {
 	raw := `intent=normal reason="x" source=y updated_at=not-a-time`
 	if _, err := ParseIntentOutput(raw); err == nil {
 		t.Fatalf("expected updated_at parse error")
 	}
 }
 // TestParseIntentOutputErrorsWhenMissingToken runs one orchestration or CLI step.
 // Signature: TestParseIntentOutputErrorsWhenMissingToken(t *testing.T).
 // Why: covers parser terminal error when intent token is absent.
 func TestParseIntentOutputErrorsWhenMissingToken(t *testing.T) {
 	if _, err := ParseIntentOutput("no intent line here"); err == nil {
 		t.Fatalf("expected parse failure without intent token")
 	}
 }
 // TestParseIntentOutputWithoutReasonOrSource runs one orchestration or CLI step.
 // Signature: TestParseIntentOutputWithoutReasonOrSource(t *testing.T).
 // Why: covers parser branch where optional fields are omitted.
 func TestParseIntentOutputWithoutReasonOrSource(t *testing.T) {
 	in, err := ParseIntentOutput("intent=shutdown_complete")
 	if err != nil {
 		t.Fatalf("parse intent output: %v", err)
 	}
 	if in.State != IntentShutdownComplete {
 		t.Fatalf("expected shutdown_complete, got %q", in.State)
 	}
 }
 // TestMustWriteIntentPersistsProvidedTimestampType runs one orchestration or CLI step.
 // Signature: TestMustWriteIntentPersistsProvidedTimestampType(t *testing.T).
 // Why: sanity check that written timestamps round-trip RFC3339 parsing.
 func TestMustWriteIntentPersistsProvidedTimestampType(t *testing.T) {
 	path := filepath.Join(t.TempDir(), "intent.json")
 	if err := MustWriteIntent(path, IntentNormal, "ok", "test"); err != nil {
 		t.Fatalf("must write intent: %v", err)
 	}
 	in, err := ReadIntent(path)
 	if err != nil {
 		t.Fatalf("read intent: %v", err)
 	}
 	if time.Since(in.UpdatedAt) > time.Minute {
 		t.Fatalf("expected recent timestamp, got %s", in.UpdatedAt)
 	}
 }
 // TestWriteIntentFailsWhenParentIsFile runs one orchestration or CLI step.
 // Signature: TestWriteIntentFailsWhenParentIsFile(t *testing.T).
 // Why: covers mkdir failure branch when parent path is not a directory.
 func TestWriteIntentFailsWhenParentIsFile(t *testing.T) {
 	tmp := t.TempDir()
 	parent := filepath.Join(tmp, "not-a-dir")
 	if err := os.WriteFile(parent, []byte("x"), 0o600); err != nil {
 		t.Fatalf("write parent file: %v", err)
 	}
 	err := WriteIntent(filepath.Join(parent, "intent.json"), Intent{State: IntentNormal})
 	if err == nil {
 		t.Fatalf("expected write failure for non-directory parent")
 	}
 }
 // TestReadIntentFailsOnPermissionError runs one orchestration or CLI step.
 // Signature: TestReadIntentFailsOnPermissionError(t *testing.T).
 // Why: covers read error branch distinct from not-exist and empty-file handling.
 func TestReadIntentFailsOnPermissionError(t *testing.T) {
 	path := filepath.Join(t.TempDir(), "intent.json")
 	if err := os.WriteFile(path, []byte(`{"state":"normal"}`), 0o640); err != nil {
 		t.Fatalf("write intent file: %v", err)
 	}
 	if err := os.Chmod(path, 0o000); err != nil {
 		t.Fatalf("chmod intent file: %v", err)
 	}
 	defer os.Chmod(path, 0o640)
 	_, err := ReadIntent(path)
 	if err == nil {
 		t.Fatalf("expected permission error")
 	}
 	if strings.Contains(strings.ToLower(err.Error()), "not exist") {
 		t.Fatalf("expected permission-related error, got: %v", err)
 	}
 }
--- a/internal/state/intent_parse.go
+++ b/internal/state/intent_parse.go
@ -7,6 +7,10 @@ import (
 )
 // ParseIntentOutput parses `ananke intent` CLI output from local/remote commands.
 // Signature: ParseIntentOutput(raw string) (Intent, error)
 // Why: Startup/shutdown coordination depends on intent state being interpreted
 // consistently from command output so remote peers and local orchestration can
 // share one durable control-plane signal.
 func ParseIntentOutput(raw string) (Intent, error) {
 	for _, line := range strings.Split(raw, "\n") {
 		line = strings.TrimSpace(line)
@ -19,9 +23,6 @@ func ParseIntentOutput(raw string) (Intent, error) {
 		}
 		payload := strings.TrimSpace(line[idx:])
 		fields := strings.Fields(payload)
 		if len(fields) == 0 || !strings.HasPrefix(fields[0], "intent=") {
 			continue
 		}
 		stateValue := strings.TrimSpace(strings.TrimPrefix(fields[0], "intent="))
 		if stateValue == "" || stateValue == "none" {
 			return Intent{}, nil
@ -29,12 +30,10 @@ func ParseIntentOutput(raw string) (Intent, error) {
 		in := Intent{State: stateValue}
 		if strings.Contains(payload, `reason="`) {
 			parts := strings.SplitN(payload, `reason="`, 2)
 			if len(parts) == 2 {
 			if end := strings.Index(parts[1], `"`); end >= 0 {
 				in.Reason = parts[1][:end]
 			}
 		}
 		}
 		for _, field := range fields[1:] {
 			if strings.HasPrefix(field, "source=") {
 				in.Source = strings.TrimSpace(strings.TrimPrefix(field, "source="))
--- a/internal/state/intent_test.go
+++ b/internal/state/intent_test.go
@ -6,6 +6,9 @@ import (
 	"testing"
 )
 // TestWriteReadIntentRoundTrip runs one orchestration or CLI step.
 // Signature: TestWriteReadIntentRoundTrip(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestWriteReadIntentRoundTrip(t *testing.T) {
 	p := filepath.Join(t.TempDir(), "intent.json")
 	if err := MustWriteIntent(p, IntentShuttingDown, "ups-threshold", "daemon"); err != nil {
@ -23,6 +26,9 @@ func TestWriteReadIntentRoundTrip(t *testing.T) {
 	}
 }
 // TestMustWriteIntentRejectsUnknownState runs one orchestration or CLI step.
 // Signature: TestMustWriteIntentRejectsUnknownState(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestMustWriteIntentRejectsUnknownState(t *testing.T) {
 	p := filepath.Join(t.TempDir(), "intent.json")
 	if err := MustWriteIntent(p, "weird", "x", "y"); err == nil {
@ -30,6 +36,9 @@ func TestMustWriteIntentRejectsUnknownState(t *testing.T) {
 	}
 }
 // TestReadIntentAutoHealsCorruptJSON runs one orchestration or CLI step.
 // Signature: TestReadIntentAutoHealsCorruptJSON(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestReadIntentAutoHealsCorruptJSON(t *testing.T) {
 	dir := t.TempDir()
 	p := filepath.Join(dir, "intent.json")
@ -60,6 +69,9 @@ func TestReadIntentAutoHealsCorruptJSON(t *testing.T) {
 	}
 }
 // TestParseIntentOutputParsesStructuredLine runs one orchestration or CLI step.
 // Signature: TestParseIntentOutputParsesStructuredLine(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestParseIntentOutputParsesStructuredLine(t *testing.T) {
 	raw := `[ananke] 2026/04/05 11:24:49 intent=normal reason="guard-test-clear-2" source=drill updated_at=2026-04-05T16:24:33Z`
 	in, err := ParseIntentOutput(raw)
@ -80,6 +92,9 @@ func TestParseIntentOutputParsesStructuredLine(t *testing.T) {
 	}
 }
 // TestParseIntentOutputHandlesNone runs one orchestration or CLI step.
 // Signature: TestParseIntentOutputHandlesNone(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestParseIntentOutputHandlesNone(t *testing.T) {
 	in, err := ParseIntentOutput(`[ananke] 2026/04/05 11:24:49 intent=none`)
 	if err != nil {
--- a/internal/state/store.go
+++ b/internal/state/store.go
@ -32,10 +32,16 @@ type Store struct {
 	mu   sync.Mutex
 }
 // New runs one orchestration or CLI step.
 // Signature: New(path string) *Store.
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func New(path string) *Store {
 	return &Store{path: path}
 }
 // EnsureDir runs one orchestration or CLI step.
 // Signature: EnsureDir(dir string) error.
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func EnsureDir(dir string) error {
 	if dir == "" {
 		return fmt.Errorf("state dir must not be empty")
@ -43,6 +49,9 @@ func EnsureDir(dir string) error {
 	return os.MkdirAll(dir, 0o750)
 }
 // AcquireLock runs one orchestration or CLI step.
 // Signature: AcquireLock(path string) (func(), error).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func AcquireLock(path string) (func(), error) {
 	if err := os.MkdirAll(filepath.Dir(path), 0o750); err != nil {
 		return nil, err
@ -85,6 +94,9 @@ func AcquireLock(path string) (func(), error) {
 	return unlock, nil
 }
 // staleLock runs one orchestration or CLI step.
 // Signature: staleLock(path string) (bool, error).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func staleLock(path string) (bool, error) {
 	b, err := os.ReadFile(path)
 	if err != nil {
@ -99,6 +111,9 @@ func staleLock(path string) (bool, error) {
 		line = strings.TrimSpace(line)
 		if strings.HasPrefix(line, "pid=") {
 			v := strings.TrimPrefix(line, "pid=")
 			if fields := strings.Fields(v); len(fields) > 0 {
 				v = fields[0]
 			}
 			parsed, parseErr := strconv.Atoi(v)
 			if parseErr != nil {
 				return true, nil
@ -118,6 +133,9 @@ func staleLock(path string) (bool, error) {
 	return false, nil
 }
 // Append runs one orchestration or CLI step.
 // Signature: (s *Store) Append(record RunRecord) error.
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func (s *Store) Append(record RunRecord) error {
 	s.mu.Lock()
 	defer s.mu.Unlock()
@ -133,19 +151,22 @@ func (s *Store) Append(record RunRecord) error {
 	if err := os.MkdirAll(filepath.Dir(s.path), 0o750); err != nil {
 		return err
 	}
-	b, err := json.MarshalIndent(records, "", "  ")
+	b, _ := json.MarshalIndent(records, "", "  ")
 	if err != nil {
 		return err
 	}
 	return os.WriteFile(s.path, b, 0o640)
 }
 // Load runs one orchestration or CLI step.
 // Signature: (s *Store) Load() ([]RunRecord, error).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func (s *Store) Load() ([]RunRecord, error) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 	return s.loadUnlocked()
 }
 // loadUnlocked runs one orchestration or CLI step.
 // Signature: (s *Store) loadUnlocked() ([]RunRecord, error).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func (s *Store) loadUnlocked() ([]RunRecord, error) {
 	b, err := os.ReadFile(s.path)
 	if err != nil {
@ -167,18 +188,30 @@ func (s *Store) loadUnlocked() ([]RunRecord, error) {
 	return records, nil
 }
 // ShutdownP95 runs one orchestration or CLI step.
 // Signature: (s *Store) ShutdownP95(defaultSeconds int) int.
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func (s *Store) ShutdownP95(defaultSeconds int) int {
 	return s.shutdownP95(defaultSeconds, 1, nil)
 }
 // ShutdownP95WithMinSamples runs one orchestration or CLI step.
 // Signature: (s *Store) ShutdownP95WithMinSamples(defaultSeconds int, minSamples int) int.
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func (s *Store) ShutdownP95WithMinSamples(defaultSeconds int, minSamples int) int {
 	return s.shutdownP95(defaultSeconds, minSamples, nil)
 }
 // ShutdownP95ByReasonPrefix runs one orchestration or CLI step.
 // Signature: (s *Store) ShutdownP95ByReasonPrefix(defaultSeconds int, minSamples int, reasonPrefixes []string) int.
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func (s *Store) ShutdownP95ByReasonPrefix(defaultSeconds int, minSamples int, reasonPrefixes []string) int {
 	return s.shutdownP95(defaultSeconds, minSamples, reasonPrefixes)
 }
 // shutdownP95 runs one orchestration or CLI step.
 // Signature: (s *Store) shutdownP95(defaultSeconds int, minSamples int, reasonPrefixes []string) int.
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func (s *Store) shutdownP95(defaultSeconds int, minSamples int, reasonPrefixes []string) int {
 	if minSamples <= 0 {
 		minSamples = 1
@ -217,14 +250,5 @@ func (s *Store) shutdownP95(defaultSeconds int, minSamples int, reasonPrefixes [
 	}
 	sort.Ints(d)
 	idx := int(math.Ceil(0.95*float64(len(d)))) - 1
 	if idx < 0 {
 		idx = 0
 	}
 	if idx >= len(d) {
 		idx = len(d) - 1
 	}
 	if d[idx] <= 0 {
 		return defaultSeconds
 	}
 	return d[idx]
 }
--- a/internal/state/store_additional_test.go
+++ b/internal/state/store_additional_test.go
@ -0,0 +1,156 @@
 package state
 import (
 	"encoding/json"
 	"os"
 	"path/filepath"
 	"strconv"
 	"testing"
 	"time"
 )
 // TestEnsureDirRejectsEmpty runs one orchestration or CLI step.
 // Signature: TestEnsureDirRejectsEmpty(t *testing.T).
 // Why: covers explicit guard branch for empty state directory inputs.
 func TestEnsureDirRejectsEmpty(t *testing.T) {
 	if err := EnsureDir(""); err == nil {
 		t.Fatalf("expected empty directory error")
 	}
 }
 // TestStoreAppendTrimToMaxRecords runs one orchestration or CLI step.
 // Signature: TestStoreAppendTrimToMaxRecords(t *testing.T).
 // Why: covers retention branch that trims run history to the 200-record cap.
 func TestStoreAppendTrimToMaxRecords(t *testing.T) {
 	path := filepath.Join(t.TempDir(), "runs.json")
 	s := New(path)
 	now := time.Now().UTC()
 	for i := 0; i < 205; i++ {
 		if err := s.Append(RunRecord{
 			ID:              "r-" + strconv.Itoa(i),
 			Action:          "shutdown",
 			StartedAt:       now,
 			EndedAt:         now,
 			DurationSeconds: i + 1,
 			Success:         true,
 		}); err != nil {
 			t.Fatalf("append %d failed: %v", i, err)
 		}
 	}
 	recs, err := s.Load()
 	if err != nil {
 		t.Fatalf("load failed: %v", err)
 	}
 	if len(recs) != 200 {
 		t.Fatalf("expected trim to 200 records, got %d", len(recs))
 	}
 }
 // TestStoreLoadHandlesEmptyFile runs one orchestration or CLI step.
 // Signature: TestStoreLoadHandlesEmptyFile(t *testing.T).
 // Why: covers load branch for empty existing run-history file.
 func TestStoreLoadHandlesEmptyFile(t *testing.T) {
 	path := filepath.Join(t.TempDir(), "runs.json")
 	if err := os.WriteFile(path, nil, 0o640); err != nil {
 		t.Fatalf("write empty file: %v", err)
 	}
 	recs, err := New(path).Load()
 	if err != nil {
 		t.Fatalf("load empty file: %v", err)
 	}
 	if len(recs) != 0 {
 		t.Fatalf("expected no records, got %d", len(recs))
 	}
 }
 // TestStoreLoadReturnsErrorOnUnhealableDecode runs one orchestration or CLI step.
 // Signature: TestStoreLoadReturnsErrorOnUnhealableDecode(t *testing.T).
 // Why: covers decode failure path where replacement write itself can fail.
 func TestStoreLoadReturnsErrorOnUnhealableDecode(t *testing.T) {
 	dir := t.TempDir()
 	path := filepath.Join(dir, "runs.json")
 	if err := os.WriteFile(path, []byte("{bad-json"), 0o640); err != nil {
 		t.Fatalf("write invalid file: %v", err)
 	}
 	// Make directory readonly so quarantine replacement cannot be written.
 	if err := os.Chmod(dir, 0o500); err != nil {
 		t.Fatalf("chmod dir readonly: %v", err)
 	}
 	defer os.Chmod(dir, 0o700)
 	if _, err := New(path).Load(); err == nil {
 		t.Fatalf("expected load failure when auto-heal cannot write replacement")
 	}
 }
 // TestShutdownP95FallsBackOnLoadError runs one orchestration or CLI step.
 // Signature: TestShutdownP95FallsBackOnLoadError(t *testing.T).
 // Why: covers load-error fallback branch in percentile helper.
 func TestShutdownP95FallsBackOnLoadError(t *testing.T) {
 	path := filepath.Join(t.TempDir(), "runs.json")
 	if err := os.WriteFile(path, []byte("{bad"), 0o640); err != nil {
 		t.Fatalf("write invalid file: %v", err)
 	}
 	// Use impossible perms to force read failure.
 	if err := os.Chmod(path, 0o000); err != nil {
 		t.Fatalf("chmod file: %v", err)
 	}
 	defer os.Chmod(path, 0o640)
 	if got := New(path).ShutdownP95(321); got != 321 {
 		t.Fatalf("expected fallback default 321, got %d", got)
 	}
 }
 // TestShutdownP95ReturnsDefaultOnNonPositiveQuantile runs one orchestration or CLI step.
 // Signature: TestShutdownP95ReturnsDefaultOnNonPositiveQuantile(t *testing.T).
 // Why: covers branch where computed percentile record is non-positive.
 func TestShutdownP95ReturnsDefaultOnNonPositiveQuantile(t *testing.T) {
 	path := filepath.Join(t.TempDir(), "runs.json")
 	now := time.Now().UTC()
 	records := []RunRecord{
 		{Action: "shutdown", StartedAt: now, EndedAt: now, DurationSeconds: 0, Success: true},
 		{Action: "shutdown", StartedAt: now, EndedAt: now, DurationSeconds: -1, Success: true},
 	}
 	b, err := json.Marshal(records)
 	if err != nil {
 		t.Fatalf("marshal records: %v", err)
 	}
 	if err := os.WriteFile(path, b, 0o640); err != nil {
 		t.Fatalf("write records: %v", err)
 	}
 	if got := New(path).ShutdownP95WithMinSamples(777, 1); got != 777 {
 		t.Fatalf("expected default 777, got %d", got)
 	}
 }
 // TestStaleLockHelpers runs one orchestration or CLI step.
 // Signature: TestStaleLockHelpers(t *testing.T).
 // Why: covers stale-lock parser branches directly for reliability.
 func TestStaleLockHelpers(t *testing.T) {
 	tmp := t.TempDir()
 	missing := filepath.Join(tmp, "missing.lock")
 	stale, err := staleLock(missing)
 	if err != nil || !stale {
 		t.Fatalf("expected missing lock to be stale=true err=nil, got stale=%v err=%v", stale, err)
 	}
 	invalidPID := filepath.Join(tmp, "invalid.lock")
 	if err := os.WriteFile(invalidPID, []byte("pid=notanumber\n"), 0o600); err != nil {
 		t.Fatalf("write invalid pid lock: %v", err)
 	}
 	stale, err = staleLock(invalidPID)
 	if err != nil || !stale {
 		t.Fatalf("expected invalid pid lock to be stale=true err=nil, got stale=%v err=%v", stale, err)
 	}
 	active := filepath.Join(tmp, "active.lock")
 	if err := os.WriteFile(active, []byte("pid="+strconv.Itoa(os.Getpid())+"\n"), 0o600); err != nil {
 		t.Fatalf("write active lock: %v", err)
 	}
 	stale, err = staleLock(active)
 	if err != nil {
 		t.Fatalf("active staleLock error: %v", err)
 	}
 	if stale {
 		t.Fatalf("expected active lock to report stale=false")
 	}
 }
--- a/internal/state/store_test.go
+++ b/internal/state/store_test.go
@ -10,6 +10,9 @@ import (
 	"time"
 )
 // TestAcquireLockLifecycle runs one orchestration or CLI step.
 // Signature: TestAcquireLockLifecycle(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestAcquireLockLifecycle(t *testing.T) {
 	lockPath := filepath.Join(t.TempDir(), "ananke.lock")
 	unlock, err := AcquireLock(lockPath)
@ -25,6 +28,9 @@ func TestAcquireLockLifecycle(t *testing.T) {
 	}
 }
 // TestAcquireLockReclaimsStaleLock runs one orchestration or CLI step.
 // Signature: TestAcquireLockReclaimsStaleLock(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestAcquireLockReclaimsStaleLock(t *testing.T) {
 	lockPath := filepath.Join(t.TempDir(), "ananke.lock")
 	if err := os.WriteFile(lockPath, []byte("pid=999999\n"), 0o600); err != nil {
@ -46,6 +52,9 @@ func TestAcquireLockReclaimsStaleLock(t *testing.T) {
 	}
 }
 // TestAcquireLockRejectsActiveLock runs one orchestration or CLI step.
 // Signature: TestAcquireLockRejectsActiveLock(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestAcquireLockRejectsActiveLock(t *testing.T) {
 	lockPath := filepath.Join(t.TempDir(), "ananke.lock")
 	active := "pid=" + strconv.Itoa(os.Getpid()) + "\n"
@ -58,6 +67,9 @@ func TestAcquireLockRejectsActiveLock(t *testing.T) {
 	}
 }
 // TestStoreLoadAutoHealsCorruptJSON runs one orchestration or CLI step.
 // Signature: TestStoreLoadAutoHealsCorruptJSON(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestStoreLoadAutoHealsCorruptJSON(t *testing.T) {
 	dir := t.TempDir()
 	p := filepath.Join(dir, "runs.json")
@ -88,6 +100,9 @@ func TestStoreLoadAutoHealsCorruptJSON(t *testing.T) {
 	}
 }
 // TestShutdownP95WithMinSamplesFallsBackWhenHistorySparse runs one orchestration or CLI step.
 // Signature: TestShutdownP95WithMinSamplesFallsBackWhenHistorySparse(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestShutdownP95WithMinSamplesFallsBackWhenHistorySparse(t *testing.T) {
 	p := filepath.Join(t.TempDir(), "runs.json")
 	records := []RunRecord{
@ -115,6 +130,9 @@ func TestShutdownP95WithMinSamplesFallsBackWhenHistorySparse(t *testing.T) {
 	}
 }
 // TestShutdownP95ByReasonPrefixFiltersSamples runs one orchestration or CLI step.
 // Signature: TestShutdownP95ByReasonPrefixFiltersSamples(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestShutdownP95ByReasonPrefixFiltersSamples(t *testing.T) {
 	p := filepath.Join(t.TempDir(), "runs.json")
 	now := time.Now().UTC()
@ -161,6 +179,9 @@ func TestShutdownP95ByReasonPrefixFiltersSamples(t *testing.T) {
 	}
 }
 // TestShutdownP95IgnoresDryRunSamples runs one orchestration or CLI step.
 // Signature: TestShutdownP95IgnoresDryRunSamples(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestShutdownP95IgnoresDryRunSamples(t *testing.T) {
 	p := filepath.Join(t.TempDir(), "runs.json")
 	now := time.Now().UTC()
--- a/internal/state/testhooks.go
+++ b/internal/state/testhooks.go
@ -0,0 +1,10 @@
 package state
 import "os"
 // TestHookQuarantineCorruptFile runs one orchestration or CLI step.
 // Signature: TestHookQuarantineCorruptFile(path string, payload []byte, replacement []byte, mode os.FileMode) error.
 // Why: exposes corrupt-file healing internals to the top-level testing module without package-local tests.
 func TestHookQuarantineCorruptFile(path string, payload []byte, replacement []byte, mode os.FileMode) error {
 	return quarantineCorruptFile(path, payload, replacement, mode)
 }
--- a/internal/ups/nut.go
+++ b/internal/ups/nut.go
@ -28,10 +28,16 @@ type NUTProvider struct {
 	Target string
 }
 // NewNUTProvider runs one orchestration or CLI step.
 // Signature: NewNUTProvider(target string) *NUTProvider.
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func NewNUTProvider(target string) *NUTProvider {
 	return &NUTProvider{Target: target}
 }
 // Read runs one orchestration or CLI step.
 // Signature: (p *NUTProvider) Read(ctx context.Context) (Sample, error).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func (p *NUTProvider) Read(ctx context.Context) (Sample, error) {
 	if p.Target == "" {
 		return Sample{}, fmt.Errorf("NUT target must not be empty")
@ -44,6 +50,9 @@ func (p *NUTProvider) Read(ctx context.Context) (Sample, error) {
 	return parseNUT(string(out))
 }
 // parseNUT runs one orchestration or CLI step.
 // Signature: parseNUT(raw string) (Sample, error).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func parseNUT(raw string) (Sample, error) {
 	kv := map[string]string{}
 	s := bufio.NewScanner(strings.NewReader(raw))
@ -106,6 +115,9 @@ func parseNUT(raw string) (Sample, error) {
 var parseNumberCleaner = regexp.MustCompile(`[^0-9.+-]`)
 // parseNumber runs one orchestration or CLI step.
 // Signature: parseNumber(raw string) (float64, bool).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func parseNumber(raw string) (float64, bool) {
 	cleaned := strings.TrimSpace(parseNumberCleaner.ReplaceAllString(raw, ""))
 	if cleaned == "" {
--- a/internal/ups/nut_additional_test.go
+++ b/internal/ups/nut_additional_test.go
@ -0,0 +1,108 @@
 package ups
 import (
 	"context"
 	"os"
 	"path/filepath"
 	"strings"
 	"testing"
 )
 // TestParseNUTRejectsMissingStatus runs one orchestration or CLI step.
 // Signature: TestParseNUTRejectsMissingStatus(t *testing.T).
 // Why: covers parser error path when mandatory status line is absent.
 func TestParseNUTRejectsMissingStatus(t *testing.T) {
 	if _, err := parseNUT("battery.charge: 88"); err == nil {
 		t.Fatalf("expected missing status error")
 	}
 }
 // TestParseNUTParsesOptionalNumbers runs one orchestration or CLI step.
 // Signature: TestParseNUTParsesOptionalNumbers(t *testing.T).
 // Why: covers numeric extraction branches for charge/load/nominal fields.
 func TestParseNUTParsesOptionalNumbers(t *testing.T) {
 	raw := strings.Join([]string{
 		"ups.status: OB LB",
 		"battery.runtime: 1024",
 		"battery.charge: 71.5 Percent",
 		"ups.load: 12.0 Percent",
 		"ups.realpower.nominal: 900 W",
 		"",
 	}, "\n")
 	s, err := parseNUT(raw)
 	if err != nil {
 		t.Fatalf("parseNUT failed: %v", err)
 	}
 	if !s.OnBattery || !s.LowBattery || s.RuntimeSeconds != 1024 {
 		t.Fatalf("unexpected status parse: %+v", s)
 	}
 	if s.BatteryCharge != 71.5 || s.LoadPercent != 12 || s.NominalPowerW != 900 {
 		t.Fatalf("unexpected numeric parse: %+v", s)
 	}
 }
 // TestNUTProviderReadViaPathShim runs one orchestration or CLI step.
 // Signature: TestNUTProviderReadViaPathShim(t *testing.T).
 // Why: covers provider command execution success path deterministically.
 func TestNUTProviderReadViaPathShim(t *testing.T) {
 	tmp := t.TempDir()
 	upscPath := filepath.Join(tmp, "upsc")
 	script := `#!/usr/bin/env bash
 set -euo pipefail
 echo "ups.status: OL"
 echo "battery.runtime: 500"
 `
 	if err := os.WriteFile(upscPath, []byte(script), 0o755); err != nil {
 		t.Fatalf("write fake upsc: %v", err)
 	}
 	t.Setenv("PATH", tmp+":"+os.Getenv("PATH"))
 	sample, err := NewNUTProvider("statera@localhost").Read(context.Background())
 	if err != nil {
 		t.Fatalf("provider read failed: %v", err)
 	}
 	if sample.OnBattery {
 		t.Fatalf("expected OL to report not-on-battery")
 	}
 	if sample.RuntimeSeconds != 500 {
 		t.Fatalf("expected runtime 500, got %d", sample.RuntimeSeconds)
 	}
 }
 // TestNUTProviderReadRejectsEmptyTarget runs one orchestration or CLI step.
 // Signature: TestNUTProviderReadRejectsEmptyTarget(t *testing.T).
 // Why: covers provider guard for empty NUT target values.
 func TestNUTProviderReadRejectsEmptyTarget(t *testing.T) {
 	if _, err := NewNUTProvider("").Read(context.Background()); err == nil {
 		t.Fatalf("expected empty-target read error")
 	}
 }
 // TestParseNumberRejectsInvalid runs one orchestration or CLI step.
 // Signature: TestParseNumberRejectsInvalid(t *testing.T).
 // Why: covers parseNumber false-return branch for invalid input.
 func TestParseNumberRejectsInvalid(t *testing.T) {
 	if _, ok := parseNumber("not-a-number"); ok {
 		t.Fatalf("expected parseNumber to reject invalid input")
 	}
 }
 // TestNUTProviderReadCommandFailure runs one orchestration or CLI step.
 // Signature: TestNUTProviderReadCommandFailure(t *testing.T).
 // Why: covers provider error propagation when upsc exits non-zero.
 func TestNUTProviderReadCommandFailure(t *testing.T) {
 	tmp := t.TempDir()
 	upscPath := filepath.Join(tmp, "upsc")
 	script := `#!/usr/bin/env bash
 set -euo pipefail
 echo "upsc failed" >&2
 exit 2
 `
 	if err := os.WriteFile(upscPath, []byte(script), 0o755); err != nil {
 		t.Fatalf("write fake upsc: %v", err)
 	}
 	t.Setenv("PATH", tmp+":"+os.Getenv("PATH"))
 	if _, err := NewNUTProvider("pyrphoros@localhost").Read(context.Background()); err == nil {
 		t.Fatalf("expected provider read error on upsc failure")
 	}
 }
--- a/internal/ups/nut_test.go
+++ b/internal/ups/nut_test.go
@ -2,6 +2,9 @@ package ups
 import "testing"
 // TestParseNUT runs one orchestration or CLI step.
 // Signature: TestParseNUT(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestParseNUT(t *testing.T) {
 	raw := `battery.runtime: 384
 battery.charge: 72
--- a/scripts/ananke-drills.sh
+++ b/scripts/ananke-drills.sh
@ -9,7 +9,7 @@ ANANKE_COORDINATOR_RELAY="${ANANKE_COORDINATOR_RELAY:-}"
 LOG_DIR="${ANANKE_DRILL_LOG_DIR:-/tmp/ananke-drills}"
 STARTUP_TIMEOUT_SECONDS="${ANANKE_DRILL_STARTUP_TIMEOUT_SECONDS:-1800}"
 SHUTDOWN_TIMEOUT_SECONDS="${ANANKE_DRILL_SHUTDOWN_TIMEOUT_SECONDS:-1800}"
-SHUTDOWN_CONFIG="${ANANKE_DRILL_SHUTDOWN_CONFIG:-/tmp/ananke-drill-no-poweroff.yaml}"
+SHUTDOWN_CONFIG="${ANANKE_DRILL_SHUTDOWN_CONFIG:-/tmp/ananke-drill-cluster-only.yaml}"
 STARTUP_RETRY_DELAY_SECONDS="${ANANKE_DRILL_STARTUP_RETRY_DELAY_SECONDS:-10}"
 STARTUP_RETRY_MAX="${ANANKE_DRILL_STARTUP_RETRY_MAX:-12}"
 EXECUTE=0
@ -25,7 +25,7 @@ Drills:
  foundation-recovery    Simulate vault/postgres/gitea outage and require layered restore.
  reconciliation-resume  Simulate global Flux suspend + source-controller down and require resume.
  startup-intent-guard   Assert startup is blocked when shutdown intent is active.
-  controlled-cycle       Run full shutdown->startup recovery cycle (uses no-poweroff config).
+  controlled-cycle       Run full shutdown->startup recovery cycle (uses cluster-only shutdown config).
 Notes:
  - Drills are intentionally disruptive and are not part of regular `make test`.
@ -405,7 +405,7 @@ run_drill_controlled_cycle() {
    run_coordinator_bash "[ -s '${SHUTDOWN_CONFIG}' ]" || die "shutdown drill config missing on coordinator: ${SHUTDOWN_CONFIG}"
  fi
-  log "running controlled shutdown cycle (poweroff disabled config)"
+  log "running controlled shutdown cycle (cluster-only shutdown config)"
  run_ananke_shutdown "drill-controlled-cycle-shutdown"
  log "running startup recovery cycle"
--- a/scripts/ananke-self-update.sh
+++ b/scripts/ananke-self-update.sh
@ -9,6 +9,7 @@ fi
 REPO_URL="${ANANKE_REPO_URL:-ssh://git@scm.bstein.dev:2242/bstein/ananke.git}"
 BRANCH="${ANANKE_REPO_BRANCH:-main}"
 REPO_DIR="${ANANKE_REPO_DIR:-/opt/ananke}"
 HOST_SHORT="$(hostname -s 2>/dev/null || hostname)"
 mkdir -p "$(dirname "${REPO_DIR}")"
 if [[ ! -d "${REPO_DIR}/.git" ]]; then
@ -23,4 +24,16 @@ git checkout "${BRANCH}"
 git reset --hard "origin/${BRANCH}"
 echo "[self-update] running installer"
 # Keep host configs aligned with tracked templates so startup/shutdown drills
 # always use the latest checklist and safety logic.
 if [[ -z "${ANANKE_FORCE_CONFIG_TEMPLATE:-}" ]]; then
  case "${HOST_SHORT}" in
    titan-db)
      export ANANKE_FORCE_CONFIG_TEMPLATE="coordinator"
      ;;
    titan-24)
      export ANANKE_FORCE_CONFIG_TEMPLATE="peer"
      ;;
  esac
 fi
 "${REPO_DIR}/scripts/install.sh"
--- a/scripts/install.sh
+++ b/scripts/install.sh
@ -22,6 +22,7 @@ NUT_PRODUCT_ID="${ANANKE_NUT_PRODUCT_ID:-0601}"
 NUT_MONITOR_USER="${ANANKE_NUT_MONITOR_USER:-monuser}"
 NUT_MONITOR_PASSWORD="${ANANKE_NUT_MONITOR_PASSWORD:-anankeupsmon}"
 FORCE_CONFIG_TEMPLATE="${ANANKE_FORCE_CONFIG_TEMPLATE:-}"
 ENFORCE_QUALITY_GATE="${ANANKE_ENFORCE_QUALITY_GATE:-1}"
 while [[ $# -gt 0 ]]; do
  case "$1" in
@ -228,6 +229,28 @@ migrate_ananke_config() {
    echo "[install] added coordination.startup_guard_max_age_seconds=900"
    changed=1
  fi
  if grep -Eq '^[[:space:]]*poweroff_enabled:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
    sed -Ei \
      -e '/^[[:space:]]*poweroff_enabled:[[:space:]]*(true|false)/d' \
      -e '/^[[:space:]]*poweroff_delay_seconds:[[:space:]]*[0-9]+/d' \
      -e '/^[[:space:]]*poweroff_local_host:[[:space:]]*(true|false)/d' \
      -e '/^[[:space:]]*extra_poweroff_hosts:[[:space:]]*(\[\])?[[:space:]]*$/d' \
      "${CONF_DIR}/ananke.yaml"
    echo "[install] removed deprecated host-poweroff shutdown config keys"
    changed=1
  fi
  if grep -Eq '^  minimum_battery_percent:[[:space:]]*[0-9.]+' "${CONF_DIR}/ananke.yaml" \
    && ! grep -Eq '^  require_node_inventory_reachability:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
    sed -Ei '/^  minimum_battery_percent:[[:space:]]*[0-9.]+/a\  require_node_inventory_reachability: true\n  node_inventory_reachability_wait_seconds: 300\n  node_inventory_reachability_poll_seconds: 5' "${CONF_DIR}/ananke.yaml"
    echo "[install] added startup node inventory reachability gate defaults"
    changed=1
  fi
  if grep -Eq '^state:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
    && ! grep -Eq '^  reports_dir:[[:space:]]*/var/lib/ananke/reports' "${CONF_DIR}/ananke.yaml"; then
    sed -Ei '/^  dir:[[:space:]]*\/var\/lib\/ananke$/a\  reports_dir: /var/lib/ananke/reports' "${CONF_DIR}/ananke.yaml"
    echo "[install] added state.reports_dir default"
    changed=1
  fi
  if ! grep -Eq '^  peer_hosts:' "${CONF_DIR}/ananke.yaml"; then
    if [[ "${role_hint}" == "peer" ]] && grep -Eq '^  forward_shutdown_host:[[:space:]]*[A-Za-z0-9._-]+' "${CONF_DIR}/ananke.yaml"; then
      local peer_host
@ -838,6 +861,13 @@ EOF
 ensure_dependencies
 migrate_legacy_hecate_install
 if [[ "${ENFORCE_QUALITY_GATE}" == "1" ]]; then
  echo "[install] running quality gate"
  "${REPO_DIR}/scripts/quality_gate.sh"
 else
  echo "[install] skipping quality gate (ANANKE_ENFORCE_QUALITY_GATE=${ENFORCE_QUALITY_GATE})"
 fi
 echo "[install] building ananke"
 cd "${REPO_DIR}"
 mkdir -p dist
@ -855,6 +885,7 @@ install -m 0755 dist/ananke "${BIN_DIR}/ananke"
 echo "[install] installing config + state dirs"
 install -d -m 0750 "${CONF_DIR}"
 install -d -m 0750 "${STATE_DIR}"
 install -d -m 0750 "${STATE_DIR}/reports"
 install -d -m 0755 "${LIB_DIR}"
 if [[ -n "${FORCE_CONFIG_TEMPLATE}" ]]; then
--- a/scripts/lint.sh
+++ b/scripts/lint.sh
@ -0,0 +1,17 @@
 #!/usr/bin/env bash
 set -euo pipefail
 REPO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 cd "${REPO_DIR}"
 export PATH="$(go env GOPATH)/bin:${PATH}"
 if ! command -v staticcheck >/dev/null 2>&1; then
  echo "[lint] installing staticcheck"
  go install honnef.co/go/tools/cmd/staticcheck@latest
 fi
 echo "[lint] go vet"
 go vet ./...
 echo "[lint] staticcheck (pedantic code-smell pass)"
 staticcheck ./...
--- a/scripts/quality_gate.sh
+++ b/scripts/quality_gate.sh
@ -0,0 +1,110 @@
 #!/usr/bin/env bash
 set -euo pipefail
 REPO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 QUALITY_METRICS_ENABLED="${ANANKE_QUALITY_METRICS_ENABLED:-1}"
 QUALITY_METRICS_FILE="${ANANKE_QUALITY_METRICS_FILE:-/var/lib/ananke/quality-gate.prom}"
 QUALITY_STATE_FILE="${ANANKE_QUALITY_STATE_FILE:-/var/lib/ananke/quality-gate.state}"
 read_quality_counter() {
  local key="$1"
  if [[ ! -f "${QUALITY_STATE_FILE}" ]]; then
    echo 0
    return 0
  fi
  local value
  value="$(awk -F= -v key="${key}" '$1==key {print $2}' "${QUALITY_STATE_FILE}" | tail -n1)"
  if [[ ! "${value}" =~ ^[0-9]+$ ]]; then
    echo 0
    return 0
  fi
  echo "${value}"
 }
 write_quality_metrics() {
  local exit_code="$1"
  if [[ "${QUALITY_METRICS_ENABLED}" != "1" ]]; then
    return 0
  fi
  local metrics_dir state_dir
  metrics_dir="$(dirname "${QUALITY_METRICS_FILE}")"
  state_dir="$(dirname "${QUALITY_STATE_FILE}")"
  mkdir -p "${metrics_dir}" "${state_dir}" >/dev/null 2>&1 || return 0
  local ok failed total last_success now success_percent
  ok="$(read_quality_counter ok)"
  failed="$(read_quality_counter failed)"
  last_success=0
  if [[ "${exit_code}" -eq 0 ]]; then
    ok=$((ok + 1))
    last_success=1
  else
    failed=$((failed + 1))
  fi
  total=$((ok + failed))
  now="$(date +%s)"
  success_percent="$(awk -v ok="${ok}" -v total="${total}" 'BEGIN { if (total <= 0) { print "0.00" } else { printf "%.2f", (ok * 100.0) / total } }')"
  local tmp_metrics tmp_state
  tmp_metrics="$(mktemp "${metrics_dir}/quality-gate.prom.XXXXXX")"
  tmp_state="$(mktemp "${state_dir}/quality-gate.state.XXXXXX")"
  cat > "${tmp_metrics}" <<EOF
 # HELP ananke_quality_gate_runs_total Total Ananke quality gate runs by status.
 # TYPE ananke_quality_gate_runs_total counter
 ananke_quality_gate_runs_total{suite="ananke",status="ok"} ${ok}
 ananke_quality_gate_runs_total{suite="ananke",status="failed"} ${failed}
 # HELP ananke_quality_gate_last_run_success Whether the latest quality gate run succeeded.
 # TYPE ananke_quality_gate_last_run_success gauge
 ananke_quality_gate_last_run_success{suite="ananke"} ${last_success}
 # HELP ananke_quality_gate_last_run_timestamp_seconds Unix timestamp of the latest quality gate run.
 # TYPE ananke_quality_gate_last_run_timestamp_seconds gauge
 ananke_quality_gate_last_run_timestamp_seconds{suite="ananke"} ${now}
 # HELP ananke_quality_gate_success_percent Running quality gate success percentage for Ananke.
 # TYPE ananke_quality_gate_success_percent gauge
 ananke_quality_gate_success_percent{suite="ananke"} ${success_percent}
 EOF
  cat > "${tmp_state}" <<EOF
 ok=${ok}
 failed=${failed}
 last_success=${last_success}
 last_run=${now}
 EOF
  mv -f "${tmp_metrics}" "${QUALITY_METRICS_FILE}"
  mv -f "${tmp_state}" "${QUALITY_STATE_FILE}"
 }
 quality_gate_finalize() {
  local exit_code="$1"
  set +e
  write_quality_metrics "${exit_code}" || true
  exit "${exit_code}"
 }
 trap 'quality_gate_finalize $?' EXIT
 cd "${REPO_DIR}"
 echo "[quality] unit tests"
 go test ./...
 echo "[quality] hygiene: doc contracts"
 cd testing
 go test ./hygiene -run TestHygieneContracts/doc_contract -count=1
 echo "[quality] hygiene: naming contracts"
 go test ./hygiene -run TestHygieneContracts/naming_contract -count=1
 echo "[quality] hygiene: LOC limits"
 go test ./hygiene -run TestHygieneContracts/loc_limit -count=1
 cd "${REPO_DIR}"
 echo "[quality] lint"
 ./scripts/lint.sh
 echo "[quality] per-file coverage gate (95%)"
 cd testing
 ANANKE_ENFORCE_COVERAGE=1 ANANKE_PER_FILE_COVERAGE_TARGET=95 go test ./coverage -run TestPerFileCoverageReport -count=1 -v
--- a/testing/config/config_quality_matrix_test.go
+++ b/testing/config/config_quality_matrix_test.go
@ -0,0 +1,238 @@
 package config
 import (
 	"os"
 	"path/filepath"
 	"strings"
 	"testing"
 	icfg "scm.bstein.dev/bstein/ananke/internal/config"
 )
 func loadBaselineConfig(t *testing.T) icfg.Config {
 	t.Helper()
 	dir := t.TempDir()
 	path := filepath.Join(dir, "ananke.yaml")
 	if err := os.WriteFile(path, []byte("ups:\n  enabled: false\n"), 0o600); err != nil {
 		t.Fatalf("write baseline config: %v", err)
 	}
 	cfg, err := icfg.Load(path)
 	if err != nil {
 		t.Fatalf("load baseline config: %v", err)
 	}
 	return cfg
 }
 // TestHookServiceCatalogAndMergeContracts runs one orchestration or CLI step.
 // Signature: TestHookServiceCatalogAndMergeContracts(t *testing.T).
 // Why: validates startup checklist defaults and merge semantics so host-level
 // overrides cannot silently drop required service behavior checks.
 func TestHookServiceCatalogAndMergeContracts(t *testing.T) {
 	checks := icfg.TestHookDefaultServiceChecklist()
 	if len(checks) < 20 {
 		t.Fatalf("expected substantial default checklist, got %d checks", len(checks))
 	}
 	seen := map[string]icfg.ServiceChecklistCheck{}
 	for _, check := range checks {
 		seen[strings.TrimSpace(check.Name)] = check
 	}
 	logging, ok := seen["logging-ui-user-session"]
 	if !ok || !logging.RequireRobotAuth || strings.TrimSpace(logging.FinalURLNotContains) == "" {
 		t.Fatalf("expected logging-ui-user-session to require robot auth + final URL validation")
 	}
 	keycloak, ok := seen["keycloak-admin-user-session"]
 	if !ok || !keycloak.RequireRobotAuth || strings.TrimSpace(keycloak.FinalURLNotContains) == "" {
 		t.Fatalf("expected keycloak-admin-user-session hard auth assertions")
 	}
 	critical := icfg.TestHookDefaultCriticalServiceEndpoints()
 	if len(critical) == 0 {
 		t.Fatalf("expected critical endpoint defaults")
 	}
 	foundMonitoring := false
 	for _, entry := range critical {
 		if entry == "monitoring/grafana" {
 			foundMonitoring = true
 			break
 		}
 	}
 	if !foundMonitoring {
 		t.Fatalf("expected monitoring/grafana critical endpoint default")
 	}
 	mergedChecks := icfg.TestHookMergeServiceChecklistDefaults(
 		[]icfg.ServiceChecklistCheck{
 			{Name: "custom", URL: "https://custom.bstein.dev/", TimeoutSeconds: 5},
 			{Name: "logging-ui-user-session", URL: "https://override.invalid/", TimeoutSeconds: 5},
 		},
 		[]icfg.ServiceChecklistCheck{
 			{Name: "logging-ui-user-session", URL: "https://logs.bstein.dev/", TimeoutSeconds: 5},
 			{Name: "metrics-ui-user-session", URL: "https://metrics.bstein.dev/", TimeoutSeconds: 5},
 		},
 	)
 	if len(mergedChecks) != 3 {
 		t.Fatalf("expected 3 merged checks with dedupe, got %d", len(mergedChecks))
 	}
 	mergedStrings := icfg.TestHookMergeStringDefaults(
 		[]string{" one ", "one", "", "two"},
 		[]string{"two", "three", " "},
 	)
 	if strings.Join(mergedStrings, ",") != "one,two,three" {
 		t.Fatalf("unexpected merged string defaults: %v", mergedStrings)
 	}
 }
 // TestValidateServiceChecklistAuthContracts runs one orchestration or CLI step.
 // Signature: TestValidateServiceChecklistAuthContracts(t *testing.T).
 // Why: covers service-checklist auth and final-url validation branches that are
 // critical for preventing false-positive startup success.
 func TestValidateServiceChecklistAuthContracts(t *testing.T) {
 	t.Run("invalid auth mode", func(t *testing.T) {
 		cfg := loadBaselineConfig(t)
 		cfg.Startup.ServiceChecklistAuth.Mode = "bad-mode"
 		if err := cfg.Validate(); err == nil {
 			t.Fatalf("expected invalid mode validation error")
 		}
 	})
 	t.Run("invalid keycloak base url", func(t *testing.T) {
 		cfg := loadBaselineConfig(t)
 		cfg.Startup.ServiceChecklistAuth.KeycloakBaseURL = "://broken"
 		if err := cfg.Validate(); err == nil {
 			t.Fatalf("expected invalid keycloak base URL validation error")
 		}
 	})
 	t.Run("missing secret key fields", func(t *testing.T) {
 		cfg := loadBaselineConfig(t)
 		cfg.Startup.ServiceChecklistAuth.AdminSecretPasswordKey = ""
 		if err := cfg.Validate(); err == nil {
 			t.Fatalf("expected missing admin secret password key validation error")
 		}
 	})
 	t.Run("require robot auth with mode none", func(t *testing.T) {
 		cfg := loadBaselineConfig(t)
 		cfg.Startup.ServiceChecklistAuth.Mode = "none"
 		cfg.Startup.ServiceChecklist = append(cfg.Startup.ServiceChecklist, icfg.ServiceChecklistCheck{
 			Name:             "robot-only",
 			URL:              "https://logs.bstein.dev/",
 			RequireRobotAuth: true,
 			TimeoutSeconds:   5,
 		})
 		if err := cfg.Validate(); err == nil {
 			t.Fatalf("expected require_robot_auth + mode none validation error")
 		}
 	})
 	t.Run("final url markers without redirects", func(t *testing.T) {
 		cfg := loadBaselineConfig(t)
 		cfg.Startup.ServiceChecklist = append(cfg.Startup.ServiceChecklist, icfg.ServiceChecklistCheck{
 			Name:             "final-url-invalid",
 			URL:              "https://logs.bstein.dev/",
 			AcceptedStatuses: []int{200},
 			FinalURLContains: "/app/home",
 			TimeoutSeconds:   5,
 		})
 		if err := cfg.Validate(); err == nil {
 			t.Fatalf("expected final_url marker validation error when redirects disabled")
 		}
 	})
 	t.Run("invalid accepted status code", func(t *testing.T) {
 		cfg := loadBaselineConfig(t)
 		cfg.Startup.ServiceChecklist[0].AcceptedStatuses = []int{700}
 		if err := cfg.Validate(); err == nil {
 			t.Fatalf("expected invalid accepted status code error")
 		}
 	})
 	t.Run("required node label map contracts", func(t *testing.T) {
 		cfg := loadBaselineConfig(t)
 		cfg.Startup.RequiredNodeLabels = map[string]map[string]string{" ": {"k": "v"}}
 		if err := cfg.Validate(); err == nil {
 			t.Fatalf("expected empty required-node-label key error")
 		}
 		cfg = loadBaselineConfig(t)
 		cfg.Startup.RequiredNodeLabels = map[string]map[string]string{"titan-23": {}}
 		if err := cfg.Validate(); err == nil {
 			t.Fatalf("expected empty required-node-label map error")
 		}
 		cfg = loadBaselineConfig(t)
 		cfg.Startup.RequiredNodeLabels = map[string]map[string]string{"titan-23": {"zone": " "}}
 		if err := cfg.Validate(); err == nil {
 			t.Fatalf("expected empty required-node-label value error")
 		}
 	})
 	t.Run("missing auth fields", func(t *testing.T) {
 		cfg := loadBaselineConfig(t)
 		cfg.Startup.ServiceChecklistAuth.Realm = ""
 		if err := cfg.Validate(); err == nil {
 			t.Fatalf("expected missing realm error")
 		}
 		cfg = loadBaselineConfig(t)
 		cfg.Startup.ServiceChecklistAuth.RobotUsername = ""
 		if err := cfg.Validate(); err == nil {
 			t.Fatalf("expected missing robot username error")
 		}
 		cfg = loadBaselineConfig(t)
 		cfg.Startup.ServiceChecklistAuth.AdminSecretNamespace = ""
 		if err := cfg.Validate(); err == nil {
 			t.Fatalf("expected missing admin secret namespace error")
 		}
 		cfg = loadBaselineConfig(t)
 		cfg.Startup.ServiceChecklistAuth.AdminSecretName = ""
 		if err := cfg.Validate(); err == nil {
 			t.Fatalf("expected missing admin secret name error")
 		}
 		cfg = loadBaselineConfig(t)
 		cfg.Startup.ServiceChecklistAuth.AdminSecretUsernameKey = ""
 		if err := cfg.Validate(); err == nil {
 			t.Fatalf("expected missing admin secret username key error")
 		}
 	})
 	t.Run("service checklist missing url", func(t *testing.T) {
 		cfg := loadBaselineConfig(t)
 		cfg.Startup.ServiceChecklist[0].URL = " "
 		if err := cfg.Validate(); err == nil {
 			t.Fatalf("expected missing checklist URL error")
 		}
 	})
 	t.Run("coordination and state contracts", func(t *testing.T) {
 		cfg := loadBaselineConfig(t)
 		cfg.Coordination.ForwardShutdownHost = "titan-24"
 		cfg.Coordination.ForwardShutdownConfig = ""
 		if err := cfg.Validate(); err == nil {
 			t.Fatalf("expected forward-shutdown config error")
 		}
 		cfg = loadBaselineConfig(t)
 		cfg.Coordination.PeerHosts = []string{"titan-24", " "}
 		if err := cfg.Validate(); err == nil {
 			t.Fatalf("expected peer host empty entry error")
 		}
 		cfg = loadBaselineConfig(t)
 		cfg.Coordination.Role = "invalid"
 		if err := cfg.Validate(); err == nil {
 			t.Fatalf("expected invalid coordination role error")
 		}
 		cfg = loadBaselineConfig(t)
 		cfg.State.ReportsDir = ""
 		if err := cfg.Validate(); err == nil {
 			t.Fatalf("expected state reports_dir required error")
 		}
 	})
 }
--- a/testing/coverage/coverage_test.go
+++ b/testing/coverage/coverage_test.go
@ -101,9 +101,18 @@ func TestPerFileCoverageReport(t *testing.T) {
 	root := repoRoot(t)
 	tmp := t.TempDir()
 	rootCover := filepath.Join(tmp, "ananke.root.cover.out")
 	configCover := filepath.Join(tmp, "ananke.testing.config.cover.out")
 	testingCover := filepath.Join(tmp, "ananke.testing.cover.out")
 	runCoverageCommand(t, root, rootCover, "./...")
 	runCoverageCommand(
 		t,
 		filepath.Join(root, "testing"),
 		configCover,
 		"./config",
 		"-coverpkg=scm.bstein.dev/bstein/ananke/...",
 	)
 	runCoverageCommand(
 		t,
 		filepath.Join(root, "testing"),
@ -118,6 +127,7 @@ func TestPerFileCoverageReport(t *testing.T) {
 	blocks := map[string]coverageBlock{}
 	parseCoverageProfile(t, rootCover, blocks)
 	parseCoverageProfile(t, configCover, blocks)
 	parseCoverageProfile(t, testingCover, blocks)
 	byFile := map[string]*fileCoverage{}
--- a/testing/orchestrator/hooks_gap_matrix_part11_test.go
+++ b/testing/orchestrator/hooks_gap_matrix_part11_test.go
@ -279,8 +279,8 @@ func TestHookGapMatrixPart11RemainingClosure(t *testing.T) {
 			_, _, probeErr := orchBodyErr.TestHookHTTPChecklistProbe(context.Background(), config.ServiceChecklistCheck{
 				URL: "http://" + ln.Addr().String() + "/health",
 			})
-			if probeErr == nil || !strings.Contains(probeErr.Error(), "read response body") {
+			if probeErr == nil || (!strings.Contains(probeErr.Error(), "read response body") && !strings.Contains(probeErr.Error(), "request failed")) {
-				t.Fatalf("expected checklist body read-error branch, got %v", probeErr)
+				t.Fatalf("expected checklist probe failure branch, got %v", probeErr)
 			}
 			cfgStability := lifecycleConfig(t)
--- a/testing/orchestrator/hooks_service_auth_matrix_test.go
+++ b/testing/orchestrator/hooks_service_auth_matrix_test.go
@ -0,0 +1,536 @@
 package orchestrator
 import (
 	"context"
 	"encoding/base64"
 	"errors"
 	"fmt"
 	"net/http"
 	"net/http/httptest"
 	"strings"
 	"testing"
 	"time"
 	"scm.bstein.dev/bstein/ananke/internal/cluster"
 	"scm.bstein.dev/bstein/ananke/internal/config"
 )
 func testSecretJSON(username, password string) string {
 	return fmt.Sprintf(
 		`{"data":{"username":"%s","password":"%s"}}`,
 		base64.StdEncoding.EncodeToString([]byte(username)),
 		base64.StdEncoding.EncodeToString([]byte(password)),
 	)
 }
 func authSettings(baseURL string) config.ServiceChecklistAuthSettings {
 	return config.ServiceChecklistAuthSettings{
 		Mode:                   "keycloak_robotuser",
 		KeycloakBaseURL:        baseURL,
 		Realm:                  "atlas",
 		RobotUsername:          "robotuser",
 		AdminSecretNamespace:   "sso",
 		AdminSecretName:        "keycloak-admin",
 		AdminSecretUsernameKey: "username",
 		AdminSecretPasswordKey: "password",
 	}
 }
 // TestHookServiceAuthChecklistSuccess runs one orchestration or CLI step.
 // Signature: TestHookServiceAuthChecklistSuccess(t *testing.T).
 // Why: validates full robotuser-authenticated checklist flow with final URL and
 // body markers so startup gates reflect real post-login user behavior.
 func TestHookServiceAuthChecklistSuccess(t *testing.T) {
 	var appServer *httptest.Server
 	appMux := http.NewServeMux()
 	appMux.HandleFunc("/session/bootstrap", func(w http.ResponseWriter, _ *http.Request) {
 		http.SetCookie(w, &http.Cookie{Name: "robot_session", Value: "ok", Path: "/"})
 		w.WriteHeader(http.StatusOK)
 		_, _ = w.Write([]byte("bootstrap ok"))
 	})
 	appMux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
 		if r.URL.Path == "/" {
 			http.Redirect(w, r, "/app/home", http.StatusFound)
 			return
 		}
 		cookie, err := r.Cookie("robot_session")
 		if err != nil || strings.TrimSpace(cookie.Value) == "" {
 			http.Redirect(w, r, "/oauth2/sign_in", http.StatusFound)
 			return
 		}
 		if r.URL.Path == "/app/home" {
 			w.WriteHeader(http.StatusOK)
 			_, _ = w.Write([]byte("OpenSearch Dashboards"))
 			return
 		}
 		if r.URL.Path == "/oauth2/sign_in" {
 			w.WriteHeader(http.StatusOK)
 			_, _ = w.Write([]byte("sign in"))
 			return
 		}
 		w.WriteHeader(http.StatusNotFound)
 	})
 	appServer = httptest.NewTLSServer(appMux)
 	defer appServer.Close()
 	kcMux := http.NewServeMux()
 	kcMux.HandleFunc("/realms/master/protocol/openid-connect/token", func(w http.ResponseWriter, _ *http.Request) {
 		w.WriteHeader(http.StatusOK)
 		_, _ = w.Write([]byte(`{"access_token":"admin-token"}`))
 	})
 	kcMux.HandleFunc("/admin/realms/atlas/users", func(w http.ResponseWriter, _ *http.Request) {
 		w.WriteHeader(http.StatusOK)
 		_, _ = w.Write([]byte(`[{"id":"robot-id"}]`))
 	})
 	kcMux.HandleFunc("/admin/realms/atlas/users/robot-id/impersonation", func(w http.ResponseWriter, _ *http.Request) {
 		w.WriteHeader(http.StatusOK)
 		_, _ = w.Write([]byte(fmt.Sprintf(`{"redirect":"%s/session/bootstrap"}`, appServer.URL)))
 	})
 	kcServer := httptest.NewTLSServer(kcMux)
 	defer kcServer.Close()
 	cfg := lifecycleConfig(t)
 	cfg.Startup.ServiceChecklistAuth = authSettings(kcServer.URL)
 	recorder := &commandRecorder{}
 	base := lifecycleDispatcher(recorder)
 	run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
 		command := name + " " + strings.Join(args, " ")
 		if name == "kubectl" && strings.Contains(command, "-n sso get secret keycloak-admin -o json") {
 			recorder.record(name, args)
 			return testSecretJSON("admin", "password"), nil
 		}
 		return base(ctx, timeout, name, args...)
 	}
 	orch, _ := newHookOrchestrator(t, cfg, run, run)
 	check := config.ServiceChecklistCheck{
 		Name:                "logs-ui-user-session",
 		URL:                 appServer.URL + "/",
 		AcceptedStatuses:    []int{200},
 		RequireRobotAuth:    true,
 		FollowRedirects:     true,
 		InsecureSkipTLS:     true,
 		FinalURLContains:    "/app/home",
 		FinalURLNotContains: "/oauth2/sign_in",
 		BodyContains:        "OpenSearch Dashboards",
 		TimeoutSeconds:      5,
 	}
 	ok, detail := orch.TestHookServiceCheckReady(context.Background(), check)
 	if !ok {
 		t.Fatalf("expected authenticated checklist success, detail=%q", detail)
 	}
 }
 // TestHookServiceAuthModeAndSecretErrors runs one orchestration or CLI step.
 // Signature: TestHookServiceAuthModeAndSecretErrors(t *testing.T).
 // Why: covers auth mode guards and secret decode error branches to keep startup
 // failures explicit when robot-auth prerequisites are missing.
 func TestHookServiceAuthModeAndSecretErrors(t *testing.T) {
 	cfg := lifecycleConfig(t)
 	client := &http.Client{Timeout: time.Second}
 	cfgNone := lifecycleConfig(t)
 	cfgNone.Startup.ServiceChecklistAuth.Mode = "none"
 	orchNone, _ := newHookOrchestrator(t, cfgNone, nil, nil)
 	if err := orchNone.TestHookAuthenticateRobotChecklistSession(context.Background(), client); err == nil {
 		t.Fatalf("expected auth mode none to fail")
 	}
 	if _, err := orchNone.TestHookChecklistAuthHTTPClient(context.Background(), time.Second, false); err == nil {
 		t.Fatalf("expected checklist auth client init to fail when mode=none")
 	}
 	cfgBad := lifecycleConfig(t)
 	cfgBad.Startup.ServiceChecklistAuth.Mode = "bad-mode"
 	orchBad, _ := newHookOrchestrator(t, cfgBad, nil, nil)
 	if err := orchBad.TestHookAuthenticateRobotChecklistSession(context.Background(), client); err == nil {
 		t.Fatalf("expected unsupported auth mode to fail")
 	}
 	base := lifecycleDispatcher(&commandRecorder{})
 	runKubectlErr := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
 		if name == "kubectl" {
 			return "", errors.New("kubectl denied")
 		}
 		return base(ctx, timeout, name, args...)
 	}
 	orchKubectlErr, _ := newHookOrchestrator(t, cfg, runKubectlErr, runKubectlErr)
 	if _, err := orchKubectlErr.TestHookKubernetesSecretValue(context.Background(), "sso", "keycloak-admin", "username"); err == nil {
 		t.Fatalf("expected kubectl error branch")
 	}
 	if _, _, err := orchKubectlErr.TestHookKeycloakAdminCredentials(context.Background(), cfg.Startup.ServiceChecklistAuth); err == nil {
 		t.Fatalf("expected keycloakAdminCredentials to fail on username secret lookup")
 	}
 	if err := orchKubectlErr.TestHookAuthenticateRobotChecklistSession(context.Background(), client); err == nil {
 		t.Fatalf("expected auth session failure when secret lookup fails")
 	}
 	runBadJSON := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
 		if name == "kubectl" {
 			return "{bad", nil
 		}
 		return base(ctx, timeout, name, args...)
 	}
 	orchBadJSON, _ := newHookOrchestrator(t, cfg, runBadJSON, runBadJSON)
 	if _, err := orchBadJSON.TestHookKubernetesSecretValue(context.Background(), "sso", "keycloak-admin", "username"); err == nil {
 		t.Fatalf("expected secret decode error branch")
 	}
 	runMissingKey := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
 		if name == "kubectl" {
 			return `{"data":{"password":"cGFzcw=="}}`, nil
 		}
 		return base(ctx, timeout, name, args...)
 	}
 	orchMissingKey, _ := newHookOrchestrator(t, cfg, runMissingKey, runMissingKey)
 	if _, err := orchMissingKey.TestHookKubernetesSecretValue(context.Background(), "sso", "keycloak-admin", "username"); err == nil {
 		t.Fatalf("expected missing key branch")
 	}
 	if err := orchMissingKey.TestHookAuthenticateRobotChecklistSession(context.Background(), client); err == nil {
 		t.Fatalf("expected auth session failure when username key is missing")
 	}
 	runMissingPassword := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
 		if name == "kubectl" {
 			return `{"data":{"username":"YWRtaW4="}}`, nil
 		}
 		return base(ctx, timeout, name, args...)
 	}
 	orchMissingPassword, _ := newHookOrchestrator(t, cfg, runMissingPassword, runMissingPassword)
 	if _, _, err := orchMissingPassword.TestHookKeycloakAdminCredentials(context.Background(), cfg.Startup.ServiceChecklistAuth); err == nil {
 		t.Fatalf("expected keycloakAdminCredentials to fail on password secret lookup")
 	}
 	if err := orchMissingPassword.TestHookAuthenticateRobotChecklistSession(context.Background(), client); err == nil {
 		t.Fatalf("expected auth session failure when password key is missing")
 	}
 	runBadB64 := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
 		if name == "kubectl" {
 			return `{"data":{"username":"###"}}`, nil
 		}
 		return base(ctx, timeout, name, args...)
 	}
 	orchBadB64, _ := newHookOrchestrator(t, cfg, runBadB64, runBadB64)
 	if _, err := orchBadB64.TestHookKubernetesSecretValue(context.Background(), "sso", "keycloak-admin", "username"); err == nil {
 		t.Fatalf("expected base64 decode branch")
 	}
 	runEmptyValue := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
 		if name == "kubectl" {
 			return `{"data":{"username":"IA=="}}`, nil
 		}
 		return base(ctx, timeout, name, args...)
 	}
 	orchEmptyValue, _ := newHookOrchestrator(t, cfg, runEmptyValue, runEmptyValue)
 	if _, err := orchEmptyValue.TestHookKubernetesSecretValue(context.Background(), "sso", "keycloak-admin", "username"); err == nil {
 		t.Fatalf("expected empty decoded value branch")
 	}
 	if got := cluster.TestHookCompactHTTPBody([]byte("  hello   world \n  test ")); got != "hello world test" {
 		t.Fatalf("unexpected compact body %q", got)
 	}
 	if got := cluster.TestHookCompactHTTPBody([]byte(" \n\t ")); got != "" {
 		t.Fatalf("expected compact empty body, got %q", got)
 	}
 	if got := cluster.TestHookKeycloakBaseURL(config.ServiceChecklistAuthSettings{KeycloakBaseURL: "https://sso.bstein.dev/"}); got != "https://sso.bstein.dev" {
 		t.Fatalf("unexpected normalized base URL %q", got)
 	}
 }
 // TestHookServiceAuthHTTPErrorBranches runs one orchestration or CLI step.
 // Signature: TestHookServiceAuthHTTPErrorBranches(t *testing.T).
 // Why: covers token/user/impersonation parser and status branches so startup
 // diagnostics remain actionable during auth failures.
 func TestHookServiceAuthHTTPErrorBranches(t *testing.T) {
 	cfg := lifecycleConfig(t)
 	orch, _ := newHookOrchestrator(t, cfg, nil, nil)
 	client := &http.Client{Timeout: 2 * time.Second}
 	authBadURL := authSettings("://bad-url")
 	if _, err := orch.TestHookKeycloakAdminToken(context.Background(), client, authBadURL, "admin", "pw"); err == nil {
 		t.Fatalf("expected request-build failure for bad base URL")
 	}
 	if _, err := orch.TestHookKeycloakRobotUserID(context.Background(), client, authBadURL, "token"); err == nil {
 		t.Fatalf("expected robot-user request-build failure for bad base URL")
 	}
 	if _, err := orch.TestHookKeycloakImpersonationRedirect(context.Background(), client, authBadURL, "token", "robot"); err == nil {
 		t.Fatalf("expected impersonation request-build failure for bad base URL")
 	}
 	authRequestErr := authSettings("http://127.0.0.1:1")
 	if _, err := orch.TestHookKeycloakAdminToken(context.Background(), client, authRequestErr, "admin", "pw"); err == nil {
 		t.Fatalf("expected admin token request error branch")
 	}
 	if _, err := orch.TestHookKeycloakRobotUserID(context.Background(), client, authRequestErr, "token"); err == nil {
 		t.Fatalf("expected robot user request error branch")
 	}
 	if _, err := orch.TestHookKeycloakImpersonationRedirect(context.Background(), client, authRequestErr, "token", "robot"); err == nil {
 		t.Fatalf("expected impersonation request error branch")
 	}
 	kcError := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 		switch {
 		case strings.Contains(r.URL.Path, "/token"):
 			w.WriteHeader(http.StatusUnauthorized)
 			_, _ = w.Write([]byte(`{"error":"unauthorized"}`))
 		case strings.Contains(r.URL.Path, "/users") && strings.Contains(r.URL.RawQuery, "username=robotuser"):
 			w.WriteHeader(http.StatusInternalServerError)
 			_, _ = w.Write([]byte(`{"error":"boom"}`))
 		default:
 			w.WriteHeader(http.StatusBadGateway)
 		}
 	}))
 	defer kcError.Close()
 	authError := authSettings(kcError.URL)
 	if _, err := orch.TestHookKeycloakAdminToken(context.Background(), client, authError, "admin", "pw"); err == nil {
 		t.Fatalf("expected non-2xx token branch")
 	}
 	if _, err := orch.TestHookKeycloakRobotUserID(context.Background(), client, authError, "token"); err == nil {
 		t.Fatalf("expected non-2xx robot user branch")
 	}
 	kcDecode := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 		switch {
 		case strings.Contains(r.URL.Path, "/token"):
 			w.WriteHeader(http.StatusOK)
 			_, _ = w.Write([]byte("not-json"))
 		case strings.Contains(r.URL.Path, "/users") && strings.Contains(r.URL.RawQuery, "username=robotuser"):
 			w.WriteHeader(http.StatusOK)
 			_, _ = w.Write([]byte("not-json"))
 		case strings.Contains(r.URL.Path, "/impersonation"):
 			w.WriteHeader(http.StatusOK)
 			_, _ = w.Write([]byte("not-json"))
 		default:
 			w.WriteHeader(http.StatusNotFound)
 		}
 	}))
 	defer kcDecode.Close()
 	authDecode := authSettings(kcDecode.URL)
 	if _, err := orch.TestHookKeycloakAdminToken(context.Background(), client, authDecode, "admin", "pw"); err == nil {
 		t.Fatalf("expected token decode error branch")
 	}
 	if _, err := orch.TestHookKeycloakRobotUserID(context.Background(), client, authDecode, "token"); err == nil {
 		t.Fatalf("expected robot user decode error branch")
 	}
 	if _, err := orch.TestHookKeycloakImpersonationRedirect(context.Background(), client, authDecode, "token", "robot"); err == nil {
 		t.Fatalf("expected impersonation decode error branch")
 	}
 	kcMissing := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 		switch {
 		case strings.Contains(r.URL.Path, "/token"):
 			w.WriteHeader(http.StatusOK)
 			_, _ = w.Write([]byte(`{"access_token":""}`))
 		case strings.Contains(r.URL.Path, "/users") && strings.Contains(r.URL.RawQuery, "username=robotuser"):
 			w.WriteHeader(http.StatusOK)
 			_, _ = w.Write([]byte(`[]`))
 		case strings.Contains(r.URL.Path, "/impersonation"):
 			w.WriteHeader(http.StatusBadRequest)
 			_, _ = w.Write([]byte(`{"error":"bad request"}`))
 		default:
 			w.WriteHeader(http.StatusNotFound)
 		}
 	}))
 	defer kcMissing.Close()
 	authMissing := authSettings(kcMissing.URL)
 	if _, err := orch.TestHookKeycloakAdminToken(context.Background(), client, authMissing, "admin", "pw"); err == nil {
 		t.Fatalf("expected missing access_token branch")
 	}
 	if _, err := orch.TestHookKeycloakRobotUserID(context.Background(), client, authMissing, "token"); err == nil {
 		t.Fatalf("expected missing robot user branch")
 	}
 	if _, err := orch.TestHookKeycloakImpersonationRedirect(context.Background(), client, authMissing, "token", "robot"); err == nil {
 		t.Fatalf("expected impersonation non-2xx branch")
 	}
 }
 // TestHookServiceChecklistProbeBranches runs one orchestration or CLI step.
 // Signature: TestHookServiceChecklistProbeBranches(t *testing.T).
 // Why: exercises redirect + final-url probe branches, including robot-auth
 // initialization failures and redirect suppression behavior.
 func TestHookServiceChecklistProbeBranches(t *testing.T) {
 	cfg := lifecycleConfig(t)
 	cfg.Startup.ServiceChecklistAuth.Mode = "none"
 	orch, _ := newHookOrchestrator(t, cfg, nil, nil)
 	if _, _, _, _, err := orch.TestHookHTTPChecklistProbeWithLocation(context.Background(), config.ServiceChecklistCheck{
 		URL:              "https://example.invalid/",
 		RequireRobotAuth: true,
 		TimeoutSeconds:   1,
 	}); err == nil {
 		t.Fatalf("expected robot auth initialization failure when mode=none")
 	}
 	redirectServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 		http.Redirect(w, r, "/next", http.StatusFound)
 	}))
 	defer redirectServer.Close()
 	orchNoAuth, _ := newHookOrchestrator(t, lifecycleConfig(t), nil, nil)
 	status, _, location, finalURL, err := orchNoAuth.TestHookHTTPChecklistProbeWithLocation(context.Background(), config.ServiceChecklistCheck{
 		URL:             redirectServer.URL,
 		FollowRedirects: false,
 		TimeoutSeconds:  2,
 	})
 	if err != nil {
 		t.Fatalf("unexpected redirect probe error: %v", err)
 	}
 	if status != http.StatusFound {
 		t.Fatalf("expected 302 status when redirects disabled, got %d", status)
 	}
 	if !strings.Contains(location, "/next") {
 		t.Fatalf("expected location header for redirect response, got %q", location)
 	}
 	if !strings.Contains(finalURL, redirectServer.URL) {
 		t.Fatalf("expected final URL to remain original request URL, got %q", finalURL)
 	}
 }
 // TestHookAuthenticateRobotChecklistSessionFailureStages runs one orchestration or CLI step.
 // Signature: TestHookAuthenticateRobotChecklistSessionFailureStages(t *testing.T).
 // Why: drives authenticateRobotChecklistSession through downstream error stages
 // (robot lookup, impersonation, redirect-build, redirect-request) to maintain
 // resilient startup diagnostics.
 func TestHookAuthenticateRobotChecklistSessionFailureStages(t *testing.T) {
 	client := &http.Client{Timeout: 3 * time.Second}
 	recorder := &commandRecorder{}
 	base := lifecycleDispatcher(recorder)
 	secretRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
 		command := name + " " + strings.Join(args, " ")
 		if name == "kubectl" && strings.Contains(command, "-n sso get secret keycloak-admin -o json") {
 			return testSecretJSON("admin", "password"), nil
 		}
 		return base(ctx, timeout, name, args...)
 	}
 	t.Run("robot-user lookup failure", func(t *testing.T) {
 		kc := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 			switch {
 			case strings.Contains(r.URL.Path, "/token"):
 				_, _ = w.Write([]byte(`{"access_token":"admin-token"}`))
 			case strings.Contains(r.URL.Path, "/users"):
 				w.WriteHeader(http.StatusBadGateway)
 				_, _ = w.Write([]byte(`{"error":"lookup failed"}`))
 			default:
 				w.WriteHeader(http.StatusOK)
 			}
 		}))
 		defer kc.Close()
 		cfg := lifecycleConfig(t)
 		cfg.Startup.ServiceChecklistAuth = authSettings(kc.URL)
 		orch, _ := newHookOrchestrator(t, cfg, secretRun, secretRun)
 		if err := orch.TestHookAuthenticateRobotChecklistSession(context.Background(), client); err == nil {
 			t.Fatalf("expected robot-user lookup failure branch")
 		}
 	})
 	t.Run("impersonation failure", func(t *testing.T) {
 		kc := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 			switch {
 			case strings.Contains(r.URL.Path, "/token"):
 				_, _ = w.Write([]byte(`{"access_token":"admin-token"}`))
 			case strings.Contains(r.URL.Path, "/users"):
 				_, _ = w.Write([]byte(`[{"id":"robot-id"}]`))
 			case strings.Contains(r.URL.Path, "/impersonation"):
 				w.WriteHeader(http.StatusBadGateway)
 				_, _ = w.Write([]byte(`{"error":"impersonation failed"}`))
 			default:
 				w.WriteHeader(http.StatusOK)
 			}
 		}))
 		defer kc.Close()
 		cfg := lifecycleConfig(t)
 		cfg.Startup.ServiceChecklistAuth = authSettings(kc.URL)
 		orch, _ := newHookOrchestrator(t, cfg, secretRun, secretRun)
 		if err := orch.TestHookAuthenticateRobotChecklistSession(context.Background(), client); err == nil {
 			t.Fatalf("expected impersonation failure branch")
 		}
 	})
 	t.Run("redirect url build failure", func(t *testing.T) {
 		kc := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 			switch {
 			case strings.Contains(r.URL.Path, "/token"):
 				_, _ = w.Write([]byte(`{"access_token":"admin-token"}`))
 			case strings.Contains(r.URL.Path, "/users"):
 				_, _ = w.Write([]byte(`[{"id":"robot-id"}]`))
 			case strings.Contains(r.URL.Path, "/impersonation"):
 				_, _ = w.Write([]byte(`{"redirect":"://bad"}`))
 			default:
 				w.WriteHeader(http.StatusOK)
 			}
 		}))
 		defer kc.Close()
 		cfg := lifecycleConfig(t)
 		cfg.Startup.ServiceChecklistAuth = authSettings(kc.URL)
 		orch, _ := newHookOrchestrator(t, cfg, secretRun, secretRun)
 		if err := orch.TestHookAuthenticateRobotChecklistSession(context.Background(), client); err == nil {
 			t.Fatalf("expected redirect request-build failure branch")
 		}
 	})
 	t.Run("redirect request failure", func(t *testing.T) {
 		kc := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 			switch {
 			case strings.Contains(r.URL.Path, "/token"):
 				_, _ = w.Write([]byte(`{"access_token":"admin-token"}`))
 			case strings.Contains(r.URL.Path, "/users"):
 				_, _ = w.Write([]byte(`[{"id":"robot-id"}]`))
 			case strings.Contains(r.URL.Path, "/impersonation"):
 				_, _ = w.Write([]byte(`{"redirect":"http://127.0.0.1:1/nowhere"}`))
 			default:
 				w.WriteHeader(http.StatusOK)
 			}
 		}))
 		defer kc.Close()
 		cfg := lifecycleConfig(t)
 		cfg.Startup.ServiceChecklistAuth = authSettings(kc.URL)
 		orch, _ := newHookOrchestrator(t, cfg, secretRun, secretRun)
 		if err := orch.TestHookAuthenticateRobotChecklistSession(context.Background(), client); err == nil {
 			t.Fatalf("expected redirect request failure branch")
 		}
 	})
 }
 // TestHookServiceAuthFallbackRedirect runs one orchestration or CLI step.
 // Signature: TestHookServiceAuthFallbackRedirect(t *testing.T).
 // Why: covers empty impersonation redirect fallback to realm account URL so
 // session bootstrap is resilient to Keycloak response shape differences.
 func TestHookServiceAuthFallbackRedirect(t *testing.T) {
 	kcMux := http.NewServeMux()
 	kcMux.HandleFunc("/realms/master/protocol/openid-connect/token", func(w http.ResponseWriter, _ *http.Request) {
 		w.WriteHeader(http.StatusOK)
 		_, _ = w.Write([]byte(`{"access_token":"admin-token"}`))
 	})
 	kcMux.HandleFunc("/admin/realms/atlas/users", func(w http.ResponseWriter, _ *http.Request) {
 		w.WriteHeader(http.StatusOK)
 		_, _ = w.Write([]byte(`[{"id":"robot-id"}]`))
 	})
 	kcMux.HandleFunc("/admin/realms/atlas/users/robot-id/impersonation", func(w http.ResponseWriter, _ *http.Request) {
 		w.WriteHeader(http.StatusOK)
 		_, _ = w.Write([]byte(`{"redirect":""}`))
 	})
 	kcMux.HandleFunc("/realms/atlas/account/", func(w http.ResponseWriter, _ *http.Request) {
 		w.WriteHeader(http.StatusOK)
 		_, _ = w.Write([]byte("account ok"))
 	})
 	kcServer := httptest.NewTLSServer(kcMux)
 	defer kcServer.Close()
 	cfg := lifecycleConfig(t)
 	cfg.Startup.ServiceChecklistAuth = authSettings(kcServer.URL)
 	recorder := &commandRecorder{}
 	base := lifecycleDispatcher(recorder)
 	run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
 		command := name + " " + strings.Join(args, " ")
 		if name == "kubectl" && strings.Contains(command, "-n sso get secret keycloak-admin -o json") {
 			return testSecretJSON("admin", "password"), nil
 		}
 		return base(ctx, timeout, name, args...)
 	}
 	orch, _ := newHookOrchestrator(t, cfg, run, run)
 	if err := orch.TestHookAuthenticateRobotChecklistSession(context.Background(), &http.Client{Timeout: 4 * time.Second, Transport: &http.Transport{}}); err == nil {
 		t.Fatalf("expected auth bootstrap without TLS skip to fail against TLS test server")
 	}
 	if _, err := orch.TestHookChecklistAuthHTTPClient(context.Background(), 4*time.Second, true); err != nil {
 		t.Fatalf("expected checklist auth client fallback redirect path success, got %v", err)
 	}
 }