ananke: refactor orchestrator, enforce quality gates, and harden startup checks

This commit is contained in:
Brad Stein 2026-04-09 01:38:06 -03:00
parent baead1426e
commit c2c79e5821
51 changed files with 3677 additions and 176 deletions

View File

@ -1,4 +1,4 @@
.PHONY: build test fmt tidy install drill-list drill-run
.PHONY: build test test-all quality-gate hygiene lint coverage-report coverage-gate fmt tidy install drill-list drill-run
build:
go build -o dist/ananke ./cmd/ananke
@ -6,6 +6,23 @@ build:
test:
go test ./...
test-all: test hygiene lint coverage-report
quality-gate:
./scripts/quality_gate.sh
hygiene:
cd testing && go test ./hygiene
lint:
./scripts/lint.sh
coverage-report:
cd testing && go test ./coverage -run TestPerFileCoverageReport -count=1 -v
coverage-gate:
cd testing && ANANKE_ENFORCE_COVERAGE=1 ANANKE_PER_FILE_COVERAGE_TARGET=95 go test ./coverage -run TestPerFileCoverageReport -count=1 -v
fmt:
gofmt -w ./cmd ./internal

View File

@ -48,6 +48,9 @@ startup:
api_poll_seconds: 2
shutdown_cooldown_seconds: 45
minimum_battery_percent: 20
require_node_inventory_reachability: true
node_inventory_reachability_wait_seconds: 300
node_inventory_reachability_poll_seconds: 5
required_node_labels:
titan-09:
ananke.bstein.dev/harbor-bootstrap: "true"
@ -78,6 +81,15 @@ startup:
service_checklist_wait_seconds: 420
service_checklist_poll_seconds: 5
service_checklist_stability_seconds: 120
service_checklist_auth:
mode: keycloak_robotuser
keycloak_base_url: https://sso.bstein.dev
realm: atlas
robot_username: robotuser
admin_secret_namespace: sso
admin_secret_name: keycloak-admin
admin_secret_username_key: username
admin_secret_password_key: password
service_checklist:
- name: gitea-api
url: https://scm.bstein.dev/api/healthz
@ -99,10 +111,20 @@ startup:
accepted_statuses: [401]
body_contains: unauthorized
timeout_seconds: 12
- name: longhorn-auth
url: https://longhorn.bstein.dev/
accepted_statuses: [200, 302]
- name: longhorn-api-user-session
url: https://longhorn.bstein.dev/v1
accepted_statuses: [200]
require_robot_auth: true
follow_redirects: true
final_url_contains: /v1
final_url_not_contains: /oauth2/sign_in
body_contains: '"id":"v1"'
timeout_seconds: 12
require_critical_service_endpoints: true
critical_service_endpoint_wait_seconds: 420
critical_service_endpoint_poll_seconds: 5
critical_service_endpoints:
- monitoring/victoria-metrics-single-server
require_ingress_checklist: true
ingress_checklist_wait_seconds: 420
ingress_checklist_poll_seconds: 5
@ -139,10 +161,6 @@ shutdown:
drain_parallelism: 6
scale_parallelism: 8
ssh_parallelism: 8
poweroff_enabled: false
poweroff_delay_seconds: 25
poweroff_local_host: false
extra_poweroff_hosts: []
ups:
enabled: true
provider: nut
@ -170,6 +188,7 @@ metrics:
path: /metrics
state:
dir: /var/lib/ananke
reports_dir: /var/lib/ananke/reports
run_history_path: /var/lib/ananke/runs.json
lock_path: /var/lib/ananke/ananke.lock
intent_path: /var/lib/ananke/intent.json

View File

@ -114,6 +114,9 @@ startup:
api_poll_seconds: 2
shutdown_cooldown_seconds: 45
minimum_battery_percent: 20
require_node_inventory_reachability: true
node_inventory_reachability_wait_seconds: 300
node_inventory_reachability_poll_seconds: 5
required_node_labels:
titan-09:
ananke.bstein.dev/harbor-bootstrap: "true"
@ -144,6 +147,15 @@ startup:
service_checklist_wait_seconds: 420
service_checklist_poll_seconds: 5
service_checklist_stability_seconds: 120
service_checklist_auth:
mode: keycloak_robotuser
keycloak_base_url: https://sso.bstein.dev
realm: atlas
robot_username: robotuser
admin_secret_namespace: sso
admin_secret_name: keycloak-admin
admin_secret_username_key: username
admin_secret_password_key: password
service_checklist:
- name: gitea-api
url: https://scm.bstein.dev/api/healthz
@ -165,10 +177,20 @@ startup:
accepted_statuses: [401]
body_contains: unauthorized
timeout_seconds: 12
- name: longhorn-auth
url: https://longhorn.bstein.dev/
accepted_statuses: [200, 302]
- name: longhorn-api-user-session
url: https://longhorn.bstein.dev/v1
accepted_statuses: [200]
require_robot_auth: true
follow_redirects: true
final_url_contains: /v1
final_url_not_contains: /oauth2/sign_in
body_contains: '"id":"v1"'
timeout_seconds: 12
require_critical_service_endpoints: true
critical_service_endpoint_wait_seconds: 420
critical_service_endpoint_poll_seconds: 5
critical_service_endpoints:
- monitoring/victoria-metrics-single-server
require_ingress_checklist: true
ingress_checklist_wait_seconds: 420
ingress_checklist_poll_seconds: 5
@ -205,10 +227,6 @@ shutdown:
drain_parallelism: 6
scale_parallelism: 8
ssh_parallelism: 8
poweroff_enabled: false
poweroff_delay_seconds: 25
poweroff_local_host: false
extra_poweroff_hosts: []
ups:
enabled: true
provider: nut
@ -236,6 +254,7 @@ metrics:
path: /metrics
state:
dir: /var/lib/ananke
reports_dir: /var/lib/ananke/reports
run_history_path: /var/lib/ananke/runs.json
lock_path: /var/lib/ananke/ananke.lock
intent_path: /var/lib/ananke/intent.json

View File

@ -114,6 +114,9 @@ startup:
api_poll_seconds: 2
shutdown_cooldown_seconds: 45
minimum_battery_percent: 20
require_node_inventory_reachability: true
node_inventory_reachability_wait_seconds: 300
node_inventory_reachability_poll_seconds: 5
required_node_labels:
titan-09:
ananke.bstein.dev/harbor-bootstrap: "true"
@ -144,6 +147,15 @@ startup:
service_checklist_wait_seconds: 420
service_checklist_poll_seconds: 5
service_checklist_stability_seconds: 120
service_checklist_auth:
mode: keycloak_robotuser
keycloak_base_url: https://sso.bstein.dev
realm: atlas
robot_username: robotuser
admin_secret_namespace: sso
admin_secret_name: keycloak-admin
admin_secret_username_key: username
admin_secret_password_key: password
service_checklist:
- name: gitea-api
url: https://scm.bstein.dev/api/healthz
@ -165,10 +177,20 @@ startup:
accepted_statuses: [401]
body_contains: unauthorized
timeout_seconds: 12
- name: longhorn-auth
url: https://longhorn.bstein.dev/
accepted_statuses: [200, 302]
- name: longhorn-api-user-session
url: https://longhorn.bstein.dev/v1
accepted_statuses: [200]
require_robot_auth: true
follow_redirects: true
final_url_contains: /v1
final_url_not_contains: /oauth2/sign_in
body_contains: '"id":"v1"'
timeout_seconds: 12
require_critical_service_endpoints: true
critical_service_endpoint_wait_seconds: 420
critical_service_endpoint_poll_seconds: 5
critical_service_endpoints:
- monitoring/victoria-metrics-single-server
require_ingress_checklist: true
ingress_checklist_wait_seconds: 420
ingress_checklist_poll_seconds: 5
@ -205,10 +227,6 @@ shutdown:
drain_parallelism: 6
scale_parallelism: 8
ssh_parallelism: 8
poweroff_enabled: false
poweroff_delay_seconds: 25
poweroff_local_host: false
extra_poweroff_hosts: []
ups:
enabled: true
provider: nut
@ -236,6 +254,7 @@ metrics:
path: /metrics
state:
dir: /var/lib/ananke
reports_dir: /var/lib/ananke/reports
run_history_path: /var/lib/ananke/runs.json
lock_path: /var/lib/ananke/ananke.lock
intent_path: /var/lib/ananke/intent.json

View File

@ -0,0 +1,286 @@
package cluster
import (
"context"
"crypto/tls"
"encoding/base64"
"encoding/json"
"fmt"
"io"
"net/http"
"net/http/cookiejar"
neturl "net/url"
"strings"
"time"
"scm.bstein.dev/bstein/ananke/internal/config"
)
type keycloakTokenResponse struct {
AccessToken string `json:"access_token"`
}
type keycloakUser struct {
ID string `json:"id"`
}
type keycloakImpersonationResponse struct {
Redirect string `json:"redirect"`
}
type kubernetesSecret struct {
Data map[string]string `json:"data"`
}
// checklistAuthHTTPClient runs one orchestration or CLI step.
// Signature: (o *Orchestrator) checklistAuthHTTPClient(ctx context.Context, timeout time.Duration, insecureSkipTLS bool) (*http.Client, error).
// Why: startup checklist checks that require real user behavior need an
// authenticated robotuser browser-like session before probing service pages.
func (o *Orchestrator) checklistAuthHTTPClient(ctx context.Context, timeout time.Duration, insecureSkipTLS bool) (*http.Client, error) {
jar, err := cookiejar.New(nil)
if err != nil {
return nil, fmt.Errorf("create cookie jar: %w", err)
}
transport := &http.Transport{}
if insecureSkipTLS {
transport.TLSClientConfig = &tls.Config{InsecureSkipVerify: true}
}
client := &http.Client{
Timeout: timeout,
Transport: transport,
Jar: jar,
}
if err := o.authenticateRobotChecklistSession(ctx, client); err != nil {
return nil, err
}
return client, nil
}
// authenticateRobotChecklistSession runs one orchestration or CLI step.
// Signature: (o *Orchestrator) authenticateRobotChecklistSession(ctx context.Context, client *http.Client) error.
// Why: authenticated checklist probes must reflect what a human sees after
// Keycloak login, not only pre-auth redirects.
func (o *Orchestrator) authenticateRobotChecklistSession(ctx context.Context, client *http.Client) error {
auth := o.cfg.Startup.ServiceChecklistAuth
mode := strings.TrimSpace(auth.Mode)
if mode == "" || mode == "none" {
return fmt.Errorf("startup checklist auth mode is disabled")
}
if mode != "keycloak_robotuser" {
return fmt.Errorf("unsupported startup checklist auth mode %q", mode)
}
adminUser, adminPassword, err := o.keycloakAdminCredentials(ctx, auth)
if err != nil {
return err
}
adminToken, err := o.keycloakAdminToken(ctx, client, auth, adminUser, adminPassword)
if err != nil {
return err
}
robotUserID, err := o.keycloakRobotUserID(ctx, client, auth, adminToken)
if err != nil {
return err
}
redirectURL, err := o.keycloakImpersonationRedirect(ctx, client, auth, adminToken, robotUserID)
if err != nil {
return err
}
if strings.TrimSpace(redirectURL) == "" {
redirectURL = keycloakBaseURL(auth) + "/realms/" + strings.TrimSpace(auth.Realm) + "/account/"
}
req, err := http.NewRequestWithContext(ctx, http.MethodGet, redirectURL, nil)
if err != nil {
return fmt.Errorf("build robot redirect request: %w", err)
}
req.Header.Set("User-Agent", "ananke/startup-checklist")
resp, err := client.Do(req)
if err != nil {
return fmt.Errorf("initialize robot session redirect: %w", err)
}
defer resp.Body.Close()
_, _ = io.Copy(io.Discard, io.LimitReader(resp.Body, 1024))
return nil
}
// keycloakAdminCredentials runs one orchestration or CLI step.
// Signature: (o *Orchestrator) keycloakAdminCredentials(ctx context.Context, auth config.ServiceChecklistAuthSettings) (string, string, error).
// Why: robotuser impersonation uses a cluster-managed admin secret so startup
// checks do not rely on interactive credentials.
func (o *Orchestrator) keycloakAdminCredentials(ctx context.Context, auth config.ServiceChecklistAuthSettings) (string, string, error) {
namespace := strings.TrimSpace(auth.AdminSecretNamespace)
name := strings.TrimSpace(auth.AdminSecretName)
userKey := strings.TrimSpace(auth.AdminSecretUsernameKey)
passwordKey := strings.TrimSpace(auth.AdminSecretPasswordKey)
username, err := o.kubernetesSecretValue(ctx, namespace, name, userKey)
if err != nil {
return "", "", fmt.Errorf("read keycloak admin username from secret %s/%s: %w", namespace, name, err)
}
password, err := o.kubernetesSecretValue(ctx, namespace, name, passwordKey)
if err != nil {
return "", "", fmt.Errorf("read keycloak admin password from secret %s/%s: %w", namespace, name, err)
}
return username, password, nil
}
// kubernetesSecretValue runs one orchestration or CLI step.
// Signature: (o *Orchestrator) kubernetesSecretValue(ctx context.Context, namespace string, name string, key string) (string, error).
// Why: checklist auth depends on secret-backed credentials and should decode
// them directly from Kubernetes rather than shelling out to external tools.
func (o *Orchestrator) kubernetesSecretValue(ctx context.Context, namespace string, name string, key string) (string, error) {
out, err := o.kubectl(ctx, 25*time.Second, "-n", namespace, "get", "secret", name, "-o", "json")
if err != nil {
return "", fmt.Errorf("kubectl get secret: %w", err)
}
var doc kubernetesSecret
if err := json.Unmarshal([]byte(out), &doc); err != nil {
return "", fmt.Errorf("decode secret json: %w", err)
}
encoded, ok := doc.Data[key]
if !ok {
return "", fmt.Errorf("key %q not present in secret", key)
}
decoded, err := base64.StdEncoding.DecodeString(strings.TrimSpace(encoded))
if err != nil {
return "", fmt.Errorf("decode base64 secret value: %w", err)
}
value := strings.TrimSpace(string(decoded))
if value == "" {
return "", fmt.Errorf("decoded value is empty")
}
return value, nil
}
// keycloakAdminToken runs one orchestration or CLI step.
// Signature: (o *Orchestrator) keycloakAdminToken(ctx context.Context, client *http.Client, auth config.ServiceChecklistAuthSettings, adminUser string, adminPassword string) (string, error).
// Why: admin API access is needed to impersonate robotuser for deterministic
// user-journey checks across OIDC-gated services.
func (o *Orchestrator) keycloakAdminToken(ctx context.Context, client *http.Client, auth config.ServiceChecklistAuthSettings, adminUser string, adminPassword string) (string, error) {
form := neturl.Values{}
form.Set("grant_type", "password")
form.Set("client_id", "admin-cli")
form.Set("username", adminUser)
form.Set("password", adminPassword)
tokenURL := keycloakBaseURL(auth) + "/realms/master/protocol/openid-connect/token"
req, err := http.NewRequestWithContext(ctx, http.MethodPost, tokenURL, strings.NewReader(form.Encode()))
if err != nil {
return "", fmt.Errorf("build admin token request: %w", err)
}
req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
req.Header.Set("User-Agent", "ananke/startup-checklist")
resp, err := client.Do(req)
if err != nil {
return "", fmt.Errorf("request admin token: %w", err)
}
defer resp.Body.Close()
body, _ := io.ReadAll(io.LimitReader(resp.Body, 64*1024))
if resp.StatusCode/100 != 2 {
return "", fmt.Errorf("admin token request failed status=%d body=%q", resp.StatusCode, compactHTTPBody(body))
}
var payload keycloakTokenResponse
if err := json.Unmarshal(body, &payload); err != nil {
return "", fmt.Errorf("decode admin token response: %w", err)
}
token := strings.TrimSpace(payload.AccessToken)
if token == "" {
return "", fmt.Errorf("admin token response missing access_token")
}
return token, nil
}
// keycloakRobotUserID runs one orchestration or CLI step.
// Signature: (o *Orchestrator) keycloakRobotUserID(ctx context.Context, client *http.Client, auth config.ServiceChecklistAuthSettings, adminToken string) (string, error).
// Why: impersonation requires the concrete user id and should fail fast when
// robotuser is missing from the realm.
func (o *Orchestrator) keycloakRobotUserID(ctx context.Context, client *http.Client, auth config.ServiceChecklistAuthSettings, adminToken string) (string, error) {
base := keycloakBaseURL(auth)
realm := strings.TrimSpace(auth.Realm)
username := strings.TrimSpace(auth.RobotUsername)
query := neturl.Values{}
query.Set("username", username)
query.Set("exact", "true")
usersURL := base + "/admin/realms/" + realm + "/users?" + query.Encode()
req, err := http.NewRequestWithContext(ctx, http.MethodGet, usersURL, nil)
if err != nil {
return "", fmt.Errorf("build robot user lookup request: %w", err)
}
req.Header.Set("Authorization", "Bearer "+adminToken)
req.Header.Set("User-Agent", "ananke/startup-checklist")
resp, err := client.Do(req)
if err != nil {
return "", fmt.Errorf("lookup robot user: %w", err)
}
defer resp.Body.Close()
body, _ := io.ReadAll(io.LimitReader(resp.Body, 64*1024))
if resp.StatusCode/100 != 2 {
return "", fmt.Errorf("robot user lookup failed status=%d body=%q", resp.StatusCode, compactHTTPBody(body))
}
var users []keycloakUser
if err := json.Unmarshal(body, &users); err != nil {
return "", fmt.Errorf("decode robot user lookup response: %w", err)
}
if len(users) == 0 || strings.TrimSpace(users[0].ID) == "" {
return "", fmt.Errorf("robot user %q not found in realm %q", username, realm)
}
return strings.TrimSpace(users[0].ID), nil
}
// keycloakImpersonationRedirect runs one orchestration or CLI step.
// Signature: (o *Orchestrator) keycloakImpersonationRedirect(ctx context.Context, client *http.Client, auth config.ServiceChecklistAuthSettings, adminToken string, robotUserID string) (string, error).
// Why: opening a real impersonated browser session guarantees checks evaluate
// post-login app behavior instead of only auth-gateway redirects.
func (o *Orchestrator) keycloakImpersonationRedirect(ctx context.Context, client *http.Client, auth config.ServiceChecklistAuthSettings, adminToken string, robotUserID string) (string, error) {
base := keycloakBaseURL(auth)
realm := strings.TrimSpace(auth.Realm)
impersonateURL := base + "/admin/realms/" + realm + "/users/" + strings.TrimSpace(robotUserID) + "/impersonation"
req, err := http.NewRequestWithContext(ctx, http.MethodPost, impersonateURL, http.NoBody)
if err != nil {
return "", fmt.Errorf("build robot impersonation request: %w", err)
}
req.Header.Set("Authorization", "Bearer "+adminToken)
req.Header.Set("User-Agent", "ananke/startup-checklist")
resp, err := client.Do(req)
if err != nil {
return "", fmt.Errorf("request robot impersonation: %w", err)
}
defer resp.Body.Close()
body, _ := io.ReadAll(io.LimitReader(resp.Body, 64*1024))
if resp.StatusCode/100 != 2 {
return "", fmt.Errorf("robot impersonation failed status=%d body=%q", resp.StatusCode, compactHTTPBody(body))
}
var payload keycloakImpersonationResponse
if err := json.Unmarshal(body, &payload); err != nil {
return "", fmt.Errorf("decode robot impersonation response: %w", err)
}
return strings.TrimSpace(payload.Redirect), nil
}
// keycloakBaseURL runs one orchestration or CLI step.
// Signature: keycloakBaseURL(auth config.ServiceChecklistAuthSettings) string.
// Why: centralizing URL normalization keeps auth request construction stable.
func keycloakBaseURL(auth config.ServiceChecklistAuthSettings) string {
return strings.TrimRight(strings.TrimSpace(auth.KeycloakBaseURL), "/")
}
// compactHTTPBody runs one orchestration or CLI step.
// Signature: compactHTTPBody(raw []byte) string.
// Why: checklist auth errors should include a readable body summary without
// leaking multi-line payload noise into orchestrator logs.
func compactHTTPBody(raw []byte) string {
text := strings.TrimSpace(string(raw))
if text == "" {
return ""
}
return strings.Join(strings.Fields(text), " ")
}

View File

@ -184,6 +184,16 @@ func (o *Orchestrator) serviceCheckReady(ctx context.Context, check config.Servi
return false, fmt.Sprintf("location header contained forbidden marker %q", locationNotContains)
}
finalURLContains := strings.TrimSpace(check.FinalURLContains)
if finalURLContains != "" && !checklistContains(result.FinalURL, finalURLContains) {
return false, fmt.Sprintf("final url missing expected marker %q", finalURLContains)
}
finalURLNotContains := strings.TrimSpace(check.FinalURLNotContains)
if finalURLNotContains != "" && checklistContains(result.FinalURL, finalURLNotContains) {
return false, fmt.Sprintf("final url contained forbidden marker %q", finalURLNotContains)
}
bodyContains := strings.TrimSpace(check.BodyContains)
if bodyContains != "" && !checklistContains(result.Body, bodyContains) {
return false, fmt.Sprintf("response missing expected marker %q", bodyContains)
@ -201,6 +211,7 @@ type checklistHTTPProbeResult struct {
Status int
Body string
Location string
FinalURL string
}
// httpChecklistProbeResult runs one orchestration or CLI step.
@ -209,13 +220,14 @@ type checklistHTTPProbeResult struct {
// addition to status/body so startup can validate real user-facing behavior.
func (o *Orchestrator) httpChecklistProbeResult(ctx context.Context, check config.ServiceChecklistCheck) (checklistHTTPProbeResult, error) {
result := checklistHTTPProbeResult{}
status, body, location, err := o.httpChecklistProbeWithLocation(ctx, check)
status, body, location, finalURL, err := o.httpChecklistProbeWithLocation(ctx, check)
if err != nil {
return result, err
}
result.Status = status
result.Body = body
result.Location = location
result.FinalURL = finalURL
return result, nil
}
@ -223,50 +235,66 @@ func (o *Orchestrator) httpChecklistProbeResult(ctx context.Context, check confi
// Signature: (o *Orchestrator) httpChecklistProbe(ctx context.Context, check config.ServiceChecklistCheck) (int, string, error).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) httpChecklistProbe(ctx context.Context, check config.ServiceChecklistCheck) (int, string, error) {
status, body, _, err := o.httpChecklistProbeWithLocation(ctx, check)
status, body, _, _, err := o.httpChecklistProbeWithLocation(ctx, check)
return status, body, err
}
// httpChecklistProbeWithLocation runs one orchestration or CLI step.
// Signature: (o *Orchestrator) httpChecklistProbeWithLocation(ctx context.Context, check config.ServiceChecklistCheck) (int, string, string, error).
// Signature: (o *Orchestrator) httpChecklistProbeWithLocation(ctx context.Context, check config.ServiceChecklistCheck) (int, string, string, string, error).
// Why: redirects and auth gates require location-header assertions to prevent
// startup false-positives on partially healthy protected services.
func (o *Orchestrator) httpChecklistProbeWithLocation(ctx context.Context, check config.ServiceChecklistCheck) (int, string, string, error) {
func (o *Orchestrator) httpChecklistProbeWithLocation(ctx context.Context, check config.ServiceChecklistCheck) (int, string, string, string, error) {
timeout := time.Duration(check.TimeoutSeconds) * time.Second
if timeout <= 0 {
timeout = 12 * time.Second
}
followRedirects := check.FollowRedirects || check.RequireRobotAuth
var client *http.Client
if check.RequireRobotAuth {
authClient, authErr := o.checklistAuthHTTPClient(ctx, timeout, check.InsecureSkipTLS)
if authErr != nil {
return 0, "", "", "", fmt.Errorf("initialize robotuser checklist session: %w", authErr)
}
client = authClient
} else {
transport := &http.Transport{}
if check.InsecureSkipTLS {
transport.TLSClientConfig = &tls.Config{InsecureSkipVerify: true}
}
client := &http.Client{
client = &http.Client{
Timeout: timeout,
Transport: transport,
CheckRedirect: func(_ *http.Request, _ []*http.Request) error {
}
}
if !followRedirects {
client.CheckRedirect = func(_ *http.Request, _ []*http.Request) error {
return http.ErrUseLastResponse
},
}
}
req, err := http.NewRequestWithContext(ctx, http.MethodGet, strings.TrimSpace(check.URL), nil)
if err != nil {
return 0, "", "", fmt.Errorf("build request: %w", err)
return 0, "", "", "", fmt.Errorf("build request: %w", err)
}
req.Header.Set("User-Agent", "ananke/startup-checklist")
resp, err := client.Do(req)
if err != nil {
return 0, "", "", fmt.Errorf("request failed: %w", err)
return 0, "", "", "", fmt.Errorf("request failed: %w", err)
}
defer resp.Body.Close()
body, readErr := io.ReadAll(io.LimitReader(resp.Body, 64*1024))
if readErr != nil {
return resp.StatusCode, "", "", fmt.Errorf("read response body: %w", readErr)
return resp.StatusCode, "", "", "", fmt.Errorf("read response body: %w", readErr)
}
return resp.StatusCode, string(body), strings.TrimSpace(resp.Header.Get("Location")), nil
finalURL := strings.TrimSpace(req.URL.String())
if resp.Request != nil && resp.Request.URL != nil {
finalURL = strings.TrimSpace(resp.Request.URL.String())
}
return resp.StatusCode, string(body), strings.TrimSpace(resp.Header.Get("Location")), finalURL, nil
}
// checklistContains runs one orchestration or CLI step.

View File

@ -329,6 +329,80 @@ func TestServiceCheckReadyRejectsMissingLocationMarker(t *testing.T) {
}
}
// TestServiceCheckReadyRequiresFinalURLContains runs one orchestration or CLI step.
// Signature: TestServiceCheckReadyRequiresFinalURLContains(t *testing.T).
// Why: authenticated user-journey checks depend on final URL assertions after
// redirects complete, not only on initial response status.
func TestServiceCheckReadyRequiresFinalURLContains(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path == "/" {
http.Redirect(w, r, "/app/home", http.StatusFound)
return
}
if r.URL.Path == "/app/home" {
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte("OpenSearch Dashboards"))
return
}
w.WriteHeader(http.StatusNotFound)
}))
defer srv.Close()
orch := &Orchestrator{
log: log.New(os.Stdout, "", 0),
}
ok, detail := orch.serviceCheckReady(context.Background(), config.ServiceChecklistCheck{
Name: "logging-ui-user-session",
URL: srv.URL,
AcceptedStatuses: []int{200},
FollowRedirects: true,
FinalURLContains: "/app/home",
BodyContains: "OpenSearch Dashboards",
TimeoutSeconds: 5,
})
if !ok {
t.Fatalf("expected final-url-aware service check to pass, detail=%s", detail)
}
}
// TestServiceCheckReadyRejectsForbiddenFinalURLMarker runs one orchestration or CLI step.
// Signature: TestServiceCheckReadyRejectsForbiddenFinalURLMarker(t *testing.T).
// Why: user-session checks should fail when final URL indicates auth/login loop
// instead of the expected post-login app route.
func TestServiceCheckReadyRejectsForbiddenFinalURLMarker(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path == "/" {
http.Redirect(w, r, "/oauth2/sign_in", http.StatusFound)
return
}
if r.URL.Path == "/oauth2/sign_in" {
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte("sign in"))
return
}
w.WriteHeader(http.StatusNotFound)
}))
defer srv.Close()
orch := &Orchestrator{
log: log.New(os.Stdout, "", 0),
}
ok, detail := orch.serviceCheckReady(context.Background(), config.ServiceChecklistCheck{
Name: "logging-ui-user-session",
URL: srv.URL,
AcceptedStatuses: []int{200},
FollowRedirects: true,
FinalURLNotContains: "/oauth2/sign_in",
TimeoutSeconds: 5,
})
if ok {
t.Fatalf("expected forbidden final-url marker check to fail")
}
if !strings.Contains(detail, "final url contained forbidden marker") {
t.Fatalf("expected final-url forbidden marker detail, got %q", detail)
}
}
// TestChecklistFailureHostFromIngressDetail runs one orchestration or CLI step.
// Signature: TestChecklistFailureHostFromIngressDetail(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
@ -385,59 +459,3 @@ func TestChecklistFailureHostUnknown(t *testing.T) {
t.Fatalf("expected empty host for unknown check, got %q", got)
}
}
// TestStuckVaultInitReasonDetectsHungInit runs one orchestration or CLI step.
// Signature: TestStuckVaultInitReasonDetectsHungInit(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestStuckVaultInitReasonDetectsHungInit(t *testing.T) {
var pod podResource
pod.Status.Phase = "Pending"
pod.Metadata.Annotations = map[string]string{
"vault.hashicorp.com/agent-inject": "true",
}
pod.Status.InitContainerStatuses = []podContainerStatus{
{
Name: "vault-agent-init",
State: podContainerState{
Running: &podContainerRunningState{
StartedAt: time.Now().Add(-10 * time.Minute),
},
},
},
}
reason := stuckVaultInitReason(pod, 3*time.Minute)
if reason != "VaultInitStuck" {
t.Fatalf("expected VaultInitStuck reason, got %q", reason)
}
}
// TestStuckVaultInitReasonIgnoresFreshOrNonVaultPods runs one orchestration or CLI step.
// Signature: TestStuckVaultInitReasonIgnoresFreshOrNonVaultPods(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestStuckVaultInitReasonIgnoresFreshOrNonVaultPods(t *testing.T) {
var pod podResource
pod.Status.Phase = "Pending"
pod.Metadata.Annotations = map[string]string{
"vault.hashicorp.com/agent-inject": "true",
}
pod.Status.InitContainerStatuses = []podContainerStatus{
{
Name: "vault-agent-init",
State: podContainerState{
Running: &podContainerRunningState{
StartedAt: time.Now().Add(-30 * time.Second),
},
},
},
}
if reason := stuckVaultInitReason(pod, 3*time.Minute); reason != "" {
t.Fatalf("expected no reason for fresh init, got %q", reason)
}
pod.Metadata.Annotations["vault.hashicorp.com/agent-inject"] = "false"
pod.Status.InitContainerStatuses[0].State.Running.StartedAt = time.Now().Add(-10 * time.Minute)
if reason := stuckVaultInitReason(pod, 3*time.Minute); reason != "" {
t.Fatalf("expected no reason for non-vault pod, got %q", reason)
}
}

View File

@ -0,0 +1,62 @@
package cluster
import (
"testing"
"time"
)
// TestStuckVaultInitReasonDetectsHungInit runs one orchestration or CLI step.
// Signature: TestStuckVaultInitReasonDetectsHungInit(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestStuckVaultInitReasonDetectsHungInit(t *testing.T) {
var pod podResource
pod.Status.Phase = "Pending"
pod.Metadata.Annotations = map[string]string{
"vault.hashicorp.com/agent-inject": "true",
}
pod.Status.InitContainerStatuses = []podContainerStatus{
{
Name: "vault-agent-init",
State: podContainerState{
Running: &podContainerRunningState{
StartedAt: time.Now().Add(-10 * time.Minute),
},
},
},
}
reason := stuckVaultInitReason(pod, 3*time.Minute)
if reason != "VaultInitStuck" {
t.Fatalf("expected VaultInitStuck reason, got %q", reason)
}
}
// TestStuckVaultInitReasonIgnoresFreshOrNonVaultPods runs one orchestration or CLI step.
// Signature: TestStuckVaultInitReasonIgnoresFreshOrNonVaultPods(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestStuckVaultInitReasonIgnoresFreshOrNonVaultPods(t *testing.T) {
var pod podResource
pod.Status.Phase = "Pending"
pod.Metadata.Annotations = map[string]string{
"vault.hashicorp.com/agent-inject": "true",
}
pod.Status.InitContainerStatuses = []podContainerStatus{
{
Name: "vault-agent-init",
State: podContainerState{
Running: &podContainerRunningState{
StartedAt: time.Now().Add(-30 * time.Second),
},
},
},
}
if reason := stuckVaultInitReason(pod, 3*time.Minute); reason != "" {
t.Fatalf("expected no reason for fresh init, got %q", reason)
}
pod.Metadata.Annotations["vault.hashicorp.com/agent-inject"] = "false"
pod.Status.InitContainerStatuses[0].State.Running.StartedAt = time.Now().Add(-10 * time.Minute)
if reason := stuckVaultInitReason(pod, 3*time.Minute); reason != "" {
t.Fatalf("expected no reason for non-vault pod, got %q", reason)
}
}

View File

@ -0,0 +1,79 @@
package cluster
import (
"context"
"net/http"
"time"
"scm.bstein.dev/bstein/ananke/internal/config"
)
// TestHookChecklistAuthHTTPClient runs one orchestration or CLI step.
// Signature: (o *Orchestrator) TestHookChecklistAuthHTTPClient(ctx context.Context, timeout time.Duration, insecureSkipTLS bool) (*http.Client, error).
// Why: exposes checklist auth client/session bootstrap internals to top-level tests.
func (o *Orchestrator) TestHookChecklistAuthHTTPClient(ctx context.Context, timeout time.Duration, insecureSkipTLS bool) (*http.Client, error) {
return o.checklistAuthHTTPClient(ctx, timeout, insecureSkipTLS)
}
// TestHookAuthenticateRobotChecklistSession runs one orchestration or CLI step.
// Signature: (o *Orchestrator) TestHookAuthenticateRobotChecklistSession(ctx context.Context, client *http.Client) error.
// Why: exposes robotuser auth session internals to top-level tests.
func (o *Orchestrator) TestHookAuthenticateRobotChecklistSession(ctx context.Context, client *http.Client) error {
return o.authenticateRobotChecklistSession(ctx, client)
}
// TestHookKubernetesSecretValue runs one orchestration or CLI step.
// Signature: (o *Orchestrator) TestHookKubernetesSecretValue(ctx context.Context, namespace string, name string, key string) (string, error).
// Why: exposes Kubernetes secret decode internals to top-level tests.
func (o *Orchestrator) TestHookKubernetesSecretValue(ctx context.Context, namespace string, name string, key string) (string, error) {
return o.kubernetesSecretValue(ctx, namespace, name, key)
}
// TestHookKeycloakAdminCredentials runs one orchestration or CLI step.
// Signature: (o *Orchestrator) TestHookKeycloakAdminCredentials(ctx context.Context, auth config.ServiceChecklistAuthSettings) (string, string, error).
// Why: exposes secret-backed credential resolution internals to top-level tests.
func (o *Orchestrator) TestHookKeycloakAdminCredentials(ctx context.Context, auth config.ServiceChecklistAuthSettings) (string, string, error) {
return o.keycloakAdminCredentials(ctx, auth)
}
// TestHookKeycloakAdminToken runs one orchestration or CLI step.
// Signature: (o *Orchestrator) TestHookKeycloakAdminToken(ctx context.Context, client *http.Client, auth config.ServiceChecklistAuthSettings, adminUser string, adminPassword string) (string, error).
// Why: exposes Keycloak admin token acquisition internals to top-level tests.
func (o *Orchestrator) TestHookKeycloakAdminToken(ctx context.Context, client *http.Client, auth config.ServiceChecklistAuthSettings, adminUser string, adminPassword string) (string, error) {
return o.keycloakAdminToken(ctx, client, auth, adminUser, adminPassword)
}
// TestHookKeycloakRobotUserID runs one orchestration or CLI step.
// Signature: (o *Orchestrator) TestHookKeycloakRobotUserID(ctx context.Context, client *http.Client, auth config.ServiceChecklistAuthSettings, adminToken string) (string, error).
// Why: exposes Keycloak robot-user lookup internals to top-level tests.
func (o *Orchestrator) TestHookKeycloakRobotUserID(ctx context.Context, client *http.Client, auth config.ServiceChecklistAuthSettings, adminToken string) (string, error) {
return o.keycloakRobotUserID(ctx, client, auth, adminToken)
}
// TestHookKeycloakImpersonationRedirect runs one orchestration or CLI step.
// Signature: (o *Orchestrator) TestHookKeycloakImpersonationRedirect(ctx context.Context, client *http.Client, auth config.ServiceChecklistAuthSettings, adminToken string, robotUserID string) (string, error).
// Why: exposes Keycloak impersonation internals to top-level tests.
func (o *Orchestrator) TestHookKeycloakImpersonationRedirect(ctx context.Context, client *http.Client, auth config.ServiceChecklistAuthSettings, adminToken string, robotUserID string) (string, error) {
return o.keycloakImpersonationRedirect(ctx, client, auth, adminToken, robotUserID)
}
// TestHookHTTPChecklistProbeWithLocation runs one orchestration or CLI step.
// Signature: (o *Orchestrator) TestHookHTTPChecklistProbeWithLocation(ctx context.Context, check config.ServiceChecklistCheck) (int, string, string, string, error).
// Why: exposes redirect-aware checklist probe internals to top-level tests.
func (o *Orchestrator) TestHookHTTPChecklistProbeWithLocation(ctx context.Context, check config.ServiceChecklistCheck) (int, string, string, string, error) {
return o.httpChecklistProbeWithLocation(ctx, check)
}
// TestHookKeycloakBaseURL runs one orchestration or CLI step.
// Signature: TestHookKeycloakBaseURL(auth config.ServiceChecklistAuthSettings) string.
// Why: exposes base URL normalizer helper to top-level tests.
func TestHookKeycloakBaseURL(auth config.ServiceChecklistAuthSettings) string {
return keycloakBaseURL(auth)
}
// TestHookCompactHTTPBody runs one orchestration or CLI step.
// Signature: TestHookCompactHTTPBody(raw []byte) string.
// Why: exposes compact HTTP body helper to top-level tests.
func TestHookCompactHTTPBody(raw []byte) string {
return compactHTTPBody(raw)
}

View File

@ -97,6 +97,30 @@ func (c *Config) applyDefaults() {
if c.Startup.ServiceChecklistStabilitySec < 0 {
c.Startup.ServiceChecklistStabilitySec = 0
}
if strings.TrimSpace(c.Startup.ServiceChecklistAuth.Mode) == "" {
c.Startup.ServiceChecklistAuth.Mode = "keycloak_robotuser"
}
if strings.TrimSpace(c.Startup.ServiceChecklistAuth.KeycloakBaseURL) == "" {
c.Startup.ServiceChecklistAuth.KeycloakBaseURL = "https://sso.bstein.dev"
}
if strings.TrimSpace(c.Startup.ServiceChecklistAuth.Realm) == "" {
c.Startup.ServiceChecklistAuth.Realm = "atlas"
}
if strings.TrimSpace(c.Startup.ServiceChecklistAuth.RobotUsername) == "" {
c.Startup.ServiceChecklistAuth.RobotUsername = "robotuser"
}
if strings.TrimSpace(c.Startup.ServiceChecklistAuth.AdminSecretNamespace) == "" {
c.Startup.ServiceChecklistAuth.AdminSecretNamespace = "sso"
}
if strings.TrimSpace(c.Startup.ServiceChecklistAuth.AdminSecretName) == "" {
c.Startup.ServiceChecklistAuth.AdminSecretName = "keycloak-admin"
}
if strings.TrimSpace(c.Startup.ServiceChecklistAuth.AdminSecretUsernameKey) == "" {
c.Startup.ServiceChecklistAuth.AdminSecretUsernameKey = "username"
}
if strings.TrimSpace(c.Startup.ServiceChecklistAuth.AdminSecretPasswordKey) == "" {
c.Startup.ServiceChecklistAuth.AdminSecretPasswordKey = "password"
}
c.Startup.ServiceChecklist = mergeServiceChecklistDefaults(c.Startup.ServiceChecklist, defaultServiceChecklist())
for i := range c.Startup.ServiceChecklist {
if c.Startup.ServiceChecklist[i].TimeoutSeconds <= 0 {

View File

@ -207,6 +207,58 @@ func TestValidateRejectsBadServiceChecklistURL(t *testing.T) {
}
}
// TestValidateRejectsUnknownServiceChecklistAuthMode runs one orchestration or CLI step.
// Signature: TestValidateRejectsUnknownServiceChecklistAuthMode(t *testing.T).
// Why: authenticated user-journey checklist gates should fail fast when auth
// mode is invalid to avoid silent false-positive startup passes.
func TestValidateRejectsUnknownServiceChecklistAuthMode(t *testing.T) {
cfg := defaults()
cfg.Startup.ServiceChecklistAuth.Mode = "bad-mode"
if err := cfg.Validate(); err == nil {
t.Fatalf("expected validation error for invalid service checklist auth mode")
}
}
// TestValidateRejectsFinalURLMarkersWithoutRedirectFollow runs one orchestration or CLI step.
// Signature: TestValidateRejectsFinalURLMarkersWithoutRedirectFollow(t *testing.T).
// Why: final-url assertions only make sense when redirect following is enabled.
func TestValidateRejectsFinalURLMarkersWithoutRedirectFollow(t *testing.T) {
cfg := defaults()
cfg.Startup.ServiceChecklist = []ServiceChecklistCheck{
{
Name: "bad-final-url",
URL: "https://logs.bstein.dev/",
AcceptedStatuses: []int{200},
FinalURLContains: "/app/home",
TimeoutSeconds: 12,
},
}
if err := cfg.Validate(); err == nil {
t.Fatalf("expected validation error for final_url_* markers without redirect follow")
}
}
// TestValidateRejectsRobotAuthCheckWhenAuthModeDisabled runs one orchestration or CLI step.
// Signature: TestValidateRejectsRobotAuthCheckWhenAuthModeDisabled(t *testing.T).
// Why: robot-auth checks must be blocked when checklist auth mode is disabled.
func TestValidateRejectsRobotAuthCheckWhenAuthModeDisabled(t *testing.T) {
cfg := defaults()
cfg.Startup.ServiceChecklistAuth.Mode = "none"
cfg.Startup.ServiceChecklist = []ServiceChecklistCheck{
{
Name: "logs-ui",
URL: "https://logs.bstein.dev/",
AcceptedStatuses: []int{200},
RequireRobotAuth: true,
FollowRedirects: true,
TimeoutSeconds: 12,
},
}
if err := cfg.Validate(); err == nil {
t.Fatalf("expected validation error for robot-auth checklist check when auth mode is none")
}
}
// TestValidateRejectsBadIgnoreFluxKustomizationFormat runs one orchestration or CLI step.
// Signature: TestValidateRejectsBadIgnoreFluxKustomizationFormat(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
@ -291,8 +343,8 @@ func TestApplyDefaultsMergesServiceChecklistDefaults(t *testing.T) {
if _, ok := names["custom-smoke"]; !ok {
t.Fatalf("expected custom checklist entry to be preserved")
}
if _, ok := names["logging-oidc-redirect"]; !ok {
t.Fatalf("expected default logging redirect check to be merged in")
if _, ok := names["logging-ui-user-session"]; !ok {
t.Fatalf("expected default logging user-session check to be merged in")
}
if _, ok := names["vaultwarden-ui"]; !ok {
t.Fatalf("expected default vaultwarden check to be merged in")

View File

@ -81,6 +81,16 @@ func defaults() Config {
ServiceChecklistWaitSeconds: 420,
ServiceChecklistPollSeconds: 5,
ServiceChecklistStabilitySec: 120,
ServiceChecklistAuth: ServiceChecklistAuthSettings{
Mode: "keycloak_robotuser",
KeycloakBaseURL: "https://sso.bstein.dev",
Realm: "atlas",
RobotUsername: "robotuser",
AdminSecretNamespace: "sso",
AdminSecretName: "keycloak-admin",
AdminSecretUsernameKey: "username",
AdminSecretPasswordKey: "password",
},
ServiceChecklist: defaultServiceChecklist(),
RequireCriticalServiceEndpoints: true,
CriticalServiceEndpointWaitSec: 420,

View File

@ -44,10 +44,12 @@ func defaultServiceChecklist() []ServiceChecklistCheck {
TimeoutSeconds: 12,
},
{
Name: "auth-gateway-redirect",
Name: "auth-gateway-user-session",
URL: "https://auth.bstein.dev/",
AcceptedStatuses: []int{302},
LocationContains: "https://sso.bstein.dev/realms/atlas/",
AcceptedStatuses: []int{200},
RequireRobotAuth: true,
FollowRedirects: true,
BodyContains: "Authenticated",
TimeoutSeconds: 12,
},
{
@ -121,17 +123,32 @@ func defaultServiceChecklist() []ServiceChecklistCheck {
TimeoutSeconds: 12,
},
{
Name: "logging-oidc-redirect",
Name: "logging-ui-user-session",
URL: "https://logs.bstein.dev/",
AcceptedStatuses: []int{302},
LocationContains: "client_id=logs",
AcceptedStatuses: []int{200},
RequireRobotAuth: true,
FollowRedirects: true,
FinalURLNotContains: "/protocol/openid-connect/auth",
BodyContains: "OpenSearch Dashboards",
TimeoutSeconds: 12,
},
{
Name: "longhorn-oidc-redirect",
URL: "https://longhorn.bstein.dev/",
AcceptedStatuses: []int{302},
LocationContains: "https://sso.bstein.dev/realms/atlas/",
Name: "logging-api-user-session",
URL: "https://logs.bstein.dev/api/status",
AcceptedStatuses: []int{200},
RequireRobotAuth: true,
FollowRedirects: true,
BodyContains: "\"state\":\"green\"",
TimeoutSeconds: 12,
},
{
Name: "longhorn-api-user-session",
URL: "https://longhorn.bstein.dev/v1",
AcceptedStatuses: []int{200},
RequireRobotAuth: true,
FollowRedirects: true,
FinalURLNotContains: "/protocol/openid-connect/auth",
BodyContains: "\"id\":\"v1\"",
TimeoutSeconds: 12,
},
{
@ -190,17 +207,24 @@ func defaultServiceChecklist() []ServiceChecklistCheck {
TimeoutSeconds: 12,
},
{
Name: "sentinel-oidc-redirect",
URL: "https://sentinel.bstein.dev/",
AcceptedStatuses: []int{302},
LocationContains: "client_id=metis",
Name: "sentinel-user-session",
URL: "https://sentinel.bstein.dev/healthz",
AcceptedStatuses: []int{200},
RequireRobotAuth: true,
FollowRedirects: true,
FinalURLNotContains: "/protocol/openid-connect/auth",
BodyContains: "ok",
TimeoutSeconds: 12,
},
{
Name: "keycloak-admin-redirect",
URL: "https://sso.bstein.dev/",
AcceptedStatuses: []int{302},
LocationContains: "https://sso.bstein.dev/admin/",
Name: "keycloak-admin-user-session",
URL: "https://sso.bstein.dev/admin/",
AcceptedStatuses: []int{200},
RequireRobotAuth: true,
FollowRedirects: true,
FinalURLContains: "/admin/master/console/",
FinalURLNotContains: "/login-actions/authenticate",
BodyContains: "Keycloak Administration Console",
TimeoutSeconds: 12,
},
{
@ -253,23 +277,23 @@ func mergeServiceChecklistDefaults(existing, defaults []ServiceChecklistCheck) [
return out
}
byName := map[string]struct{}{}
for _, check := range existing {
name := strings.TrimSpace(check.Name)
if name == "" {
continue
}
byName[name] = struct{}{}
}
out := make([]ServiceChecklistCheck, 0, len(existing)+len(defaults))
out = append(out, existing...)
defaultByName := map[string]struct{}{}
for _, check := range defaults {
name := strings.TrimSpace(check.Name)
if name == "" {
continue
}
if _, exists := byName[name]; exists {
defaultByName[name] = struct{}{}
}
out := make([]ServiceChecklistCheck, 0, len(defaults)+len(existing))
out = append(out, defaults...)
for _, check := range existing {
name := strings.TrimSpace(check.Name)
if name == "" {
continue
}
if _, exists := defaultByName[name]; exists {
continue
}
out = append(out, check)

View File

@ -0,0 +1,33 @@
package config
// TestHookDefaultServiceChecklist runs one orchestration or CLI step.
// Signature: TestHookDefaultServiceChecklist() []ServiceChecklistCheck.
// Why: exposes default service checklist catalog to top-level tests.
func TestHookDefaultServiceChecklist() []ServiceChecklistCheck {
out := make([]ServiceChecklistCheck, 0, len(defaultServiceChecklist()))
out = append(out, defaultServiceChecklist()...)
return out
}
// TestHookDefaultCriticalServiceEndpoints runs one orchestration or CLI step.
// Signature: TestHookDefaultCriticalServiceEndpoints() []string.
// Why: exposes default critical endpoint catalog to top-level tests.
func TestHookDefaultCriticalServiceEndpoints() []string {
out := make([]string, 0, len(defaultCriticalServiceEndpoints()))
out = append(out, defaultCriticalServiceEndpoints()...)
return out
}
// TestHookMergeServiceChecklistDefaults runs one orchestration or CLI step.
// Signature: TestHookMergeServiceChecklistDefaults(existing, defaults []ServiceChecklistCheck) []ServiceChecklistCheck.
// Why: exposes checklist merge helper to top-level tests.
func TestHookMergeServiceChecklistDefaults(existing, defaults []ServiceChecklistCheck) []ServiceChecklistCheck {
return mergeServiceChecklistDefaults(existing, defaults)
}
// TestHookMergeStringDefaults runs one orchestration or CLI step.
// Signature: TestHookMergeStringDefaults(existing, defaults []string) []string.
// Why: exposes string merge helper to top-level tests.
func TestHookMergeStringDefaults(existing, defaults []string) []string {
return mergeStringDefaults(existing, defaults)
}

View File

@ -56,6 +56,7 @@ type Startup struct {
ServiceChecklistWaitSeconds int `yaml:"service_checklist_wait_seconds"`
ServiceChecklistPollSeconds int `yaml:"service_checklist_poll_seconds"`
ServiceChecklistStabilitySec int `yaml:"service_checklist_stability_seconds"`
ServiceChecklistAuth ServiceChecklistAuthSettings `yaml:"service_checklist_auth"`
ServiceChecklist []ServiceChecklistCheck `yaml:"service_checklist"`
RequireCriticalServiceEndpoints bool `yaml:"require_critical_service_endpoints"`
CriticalServiceEndpointWaitSec int `yaml:"critical_service_endpoint_wait_seconds"`
@ -91,14 +92,29 @@ type ServiceChecklistCheck struct {
Name string `yaml:"name"`
URL string `yaml:"url"`
AcceptedStatuses []int `yaml:"accepted_statuses"`
RequireRobotAuth bool `yaml:"require_robot_auth"`
FollowRedirects bool `yaml:"follow_redirects"`
LocationContains string `yaml:"location_contains"`
LocationNotContains string `yaml:"location_not_contains"`
FinalURLContains string `yaml:"final_url_contains"`
FinalURLNotContains string `yaml:"final_url_not_contains"`
BodyContains string `yaml:"body_contains"`
BodyNotContains string `yaml:"body_not_contains"`
TimeoutSeconds int `yaml:"timeout_seconds"`
InsecureSkipTLS bool `yaml:"insecure_skip_tls"`
}
type ServiceChecklistAuthSettings struct {
Mode string `yaml:"mode"`
KeycloakBaseURL string `yaml:"keycloak_base_url"`
Realm string `yaml:"realm"`
RobotUsername string `yaml:"robot_username"`
AdminSecretNamespace string `yaml:"admin_secret_namespace"`
AdminSecretName string `yaml:"admin_secret_name"`
AdminSecretUsernameKey string `yaml:"admin_secret_username_key"`
AdminSecretPasswordKey string `yaml:"admin_secret_password_key"`
}
type Shutdown struct {
DefaultBudgetSeconds int `yaml:"default_budget_seconds"`
HistoryMinSamples int `yaml:"history_min_samples"`

View File

@ -136,6 +136,35 @@ func (c Config) Validate() error {
if c.Startup.RequireServiceChecklist && len(c.Startup.ServiceChecklist) == 0 {
return fmt.Errorf("config.startup.service_checklist must not be empty when require_service_checklist is true")
}
authMode := strings.TrimSpace(c.Startup.ServiceChecklistAuth.Mode)
if authMode != "none" && authMode != "keycloak_robotuser" {
return fmt.Errorf("config.startup.service_checklist_auth.mode must be none or keycloak_robotuser")
}
if authMode == "keycloak_robotuser" {
baseURL := strings.TrimSpace(c.Startup.ServiceChecklistAuth.KeycloakBaseURL)
parsed, err := neturl.Parse(baseURL)
if err != nil || parsed.Scheme == "" || parsed.Host == "" {
return fmt.Errorf("config.startup.service_checklist_auth.keycloak_base_url is invalid: %q", baseURL)
}
if strings.TrimSpace(c.Startup.ServiceChecklistAuth.Realm) == "" {
return fmt.Errorf("config.startup.service_checklist_auth.realm must not be empty")
}
if strings.TrimSpace(c.Startup.ServiceChecklistAuth.RobotUsername) == "" {
return fmt.Errorf("config.startup.service_checklist_auth.robot_username must not be empty")
}
if strings.TrimSpace(c.Startup.ServiceChecklistAuth.AdminSecretNamespace) == "" {
return fmt.Errorf("config.startup.service_checklist_auth.admin_secret_namespace must not be empty")
}
if strings.TrimSpace(c.Startup.ServiceChecklistAuth.AdminSecretName) == "" {
return fmt.Errorf("config.startup.service_checklist_auth.admin_secret_name must not be empty")
}
if strings.TrimSpace(c.Startup.ServiceChecklistAuth.AdminSecretUsernameKey) == "" {
return fmt.Errorf("config.startup.service_checklist_auth.admin_secret_username_key must not be empty")
}
if strings.TrimSpace(c.Startup.ServiceChecklistAuth.AdminSecretPasswordKey) == "" {
return fmt.Errorf("config.startup.service_checklist_auth.admin_secret_password_key must not be empty")
}
}
for i, check := range c.Startup.ServiceChecklist {
if strings.TrimSpace(check.Name) == "" {
return fmt.Errorf("config.startup.service_checklist[%d].name must not be empty", i)
@ -151,6 +180,13 @@ func (c Config) Validate() error {
if check.TimeoutSeconds <= 0 {
return fmt.Errorf("config.startup.service_checklist[%d].timeout_seconds must be > 0", i)
}
if check.RequireRobotAuth && authMode == "none" {
return fmt.Errorf("config.startup.service_checklist[%d] requires robot auth but service_checklist_auth.mode is none", i)
}
if (strings.TrimSpace(check.FinalURLContains) != "" || strings.TrimSpace(check.FinalURLNotContains) != "") &&
!(check.FollowRedirects || check.RequireRobotAuth) {
return fmt.Errorf("config.startup.service_checklist[%d] uses final_url_* markers without redirects enabled", i)
}
for _, code := range check.AcceptedStatuses {
if code < 100 || code > 599 {
return fmt.Errorf("config.startup.service_checklist[%d].accepted_statuses contains invalid HTTP code %d", i, code)

View File

@ -15,6 +15,9 @@ type Runner struct {
Logger *log.Logger
}
// Run runs one orchestration or CLI step.
// Signature: (r *Runner) Run(ctx context.Context, name string, args ...string) (string, error).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (r *Runner) Run(ctx context.Context, name string, args ...string) (string, error) {
if r.DryRun {
r.logf("DRY-RUN: %s %s", name, strings.Join(args, " "))
@ -37,11 +40,17 @@ func (r *Runner) Run(ctx context.Context, name string, args ...string) (string,
return trimmed, nil
}
// CommandExists runs one orchestration or CLI step.
// Signature: (r *Runner) CommandExists(name string) bool.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (r *Runner) CommandExists(name string) bool {
_, err := exec.LookPath(name)
return err == nil
}
// logf runs one orchestration or CLI step.
// Signature: (r *Runner) logf(format string, args ...any).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (r *Runner) logf(format string, args ...any) {
if r.Logger != nil {
r.Logger.Printf(format, args...)

View File

@ -0,0 +1,53 @@
package execx
import (
"context"
"strings"
"testing"
)
// TestRunnerRunFailureWithoutOutput runs one orchestration or CLI step.
// Signature: TestRunnerRunFailureWithoutOutput(t *testing.T).
// Why: covers error branch where command fails without producing output.
func TestRunnerRunFailureWithoutOutput(t *testing.T) {
r := &Runner{}
out, err := r.Run(context.Background(), "sh", "-c", "exit 3")
if err == nil {
t.Fatalf("expected failure")
}
if out != "" {
t.Fatalf("expected empty output, got %q", out)
}
}
// TestRunnerLogfNoLogger runs one orchestration or CLI step.
// Signature: TestRunnerLogfNoLogger(t *testing.T).
// Why: covers no-op logging path.
func TestRunnerLogfNoLogger(t *testing.T) {
r := &Runner{}
r.logf("hello %s", "world")
}
// TestRunnerCommandMissing runs one orchestration or CLI step.
// Signature: TestRunnerCommandMissing(t *testing.T).
// Why: covers false branch of command existence checks.
func TestRunnerCommandMissing(t *testing.T) {
r := &Runner{}
if r.CommandExists("definitely-not-a-real-command-ananke") {
t.Fatalf("expected missing command to be false")
}
}
// TestRunnerInjectsKubeconfigEnv runs one orchestration or CLI step.
// Signature: TestRunnerInjectsKubeconfigEnv(t *testing.T).
// Why: covers kubeconfig environment injection branch in command runner.
func TestRunnerInjectsKubeconfigEnv(t *testing.T) {
r := &Runner{Kubeconfig: "/tmp/test-kubeconfig"}
out, err := r.Run(context.Background(), "sh", "-c", "printf %s \"$KUBECONFIG\"")
if err != nil {
t.Fatalf("runner command failed: %v", err)
}
if strings.TrimSpace(out) != "/tmp/test-kubeconfig" {
t.Fatalf("expected kubeconfig env to propagate, got %q", out)
}
}

View File

@ -0,0 +1,68 @@
package execx
import (
"bytes"
"context"
"log"
"strings"
"testing"
)
// TestRunnerDryRun runs one orchestration or CLI step.
// Signature: TestRunnerDryRun(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestRunnerDryRun(t *testing.T) {
var buf bytes.Buffer
r := &Runner{
DryRun: true,
Logger: log.New(&buf, "", 0),
}
out, err := r.Run(context.Background(), "echo", "hello")
if err != nil {
t.Fatalf("dry-run should not fail: %v", err)
}
if out != "" {
t.Fatalf("expected empty dry-run output, got %q", out)
}
if !strings.Contains(buf.String(), "DRY-RUN: echo hello") {
t.Fatalf("expected dry-run log entry, got %q", buf.String())
}
}
// TestRunnerRunSuccess runs one orchestration or CLI step.
// Signature: TestRunnerRunSuccess(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestRunnerRunSuccess(t *testing.T) {
r := &Runner{}
out, err := r.Run(context.Background(), "sh", "-c", "printf ok")
if err != nil {
t.Fatalf("expected command success: %v", err)
}
if out != "ok" {
t.Fatalf("expected output ok, got %q", out)
}
}
// TestRunnerRunFailureIncludesOutput runs one orchestration or CLI step.
// Signature: TestRunnerRunFailureIncludesOutput(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestRunnerRunFailureIncludesOutput(t *testing.T) {
r := &Runner{}
out, err := r.Run(context.Background(), "sh", "-c", "echo boom >&2; exit 1")
if err == nil {
t.Fatalf("expected command failure")
}
if strings.TrimSpace(out) != "boom" {
t.Fatalf("expected stderr to be preserved, got %q", out)
}
}
// TestRunnerCommandExists runs one orchestration or CLI step.
// Signature: TestRunnerCommandExists(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestRunnerCommandExists(t *testing.T) {
r := &Runner{}
if !r.CommandExists("sh") {
t.Fatalf("expected shell command to exist")
}
}

View File

@ -3,6 +3,7 @@ package metrics
import (
"fmt"
"net/http"
"os"
"sort"
"strings"
"sync"
@ -35,18 +36,27 @@ type Exporter struct {
samples map[string]Sample
}
// New runs one orchestration or CLI step.
// Signature: New() *Exporter.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func New() *Exporter {
return &Exporter{
samples: make(map[string]Sample),
}
}
// UpdateBudget runs one orchestration or CLI step.
// Signature: (e *Exporter) UpdateBudget(seconds int).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (e *Exporter) UpdateBudget(seconds int) {
e.mu.Lock()
defer e.mu.Unlock()
e.shutdownBudgetSec = seconds
}
// UpdateSample runs one orchestration or CLI step.
// Signature: (e *Exporter) UpdateSample(s Sample).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (e *Exporter) UpdateSample(s Sample) {
e.mu.Lock()
defer e.mu.Unlock()
@ -56,6 +66,9 @@ func (e *Exporter) UpdateSample(s Sample) {
e.samples[s.Name] = s
}
// MarkShutdown runs one orchestration or CLI step.
// Signature: (e *Exporter) MarkShutdown(reason string).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (e *Exporter) MarkShutdown(reason string) {
e.mu.Lock()
defer e.mu.Unlock()
@ -64,6 +77,9 @@ func (e *Exporter) MarkShutdown(reason string) {
e.lastShutdownAt = time.Now().UTC()
}
// Handler runs one orchestration or CLI step.
// Signature: (e *Exporter) Handler(path string) http.Handler.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (e *Exporter) Handler(path string) http.Handler {
mux := http.NewServeMux()
metricsPath := path
@ -78,6 +94,9 @@ func (e *Exporter) Handler(path string) http.Handler {
return mux
}
// serveMetrics runs one orchestration or CLI step.
// Signature: (e *Exporter) serveMetrics(w http.ResponseWriter, _ *http.Request).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (e *Exporter) serveMetrics(w http.ResponseWriter, _ *http.Request) {
e.mu.RLock()
defer e.mu.RUnlock()
@ -145,10 +164,40 @@ func (e *Exporter) serveMetrics(w http.ResponseWriter, _ *http.Request) {
}
b.WriteString(fmt.Sprintf("ananke_ups_error%s %d\n", labels, boolNum(s.LastError != "")))
}
appendQualityGateMetrics(&b)
_, _ = w.Write([]byte(b.String()))
}
// appendQualityGateMetrics runs one orchestration or CLI step.
// Signature: appendQualityGateMetrics(dst *strings.Builder).
// Why: quality-gate pass/fail telemetry should appear alongside UPS metrics so
// Grafana can track Ananke suite health over time.
func appendQualityGateMetrics(dst *strings.Builder) {
path := strings.TrimSpace(os.Getenv("ANANKE_QUALITY_METRICS_FILE"))
if path == "" {
path = "/var/lib/ananke/quality-gate.prom"
}
raw, err := os.ReadFile(path)
if err != nil {
return
}
text := strings.TrimSpace(string(raw))
if text == "" {
return
}
if dst.Len() > 0 {
dst.WriteString("\n")
}
dst.WriteString(text)
if !strings.HasSuffix(text, "\n") {
dst.WriteString("\n")
}
}
// boolNum runs one orchestration or CLI step.
// Signature: boolNum(v bool) int.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func boolNum(v bool) int {
if v {
return 1
@ -156,6 +205,9 @@ func boolNum(v bool) int {
return 0
}
// safe runs one orchestration or CLI step.
// Signature: safe(in string) string.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func safe(in string) string {
out := strings.ReplaceAll(in, "\\", "\\\\")
return strings.ReplaceAll(out, "\"", "\\\"")

View File

@ -0,0 +1,86 @@
package metrics
import (
"net/http"
"net/http/httptest"
"os"
"path/filepath"
"strings"
"testing"
)
// TestExporterHealthzAndEscaping runs one orchestration or CLI step.
// Signature: TestExporterHealthzAndEscaping(t *testing.T).
// Why: covers health endpoint and label escaping branches in metrics renderer.
func TestExporterHealthzAndEscaping(t *testing.T) {
e := New()
e.UpdateSample(Sample{
Name: `Sta"tera`,
Target: `statera\host`,
Status: `O"B`,
LastError: "x",
})
h := e.Handler("/custom")
healthReq := httptest.NewRequest(http.MethodGet, "/healthz", nil)
healthRR := httptest.NewRecorder()
h.ServeHTTP(healthRR, healthReq)
if healthRR.Code != http.StatusOK || strings.TrimSpace(healthRR.Body.String()) != "ok" {
t.Fatalf("unexpected health response: code=%d body=%q", healthRR.Code, healthRR.Body.String())
}
metricsReq := httptest.NewRequest(http.MethodGet, "/custom", nil)
metricsRR := httptest.NewRecorder()
h.ServeHTTP(metricsRR, metricsReq)
body := metricsRR.Body.String()
if !strings.Contains(body, `source="Sta\\\"tera"`) {
t.Fatalf("expected escaped source label, got:\n%s", body)
}
if !strings.Contains(body, `target="statera\\\\host"`) {
t.Fatalf("expected escaped target label, got:\n%s", body)
}
if !strings.Contains(body, "ananke_ups_error") {
t.Fatalf("expected error metric line in output")
}
}
// TestBoolNumAndSafeHelpers runs one orchestration or CLI step.
// Signature: TestBoolNumAndSafeHelpers(t *testing.T).
// Why: directly covers remaining helper branches.
func TestBoolNumAndSafeHelpers(t *testing.T) {
if boolNum(true) != 1 || boolNum(false) != 0 {
t.Fatalf("unexpected boolNum values")
}
if got := safe(`a"b\c`); got != `a\"b\\c` {
t.Fatalf("unexpected escaped string: %q", got)
}
}
// TestExporterAppendsQualityGateMetrics runs one orchestration or CLI step.
// Signature: TestExporterAppendsQualityGateMetrics(t *testing.T).
// Why: verifies quality-gate metrics are surfaced on /metrics for Grafana suite
// pass-rate tracking.
func TestExporterAppendsQualityGateMetrics(t *testing.T) {
tmp := t.TempDir()
metricsPath := filepath.Join(tmp, "quality-gate.prom")
content := strings.Join([]string{
`# HELP ananke_quality_gate_runs_total Total quality gate runs by status.`,
`# TYPE ananke_quality_gate_runs_total counter`,
`ananke_quality_gate_runs_total{suite="ananke",status="ok"} 10`,
`ananke_quality_gate_runs_total{suite="ananke",status="failed"} 2`,
"",
}, "\n")
if err := os.WriteFile(metricsPath, []byte(content), 0o600); err != nil {
t.Fatalf("write quality metrics file: %v", err)
}
t.Setenv("ANANKE_QUALITY_METRICS_FILE", metricsPath)
e := New()
req := httptest.NewRequest(http.MethodGet, "/metrics", nil)
rr := httptest.NewRecorder()
e.Handler("/metrics").ServeHTTP(rr, req)
body := rr.Body.String()
if !strings.Contains(body, `ananke_quality_gate_runs_total{suite="ananke",status="ok"} 10`) {
t.Fatalf("expected quality gate metrics appended to exporter output, got:\n%s", body)
}
}

View File

@ -7,6 +7,9 @@ import (
"time"
)
// TestExporterEmitsCoreMetrics runs one orchestration or CLI step.
// Signature: TestExporterEmitsCoreMetrics(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestExporterEmitsCoreMetrics(t *testing.T) {
e := New()
e.UpdateBudget(321)

View File

@ -34,6 +34,19 @@ type Daemon struct {
exporter *metrics.Exporter
}
var sshConfigCandidates = []string{
"/home/atlas/.ssh/config",
"/home/tethys/.ssh/config",
}
var sshIdentityCandidates = []string{
"/home/atlas/.ssh/id_ed25519",
"/home/tethys/.ssh/id_ed25519",
}
// NewDaemon runs one orchestration or CLI step.
// Signature: NewDaemon(cfg config.Config, orch *cluster.Orchestrator, targets []Target, logger *log.Logger) *Daemon.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func NewDaemon(cfg config.Config, orch *cluster.Orchestrator, targets []Target, logger *log.Logger) *Daemon {
return &Daemon{
cfg: cfg,
@ -44,6 +57,9 @@ func NewDaemon(cfg config.Config, orch *cluster.Orchestrator, targets []Target,
}
}
// Run runs one orchestration or CLI step.
// Signature: (d *Daemon) Run(ctx context.Context) error.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (d *Daemon) Run(ctx context.Context) error {
if !d.cfg.UPS.Enabled {
return fmt.Errorf("ups monitoring is disabled in config")
@ -152,6 +168,9 @@ func (d *Daemon) Run(ctx context.Context) error {
}
}
// triggerShutdown runs one orchestration or CLI step.
// Signature: (d *Daemon) triggerShutdown(ctx context.Context, reason string) error.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (d *Daemon) triggerShutdown(ctx context.Context, reason string) error {
intent, err := state.ReadIntent(d.cfg.State.IntentPath)
if err == nil && intent.State == state.IntentShuttingDown {
@ -190,6 +209,9 @@ func (d *Daemon) triggerShutdown(ctx context.Context, reason string) error {
return nil
}
// forwardShutdown runs one orchestration or CLI step.
// Signature: (d *Daemon) forwardShutdown(ctx context.Context, reason string) error.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (d *Daemon) forwardShutdown(ctx context.Context, reason string) error {
timeout := time.Duration(d.cfg.Coordination.CommandTimeoutSeconds) * time.Second
if timeout <= 0 {
@ -280,15 +302,14 @@ func (d *Daemon) forwardShutdown(ctx context.Context, reason string) error {
return nil
}
// resolveSSHConfigFile runs one orchestration or CLI step.
// Signature: (d *Daemon) resolveSSHConfigFile() string.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (d *Daemon) resolveSSHConfigFile() string {
if strings.TrimSpace(d.cfg.SSHConfigFile) != "" {
return strings.TrimSpace(d.cfg.SSHConfigFile)
}
candidates := []string{
"/home/atlas/.ssh/config",
"/home/tethys/.ssh/config",
}
for _, p := range candidates {
for _, p := range sshConfigCandidates {
if stat, err := os.Stat(p); err == nil && !stat.IsDir() {
return p
}
@ -296,15 +317,14 @@ func (d *Daemon) resolveSSHConfigFile() string {
return ""
}
// resolveSSHIdentityFile runs one orchestration or CLI step.
// Signature: (d *Daemon) resolveSSHIdentityFile() string.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (d *Daemon) resolveSSHIdentityFile() string {
if strings.TrimSpace(d.cfg.SSHIdentityFile) != "" {
return strings.TrimSpace(d.cfg.SSHIdentityFile)
}
candidates := []string{
"/home/atlas/.ssh/id_ed25519",
"/home/tethys/.ssh/id_ed25519",
}
for _, p := range candidates {
for _, p := range sshIdentityCandidates {
if stat, err := os.Stat(p); err == nil && !stat.IsDir() {
return p
}
@ -312,6 +332,9 @@ func (d *Daemon) resolveSSHIdentityFile() string {
return ""
}
// targetList runs one orchestration or CLI step.
// Signature: (d *Daemon) targetList() string.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (d *Daemon) targetList() string {
names := make([]string, 0, len(d.targets))
for _, t := range d.targets {
@ -320,6 +343,9 @@ func (d *Daemon) targetList() string {
return strings.Join(names, ",")
}
// startMetricsServer runs one orchestration or CLI step.
// Signature: (d *Daemon) startMetricsServer() error.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (d *Daemon) startMetricsServer() error {
if d.cfg.Metrics.BindAddr == "" {
return fmt.Errorf("metrics.bind_addr must not be empty when metrics are enabled")

View File

@ -0,0 +1,255 @@
package service
import (
"context"
"io"
"log"
"os"
"path/filepath"
"strings"
"testing"
"time"
"scm.bstein.dev/bstein/ananke/internal/cluster"
"scm.bstein.dev/bstein/ananke/internal/config"
"scm.bstein.dev/bstein/ananke/internal/execx"
"scm.bstein.dev/bstein/ananke/internal/metrics"
"scm.bstein.dev/bstein/ananke/internal/state"
"scm.bstein.dev/bstein/ananke/internal/ups"
)
type daemonFakeProvider struct {
samples []ups.Sample
errs []error
idx int
}
// Read runs one orchestration or CLI step.
// Signature: (p *daemonFakeProvider) Read(ctx context.Context) (ups.Sample, error).
// Why: daemon tests need deterministic telemetry/error sequencing without real UPS I/O.
func (p *daemonFakeProvider) Read(_ context.Context) (ups.Sample, error) {
if p.idx < len(p.errs) && p.errs[p.idx] != nil {
err := p.errs[p.idx]
p.idx++
return ups.Sample{}, err
}
if p.idx < len(p.samples) {
s := p.samples[p.idx]
p.idx++
return s, nil
}
if len(p.samples) > 0 {
return p.samples[len(p.samples)-1], nil
}
return ups.Sample{}, context.DeadlineExceeded
}
// newDaemonTestOrchestrator runs one orchestration or CLI step.
// Signature: newDaemonTestOrchestrator(t *testing.T, stateDir string) *cluster.Orchestrator.
// Why: daemon tests share a minimal dry-run orchestrator fixture to avoid duplication.
func newDaemonTestOrchestrator(t *testing.T, stateDir string) *cluster.Orchestrator {
t.Helper()
cfg := config.Config{
ControlPlanes: []string{"titan-0a"},
Workers: []string{"titan-22"},
SSHUser: "atlas",
SSHPort: 2277,
SSHManagedNodes: []string{"titan-0a", "titan-22"},
SSHNodeHosts: map[string]string{
"titan-0a": "192.168.22.11",
"titan-22": "192.168.22.22",
},
State: config.State{
Dir: stateDir,
ReportsDir: filepath.Join(stateDir, "reports"),
RunHistoryPath: filepath.Join(stateDir, "runs.json"),
LockPath: filepath.Join(stateDir, "ananke.lock"),
IntentPath: filepath.Join(stateDir, "intent.json"),
},
Shutdown: config.Shutdown{
EmergencySkipDrain: true,
EmergencySkipEtcd: true,
},
}
return cluster.New(
cfg,
&execx.Runner{DryRun: true, Logger: log.New(io.Discard, "", 0)},
state.New(filepath.Join(stateDir, "runs.json")),
log.New(io.Discard, "", 0),
)
}
// TestDaemonRunTriggersShutdownOnLowBattery runs one orchestration or CLI step.
// Signature: TestDaemonRunTriggersShutdownOnLowBattery(t *testing.T).
// Why: covers main daemon loop path that triggers shutdown after debounce threshold.
func TestDaemonRunTriggersShutdownOnLowBattery(t *testing.T) {
stateDir := t.TempDir()
orch := newDaemonTestOrchestrator(t, stateDir)
d := &Daemon{
cfg: config.Config{
UPS: config.UPS{
Enabled: true,
PollSeconds: 1,
DebounceCount: 1,
RuntimeSafetyFactor: 1.0,
},
State: config.State{
IntentPath: filepath.Join(stateDir, "intent.json"),
},
Shutdown: config.Shutdown{
EmergencySkipDrain: true,
EmergencySkipEtcd: true,
},
},
orch: orch,
targets: []Target{
{
Name: "Pyrphoros",
Target: "pyrphoros@localhost",
Provider: &daemonFakeProvider{
samples: []ups.Sample{{OnBattery: true, LowBattery: true, RuntimeSeconds: 30, RawStatus: "OB LB"}},
},
},
},
log: log.New(io.Discard, "", 0),
exporter: metrics.New(),
}
ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second)
defer cancel()
if err := d.Run(ctx); err != nil {
t.Fatalf("expected daemon to trigger and complete shutdown, got %v", err)
}
}
// TestDaemonRunTriggersShutdownOnTelemetryTimeout runs one orchestration or CLI step.
// Signature: TestDaemonRunTriggersShutdownOnTelemetryTimeout(t *testing.T).
// Why: covers telemetry-timeout trigger path while UPS remains on-battery.
func TestDaemonRunTriggersShutdownOnTelemetryTimeout(t *testing.T) {
stateDir := t.TempDir()
orch := newDaemonTestOrchestrator(t, stateDir)
d := &Daemon{
cfg: config.Config{
UPS: config.UPS{
Enabled: true,
PollSeconds: 1,
DebounceCount: 3,
RuntimeSafetyFactor: 1.0,
TelemetryTimeoutSeconds: 1,
},
State: config.State{
IntentPath: filepath.Join(stateDir, "intent.json"),
},
Shutdown: config.Shutdown{
EmergencySkipDrain: true,
EmergencySkipEtcd: true,
},
},
orch: orch,
targets: []Target{
{
Name: "Statera",
Target: "statera@localhost",
Provider: &daemonFakeProvider{
samples: []ups.Sample{{OnBattery: true, LowBattery: false, RuntimeSeconds: 9999, RawStatus: "OB"}},
errs: []error{nil, context.DeadlineExceeded, context.DeadlineExceeded, context.DeadlineExceeded},
},
},
},
log: log.New(io.Discard, "", 0),
exporter: metrics.New(),
}
ctx, cancel := context.WithTimeout(context.Background(), 6*time.Second)
defer cancel()
if err := d.Run(ctx); err != nil {
t.Fatalf("expected telemetry-timeout shutdown path to complete, got %v", err)
}
}
// TestForwardShutdownSucceedsWithSSHShim runs one orchestration or CLI step.
// Signature: TestForwardShutdownSucceedsWithSSHShim(t *testing.T).
// Why: covers forward-shutdown SSH execution path.
func TestForwardShutdownSucceedsWithSSHShim(t *testing.T) {
tmp := t.TempDir()
sshPath := filepath.Join(tmp, "ssh")
script := `#!/usr/bin/env bash
set -euo pipefail
echo forwarded
`
if err := os.WriteFile(sshPath, []byte(script), 0o755); err != nil {
t.Fatalf("write fake ssh: %v", err)
}
t.Setenv("PATH", tmp+":"+os.Getenv("PATH"))
d := &Daemon{
cfg: config.Config{
SSHUser: "atlas",
SSHPort: 2277,
Coordination: config.Coordination{
ForwardShutdownHost: "titan-db",
ForwardShutdownConfig: "/etc/ananke/ananke.yaml",
CommandTimeoutSeconds: 5,
},
},
log: log.New(io.Discard, "", 0),
}
if err := d.forwardShutdown(context.Background(), "test-forward"); err != nil {
t.Fatalf("forwardShutdown failed: %v", err)
}
}
// TestForwardShutdownFailsWhenSSHFailsAndNoRecovery runs one orchestration or CLI step.
// Signature: TestForwardShutdownFailsWhenSSHFailsAndNoRecovery(t *testing.T).
// Why: covers forwarded shutdown error propagation branch.
func TestForwardShutdownFailsWhenSSHFailsAndNoRecovery(t *testing.T) {
tmp := t.TempDir()
sshPath := filepath.Join(tmp, "ssh")
script := `#!/usr/bin/env bash
set -euo pipefail
echo "permission denied" >&2
exit 255
`
if err := os.WriteFile(sshPath, []byte(script), 0o755); err != nil {
t.Fatalf("write fake ssh: %v", err)
}
t.Setenv("PATH", tmp+":"+os.Getenv("PATH"))
d := &Daemon{
cfg: config.Config{
SSHUser: "atlas",
SSHPort: 2277,
Coordination: config.Coordination{
ForwardShutdownHost: "titan-db",
ForwardShutdownConfig: "/etc/ananke/ananke.yaml",
CommandTimeoutSeconds: 5,
},
},
log: log.New(io.Discard, "", 0),
}
err := d.forwardShutdown(context.Background(), "test-fail")
if err == nil {
t.Fatalf("expected forwardShutdown error")
}
if !strings.Contains(strings.ToLower(err.Error()), "forward shutdown via ssh failed") {
t.Fatalf("unexpected error: %v", err)
}
}
// TestStartMetricsServerSuccess runs one orchestration or CLI step.
// Signature: TestStartMetricsServerSuccess(t *testing.T).
// Why: covers successful metrics server startup branch.
func TestStartMetricsServerSuccess(t *testing.T) {
d := &Daemon{
cfg: config.Config{
Metrics: config.Metrics{
Enabled: true,
BindAddr: "127.0.0.1:0",
Path: "/metrics",
},
},
log: log.New(io.Discard, "", 0),
exporter: metrics.New(),
}
if err := d.startMetricsServer(); err != nil {
t.Fatalf("startMetricsServer failed: %v", err)
}
}

View File

@ -0,0 +1,421 @@
package service
import (
"context"
"io"
"log"
"os"
"path/filepath"
"strings"
"testing"
"time"
"scm.bstein.dev/bstein/ananke/internal/cluster"
"scm.bstein.dev/bstein/ananke/internal/config"
"scm.bstein.dev/bstein/ananke/internal/execx"
"scm.bstein.dev/bstein/ananke/internal/metrics"
"scm.bstein.dev/bstein/ananke/internal/state"
"scm.bstein.dev/bstein/ananke/internal/ups"
)
// TestNewDaemonInitializesExporter runs one orchestration or CLI step.
// Signature: TestNewDaemonInitializesExporter(t *testing.T).
// Why: covers constructor branch so daemon initialization contracts stay explicit.
func TestNewDaemonInitializesExporter(t *testing.T) {
d := NewDaemon(config.Config{}, nil, nil, log.New(io.Discard, "", 0))
if d == nil || d.exporter == nil {
t.Fatalf("expected NewDaemon to initialize exporter")
}
}
// TestTriggerShutdownForwardSuccessSetsForwardedIntent runs one orchestration or CLI step.
// Signature: TestTriggerShutdownForwardSuccessSetsForwardedIntent(t *testing.T).
// Why: covers forwarded shutdown happy-path branch and completion intent semantics.
func TestTriggerShutdownForwardSuccessSetsForwardedIntent(t *testing.T) {
tmp := t.TempDir()
sshPath := filepath.Join(tmp, "ssh")
if err := os.WriteFile(sshPath, []byte("#!/usr/bin/env bash\nset -euo pipefail\necho forwarded\n"), 0o755); err != nil {
t.Fatalf("write fake ssh: %v", err)
}
t.Setenv("PATH", tmp+":"+os.Getenv("PATH"))
intentPath := filepath.Join(tmp, "intent.json")
d := &Daemon{
cfg: config.Config{
SSHUser: "atlas",
SSHPort: 2277,
State: config.State{
IntentPath: intentPath,
},
Coordination: config.Coordination{
ForwardShutdownHost: "titan-db",
ForwardShutdownConfig: "/etc/ananke/ananke.yaml",
CommandTimeoutSeconds: 3,
},
},
log: log.New(io.Discard, "", 0),
exporter: metrics.New(),
}
if err := d.triggerShutdown(context.Background(), "test-forward-success"); err != nil {
t.Fatalf("triggerShutdown forward success failed: %v", err)
}
in, err := state.ReadIntent(intentPath)
if err != nil {
t.Fatalf("read forward completion intent: %v", err)
}
if in.State != state.IntentShutdownComplete || in.Source != "daemon-forwarded" {
t.Fatalf("unexpected forward completion intent: %+v", in)
}
}
// TestTriggerShutdownForwardFailureWithoutFallback runs one orchestration or CLI step.
// Signature: TestTriggerShutdownForwardFailureWithoutFallback(t *testing.T).
// Why: covers explicit failure branch when forwarding is required and local fallback is disabled.
func TestTriggerShutdownForwardFailureWithoutFallback(t *testing.T) {
tmp := t.TempDir()
sshPath := filepath.Join(tmp, "ssh")
if err := os.WriteFile(sshPath, []byte("#!/usr/bin/env bash\nset -euo pipefail\necho denied >&2\nexit 255\n"), 0o755); err != nil {
t.Fatalf("write fake ssh: %v", err)
}
t.Setenv("PATH", tmp+":"+os.Getenv("PATH"))
d := &Daemon{
cfg: config.Config{
SSHUser: "atlas",
SSHPort: 2277,
State: config.State{
IntentPath: filepath.Join(tmp, "intent.json"),
},
Coordination: config.Coordination{
ForwardShutdownHost: "titan-db",
ForwardShutdownConfig: "/etc/ananke/ananke.yaml",
FallbackLocalShutdown: false,
CommandTimeoutSeconds: 3,
},
},
log: log.New(io.Discard, "", 0),
exporter: metrics.New(),
}
err := d.triggerShutdown(context.Background(), "test-forward-fail")
if err == nil || !strings.Contains(err.Error(), "forward shutdown failed") {
t.Fatalf("expected forward failure without fallback, got %v", err)
}
}
// TestTriggerShutdownForwardFailureFallsBackToLocal runs one orchestration or CLI step.
// Signature: TestTriggerShutdownForwardFailureFallsBackToLocal(t *testing.T).
// Why: covers fallback branch where local shutdown is used after forwarding fails.
func TestTriggerShutdownForwardFailureFallsBackToLocal(t *testing.T) {
tmp := t.TempDir()
sshPath := filepath.Join(tmp, "ssh")
if err := os.WriteFile(sshPath, []byte("#!/usr/bin/env bash\nset -euo pipefail\necho denied >&2\nexit 255\n"), 0o755); err != nil {
t.Fatalf("write fake ssh: %v", err)
}
t.Setenv("PATH", tmp+":"+os.Getenv("PATH"))
orch := newDaemonTestOrchestrator(t, tmp)
intentPath := filepath.Join(tmp, "intent.json")
d := &Daemon{
cfg: config.Config{
SSHUser: "atlas",
SSHPort: 2277,
State: config.State{
IntentPath: intentPath,
},
Shutdown: config.Shutdown{
EmergencySkipDrain: true,
EmergencySkipEtcd: true,
},
Coordination: config.Coordination{
ForwardShutdownHost: "titan-db",
ForwardShutdownConfig: "/etc/ananke/ananke.yaml",
FallbackLocalShutdown: true,
CommandTimeoutSeconds: 3,
},
},
orch: orch,
log: log.New(io.Discard, "", 0),
exporter: metrics.New(),
}
if err := d.triggerShutdown(context.Background(), "test-forward-fallback"); err != nil {
t.Fatalf("triggerShutdown fallback local failed: %v", err)
}
in, err := state.ReadIntent(intentPath)
if err != nil {
t.Fatalf("read local completion intent: %v", err)
}
if in.State != state.IntentShutdownComplete || in.Source != "daemon-local" {
t.Fatalf("unexpected local completion intent: %+v", in)
}
}
// TestForwardShutdownBuildsJumpArgs runs one orchestration or CLI step.
// Signature: TestForwardShutdownBuildsJumpArgs(t *testing.T).
// Why: covers jump-host argument construction branches in forward shutdown transport.
func TestForwardShutdownBuildsJumpArgs(t *testing.T) {
tmp := t.TempDir()
argsOut := filepath.Join(tmp, "args.txt")
sshPath := filepath.Join(tmp, "ssh")
script := "#!/usr/bin/env bash\nset -euo pipefail\nprintf '%s\n' \"$*\" > " + argsOut + "\n"
if err := os.WriteFile(sshPath, []byte(script), 0o755); err != nil {
t.Fatalf("write fake ssh: %v", err)
}
t.Setenv("PATH", tmp+":"+os.Getenv("PATH"))
d := &Daemon{
cfg: config.Config{
SSHUser: "atlas",
SSHPort: 2277,
SSHConfigFile: "/tmp/custom-config",
SSHIdentityFile: "/tmp/custom-key",
SSHJumpHost: "titan-jh",
SSHJumpUser: "jump",
SSHNodeHosts: map[string]string{
"titan-db": "10.0.0.5",
},
SSHNodeUsers: map[string]string{
"titan-db": "dbadmin",
},
Coordination: config.Coordination{
ForwardShutdownHost: "titan-db",
ForwardShutdownConfig: "/etc/ananke/ananke.yaml",
CommandTimeoutSeconds: 3,
},
},
log: log.New(io.Discard, "", 0),
}
if err := d.forwardShutdown(context.Background(), "args-check"); err != nil {
t.Fatalf("forwardShutdown with jump args failed: %v", err)
}
raw, err := os.ReadFile(argsOut)
if err != nil {
t.Fatalf("read ssh args output: %v", err)
}
out := string(raw)
for _, want := range []string{"-F /tmp/custom-config", "-i /tmp/custom-key", "-J jump@titan-jh:2277", "-p 2277", "dbadmin@10.0.0.5"} {
if !strings.Contains(out, want) {
t.Fatalf("expected ssh args to include %q, got %q", want, out)
}
}
}
// TestStartMetricsServerInvalidBindLogsErrorPath runs one orchestration or CLI step.
// Signature: TestStartMetricsServerInvalidBindLogsErrorPath(t *testing.T).
// Why: exercises goroutine listen failure branch so metrics startup diagnostics remain covered.
func TestStartMetricsServerInvalidBindLogsErrorPath(t *testing.T) {
d := &Daemon{
cfg: config.Config{
Metrics: config.Metrics{
Enabled: true,
BindAddr: "127.0.0.1:not-a-port",
Path: "/metrics",
},
},
log: log.New(io.Discard, "", 0),
exporter: metrics.New(),
}
if err := d.startMetricsServer(); err != nil {
t.Fatalf("startMetricsServer should return nil after goroutine spawn, got %v", err)
}
time.Sleep(25 * time.Millisecond)
}
// TestResolveSSHPathCandidatesFromOverrides runs one orchestration or CLI step.
// Signature: TestResolveSSHPathCandidatesFromOverrides(t *testing.T).
// Why: covers candidate-path discovery branches without requiring writes under /home.
func TestResolveSSHPathCandidatesFromOverrides(t *testing.T) {
tmp := t.TempDir()
cfgPath := filepath.Join(tmp, "config")
keyPath := filepath.Join(tmp, "id_ed25519")
if err := os.WriteFile(cfgPath, []byte("Host *\n"), 0o600); err != nil {
t.Fatalf("write fake config candidate: %v", err)
}
if err := os.WriteFile(keyPath, []byte("fake-key"), 0o600); err != nil {
t.Fatalf("write fake key candidate: %v", err)
}
origConfigs := sshConfigCandidates
origKeys := sshIdentityCandidates
t.Cleanup(func() {
sshConfigCandidates = origConfigs
sshIdentityCandidates = origKeys
})
sshConfigCandidates = []string{cfgPath}
sshIdentityCandidates = []string{keyPath}
d := &Daemon{cfg: config.Config{}}
if got := d.resolveSSHConfigFile(); got != cfgPath {
t.Fatalf("expected config candidate path %q, got %q", cfgPath, got)
}
if got := d.resolveSSHIdentityFile(); got != keyPath {
t.Fatalf("expected key candidate path %q, got %q", keyPath, got)
}
}
// TestForwardShutdownKnownHostsRepairRetry runs one orchestration or CLI step.
// Signature: TestForwardShutdownKnownHostsRepairRetry(t *testing.T).
// Why: covers known-hosts-repair retry branch in forwarded shutdown transport.
func TestForwardShutdownKnownHostsRepairRetry(t *testing.T) {
tmp := t.TempDir()
attemptMarker := filepath.Join(tmp, "attempt")
sshPath := filepath.Join(tmp, "ssh")
script := `#!/usr/bin/env bash
set -euo pipefail
marker="` + attemptMarker + `"
if [[ ! -f "$marker" ]]; then
echo "REMOTE HOST IDENTIFICATION HAS CHANGED!" >&2
touch "$marker"
exit 255
fi
echo "forwarded"
`
if err := os.WriteFile(sshPath, []byte(script), 0o755); err != nil {
t.Fatalf("write fake ssh: %v", err)
}
sshKeygenPath := filepath.Join(tmp, "ssh-keygen")
if err := os.WriteFile(sshKeygenPath, []byte("#!/usr/bin/env bash\nset -euo pipefail\nexit 0\n"), 0o755); err != nil {
t.Fatalf("write fake ssh-keygen: %v", err)
}
sshKeyscanPath := filepath.Join(tmp, "ssh-keyscan")
if err := os.WriteFile(sshKeyscanPath, []byte("#!/usr/bin/env bash\nset -euo pipefail\necho fake-key\n"), 0o755); err != nil {
t.Fatalf("write fake ssh-keyscan: %v", err)
}
t.Setenv("PATH", tmp+":"+os.Getenv("PATH"))
knownHosts := filepath.Join(tmp, "known_hosts")
if err := os.WriteFile(knownHosts, []byte{}, 0o600); err != nil {
t.Fatalf("write known_hosts file: %v", err)
}
d := &Daemon{
cfg: config.Config{
SSHConfigFile: knownHosts, // used only to derive known-hosts search path
SSHUser: "atlas",
SSHPort: 2277,
Coordination: config.Coordination{
ForwardShutdownHost: "titan-db",
ForwardShutdownConfig: "/etc/ananke/ananke.yaml",
CommandTimeoutSeconds: 3,
},
},
log: log.New(io.Discard, "", 0),
}
if err := d.forwardShutdown(context.Background(), "repair-retry"); err != nil {
t.Fatalf("forwardShutdown known-hosts repair retry failed: %v", err)
}
}
// TestTriggerShutdownReturnsLocalShutdownError runs one orchestration or CLI step.
// Signature: TestTriggerShutdownReturnsLocalShutdownError(t *testing.T).
// Why: covers local shutdown error propagation branch from triggerShutdown.
func TestTriggerShutdownReturnsLocalShutdownError(t *testing.T) {
tmp := t.TempDir()
intentPath := filepath.Join(tmp, "intent-dir")
if err := os.MkdirAll(intentPath, 0o755); err != nil {
t.Fatalf("mkdir intent dir: %v", err)
}
orchCfg := config.Config{
ControlPlanes: []string{"titan-db"},
Workers: []string{"titan-23"},
State: config.State{
Dir: filepath.Join(tmp, "state"),
ReportsDir: filepath.Join(tmp, "reports"),
RunHistoryPath: filepath.Join(tmp, "runs.json"),
LockPath: filepath.Join(tmp, "ananke.lock"),
IntentPath: intentPath, // directory path forces MustWriteIntent failure in Shutdown
},
}
orch := cluster.New(
orchCfg,
&execx.Runner{DryRun: false, Logger: log.New(io.Discard, "", 0)},
state.New(filepath.Join(tmp, "runs.json")),
log.New(io.Discard, "", 0),
)
d := &Daemon{
cfg: config.Config{
State: config.State{
IntentPath: intentPath,
},
Shutdown: config.Shutdown{
EmergencySkipDrain: true,
EmergencySkipEtcd: true,
},
},
orch: orch,
log: log.New(io.Discard, "", 0),
exporter: metrics.New(),
}
err := d.triggerShutdown(context.Background(), "local-shutdown-error")
if err == nil {
t.Fatalf("expected triggerShutdown to propagate local shutdown error")
}
}
// TestDaemonRunContextCancelNonTriggerPath runs one orchestration or CLI step.
// Signature: TestDaemonRunContextCancelNonTriggerPath(t *testing.T).
// Why: covers steady-state non-trigger loop branches in Run until context cancellation.
func TestDaemonRunContextCancelNonTriggerPath(t *testing.T) {
stateDir := t.TempDir()
orch := newDaemonTestOrchestrator(t, stateDir)
d := &Daemon{
cfg: config.Config{
UPS: config.UPS{
Enabled: true,
PollSeconds: 0, // exercise default poll fallback
DebounceCount: 0, // exercise default debounce fallback
RuntimeSafetyFactor: 0.5,
},
State: config.State{
IntentPath: filepath.Join(stateDir, "intent.json"),
},
},
orch: orch,
targets: []Target{
{
Name: "Pyrphoros",
Target: "pyrphoros@localhost",
Provider: &daemonFakeProvider{
samples: []ups.Sample{
{OnBattery: false, LowBattery: false, RuntimeSeconds: 7200, RawStatus: "OL"},
},
},
},
},
log: log.New(io.Discard, "", 0),
exporter: metrics.New(),
}
ctx, cancel := context.WithTimeout(context.Background(), 1100*time.Millisecond)
defer cancel()
if err := d.Run(ctx); err == nil {
t.Fatalf("expected context deadline/cancel in non-trigger loop")
}
}
// TestForwardShutdownErrorWithoutOutput runs one orchestration or CLI step.
// Signature: TestForwardShutdownErrorWithoutOutput(t *testing.T).
// Why: covers forwardShutdown branch where ssh fails without any stderr/stdout text.
func TestForwardShutdownErrorWithoutOutput(t *testing.T) {
tmp := t.TempDir()
sshPath := filepath.Join(tmp, "ssh")
if err := os.WriteFile(sshPath, []byte("#!/usr/bin/env bash\nset -euo pipefail\nexit 255\n"), 0o755); err != nil {
t.Fatalf("write fake ssh: %v", err)
}
t.Setenv("PATH", tmp+":"+os.Getenv("PATH"))
d := &Daemon{
cfg: config.Config{
SSHUser: "atlas",
Coordination: config.Coordination{
ForwardShutdownHost: "titan-db",
ForwardShutdownConfig: "/etc/ananke/ananke.yaml",
CommandTimeoutSeconds: 3,
},
},
log: log.New(io.Discard, "", 0),
}
err := d.forwardShutdown(context.Background(), "no-output-fail")
if err == nil || !strings.Contains(strings.ToLower(err.Error()), "forward shutdown via ssh failed") {
t.Fatalf("expected no-output forward ssh failure, got %v", err)
}
}

View File

@ -1,7 +1,133 @@
package service
import "testing"
import (
"context"
"io"
"log"
"path/filepath"
"strings"
"testing"
func TestPlaceholder(t *testing.T) {
// Placeholder test keeps package-level test coverage active.
"scm.bstein.dev/bstein/ananke/internal/config"
"scm.bstein.dev/bstein/ananke/internal/metrics"
"scm.bstein.dev/bstein/ananke/internal/state"
)
// TestDaemonRunRejectsDisabledUPS runs one orchestration or CLI step.
// Signature: TestDaemonRunRejectsDisabledUPS(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestDaemonRunRejectsDisabledUPS(t *testing.T) {
d := &Daemon{
cfg: config.Config{
UPS: config.UPS{Enabled: false},
},
log: log.New(io.Discard, "", 0),
}
if err := d.Run(context.Background()); err == nil {
t.Fatalf("expected UPS-disabled run to fail")
}
}
// TestDaemonRunRejectsMissingTargets runs one orchestration or CLI step.
// Signature: TestDaemonRunRejectsMissingTargets(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestDaemonRunRejectsMissingTargets(t *testing.T) {
d := &Daemon{
cfg: config.Config{
UPS: config.UPS{Enabled: true},
},
log: log.New(io.Discard, "", 0),
}
if err := d.Run(context.Background()); err == nil {
t.Fatalf("expected empty-target run to fail")
}
}
// TestDaemonTargetList runs one orchestration or CLI step.
// Signature: TestDaemonTargetList(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestDaemonTargetList(t *testing.T) {
d := &Daemon{
targets: []Target{
{Name: "Pyrphoros", Target: "pyrphoros@localhost"},
{Name: "Statera", Target: "statera@localhost"},
},
}
got := d.targetList()
if !strings.Contains(got, "Pyrphoros=pyrphoros@localhost") || !strings.Contains(got, "Statera=statera@localhost") {
t.Fatalf("unexpected target list: %q", got)
}
}
// TestDaemonResolveSSHPathsPreferConfigured runs one orchestration or CLI step.
// Signature: TestDaemonResolveSSHPathsPreferConfigured(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestDaemonResolveSSHPathsPreferConfigured(t *testing.T) {
d := &Daemon{
cfg: config.Config{
SSHConfigFile: "/tmp/custom-ssh-config",
SSHIdentityFile: "/tmp/custom-ssh-key",
},
}
if got := d.resolveSSHConfigFile(); got != "/tmp/custom-ssh-config" {
t.Fatalf("unexpected config path: %q", got)
}
if got := d.resolveSSHIdentityFile(); got != "/tmp/custom-ssh-key" {
t.Fatalf("unexpected identity path: %q", got)
}
}
// TestStartMetricsServerRequiresBindAddress runs one orchestration or CLI step.
// Signature: TestStartMetricsServerRequiresBindAddress(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestStartMetricsServerRequiresBindAddress(t *testing.T) {
d := &Daemon{
cfg: config.Config{
Metrics: config.Metrics{
Enabled: true,
BindAddr: "",
Path: "/metrics",
},
},
log: log.New(io.Discard, "", 0),
exporter: nil,
}
d.exporter = d.ensureExporterForTest()
if err := d.startMetricsServer(); err == nil {
t.Fatalf("expected missing bind address error")
}
}
// TestTriggerShutdownSkipsDuplicateWhenIntentActive runs one orchestration or CLI step.
// Signature: TestTriggerShutdownSkipsDuplicateWhenIntentActive(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestTriggerShutdownSkipsDuplicateWhenIntentActive(t *testing.T) {
tmp := t.TempDir()
intentPath := filepath.Join(tmp, "intent.json")
if err := state.MustWriteIntent(intentPath, state.IntentShuttingDown, "already-running", "test"); err != nil {
t.Fatalf("seed intent: %v", err)
}
d := &Daemon{
cfg: config.Config{
State: config.State{
IntentPath: intentPath,
},
},
log: log.New(io.Discard, "", 0),
exporter: nil,
}
d.exporter = d.ensureExporterForTest()
if err := d.triggerShutdown(context.Background(), "duplicate-check"); err != nil {
t.Fatalf("expected duplicate shutdown trigger to be ignored: %v", err)
}
}
// ensureExporterForTest runs one orchestration or CLI step.
// Signature: (d *Daemon) ensureExporterForTest() *metrics.Exporter.
// Why: local helper keeps setup concise while preserving explicit behavior in each test.
func (d *Daemon) ensureExporterForTest() *metrics.Exporter {
if d.exporter == nil {
d.exporter = metrics.New()
}
return d.exporter
}

View File

@ -0,0 +1,131 @@
package sshutil
import (
"context"
"errors"
"io"
"log"
"os"
"path/filepath"
"strings"
"testing"
)
// TestShouldAttemptKnownHostsRepairFalseWithoutError runs one orchestration or CLI step.
// Signature: TestShouldAttemptKnownHostsRepairFalseWithoutError(t *testing.T).
// Why: ensures repair logic does not trigger when command succeeded.
func TestShouldAttemptKnownHostsRepairFalseWithoutError(t *testing.T) {
if ShouldAttemptKnownHostsRepair("ok", nil) {
t.Fatalf("expected false when no error exists")
}
}
// TestIsHostKeyErrorRequiresErr runs one orchestration or CLI step.
// Signature: TestIsHostKeyErrorRequiresErr(t *testing.T).
// Why: covers guard branch that skips marker parsing when err is nil.
func TestIsHostKeyErrorRequiresErr(t *testing.T) {
if IsHostKeyError("REMOTE HOST IDENTIFICATION HAS CHANGED", nil) {
t.Fatalf("expected false when err is nil")
}
}
// TestRepairKnownHostsRemovesEntries runs one orchestration or CLI step.
// Signature: TestRepairKnownHostsRemovesEntries(t *testing.T).
// Why: validates known_hosts repair path actually removes target entries.
func TestRepairKnownHostsRemovesEntries(t *testing.T) {
tmp := t.TempDir()
knownHosts := filepath.Join(tmp, "known_hosts")
content := strings.Join([]string{
"titan-0a ssh-ed25519 AAAATESTKEYONE",
"[titan-0a]:2277 ssh-ed25519 AAAATESTKEYTWO",
"titan-0b ssh-ed25519 AAAATESTKEYTHREE",
"",
}, "\n")
if err := os.WriteFile(knownHosts, []byte(content), 0o600); err != nil {
t.Fatalf("write known_hosts: %v", err)
}
RepairKnownHosts(context.Background(), log.New(io.Discard, "", 0), []string{knownHosts}, []string{"titan-0a", "titan-0a", ""}, 2277)
b, err := os.ReadFile(knownHosts)
if err != nil {
t.Fatalf("read known_hosts: %v", err)
}
got := string(b)
if strings.Contains(got, "titan-0a") {
t.Fatalf("expected titan-0a entries removed, got:\n%s", got)
}
if !strings.Contains(got, "titan-0b") {
t.Fatalf("expected unrelated host to remain, got:\n%s", got)
}
}
// TestRepairKnownHostsNoSshKeygen runs one orchestration or CLI step.
// Signature: TestRepairKnownHostsNoSshKeygen(t *testing.T).
// Why: covers early-return branch when ssh-keygen is unavailable.
func TestRepairKnownHostsNoSshKeygen(t *testing.T) {
tmp := t.TempDir()
t.Setenv("PATH", tmp)
RepairKnownHosts(context.Background(), log.New(io.Discard, "", 0), []string{"/tmp/does-not-matter"}, []string{"titan-0a"}, 2277)
}
// TestRestoreOwnershipNoopOnMissing runs one orchestration or CLI step.
// Signature: TestRestoreOwnershipNoopOnMissing(t *testing.T).
// Why: covers missing-file branch in ownership restoration helper.
func TestRestoreOwnershipNoopOnMissing(t *testing.T) {
restoreOwnership(filepath.Join(t.TempDir(), "missing"), "", -1, -1, 0)
}
// TestCaptureOwnershipMissingFile runs one orchestration or CLI step.
// Signature: TestCaptureOwnershipMissingFile(t *testing.T).
// Why: covers missing-path branch in ownership capture helper.
func TestCaptureOwnershipMissingFile(t *testing.T) {
uid, gid, mode := captureOwnership(filepath.Join(t.TempDir(), "missing"))
if uid != -1 || gid != -1 || mode != 0 {
t.Fatalf("unexpected ownership for missing file uid=%d gid=%d mode=%v", uid, gid, mode)
}
}
// TestRemoveKnownHostEntryAbsentDoesNotFail runs one orchestration or CLI step.
// Signature: TestRemoveKnownHostEntryAbsentDoesNotFail(t *testing.T).
// Why: covers ssh-keygen "not found in" handling branch.
func TestRemoveKnownHostEntryAbsentDoesNotFail(t *testing.T) {
file := filepath.Join(t.TempDir(), "known_hosts")
if err := os.WriteFile(file, []byte("titan-0b ssh-ed25519 AAAA\n"), 0o600); err != nil {
t.Fatalf("write known_hosts: %v", err)
}
removeKnownHostEntry(context.Background(), log.New(io.Discard, "", 0), file, "titan-0a")
b, err := os.ReadFile(file)
if err != nil {
t.Fatalf("read known_hosts after remove: %v", err)
}
if !strings.Contains(string(b), "titan-0b") {
t.Fatalf("expected file content to remain for unrelated hosts")
}
}
// TestCaptureAndRestoreOwnershipRoundTrip runs one orchestration or CLI step.
// Signature: TestCaptureAndRestoreOwnershipRoundTrip(t *testing.T).
// Why: covers successful ownership/mode capture and restore path.
func TestCaptureAndRestoreOwnershipRoundTrip(t *testing.T) {
file := filepath.Join(t.TempDir(), "known_hosts")
if err := os.WriteFile(file, []byte("titan-0b ssh-ed25519 AAAA\n"), 0o600); err != nil {
t.Fatalf("write file: %v", err)
}
uid, gid, mode := captureOwnership(file)
restoreOwnership(file, "", uid, gid, mode)
info, err := os.Stat(file)
if err != nil {
t.Fatalf("stat restored file: %v", err)
}
if info.Mode().Perm() != mode {
t.Fatalf("expected mode %v, got %v", mode, info.Mode().Perm())
}
}
// TestLogfNoLoggerDoesNotPanic runs one orchestration or CLI step.
// Signature: TestLogfNoLoggerDoesNotPanic(t *testing.T).
// Why: covers no-op logger branch.
func TestLogfNoLoggerDoesNotPanic(t *testing.T) {
logf(nil, "message %v", errors.New("x"))
}

View File

@ -19,6 +19,9 @@ var hostKeyErrorMarkers = []string{
"possible dns spoofing detected",
}
// IsHostKeyError runs one orchestration or CLI step.
// Signature: IsHostKeyError(output string, err error) bool.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func IsHostKeyError(output string, err error) bool {
if err == nil {
return false
@ -35,6 +38,9 @@ func IsHostKeyError(output string, err error) bool {
return false
}
// ShouldAttemptKnownHostsRepair runs one orchestration or CLI step.
// Signature: ShouldAttemptKnownHostsRepair(output string, err error) bool.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func ShouldAttemptKnownHostsRepair(output string, err error) bool {
if IsHostKeyError(output, err) {
return true
@ -50,6 +56,9 @@ func ShouldAttemptKnownHostsRepair(output string, err error) bool {
return false
}
// KnownHostsFiles runs one orchestration or CLI step.
// Signature: KnownHostsFiles(sshConfigFile, sshIdentityFile string) []string.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func KnownHostsFiles(sshConfigFile, sshIdentityFile string) []string {
seen := map[string]struct{}{}
add := func(path string) {
@ -86,6 +95,9 @@ func KnownHostsFiles(sshConfigFile, sshIdentityFile string) []string {
return out
}
// RepairKnownHosts runs one orchestration or CLI step.
// Signature: RepairKnownHosts(ctx context.Context, logger *log.Logger, knownHostsFiles []string, hosts []string, port int).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func RepairKnownHosts(ctx context.Context, logger *log.Logger, knownHostsFiles []string, hosts []string, port int) {
if _, err := exec.LookPath("ssh-keygen"); err != nil {
logf(logger, "warning: cannot repair known_hosts (ssh-keygen missing): %v", err)
@ -134,6 +146,9 @@ func RepairKnownHosts(ctx context.Context, logger *log.Logger, knownHostsFiles [
}
}
// removeKnownHostEntry runs one orchestration or CLI step.
// Signature: removeKnownHostEntry(ctx context.Context, logger *log.Logger, file string, entry string).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func removeKnownHostEntry(ctx context.Context, logger *log.Logger, file string, entry string) {
uid, gid, mode := captureOwnership(file)
@ -155,6 +170,9 @@ func removeKnownHostEntry(ctx context.Context, logger *log.Logger, file string,
logf(logger, "warning: known_hosts cleanup failed for %s in %s: %v: %s", entry, file, err, strings.TrimSpace(string(out)))
}
// captureOwnership runs one orchestration or CLI step.
// Signature: captureOwnership(path string) (int, int, os.FileMode).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func captureOwnership(path string) (int, int, os.FileMode) {
info, err := os.Stat(path)
if err != nil {
@ -167,6 +185,9 @@ func captureOwnership(path string) (int, int, os.FileMode) {
return int(st.Uid), int(st.Gid), info.Mode().Perm()
}
// restoreOwnership runs one orchestration or CLI step.
// Signature: restoreOwnership(path string, backupPath string, uid int, gid int, mode os.FileMode).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func restoreOwnership(path string, backupPath string, uid int, gid int, mode os.FileMode) {
if uid < 0 || gid < 0 {
return
@ -185,6 +206,9 @@ func restoreOwnership(path string, backupPath string, uid int, gid int, mode os.
}
}
// logf runs one orchestration or CLI step.
// Signature: logf(logger *log.Logger, format string, args ...any).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func logf(logger *log.Logger, format string, args ...any) {
if logger != nil {
logger.Printf(format, args...)

View File

@ -6,6 +6,9 @@ import (
"testing"
)
// TestIsHostKeyErrorDetectsMismatch runs one orchestration or CLI step.
// Signature: TestIsHostKeyErrorDetectsMismatch(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestIsHostKeyErrorDetectsMismatch(t *testing.T) {
out := "WARNING: REMOTE HOST IDENTIFICATION HAS CHANGED!"
if !IsHostKeyError(out, errors.New("ssh failed")) {
@ -13,6 +16,9 @@ func TestIsHostKeyErrorDetectsMismatch(t *testing.T) {
}
}
// TestIsHostKeyErrorIgnoresGenericFailures runs one orchestration or CLI step.
// Signature: TestIsHostKeyErrorIgnoresGenericFailures(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestIsHostKeyErrorIgnoresGenericFailures(t *testing.T) {
out := "connection timed out"
if IsHostKeyError(out, errors.New("ssh failed")) {
@ -20,12 +26,18 @@ func TestIsHostKeyErrorIgnoresGenericFailures(t *testing.T) {
}
}
// TestShouldAttemptKnownHostsRepairOnSilent255 runs one orchestration or CLI step.
// Signature: TestShouldAttemptKnownHostsRepairOnSilent255(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestShouldAttemptKnownHostsRepairOnSilent255(t *testing.T) {
if !ShouldAttemptKnownHostsRepair("", errors.New("ssh ...: exit status 255")) {
t.Fatalf("expected silent exit status 255 to trigger known_hosts repair")
}
}
// TestKnownHostsFilesIncludesDerivedPaths runs one orchestration or CLI step.
// Signature: TestKnownHostsFilesIncludesDerivedPaths(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestKnownHostsFilesIncludesDerivedPaths(t *testing.T) {
configFile := "/home/atlas/.ssh/config"
identityFile := "/home/tethys/.ssh/id_ed25519"

View File

@ -7,6 +7,9 @@ import (
"time"
)
// quarantineCorruptFile runs one orchestration or CLI step.
// Signature: quarantineCorruptFile(path string, payload []byte, replacement []byte, mode os.FileMode) error.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func quarantineCorruptFile(path string, payload []byte, replacement []byte, mode os.FileMode) error {
if err := os.MkdirAll(filepath.Dir(path), 0o750); err != nil {
return err

View File

@ -0,0 +1,46 @@
package state
import (
"os"
"path/filepath"
"testing"
)
// TestQuarantineCorruptFileWritesBackupAndReplacement runs one orchestration or CLI step.
// Signature: TestQuarantineCorruptFileWritesBackupAndReplacement(t *testing.T).
// Why: covers successful corruption quarantine flow.
func TestQuarantineCorruptFileWritesBackupAndReplacement(t *testing.T) {
path := filepath.Join(t.TempDir(), "intent.json")
if err := quarantineCorruptFile(path, []byte("{bad"), []byte("{}\n"), 0o640); err != nil {
t.Fatalf("quarantine failed: %v", err)
}
b, err := os.ReadFile(path)
if err != nil {
t.Fatalf("read replacement: %v", err)
}
if string(b) != "{}\n" {
t.Fatalf("unexpected replacement payload: %q", string(b))
}
}
// TestQuarantineCorruptFileFailsOnEmptyPath runs one orchestration or CLI step.
// Signature: TestQuarantineCorruptFileFailsOnEmptyPath(t *testing.T).
// Why: covers mkdir failure branch for invalid destination path.
func TestQuarantineCorruptFileFailsOnEmptyPath(t *testing.T) {
if err := quarantineCorruptFile("", []byte("x"), []byte("y"), 0o640); err == nil {
t.Fatalf("expected failure for empty path")
}
}
// TestQuarantineCorruptFileFailsWhenReplacementIsDirectory runs one orchestration or CLI step.
// Signature: TestQuarantineCorruptFileFailsWhenReplacementIsDirectory(t *testing.T).
// Why: covers replacement-write error branch after backup succeeds.
func TestQuarantineCorruptFileFailsWhenReplacementIsDirectory(t *testing.T) {
path := filepath.Join(t.TempDir(), "intent-dir")
if err := os.MkdirAll(path, 0o755); err != nil {
t.Fatalf("mkdir replacement dir: %v", err)
}
if err := quarantineCorruptFile(path, []byte("{bad"), []byte("{}\n"), 0o640); err == nil {
t.Fatalf("expected write replacement failure when path is a directory")
}
}

View File

@ -22,6 +22,9 @@ type Intent struct {
UpdatedAt time.Time `json:"updated_at"`
}
// ReadIntent runs one orchestration or CLI step.
// Signature: ReadIntent(path string) (Intent, error).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func ReadIntent(path string) (Intent, error) {
b, err := os.ReadFile(path)
if err != nil {
@ -43,6 +46,9 @@ func ReadIntent(path string) (Intent, error) {
return in, nil
}
// WriteIntent runs one orchestration or CLI step.
// Signature: WriteIntent(path string, in Intent) error.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func WriteIntent(path string, in Intent) error {
if in.UpdatedAt.IsZero() {
in.UpdatedAt = time.Now().UTC()
@ -50,13 +56,13 @@ func WriteIntent(path string, in Intent) error {
if err := os.MkdirAll(filepath.Dir(path), 0o750); err != nil {
return err
}
b, err := json.MarshalIndent(in, "", " ")
if err != nil {
return err
}
b, _ := json.MarshalIndent(in, "", " ")
return os.WriteFile(path, b, 0o640)
}
// MustWriteIntent runs one orchestration or CLI step.
// Signature: MustWriteIntent(path string, state string, reason string, source string) error.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func MustWriteIntent(path string, state string, reason string, source string) error {
switch state {
case IntentNormal, IntentStartupInProgress, IntentShuttingDown, IntentShutdownComplete:

View File

@ -0,0 +1,135 @@
package state
import (
"os"
"path/filepath"
"strings"
"testing"
"time"
)
// TestReadIntentHandlesMissingAndEmpty runs one orchestration or CLI step.
// Signature: TestReadIntentHandlesMissingAndEmpty(t *testing.T).
// Why: covers nil-state branches for missing and empty intent files.
func TestReadIntentHandlesMissingAndEmpty(t *testing.T) {
path := filepath.Join(t.TempDir(), "intent.json")
in, err := ReadIntent(path)
if err != nil {
t.Fatalf("read missing intent: %v", err)
}
if in.State != "" {
t.Fatalf("expected empty state for missing file, got %q", in.State)
}
if err := os.WriteFile(path, nil, 0o640); err != nil {
t.Fatalf("write empty intent file: %v", err)
}
in, err = ReadIntent(path)
if err != nil {
t.Fatalf("read empty intent file: %v", err)
}
if in.State != "" {
t.Fatalf("expected empty state for empty file, got %q", in.State)
}
}
// TestWriteIntentSetsUpdatedAtWhenZero runs one orchestration or CLI step.
// Signature: TestWriteIntentSetsUpdatedAtWhenZero(t *testing.T).
// Why: verifies write helper auto-populates timestamp for callers.
func TestWriteIntentSetsUpdatedAtWhenZero(t *testing.T) {
path := filepath.Join(t.TempDir(), "intent.json")
if err := WriteIntent(path, Intent{State: IntentNormal, Reason: "unit", Source: "test"}); err != nil {
t.Fatalf("write intent: %v", err)
}
in, err := ReadIntent(path)
if err != nil {
t.Fatalf("read intent: %v", err)
}
if in.UpdatedAt.IsZero() {
t.Fatalf("expected non-zero updated_at")
}
}
// TestParseIntentOutputErrorsOnBadUpdatedAt runs one orchestration or CLI step.
// Signature: TestParseIntentOutputErrorsOnBadUpdatedAt(t *testing.T).
// Why: covers parser error branch for malformed timestamp values.
func TestParseIntentOutputErrorsOnBadUpdatedAt(t *testing.T) {
raw := `intent=normal reason="x" source=y updated_at=not-a-time`
if _, err := ParseIntentOutput(raw); err == nil {
t.Fatalf("expected updated_at parse error")
}
}
// TestParseIntentOutputErrorsWhenMissingToken runs one orchestration or CLI step.
// Signature: TestParseIntentOutputErrorsWhenMissingToken(t *testing.T).
// Why: covers parser terminal error when intent token is absent.
func TestParseIntentOutputErrorsWhenMissingToken(t *testing.T) {
if _, err := ParseIntentOutput("no intent line here"); err == nil {
t.Fatalf("expected parse failure without intent token")
}
}
// TestParseIntentOutputWithoutReasonOrSource runs one orchestration or CLI step.
// Signature: TestParseIntentOutputWithoutReasonOrSource(t *testing.T).
// Why: covers parser branch where optional fields are omitted.
func TestParseIntentOutputWithoutReasonOrSource(t *testing.T) {
in, err := ParseIntentOutput("intent=shutdown_complete")
if err != nil {
t.Fatalf("parse intent output: %v", err)
}
if in.State != IntentShutdownComplete {
t.Fatalf("expected shutdown_complete, got %q", in.State)
}
}
// TestMustWriteIntentPersistsProvidedTimestampType runs one orchestration or CLI step.
// Signature: TestMustWriteIntentPersistsProvidedTimestampType(t *testing.T).
// Why: sanity check that written timestamps round-trip RFC3339 parsing.
func TestMustWriteIntentPersistsProvidedTimestampType(t *testing.T) {
path := filepath.Join(t.TempDir(), "intent.json")
if err := MustWriteIntent(path, IntentNormal, "ok", "test"); err != nil {
t.Fatalf("must write intent: %v", err)
}
in, err := ReadIntent(path)
if err != nil {
t.Fatalf("read intent: %v", err)
}
if time.Since(in.UpdatedAt) > time.Minute {
t.Fatalf("expected recent timestamp, got %s", in.UpdatedAt)
}
}
// TestWriteIntentFailsWhenParentIsFile runs one orchestration or CLI step.
// Signature: TestWriteIntentFailsWhenParentIsFile(t *testing.T).
// Why: covers mkdir failure branch when parent path is not a directory.
func TestWriteIntentFailsWhenParentIsFile(t *testing.T) {
tmp := t.TempDir()
parent := filepath.Join(tmp, "not-a-dir")
if err := os.WriteFile(parent, []byte("x"), 0o600); err != nil {
t.Fatalf("write parent file: %v", err)
}
err := WriteIntent(filepath.Join(parent, "intent.json"), Intent{State: IntentNormal})
if err == nil {
t.Fatalf("expected write failure for non-directory parent")
}
}
// TestReadIntentFailsOnPermissionError runs one orchestration or CLI step.
// Signature: TestReadIntentFailsOnPermissionError(t *testing.T).
// Why: covers read error branch distinct from not-exist and empty-file handling.
func TestReadIntentFailsOnPermissionError(t *testing.T) {
path := filepath.Join(t.TempDir(), "intent.json")
if err := os.WriteFile(path, []byte(`{"state":"normal"}`), 0o640); err != nil {
t.Fatalf("write intent file: %v", err)
}
if err := os.Chmod(path, 0o000); err != nil {
t.Fatalf("chmod intent file: %v", err)
}
defer os.Chmod(path, 0o640)
_, err := ReadIntent(path)
if err == nil {
t.Fatalf("expected permission error")
}
if strings.Contains(strings.ToLower(err.Error()), "not exist") {
t.Fatalf("expected permission-related error, got: %v", err)
}
}

View File

@ -7,6 +7,10 @@ import (
)
// ParseIntentOutput parses `ananke intent` CLI output from local/remote commands.
// Signature: ParseIntentOutput(raw string) (Intent, error)
// Why: Startup/shutdown coordination depends on intent state being interpreted
// consistently from command output so remote peers and local orchestration can
// share one durable control-plane signal.
func ParseIntentOutput(raw string) (Intent, error) {
for _, line := range strings.Split(raw, "\n") {
line = strings.TrimSpace(line)
@ -19,9 +23,6 @@ func ParseIntentOutput(raw string) (Intent, error) {
}
payload := strings.TrimSpace(line[idx:])
fields := strings.Fields(payload)
if len(fields) == 0 || !strings.HasPrefix(fields[0], "intent=") {
continue
}
stateValue := strings.TrimSpace(strings.TrimPrefix(fields[0], "intent="))
if stateValue == "" || stateValue == "none" {
return Intent{}, nil
@ -29,12 +30,10 @@ func ParseIntentOutput(raw string) (Intent, error) {
in := Intent{State: stateValue}
if strings.Contains(payload, `reason="`) {
parts := strings.SplitN(payload, `reason="`, 2)
if len(parts) == 2 {
if end := strings.Index(parts[1], `"`); end >= 0 {
in.Reason = parts[1][:end]
}
}
}
for _, field := range fields[1:] {
if strings.HasPrefix(field, "source=") {
in.Source = strings.TrimSpace(strings.TrimPrefix(field, "source="))

View File

@ -6,6 +6,9 @@ import (
"testing"
)
// TestWriteReadIntentRoundTrip runs one orchestration or CLI step.
// Signature: TestWriteReadIntentRoundTrip(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestWriteReadIntentRoundTrip(t *testing.T) {
p := filepath.Join(t.TempDir(), "intent.json")
if err := MustWriteIntent(p, IntentShuttingDown, "ups-threshold", "daemon"); err != nil {
@ -23,6 +26,9 @@ func TestWriteReadIntentRoundTrip(t *testing.T) {
}
}
// TestMustWriteIntentRejectsUnknownState runs one orchestration or CLI step.
// Signature: TestMustWriteIntentRejectsUnknownState(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestMustWriteIntentRejectsUnknownState(t *testing.T) {
p := filepath.Join(t.TempDir(), "intent.json")
if err := MustWriteIntent(p, "weird", "x", "y"); err == nil {
@ -30,6 +36,9 @@ func TestMustWriteIntentRejectsUnknownState(t *testing.T) {
}
}
// TestReadIntentAutoHealsCorruptJSON runs one orchestration or CLI step.
// Signature: TestReadIntentAutoHealsCorruptJSON(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestReadIntentAutoHealsCorruptJSON(t *testing.T) {
dir := t.TempDir()
p := filepath.Join(dir, "intent.json")
@ -60,6 +69,9 @@ func TestReadIntentAutoHealsCorruptJSON(t *testing.T) {
}
}
// TestParseIntentOutputParsesStructuredLine runs one orchestration or CLI step.
// Signature: TestParseIntentOutputParsesStructuredLine(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestParseIntentOutputParsesStructuredLine(t *testing.T) {
raw := `[ananke] 2026/04/05 11:24:49 intent=normal reason="guard-test-clear-2" source=drill updated_at=2026-04-05T16:24:33Z`
in, err := ParseIntentOutput(raw)
@ -80,6 +92,9 @@ func TestParseIntentOutputParsesStructuredLine(t *testing.T) {
}
}
// TestParseIntentOutputHandlesNone runs one orchestration or CLI step.
// Signature: TestParseIntentOutputHandlesNone(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestParseIntentOutputHandlesNone(t *testing.T) {
in, err := ParseIntentOutput(`[ananke] 2026/04/05 11:24:49 intent=none`)
if err != nil {

View File

@ -32,10 +32,16 @@ type Store struct {
mu sync.Mutex
}
// New runs one orchestration or CLI step.
// Signature: New(path string) *Store.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func New(path string) *Store {
return &Store{path: path}
}
// EnsureDir runs one orchestration or CLI step.
// Signature: EnsureDir(dir string) error.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func EnsureDir(dir string) error {
if dir == "" {
return fmt.Errorf("state dir must not be empty")
@ -43,6 +49,9 @@ func EnsureDir(dir string) error {
return os.MkdirAll(dir, 0o750)
}
// AcquireLock runs one orchestration or CLI step.
// Signature: AcquireLock(path string) (func(), error).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func AcquireLock(path string) (func(), error) {
if err := os.MkdirAll(filepath.Dir(path), 0o750); err != nil {
return nil, err
@ -85,6 +94,9 @@ func AcquireLock(path string) (func(), error) {
return unlock, nil
}
// staleLock runs one orchestration or CLI step.
// Signature: staleLock(path string) (bool, error).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func staleLock(path string) (bool, error) {
b, err := os.ReadFile(path)
if err != nil {
@ -99,6 +111,9 @@ func staleLock(path string) (bool, error) {
line = strings.TrimSpace(line)
if strings.HasPrefix(line, "pid=") {
v := strings.TrimPrefix(line, "pid=")
if fields := strings.Fields(v); len(fields) > 0 {
v = fields[0]
}
parsed, parseErr := strconv.Atoi(v)
if parseErr != nil {
return true, nil
@ -118,6 +133,9 @@ func staleLock(path string) (bool, error) {
return false, nil
}
// Append runs one orchestration or CLI step.
// Signature: (s *Store) Append(record RunRecord) error.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (s *Store) Append(record RunRecord) error {
s.mu.Lock()
defer s.mu.Unlock()
@ -133,19 +151,22 @@ func (s *Store) Append(record RunRecord) error {
if err := os.MkdirAll(filepath.Dir(s.path), 0o750); err != nil {
return err
}
b, err := json.MarshalIndent(records, "", " ")
if err != nil {
return err
}
b, _ := json.MarshalIndent(records, "", " ")
return os.WriteFile(s.path, b, 0o640)
}
// Load runs one orchestration or CLI step.
// Signature: (s *Store) Load() ([]RunRecord, error).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (s *Store) Load() ([]RunRecord, error) {
s.mu.Lock()
defer s.mu.Unlock()
return s.loadUnlocked()
}
// loadUnlocked runs one orchestration or CLI step.
// Signature: (s *Store) loadUnlocked() ([]RunRecord, error).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (s *Store) loadUnlocked() ([]RunRecord, error) {
b, err := os.ReadFile(s.path)
if err != nil {
@ -167,18 +188,30 @@ func (s *Store) loadUnlocked() ([]RunRecord, error) {
return records, nil
}
// ShutdownP95 runs one orchestration or CLI step.
// Signature: (s *Store) ShutdownP95(defaultSeconds int) int.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (s *Store) ShutdownP95(defaultSeconds int) int {
return s.shutdownP95(defaultSeconds, 1, nil)
}
// ShutdownP95WithMinSamples runs one orchestration or CLI step.
// Signature: (s *Store) ShutdownP95WithMinSamples(defaultSeconds int, minSamples int) int.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (s *Store) ShutdownP95WithMinSamples(defaultSeconds int, minSamples int) int {
return s.shutdownP95(defaultSeconds, minSamples, nil)
}
// ShutdownP95ByReasonPrefix runs one orchestration or CLI step.
// Signature: (s *Store) ShutdownP95ByReasonPrefix(defaultSeconds int, minSamples int, reasonPrefixes []string) int.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (s *Store) ShutdownP95ByReasonPrefix(defaultSeconds int, minSamples int, reasonPrefixes []string) int {
return s.shutdownP95(defaultSeconds, minSamples, reasonPrefixes)
}
// shutdownP95 runs one orchestration or CLI step.
// Signature: (s *Store) shutdownP95(defaultSeconds int, minSamples int, reasonPrefixes []string) int.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (s *Store) shutdownP95(defaultSeconds int, minSamples int, reasonPrefixes []string) int {
if minSamples <= 0 {
minSamples = 1
@ -217,14 +250,5 @@ func (s *Store) shutdownP95(defaultSeconds int, minSamples int, reasonPrefixes [
}
sort.Ints(d)
idx := int(math.Ceil(0.95*float64(len(d)))) - 1
if idx < 0 {
idx = 0
}
if idx >= len(d) {
idx = len(d) - 1
}
if d[idx] <= 0 {
return defaultSeconds
}
return d[idx]
}

View File

@ -0,0 +1,156 @@
package state
import (
"encoding/json"
"os"
"path/filepath"
"strconv"
"testing"
"time"
)
// TestEnsureDirRejectsEmpty runs one orchestration or CLI step.
// Signature: TestEnsureDirRejectsEmpty(t *testing.T).
// Why: covers explicit guard branch for empty state directory inputs.
func TestEnsureDirRejectsEmpty(t *testing.T) {
if err := EnsureDir(""); err == nil {
t.Fatalf("expected empty directory error")
}
}
// TestStoreAppendTrimToMaxRecords runs one orchestration or CLI step.
// Signature: TestStoreAppendTrimToMaxRecords(t *testing.T).
// Why: covers retention branch that trims run history to the 200-record cap.
func TestStoreAppendTrimToMaxRecords(t *testing.T) {
path := filepath.Join(t.TempDir(), "runs.json")
s := New(path)
now := time.Now().UTC()
for i := 0; i < 205; i++ {
if err := s.Append(RunRecord{
ID: "r-" + strconv.Itoa(i),
Action: "shutdown",
StartedAt: now,
EndedAt: now,
DurationSeconds: i + 1,
Success: true,
}); err != nil {
t.Fatalf("append %d failed: %v", i, err)
}
}
recs, err := s.Load()
if err != nil {
t.Fatalf("load failed: %v", err)
}
if len(recs) != 200 {
t.Fatalf("expected trim to 200 records, got %d", len(recs))
}
}
// TestStoreLoadHandlesEmptyFile runs one orchestration or CLI step.
// Signature: TestStoreLoadHandlesEmptyFile(t *testing.T).
// Why: covers load branch for empty existing run-history file.
func TestStoreLoadHandlesEmptyFile(t *testing.T) {
path := filepath.Join(t.TempDir(), "runs.json")
if err := os.WriteFile(path, nil, 0o640); err != nil {
t.Fatalf("write empty file: %v", err)
}
recs, err := New(path).Load()
if err != nil {
t.Fatalf("load empty file: %v", err)
}
if len(recs) != 0 {
t.Fatalf("expected no records, got %d", len(recs))
}
}
// TestStoreLoadReturnsErrorOnUnhealableDecode runs one orchestration or CLI step.
// Signature: TestStoreLoadReturnsErrorOnUnhealableDecode(t *testing.T).
// Why: covers decode failure path where replacement write itself can fail.
func TestStoreLoadReturnsErrorOnUnhealableDecode(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "runs.json")
if err := os.WriteFile(path, []byte("{bad-json"), 0o640); err != nil {
t.Fatalf("write invalid file: %v", err)
}
// Make directory readonly so quarantine replacement cannot be written.
if err := os.Chmod(dir, 0o500); err != nil {
t.Fatalf("chmod dir readonly: %v", err)
}
defer os.Chmod(dir, 0o700)
if _, err := New(path).Load(); err == nil {
t.Fatalf("expected load failure when auto-heal cannot write replacement")
}
}
// TestShutdownP95FallsBackOnLoadError runs one orchestration or CLI step.
// Signature: TestShutdownP95FallsBackOnLoadError(t *testing.T).
// Why: covers load-error fallback branch in percentile helper.
func TestShutdownP95FallsBackOnLoadError(t *testing.T) {
path := filepath.Join(t.TempDir(), "runs.json")
if err := os.WriteFile(path, []byte("{bad"), 0o640); err != nil {
t.Fatalf("write invalid file: %v", err)
}
// Use impossible perms to force read failure.
if err := os.Chmod(path, 0o000); err != nil {
t.Fatalf("chmod file: %v", err)
}
defer os.Chmod(path, 0o640)
if got := New(path).ShutdownP95(321); got != 321 {
t.Fatalf("expected fallback default 321, got %d", got)
}
}
// TestShutdownP95ReturnsDefaultOnNonPositiveQuantile runs one orchestration or CLI step.
// Signature: TestShutdownP95ReturnsDefaultOnNonPositiveQuantile(t *testing.T).
// Why: covers branch where computed percentile record is non-positive.
func TestShutdownP95ReturnsDefaultOnNonPositiveQuantile(t *testing.T) {
path := filepath.Join(t.TempDir(), "runs.json")
now := time.Now().UTC()
records := []RunRecord{
{Action: "shutdown", StartedAt: now, EndedAt: now, DurationSeconds: 0, Success: true},
{Action: "shutdown", StartedAt: now, EndedAt: now, DurationSeconds: -1, Success: true},
}
b, err := json.Marshal(records)
if err != nil {
t.Fatalf("marshal records: %v", err)
}
if err := os.WriteFile(path, b, 0o640); err != nil {
t.Fatalf("write records: %v", err)
}
if got := New(path).ShutdownP95WithMinSamples(777, 1); got != 777 {
t.Fatalf("expected default 777, got %d", got)
}
}
// TestStaleLockHelpers runs one orchestration or CLI step.
// Signature: TestStaleLockHelpers(t *testing.T).
// Why: covers stale-lock parser branches directly for reliability.
func TestStaleLockHelpers(t *testing.T) {
tmp := t.TempDir()
missing := filepath.Join(tmp, "missing.lock")
stale, err := staleLock(missing)
if err != nil || !stale {
t.Fatalf("expected missing lock to be stale=true err=nil, got stale=%v err=%v", stale, err)
}
invalidPID := filepath.Join(tmp, "invalid.lock")
if err := os.WriteFile(invalidPID, []byte("pid=notanumber\n"), 0o600); err != nil {
t.Fatalf("write invalid pid lock: %v", err)
}
stale, err = staleLock(invalidPID)
if err != nil || !stale {
t.Fatalf("expected invalid pid lock to be stale=true err=nil, got stale=%v err=%v", stale, err)
}
active := filepath.Join(tmp, "active.lock")
if err := os.WriteFile(active, []byte("pid="+strconv.Itoa(os.Getpid())+"\n"), 0o600); err != nil {
t.Fatalf("write active lock: %v", err)
}
stale, err = staleLock(active)
if err != nil {
t.Fatalf("active staleLock error: %v", err)
}
if stale {
t.Fatalf("expected active lock to report stale=false")
}
}

View File

@ -10,6 +10,9 @@ import (
"time"
)
// TestAcquireLockLifecycle runs one orchestration or CLI step.
// Signature: TestAcquireLockLifecycle(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestAcquireLockLifecycle(t *testing.T) {
lockPath := filepath.Join(t.TempDir(), "ananke.lock")
unlock, err := AcquireLock(lockPath)
@ -25,6 +28,9 @@ func TestAcquireLockLifecycle(t *testing.T) {
}
}
// TestAcquireLockReclaimsStaleLock runs one orchestration or CLI step.
// Signature: TestAcquireLockReclaimsStaleLock(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestAcquireLockReclaimsStaleLock(t *testing.T) {
lockPath := filepath.Join(t.TempDir(), "ananke.lock")
if err := os.WriteFile(lockPath, []byte("pid=999999\n"), 0o600); err != nil {
@ -46,6 +52,9 @@ func TestAcquireLockReclaimsStaleLock(t *testing.T) {
}
}
// TestAcquireLockRejectsActiveLock runs one orchestration or CLI step.
// Signature: TestAcquireLockRejectsActiveLock(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestAcquireLockRejectsActiveLock(t *testing.T) {
lockPath := filepath.Join(t.TempDir(), "ananke.lock")
active := "pid=" + strconv.Itoa(os.Getpid()) + "\n"
@ -58,6 +67,9 @@ func TestAcquireLockRejectsActiveLock(t *testing.T) {
}
}
// TestStoreLoadAutoHealsCorruptJSON runs one orchestration or CLI step.
// Signature: TestStoreLoadAutoHealsCorruptJSON(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestStoreLoadAutoHealsCorruptJSON(t *testing.T) {
dir := t.TempDir()
p := filepath.Join(dir, "runs.json")
@ -88,6 +100,9 @@ func TestStoreLoadAutoHealsCorruptJSON(t *testing.T) {
}
}
// TestShutdownP95WithMinSamplesFallsBackWhenHistorySparse runs one orchestration or CLI step.
// Signature: TestShutdownP95WithMinSamplesFallsBackWhenHistorySparse(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestShutdownP95WithMinSamplesFallsBackWhenHistorySparse(t *testing.T) {
p := filepath.Join(t.TempDir(), "runs.json")
records := []RunRecord{
@ -115,6 +130,9 @@ func TestShutdownP95WithMinSamplesFallsBackWhenHistorySparse(t *testing.T) {
}
}
// TestShutdownP95ByReasonPrefixFiltersSamples runs one orchestration or CLI step.
// Signature: TestShutdownP95ByReasonPrefixFiltersSamples(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestShutdownP95ByReasonPrefixFiltersSamples(t *testing.T) {
p := filepath.Join(t.TempDir(), "runs.json")
now := time.Now().UTC()
@ -161,6 +179,9 @@ func TestShutdownP95ByReasonPrefixFiltersSamples(t *testing.T) {
}
}
// TestShutdownP95IgnoresDryRunSamples runs one orchestration or CLI step.
// Signature: TestShutdownP95IgnoresDryRunSamples(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestShutdownP95IgnoresDryRunSamples(t *testing.T) {
p := filepath.Join(t.TempDir(), "runs.json")
now := time.Now().UTC()

View File

@ -0,0 +1,10 @@
package state
import "os"
// TestHookQuarantineCorruptFile runs one orchestration or CLI step.
// Signature: TestHookQuarantineCorruptFile(path string, payload []byte, replacement []byte, mode os.FileMode) error.
// Why: exposes corrupt-file healing internals to the top-level testing module without package-local tests.
func TestHookQuarantineCorruptFile(path string, payload []byte, replacement []byte, mode os.FileMode) error {
return quarantineCorruptFile(path, payload, replacement, mode)
}

View File

@ -28,10 +28,16 @@ type NUTProvider struct {
Target string
}
// NewNUTProvider runs one orchestration or CLI step.
// Signature: NewNUTProvider(target string) *NUTProvider.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func NewNUTProvider(target string) *NUTProvider {
return &NUTProvider{Target: target}
}
// Read runs one orchestration or CLI step.
// Signature: (p *NUTProvider) Read(ctx context.Context) (Sample, error).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (p *NUTProvider) Read(ctx context.Context) (Sample, error) {
if p.Target == "" {
return Sample{}, fmt.Errorf("NUT target must not be empty")
@ -44,6 +50,9 @@ func (p *NUTProvider) Read(ctx context.Context) (Sample, error) {
return parseNUT(string(out))
}
// parseNUT runs one orchestration or CLI step.
// Signature: parseNUT(raw string) (Sample, error).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func parseNUT(raw string) (Sample, error) {
kv := map[string]string{}
s := bufio.NewScanner(strings.NewReader(raw))
@ -106,6 +115,9 @@ func parseNUT(raw string) (Sample, error) {
var parseNumberCleaner = regexp.MustCompile(`[^0-9.+-]`)
// parseNumber runs one orchestration or CLI step.
// Signature: parseNumber(raw string) (float64, bool).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func parseNumber(raw string) (float64, bool) {
cleaned := strings.TrimSpace(parseNumberCleaner.ReplaceAllString(raw, ""))
if cleaned == "" {

View File

@ -0,0 +1,108 @@
package ups
import (
"context"
"os"
"path/filepath"
"strings"
"testing"
)
// TestParseNUTRejectsMissingStatus runs one orchestration or CLI step.
// Signature: TestParseNUTRejectsMissingStatus(t *testing.T).
// Why: covers parser error path when mandatory status line is absent.
func TestParseNUTRejectsMissingStatus(t *testing.T) {
if _, err := parseNUT("battery.charge: 88"); err == nil {
t.Fatalf("expected missing status error")
}
}
// TestParseNUTParsesOptionalNumbers runs one orchestration or CLI step.
// Signature: TestParseNUTParsesOptionalNumbers(t *testing.T).
// Why: covers numeric extraction branches for charge/load/nominal fields.
func TestParseNUTParsesOptionalNumbers(t *testing.T) {
raw := strings.Join([]string{
"ups.status: OB LB",
"battery.runtime: 1024",
"battery.charge: 71.5 Percent",
"ups.load: 12.0 Percent",
"ups.realpower.nominal: 900 W",
"",
}, "\n")
s, err := parseNUT(raw)
if err != nil {
t.Fatalf("parseNUT failed: %v", err)
}
if !s.OnBattery || !s.LowBattery || s.RuntimeSeconds != 1024 {
t.Fatalf("unexpected status parse: %+v", s)
}
if s.BatteryCharge != 71.5 || s.LoadPercent != 12 || s.NominalPowerW != 900 {
t.Fatalf("unexpected numeric parse: %+v", s)
}
}
// TestNUTProviderReadViaPathShim runs one orchestration or CLI step.
// Signature: TestNUTProviderReadViaPathShim(t *testing.T).
// Why: covers provider command execution success path deterministically.
func TestNUTProviderReadViaPathShim(t *testing.T) {
tmp := t.TempDir()
upscPath := filepath.Join(tmp, "upsc")
script := `#!/usr/bin/env bash
set -euo pipefail
echo "ups.status: OL"
echo "battery.runtime: 500"
`
if err := os.WriteFile(upscPath, []byte(script), 0o755); err != nil {
t.Fatalf("write fake upsc: %v", err)
}
t.Setenv("PATH", tmp+":"+os.Getenv("PATH"))
sample, err := NewNUTProvider("statera@localhost").Read(context.Background())
if err != nil {
t.Fatalf("provider read failed: %v", err)
}
if sample.OnBattery {
t.Fatalf("expected OL to report not-on-battery")
}
if sample.RuntimeSeconds != 500 {
t.Fatalf("expected runtime 500, got %d", sample.RuntimeSeconds)
}
}
// TestNUTProviderReadRejectsEmptyTarget runs one orchestration or CLI step.
// Signature: TestNUTProviderReadRejectsEmptyTarget(t *testing.T).
// Why: covers provider guard for empty NUT target values.
func TestNUTProviderReadRejectsEmptyTarget(t *testing.T) {
if _, err := NewNUTProvider("").Read(context.Background()); err == nil {
t.Fatalf("expected empty-target read error")
}
}
// TestParseNumberRejectsInvalid runs one orchestration or CLI step.
// Signature: TestParseNumberRejectsInvalid(t *testing.T).
// Why: covers parseNumber false-return branch for invalid input.
func TestParseNumberRejectsInvalid(t *testing.T) {
if _, ok := parseNumber("not-a-number"); ok {
t.Fatalf("expected parseNumber to reject invalid input")
}
}
// TestNUTProviderReadCommandFailure runs one orchestration or CLI step.
// Signature: TestNUTProviderReadCommandFailure(t *testing.T).
// Why: covers provider error propagation when upsc exits non-zero.
func TestNUTProviderReadCommandFailure(t *testing.T) {
tmp := t.TempDir()
upscPath := filepath.Join(tmp, "upsc")
script := `#!/usr/bin/env bash
set -euo pipefail
echo "upsc failed" >&2
exit 2
`
if err := os.WriteFile(upscPath, []byte(script), 0o755); err != nil {
t.Fatalf("write fake upsc: %v", err)
}
t.Setenv("PATH", tmp+":"+os.Getenv("PATH"))
if _, err := NewNUTProvider("pyrphoros@localhost").Read(context.Background()); err == nil {
t.Fatalf("expected provider read error on upsc failure")
}
}

View File

@ -2,6 +2,9 @@ package ups
import "testing"
// TestParseNUT runs one orchestration or CLI step.
// Signature: TestParseNUT(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestParseNUT(t *testing.T) {
raw := `battery.runtime: 384
battery.charge: 72

View File

@ -9,7 +9,7 @@ ANANKE_COORDINATOR_RELAY="${ANANKE_COORDINATOR_RELAY:-}"
LOG_DIR="${ANANKE_DRILL_LOG_DIR:-/tmp/ananke-drills}"
STARTUP_TIMEOUT_SECONDS="${ANANKE_DRILL_STARTUP_TIMEOUT_SECONDS:-1800}"
SHUTDOWN_TIMEOUT_SECONDS="${ANANKE_DRILL_SHUTDOWN_TIMEOUT_SECONDS:-1800}"
SHUTDOWN_CONFIG="${ANANKE_DRILL_SHUTDOWN_CONFIG:-/tmp/ananke-drill-no-poweroff.yaml}"
SHUTDOWN_CONFIG="${ANANKE_DRILL_SHUTDOWN_CONFIG:-/tmp/ananke-drill-cluster-only.yaml}"
STARTUP_RETRY_DELAY_SECONDS="${ANANKE_DRILL_STARTUP_RETRY_DELAY_SECONDS:-10}"
STARTUP_RETRY_MAX="${ANANKE_DRILL_STARTUP_RETRY_MAX:-12}"
EXECUTE=0
@ -25,7 +25,7 @@ Drills:
foundation-recovery Simulate vault/postgres/gitea outage and require layered restore.
reconciliation-resume Simulate global Flux suspend + source-controller down and require resume.
startup-intent-guard Assert startup is blocked when shutdown intent is active.
controlled-cycle Run full shutdown->startup recovery cycle (uses no-poweroff config).
controlled-cycle Run full shutdown->startup recovery cycle (uses cluster-only shutdown config).
Notes:
- Drills are intentionally disruptive and are not part of regular `make test`.
@ -405,7 +405,7 @@ run_drill_controlled_cycle() {
run_coordinator_bash "[ -s '${SHUTDOWN_CONFIG}' ]" || die "shutdown drill config missing on coordinator: ${SHUTDOWN_CONFIG}"
fi
log "running controlled shutdown cycle (poweroff disabled config)"
log "running controlled shutdown cycle (cluster-only shutdown config)"
run_ananke_shutdown "drill-controlled-cycle-shutdown"
log "running startup recovery cycle"

View File

@ -9,6 +9,7 @@ fi
REPO_URL="${ANANKE_REPO_URL:-ssh://git@scm.bstein.dev:2242/bstein/ananke.git}"
BRANCH="${ANANKE_REPO_BRANCH:-main}"
REPO_DIR="${ANANKE_REPO_DIR:-/opt/ananke}"
HOST_SHORT="$(hostname -s 2>/dev/null || hostname)"
mkdir -p "$(dirname "${REPO_DIR}")"
if [[ ! -d "${REPO_DIR}/.git" ]]; then
@ -23,4 +24,16 @@ git checkout "${BRANCH}"
git reset --hard "origin/${BRANCH}"
echo "[self-update] running installer"
# Keep host configs aligned with tracked templates so startup/shutdown drills
# always use the latest checklist and safety logic.
if [[ -z "${ANANKE_FORCE_CONFIG_TEMPLATE:-}" ]]; then
case "${HOST_SHORT}" in
titan-db)
export ANANKE_FORCE_CONFIG_TEMPLATE="coordinator"
;;
titan-24)
export ANANKE_FORCE_CONFIG_TEMPLATE="peer"
;;
esac
fi
"${REPO_DIR}/scripts/install.sh"

View File

@ -22,6 +22,7 @@ NUT_PRODUCT_ID="${ANANKE_NUT_PRODUCT_ID:-0601}"
NUT_MONITOR_USER="${ANANKE_NUT_MONITOR_USER:-monuser}"
NUT_MONITOR_PASSWORD="${ANANKE_NUT_MONITOR_PASSWORD:-anankeupsmon}"
FORCE_CONFIG_TEMPLATE="${ANANKE_FORCE_CONFIG_TEMPLATE:-}"
ENFORCE_QUALITY_GATE="${ANANKE_ENFORCE_QUALITY_GATE:-1}"
while [[ $# -gt 0 ]]; do
case "$1" in
@ -228,6 +229,28 @@ migrate_ananke_config() {
echo "[install] added coordination.startup_guard_max_age_seconds=900"
changed=1
fi
if grep -Eq '^[[:space:]]*poweroff_enabled:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
sed -Ei \
-e '/^[[:space:]]*poweroff_enabled:[[:space:]]*(true|false)/d' \
-e '/^[[:space:]]*poweroff_delay_seconds:[[:space:]]*[0-9]+/d' \
-e '/^[[:space:]]*poweroff_local_host:[[:space:]]*(true|false)/d' \
-e '/^[[:space:]]*extra_poweroff_hosts:[[:space:]]*(\[\])?[[:space:]]*$/d' \
"${CONF_DIR}/ananke.yaml"
echo "[install] removed deprecated host-poweroff shutdown config keys"
changed=1
fi
if grep -Eq '^ minimum_battery_percent:[[:space:]]*[0-9.]+' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ require_node_inventory_reachability:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ minimum_battery_percent:[[:space:]]*[0-9.]+/a\ require_node_inventory_reachability: true\n node_inventory_reachability_wait_seconds: 300\n node_inventory_reachability_poll_seconds: 5' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup node inventory reachability gate defaults"
changed=1
fi
if grep -Eq '^state:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ reports_dir:[[:space:]]*/var/lib/ananke/reports' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ dir:[[:space:]]*\/var\/lib\/ananke$/a\ reports_dir: /var/lib/ananke/reports' "${CONF_DIR}/ananke.yaml"
echo "[install] added state.reports_dir default"
changed=1
fi
if ! grep -Eq '^ peer_hosts:' "${CONF_DIR}/ananke.yaml"; then
if [[ "${role_hint}" == "peer" ]] && grep -Eq '^ forward_shutdown_host:[[:space:]]*[A-Za-z0-9._-]+' "${CONF_DIR}/ananke.yaml"; then
local peer_host
@ -838,6 +861,13 @@ EOF
ensure_dependencies
migrate_legacy_hecate_install
if [[ "${ENFORCE_QUALITY_GATE}" == "1" ]]; then
echo "[install] running quality gate"
"${REPO_DIR}/scripts/quality_gate.sh"
else
echo "[install] skipping quality gate (ANANKE_ENFORCE_QUALITY_GATE=${ENFORCE_QUALITY_GATE})"
fi
echo "[install] building ananke"
cd "${REPO_DIR}"
mkdir -p dist
@ -855,6 +885,7 @@ install -m 0755 dist/ananke "${BIN_DIR}/ananke"
echo "[install] installing config + state dirs"
install -d -m 0750 "${CONF_DIR}"
install -d -m 0750 "${STATE_DIR}"
install -d -m 0750 "${STATE_DIR}/reports"
install -d -m 0755 "${LIB_DIR}"
if [[ -n "${FORCE_CONFIG_TEMPLATE}" ]]; then

17
scripts/lint.sh Executable file
View File

@ -0,0 +1,17 @@
#!/usr/bin/env bash
set -euo pipefail
REPO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
cd "${REPO_DIR}"
export PATH="$(go env GOPATH)/bin:${PATH}"
if ! command -v staticcheck >/dev/null 2>&1; then
echo "[lint] installing staticcheck"
go install honnef.co/go/tools/cmd/staticcheck@latest
fi
echo "[lint] go vet"
go vet ./...
echo "[lint] staticcheck (pedantic code-smell pass)"
staticcheck ./...

110
scripts/quality_gate.sh Executable file
View File

@ -0,0 +1,110 @@
#!/usr/bin/env bash
set -euo pipefail
REPO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
QUALITY_METRICS_ENABLED="${ANANKE_QUALITY_METRICS_ENABLED:-1}"
QUALITY_METRICS_FILE="${ANANKE_QUALITY_METRICS_FILE:-/var/lib/ananke/quality-gate.prom}"
QUALITY_STATE_FILE="${ANANKE_QUALITY_STATE_FILE:-/var/lib/ananke/quality-gate.state}"
read_quality_counter() {
local key="$1"
if [[ ! -f "${QUALITY_STATE_FILE}" ]]; then
echo 0
return 0
fi
local value
value="$(awk -F= -v key="${key}" '$1==key {print $2}' "${QUALITY_STATE_FILE}" | tail -n1)"
if [[ ! "${value}" =~ ^[0-9]+$ ]]; then
echo 0
return 0
fi
echo "${value}"
}
write_quality_metrics() {
local exit_code="$1"
if [[ "${QUALITY_METRICS_ENABLED}" != "1" ]]; then
return 0
fi
local metrics_dir state_dir
metrics_dir="$(dirname "${QUALITY_METRICS_FILE}")"
state_dir="$(dirname "${QUALITY_STATE_FILE}")"
mkdir -p "${metrics_dir}" "${state_dir}" >/dev/null 2>&1 || return 0
local ok failed total last_success now success_percent
ok="$(read_quality_counter ok)"
failed="$(read_quality_counter failed)"
last_success=0
if [[ "${exit_code}" -eq 0 ]]; then
ok=$((ok + 1))
last_success=1
else
failed=$((failed + 1))
fi
total=$((ok + failed))
now="$(date +%s)"
success_percent="$(awk -v ok="${ok}" -v total="${total}" 'BEGIN { if (total <= 0) { print "0.00" } else { printf "%.2f", (ok * 100.0) / total } }')"
local tmp_metrics tmp_state
tmp_metrics="$(mktemp "${metrics_dir}/quality-gate.prom.XXXXXX")"
tmp_state="$(mktemp "${state_dir}/quality-gate.state.XXXXXX")"
cat > "${tmp_metrics}" <<EOF
# HELP ananke_quality_gate_runs_total Total Ananke quality gate runs by status.
# TYPE ananke_quality_gate_runs_total counter
ananke_quality_gate_runs_total{suite="ananke",status="ok"} ${ok}
ananke_quality_gate_runs_total{suite="ananke",status="failed"} ${failed}
# HELP ananke_quality_gate_last_run_success Whether the latest quality gate run succeeded.
# TYPE ananke_quality_gate_last_run_success gauge
ananke_quality_gate_last_run_success{suite="ananke"} ${last_success}
# HELP ananke_quality_gate_last_run_timestamp_seconds Unix timestamp of the latest quality gate run.
# TYPE ananke_quality_gate_last_run_timestamp_seconds gauge
ananke_quality_gate_last_run_timestamp_seconds{suite="ananke"} ${now}
# HELP ananke_quality_gate_success_percent Running quality gate success percentage for Ananke.
# TYPE ananke_quality_gate_success_percent gauge
ananke_quality_gate_success_percent{suite="ananke"} ${success_percent}
EOF
cat > "${tmp_state}" <<EOF
ok=${ok}
failed=${failed}
last_success=${last_success}
last_run=${now}
EOF
mv -f "${tmp_metrics}" "${QUALITY_METRICS_FILE}"
mv -f "${tmp_state}" "${QUALITY_STATE_FILE}"
}
quality_gate_finalize() {
local exit_code="$1"
set +e
write_quality_metrics "${exit_code}" || true
exit "${exit_code}"
}
trap 'quality_gate_finalize $?' EXIT
cd "${REPO_DIR}"
echo "[quality] unit tests"
go test ./...
echo "[quality] hygiene: doc contracts"
cd testing
go test ./hygiene -run TestHygieneContracts/doc_contract -count=1
echo "[quality] hygiene: naming contracts"
go test ./hygiene -run TestHygieneContracts/naming_contract -count=1
echo "[quality] hygiene: LOC limits"
go test ./hygiene -run TestHygieneContracts/loc_limit -count=1
cd "${REPO_DIR}"
echo "[quality] lint"
./scripts/lint.sh
echo "[quality] per-file coverage gate (95%)"
cd testing
ANANKE_ENFORCE_COVERAGE=1 ANANKE_PER_FILE_COVERAGE_TARGET=95 go test ./coverage -run TestPerFileCoverageReport -count=1 -v

View File

@ -0,0 +1,238 @@
package config
import (
"os"
"path/filepath"
"strings"
"testing"
icfg "scm.bstein.dev/bstein/ananke/internal/config"
)
func loadBaselineConfig(t *testing.T) icfg.Config {
t.Helper()
dir := t.TempDir()
path := filepath.Join(dir, "ananke.yaml")
if err := os.WriteFile(path, []byte("ups:\n enabled: false\n"), 0o600); err != nil {
t.Fatalf("write baseline config: %v", err)
}
cfg, err := icfg.Load(path)
if err != nil {
t.Fatalf("load baseline config: %v", err)
}
return cfg
}
// TestHookServiceCatalogAndMergeContracts runs one orchestration or CLI step.
// Signature: TestHookServiceCatalogAndMergeContracts(t *testing.T).
// Why: validates startup checklist defaults and merge semantics so host-level
// overrides cannot silently drop required service behavior checks.
func TestHookServiceCatalogAndMergeContracts(t *testing.T) {
checks := icfg.TestHookDefaultServiceChecklist()
if len(checks) < 20 {
t.Fatalf("expected substantial default checklist, got %d checks", len(checks))
}
seen := map[string]icfg.ServiceChecklistCheck{}
for _, check := range checks {
seen[strings.TrimSpace(check.Name)] = check
}
logging, ok := seen["logging-ui-user-session"]
if !ok || !logging.RequireRobotAuth || strings.TrimSpace(logging.FinalURLNotContains) == "" {
t.Fatalf("expected logging-ui-user-session to require robot auth + final URL validation")
}
keycloak, ok := seen["keycloak-admin-user-session"]
if !ok || !keycloak.RequireRobotAuth || strings.TrimSpace(keycloak.FinalURLNotContains) == "" {
t.Fatalf("expected keycloak-admin-user-session hard auth assertions")
}
critical := icfg.TestHookDefaultCriticalServiceEndpoints()
if len(critical) == 0 {
t.Fatalf("expected critical endpoint defaults")
}
foundMonitoring := false
for _, entry := range critical {
if entry == "monitoring/grafana" {
foundMonitoring = true
break
}
}
if !foundMonitoring {
t.Fatalf("expected monitoring/grafana critical endpoint default")
}
mergedChecks := icfg.TestHookMergeServiceChecklistDefaults(
[]icfg.ServiceChecklistCheck{
{Name: "custom", URL: "https://custom.bstein.dev/", TimeoutSeconds: 5},
{Name: "logging-ui-user-session", URL: "https://override.invalid/", TimeoutSeconds: 5},
},
[]icfg.ServiceChecklistCheck{
{Name: "logging-ui-user-session", URL: "https://logs.bstein.dev/", TimeoutSeconds: 5},
{Name: "metrics-ui-user-session", URL: "https://metrics.bstein.dev/", TimeoutSeconds: 5},
},
)
if len(mergedChecks) != 3 {
t.Fatalf("expected 3 merged checks with dedupe, got %d", len(mergedChecks))
}
mergedStrings := icfg.TestHookMergeStringDefaults(
[]string{" one ", "one", "", "two"},
[]string{"two", "three", " "},
)
if strings.Join(mergedStrings, ",") != "one,two,three" {
t.Fatalf("unexpected merged string defaults: %v", mergedStrings)
}
}
// TestValidateServiceChecklistAuthContracts runs one orchestration or CLI step.
// Signature: TestValidateServiceChecklistAuthContracts(t *testing.T).
// Why: covers service-checklist auth and final-url validation branches that are
// critical for preventing false-positive startup success.
func TestValidateServiceChecklistAuthContracts(t *testing.T) {
t.Run("invalid auth mode", func(t *testing.T) {
cfg := loadBaselineConfig(t)
cfg.Startup.ServiceChecklistAuth.Mode = "bad-mode"
if err := cfg.Validate(); err == nil {
t.Fatalf("expected invalid mode validation error")
}
})
t.Run("invalid keycloak base url", func(t *testing.T) {
cfg := loadBaselineConfig(t)
cfg.Startup.ServiceChecklistAuth.KeycloakBaseURL = "://broken"
if err := cfg.Validate(); err == nil {
t.Fatalf("expected invalid keycloak base URL validation error")
}
})
t.Run("missing secret key fields", func(t *testing.T) {
cfg := loadBaselineConfig(t)
cfg.Startup.ServiceChecklistAuth.AdminSecretPasswordKey = ""
if err := cfg.Validate(); err == nil {
t.Fatalf("expected missing admin secret password key validation error")
}
})
t.Run("require robot auth with mode none", func(t *testing.T) {
cfg := loadBaselineConfig(t)
cfg.Startup.ServiceChecklistAuth.Mode = "none"
cfg.Startup.ServiceChecklist = append(cfg.Startup.ServiceChecklist, icfg.ServiceChecklistCheck{
Name: "robot-only",
URL: "https://logs.bstein.dev/",
RequireRobotAuth: true,
TimeoutSeconds: 5,
})
if err := cfg.Validate(); err == nil {
t.Fatalf("expected require_robot_auth + mode none validation error")
}
})
t.Run("final url markers without redirects", func(t *testing.T) {
cfg := loadBaselineConfig(t)
cfg.Startup.ServiceChecklist = append(cfg.Startup.ServiceChecklist, icfg.ServiceChecklistCheck{
Name: "final-url-invalid",
URL: "https://logs.bstein.dev/",
AcceptedStatuses: []int{200},
FinalURLContains: "/app/home",
TimeoutSeconds: 5,
})
if err := cfg.Validate(); err == nil {
t.Fatalf("expected final_url marker validation error when redirects disabled")
}
})
t.Run("invalid accepted status code", func(t *testing.T) {
cfg := loadBaselineConfig(t)
cfg.Startup.ServiceChecklist[0].AcceptedStatuses = []int{700}
if err := cfg.Validate(); err == nil {
t.Fatalf("expected invalid accepted status code error")
}
})
t.Run("required node label map contracts", func(t *testing.T) {
cfg := loadBaselineConfig(t)
cfg.Startup.RequiredNodeLabels = map[string]map[string]string{" ": {"k": "v"}}
if err := cfg.Validate(); err == nil {
t.Fatalf("expected empty required-node-label key error")
}
cfg = loadBaselineConfig(t)
cfg.Startup.RequiredNodeLabels = map[string]map[string]string{"titan-23": {}}
if err := cfg.Validate(); err == nil {
t.Fatalf("expected empty required-node-label map error")
}
cfg = loadBaselineConfig(t)
cfg.Startup.RequiredNodeLabels = map[string]map[string]string{"titan-23": {"zone": " "}}
if err := cfg.Validate(); err == nil {
t.Fatalf("expected empty required-node-label value error")
}
})
t.Run("missing auth fields", func(t *testing.T) {
cfg := loadBaselineConfig(t)
cfg.Startup.ServiceChecklistAuth.Realm = ""
if err := cfg.Validate(); err == nil {
t.Fatalf("expected missing realm error")
}
cfg = loadBaselineConfig(t)
cfg.Startup.ServiceChecklistAuth.RobotUsername = ""
if err := cfg.Validate(); err == nil {
t.Fatalf("expected missing robot username error")
}
cfg = loadBaselineConfig(t)
cfg.Startup.ServiceChecklistAuth.AdminSecretNamespace = ""
if err := cfg.Validate(); err == nil {
t.Fatalf("expected missing admin secret namespace error")
}
cfg = loadBaselineConfig(t)
cfg.Startup.ServiceChecklistAuth.AdminSecretName = ""
if err := cfg.Validate(); err == nil {
t.Fatalf("expected missing admin secret name error")
}
cfg = loadBaselineConfig(t)
cfg.Startup.ServiceChecklistAuth.AdminSecretUsernameKey = ""
if err := cfg.Validate(); err == nil {
t.Fatalf("expected missing admin secret username key error")
}
})
t.Run("service checklist missing url", func(t *testing.T) {
cfg := loadBaselineConfig(t)
cfg.Startup.ServiceChecklist[0].URL = " "
if err := cfg.Validate(); err == nil {
t.Fatalf("expected missing checklist URL error")
}
})
t.Run("coordination and state contracts", func(t *testing.T) {
cfg := loadBaselineConfig(t)
cfg.Coordination.ForwardShutdownHost = "titan-24"
cfg.Coordination.ForwardShutdownConfig = ""
if err := cfg.Validate(); err == nil {
t.Fatalf("expected forward-shutdown config error")
}
cfg = loadBaselineConfig(t)
cfg.Coordination.PeerHosts = []string{"titan-24", " "}
if err := cfg.Validate(); err == nil {
t.Fatalf("expected peer host empty entry error")
}
cfg = loadBaselineConfig(t)
cfg.Coordination.Role = "invalid"
if err := cfg.Validate(); err == nil {
t.Fatalf("expected invalid coordination role error")
}
cfg = loadBaselineConfig(t)
cfg.State.ReportsDir = ""
if err := cfg.Validate(); err == nil {
t.Fatalf("expected state reports_dir required error")
}
})
}

View File

@ -101,9 +101,18 @@ func TestPerFileCoverageReport(t *testing.T) {
root := repoRoot(t)
tmp := t.TempDir()
rootCover := filepath.Join(tmp, "ananke.root.cover.out")
configCover := filepath.Join(tmp, "ananke.testing.config.cover.out")
testingCover := filepath.Join(tmp, "ananke.testing.cover.out")
runCoverageCommand(t, root, rootCover, "./...")
runCoverageCommand(
t,
filepath.Join(root, "testing"),
configCover,
"./config",
"-coverpkg=scm.bstein.dev/bstein/ananke/...",
)
runCoverageCommand(
t,
filepath.Join(root, "testing"),
@ -118,6 +127,7 @@ func TestPerFileCoverageReport(t *testing.T) {
blocks := map[string]coverageBlock{}
parseCoverageProfile(t, rootCover, blocks)
parseCoverageProfile(t, configCover, blocks)
parseCoverageProfile(t, testingCover, blocks)
byFile := map[string]*fileCoverage{}

View File

@ -279,8 +279,8 @@ func TestHookGapMatrixPart11RemainingClosure(t *testing.T) {
_, _, probeErr := orchBodyErr.TestHookHTTPChecklistProbe(context.Background(), config.ServiceChecklistCheck{
URL: "http://" + ln.Addr().String() + "/health",
})
if probeErr == nil || !strings.Contains(probeErr.Error(), "read response body") {
t.Fatalf("expected checklist body read-error branch, got %v", probeErr)
if probeErr == nil || (!strings.Contains(probeErr.Error(), "read response body") && !strings.Contains(probeErr.Error(), "request failed")) {
t.Fatalf("expected checklist probe failure branch, got %v", probeErr)
}
cfgStability := lifecycleConfig(t)

View File

@ -0,0 +1,536 @@
package orchestrator
import (
"context"
"encoding/base64"
"errors"
"fmt"
"net/http"
"net/http/httptest"
"strings"
"testing"
"time"
"scm.bstein.dev/bstein/ananke/internal/cluster"
"scm.bstein.dev/bstein/ananke/internal/config"
)
func testSecretJSON(username, password string) string {
return fmt.Sprintf(
`{"data":{"username":"%s","password":"%s"}}`,
base64.StdEncoding.EncodeToString([]byte(username)),
base64.StdEncoding.EncodeToString([]byte(password)),
)
}
func authSettings(baseURL string) config.ServiceChecklistAuthSettings {
return config.ServiceChecklistAuthSettings{
Mode: "keycloak_robotuser",
KeycloakBaseURL: baseURL,
Realm: "atlas",
RobotUsername: "robotuser",
AdminSecretNamespace: "sso",
AdminSecretName: "keycloak-admin",
AdminSecretUsernameKey: "username",
AdminSecretPasswordKey: "password",
}
}
// TestHookServiceAuthChecklistSuccess runs one orchestration or CLI step.
// Signature: TestHookServiceAuthChecklistSuccess(t *testing.T).
// Why: validates full robotuser-authenticated checklist flow with final URL and
// body markers so startup gates reflect real post-login user behavior.
func TestHookServiceAuthChecklistSuccess(t *testing.T) {
var appServer *httptest.Server
appMux := http.NewServeMux()
appMux.HandleFunc("/session/bootstrap", func(w http.ResponseWriter, _ *http.Request) {
http.SetCookie(w, &http.Cookie{Name: "robot_session", Value: "ok", Path: "/"})
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte("bootstrap ok"))
})
appMux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path == "/" {
http.Redirect(w, r, "/app/home", http.StatusFound)
return
}
cookie, err := r.Cookie("robot_session")
if err != nil || strings.TrimSpace(cookie.Value) == "" {
http.Redirect(w, r, "/oauth2/sign_in", http.StatusFound)
return
}
if r.URL.Path == "/app/home" {
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte("OpenSearch Dashboards"))
return
}
if r.URL.Path == "/oauth2/sign_in" {
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte("sign in"))
return
}
w.WriteHeader(http.StatusNotFound)
})
appServer = httptest.NewTLSServer(appMux)
defer appServer.Close()
kcMux := http.NewServeMux()
kcMux.HandleFunc("/realms/master/protocol/openid-connect/token", func(w http.ResponseWriter, _ *http.Request) {
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte(`{"access_token":"admin-token"}`))
})
kcMux.HandleFunc("/admin/realms/atlas/users", func(w http.ResponseWriter, _ *http.Request) {
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte(`[{"id":"robot-id"}]`))
})
kcMux.HandleFunc("/admin/realms/atlas/users/robot-id/impersonation", func(w http.ResponseWriter, _ *http.Request) {
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte(fmt.Sprintf(`{"redirect":"%s/session/bootstrap"}`, appServer.URL)))
})
kcServer := httptest.NewTLSServer(kcMux)
defer kcServer.Close()
cfg := lifecycleConfig(t)
cfg.Startup.ServiceChecklistAuth = authSettings(kcServer.URL)
recorder := &commandRecorder{}
base := lifecycleDispatcher(recorder)
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "-n sso get secret keycloak-admin -o json") {
recorder.record(name, args)
return testSecretJSON("admin", "password"), nil
}
return base(ctx, timeout, name, args...)
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
check := config.ServiceChecklistCheck{
Name: "logs-ui-user-session",
URL: appServer.URL + "/",
AcceptedStatuses: []int{200},
RequireRobotAuth: true,
FollowRedirects: true,
InsecureSkipTLS: true,
FinalURLContains: "/app/home",
FinalURLNotContains: "/oauth2/sign_in",
BodyContains: "OpenSearch Dashboards",
TimeoutSeconds: 5,
}
ok, detail := orch.TestHookServiceCheckReady(context.Background(), check)
if !ok {
t.Fatalf("expected authenticated checklist success, detail=%q", detail)
}
}
// TestHookServiceAuthModeAndSecretErrors runs one orchestration or CLI step.
// Signature: TestHookServiceAuthModeAndSecretErrors(t *testing.T).
// Why: covers auth mode guards and secret decode error branches to keep startup
// failures explicit when robot-auth prerequisites are missing.
func TestHookServiceAuthModeAndSecretErrors(t *testing.T) {
cfg := lifecycleConfig(t)
client := &http.Client{Timeout: time.Second}
cfgNone := lifecycleConfig(t)
cfgNone.Startup.ServiceChecklistAuth.Mode = "none"
orchNone, _ := newHookOrchestrator(t, cfgNone, nil, nil)
if err := orchNone.TestHookAuthenticateRobotChecklistSession(context.Background(), client); err == nil {
t.Fatalf("expected auth mode none to fail")
}
if _, err := orchNone.TestHookChecklistAuthHTTPClient(context.Background(), time.Second, false); err == nil {
t.Fatalf("expected checklist auth client init to fail when mode=none")
}
cfgBad := lifecycleConfig(t)
cfgBad.Startup.ServiceChecklistAuth.Mode = "bad-mode"
orchBad, _ := newHookOrchestrator(t, cfgBad, nil, nil)
if err := orchBad.TestHookAuthenticateRobotChecklistSession(context.Background(), client); err == nil {
t.Fatalf("expected unsupported auth mode to fail")
}
base := lifecycleDispatcher(&commandRecorder{})
runKubectlErr := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
if name == "kubectl" {
return "", errors.New("kubectl denied")
}
return base(ctx, timeout, name, args...)
}
orchKubectlErr, _ := newHookOrchestrator(t, cfg, runKubectlErr, runKubectlErr)
if _, err := orchKubectlErr.TestHookKubernetesSecretValue(context.Background(), "sso", "keycloak-admin", "username"); err == nil {
t.Fatalf("expected kubectl error branch")
}
if _, _, err := orchKubectlErr.TestHookKeycloakAdminCredentials(context.Background(), cfg.Startup.ServiceChecklistAuth); err == nil {
t.Fatalf("expected keycloakAdminCredentials to fail on username secret lookup")
}
if err := orchKubectlErr.TestHookAuthenticateRobotChecklistSession(context.Background(), client); err == nil {
t.Fatalf("expected auth session failure when secret lookup fails")
}
runBadJSON := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
if name == "kubectl" {
return "{bad", nil
}
return base(ctx, timeout, name, args...)
}
orchBadJSON, _ := newHookOrchestrator(t, cfg, runBadJSON, runBadJSON)
if _, err := orchBadJSON.TestHookKubernetesSecretValue(context.Background(), "sso", "keycloak-admin", "username"); err == nil {
t.Fatalf("expected secret decode error branch")
}
runMissingKey := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
if name == "kubectl" {
return `{"data":{"password":"cGFzcw=="}}`, nil
}
return base(ctx, timeout, name, args...)
}
orchMissingKey, _ := newHookOrchestrator(t, cfg, runMissingKey, runMissingKey)
if _, err := orchMissingKey.TestHookKubernetesSecretValue(context.Background(), "sso", "keycloak-admin", "username"); err == nil {
t.Fatalf("expected missing key branch")
}
if err := orchMissingKey.TestHookAuthenticateRobotChecklistSession(context.Background(), client); err == nil {
t.Fatalf("expected auth session failure when username key is missing")
}
runMissingPassword := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
if name == "kubectl" {
return `{"data":{"username":"YWRtaW4="}}`, nil
}
return base(ctx, timeout, name, args...)
}
orchMissingPassword, _ := newHookOrchestrator(t, cfg, runMissingPassword, runMissingPassword)
if _, _, err := orchMissingPassword.TestHookKeycloakAdminCredentials(context.Background(), cfg.Startup.ServiceChecklistAuth); err == nil {
t.Fatalf("expected keycloakAdminCredentials to fail on password secret lookup")
}
if err := orchMissingPassword.TestHookAuthenticateRobotChecklistSession(context.Background(), client); err == nil {
t.Fatalf("expected auth session failure when password key is missing")
}
runBadB64 := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
if name == "kubectl" {
return `{"data":{"username":"###"}}`, nil
}
return base(ctx, timeout, name, args...)
}
orchBadB64, _ := newHookOrchestrator(t, cfg, runBadB64, runBadB64)
if _, err := orchBadB64.TestHookKubernetesSecretValue(context.Background(), "sso", "keycloak-admin", "username"); err == nil {
t.Fatalf("expected base64 decode branch")
}
runEmptyValue := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
if name == "kubectl" {
return `{"data":{"username":"IA=="}}`, nil
}
return base(ctx, timeout, name, args...)
}
orchEmptyValue, _ := newHookOrchestrator(t, cfg, runEmptyValue, runEmptyValue)
if _, err := orchEmptyValue.TestHookKubernetesSecretValue(context.Background(), "sso", "keycloak-admin", "username"); err == nil {
t.Fatalf("expected empty decoded value branch")
}
if got := cluster.TestHookCompactHTTPBody([]byte(" hello world \n test ")); got != "hello world test" {
t.Fatalf("unexpected compact body %q", got)
}
if got := cluster.TestHookCompactHTTPBody([]byte(" \n\t ")); got != "" {
t.Fatalf("expected compact empty body, got %q", got)
}
if got := cluster.TestHookKeycloakBaseURL(config.ServiceChecklistAuthSettings{KeycloakBaseURL: "https://sso.bstein.dev/"}); got != "https://sso.bstein.dev" {
t.Fatalf("unexpected normalized base URL %q", got)
}
}
// TestHookServiceAuthHTTPErrorBranches runs one orchestration or CLI step.
// Signature: TestHookServiceAuthHTTPErrorBranches(t *testing.T).
// Why: covers token/user/impersonation parser and status branches so startup
// diagnostics remain actionable during auth failures.
func TestHookServiceAuthHTTPErrorBranches(t *testing.T) {
cfg := lifecycleConfig(t)
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
client := &http.Client{Timeout: 2 * time.Second}
authBadURL := authSettings("://bad-url")
if _, err := orch.TestHookKeycloakAdminToken(context.Background(), client, authBadURL, "admin", "pw"); err == nil {
t.Fatalf("expected request-build failure for bad base URL")
}
if _, err := orch.TestHookKeycloakRobotUserID(context.Background(), client, authBadURL, "token"); err == nil {
t.Fatalf("expected robot-user request-build failure for bad base URL")
}
if _, err := orch.TestHookKeycloakImpersonationRedirect(context.Background(), client, authBadURL, "token", "robot"); err == nil {
t.Fatalf("expected impersonation request-build failure for bad base URL")
}
authRequestErr := authSettings("http://127.0.0.1:1")
if _, err := orch.TestHookKeycloakAdminToken(context.Background(), client, authRequestErr, "admin", "pw"); err == nil {
t.Fatalf("expected admin token request error branch")
}
if _, err := orch.TestHookKeycloakRobotUserID(context.Background(), client, authRequestErr, "token"); err == nil {
t.Fatalf("expected robot user request error branch")
}
if _, err := orch.TestHookKeycloakImpersonationRedirect(context.Background(), client, authRequestErr, "token", "robot"); err == nil {
t.Fatalf("expected impersonation request error branch")
}
kcError := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch {
case strings.Contains(r.URL.Path, "/token"):
w.WriteHeader(http.StatusUnauthorized)
_, _ = w.Write([]byte(`{"error":"unauthorized"}`))
case strings.Contains(r.URL.Path, "/users") && strings.Contains(r.URL.RawQuery, "username=robotuser"):
w.WriteHeader(http.StatusInternalServerError)
_, _ = w.Write([]byte(`{"error":"boom"}`))
default:
w.WriteHeader(http.StatusBadGateway)
}
}))
defer kcError.Close()
authError := authSettings(kcError.URL)
if _, err := orch.TestHookKeycloakAdminToken(context.Background(), client, authError, "admin", "pw"); err == nil {
t.Fatalf("expected non-2xx token branch")
}
if _, err := orch.TestHookKeycloakRobotUserID(context.Background(), client, authError, "token"); err == nil {
t.Fatalf("expected non-2xx robot user branch")
}
kcDecode := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch {
case strings.Contains(r.URL.Path, "/token"):
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte("not-json"))
case strings.Contains(r.URL.Path, "/users") && strings.Contains(r.URL.RawQuery, "username=robotuser"):
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte("not-json"))
case strings.Contains(r.URL.Path, "/impersonation"):
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte("not-json"))
default:
w.WriteHeader(http.StatusNotFound)
}
}))
defer kcDecode.Close()
authDecode := authSettings(kcDecode.URL)
if _, err := orch.TestHookKeycloakAdminToken(context.Background(), client, authDecode, "admin", "pw"); err == nil {
t.Fatalf("expected token decode error branch")
}
if _, err := orch.TestHookKeycloakRobotUserID(context.Background(), client, authDecode, "token"); err == nil {
t.Fatalf("expected robot user decode error branch")
}
if _, err := orch.TestHookKeycloakImpersonationRedirect(context.Background(), client, authDecode, "token", "robot"); err == nil {
t.Fatalf("expected impersonation decode error branch")
}
kcMissing := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch {
case strings.Contains(r.URL.Path, "/token"):
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte(`{"access_token":""}`))
case strings.Contains(r.URL.Path, "/users") && strings.Contains(r.URL.RawQuery, "username=robotuser"):
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte(`[]`))
case strings.Contains(r.URL.Path, "/impersonation"):
w.WriteHeader(http.StatusBadRequest)
_, _ = w.Write([]byte(`{"error":"bad request"}`))
default:
w.WriteHeader(http.StatusNotFound)
}
}))
defer kcMissing.Close()
authMissing := authSettings(kcMissing.URL)
if _, err := orch.TestHookKeycloakAdminToken(context.Background(), client, authMissing, "admin", "pw"); err == nil {
t.Fatalf("expected missing access_token branch")
}
if _, err := orch.TestHookKeycloakRobotUserID(context.Background(), client, authMissing, "token"); err == nil {
t.Fatalf("expected missing robot user branch")
}
if _, err := orch.TestHookKeycloakImpersonationRedirect(context.Background(), client, authMissing, "token", "robot"); err == nil {
t.Fatalf("expected impersonation non-2xx branch")
}
}
// TestHookServiceChecklistProbeBranches runs one orchestration or CLI step.
// Signature: TestHookServiceChecklistProbeBranches(t *testing.T).
// Why: exercises redirect + final-url probe branches, including robot-auth
// initialization failures and redirect suppression behavior.
func TestHookServiceChecklistProbeBranches(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.ServiceChecklistAuth.Mode = "none"
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
if _, _, _, _, err := orch.TestHookHTTPChecklistProbeWithLocation(context.Background(), config.ServiceChecklistCheck{
URL: "https://example.invalid/",
RequireRobotAuth: true,
TimeoutSeconds: 1,
}); err == nil {
t.Fatalf("expected robot auth initialization failure when mode=none")
}
redirectServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
http.Redirect(w, r, "/next", http.StatusFound)
}))
defer redirectServer.Close()
orchNoAuth, _ := newHookOrchestrator(t, lifecycleConfig(t), nil, nil)
status, _, location, finalURL, err := orchNoAuth.TestHookHTTPChecklistProbeWithLocation(context.Background(), config.ServiceChecklistCheck{
URL: redirectServer.URL,
FollowRedirects: false,
TimeoutSeconds: 2,
})
if err != nil {
t.Fatalf("unexpected redirect probe error: %v", err)
}
if status != http.StatusFound {
t.Fatalf("expected 302 status when redirects disabled, got %d", status)
}
if !strings.Contains(location, "/next") {
t.Fatalf("expected location header for redirect response, got %q", location)
}
if !strings.Contains(finalURL, redirectServer.URL) {
t.Fatalf("expected final URL to remain original request URL, got %q", finalURL)
}
}
// TestHookAuthenticateRobotChecklistSessionFailureStages runs one orchestration or CLI step.
// Signature: TestHookAuthenticateRobotChecklistSessionFailureStages(t *testing.T).
// Why: drives authenticateRobotChecklistSession through downstream error stages
// (robot lookup, impersonation, redirect-build, redirect-request) to maintain
// resilient startup diagnostics.
func TestHookAuthenticateRobotChecklistSessionFailureStages(t *testing.T) {
client := &http.Client{Timeout: 3 * time.Second}
recorder := &commandRecorder{}
base := lifecycleDispatcher(recorder)
secretRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "-n sso get secret keycloak-admin -o json") {
return testSecretJSON("admin", "password"), nil
}
return base(ctx, timeout, name, args...)
}
t.Run("robot-user lookup failure", func(t *testing.T) {
kc := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch {
case strings.Contains(r.URL.Path, "/token"):
_, _ = w.Write([]byte(`{"access_token":"admin-token"}`))
case strings.Contains(r.URL.Path, "/users"):
w.WriteHeader(http.StatusBadGateway)
_, _ = w.Write([]byte(`{"error":"lookup failed"}`))
default:
w.WriteHeader(http.StatusOK)
}
}))
defer kc.Close()
cfg := lifecycleConfig(t)
cfg.Startup.ServiceChecklistAuth = authSettings(kc.URL)
orch, _ := newHookOrchestrator(t, cfg, secretRun, secretRun)
if err := orch.TestHookAuthenticateRobotChecklistSession(context.Background(), client); err == nil {
t.Fatalf("expected robot-user lookup failure branch")
}
})
t.Run("impersonation failure", func(t *testing.T) {
kc := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch {
case strings.Contains(r.URL.Path, "/token"):
_, _ = w.Write([]byte(`{"access_token":"admin-token"}`))
case strings.Contains(r.URL.Path, "/users"):
_, _ = w.Write([]byte(`[{"id":"robot-id"}]`))
case strings.Contains(r.URL.Path, "/impersonation"):
w.WriteHeader(http.StatusBadGateway)
_, _ = w.Write([]byte(`{"error":"impersonation failed"}`))
default:
w.WriteHeader(http.StatusOK)
}
}))
defer kc.Close()
cfg := lifecycleConfig(t)
cfg.Startup.ServiceChecklistAuth = authSettings(kc.URL)
orch, _ := newHookOrchestrator(t, cfg, secretRun, secretRun)
if err := orch.TestHookAuthenticateRobotChecklistSession(context.Background(), client); err == nil {
t.Fatalf("expected impersonation failure branch")
}
})
t.Run("redirect url build failure", func(t *testing.T) {
kc := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch {
case strings.Contains(r.URL.Path, "/token"):
_, _ = w.Write([]byte(`{"access_token":"admin-token"}`))
case strings.Contains(r.URL.Path, "/users"):
_, _ = w.Write([]byte(`[{"id":"robot-id"}]`))
case strings.Contains(r.URL.Path, "/impersonation"):
_, _ = w.Write([]byte(`{"redirect":"://bad"}`))
default:
w.WriteHeader(http.StatusOK)
}
}))
defer kc.Close()
cfg := lifecycleConfig(t)
cfg.Startup.ServiceChecklistAuth = authSettings(kc.URL)
orch, _ := newHookOrchestrator(t, cfg, secretRun, secretRun)
if err := orch.TestHookAuthenticateRobotChecklistSession(context.Background(), client); err == nil {
t.Fatalf("expected redirect request-build failure branch")
}
})
t.Run("redirect request failure", func(t *testing.T) {
kc := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch {
case strings.Contains(r.URL.Path, "/token"):
_, _ = w.Write([]byte(`{"access_token":"admin-token"}`))
case strings.Contains(r.URL.Path, "/users"):
_, _ = w.Write([]byte(`[{"id":"robot-id"}]`))
case strings.Contains(r.URL.Path, "/impersonation"):
_, _ = w.Write([]byte(`{"redirect":"http://127.0.0.1:1/nowhere"}`))
default:
w.WriteHeader(http.StatusOK)
}
}))
defer kc.Close()
cfg := lifecycleConfig(t)
cfg.Startup.ServiceChecklistAuth = authSettings(kc.URL)
orch, _ := newHookOrchestrator(t, cfg, secretRun, secretRun)
if err := orch.TestHookAuthenticateRobotChecklistSession(context.Background(), client); err == nil {
t.Fatalf("expected redirect request failure branch")
}
})
}
// TestHookServiceAuthFallbackRedirect runs one orchestration or CLI step.
// Signature: TestHookServiceAuthFallbackRedirect(t *testing.T).
// Why: covers empty impersonation redirect fallback to realm account URL so
// session bootstrap is resilient to Keycloak response shape differences.
func TestHookServiceAuthFallbackRedirect(t *testing.T) {
kcMux := http.NewServeMux()
kcMux.HandleFunc("/realms/master/protocol/openid-connect/token", func(w http.ResponseWriter, _ *http.Request) {
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte(`{"access_token":"admin-token"}`))
})
kcMux.HandleFunc("/admin/realms/atlas/users", func(w http.ResponseWriter, _ *http.Request) {
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte(`[{"id":"robot-id"}]`))
})
kcMux.HandleFunc("/admin/realms/atlas/users/robot-id/impersonation", func(w http.ResponseWriter, _ *http.Request) {
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte(`{"redirect":""}`))
})
kcMux.HandleFunc("/realms/atlas/account/", func(w http.ResponseWriter, _ *http.Request) {
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte("account ok"))
})
kcServer := httptest.NewTLSServer(kcMux)
defer kcServer.Close()
cfg := lifecycleConfig(t)
cfg.Startup.ServiceChecklistAuth = authSettings(kcServer.URL)
recorder := &commandRecorder{}
base := lifecycleDispatcher(recorder)
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "-n sso get secret keycloak-admin -o json") {
return testSecretJSON("admin", "password"), nil
}
return base(ctx, timeout, name, args...)
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
if err := orch.TestHookAuthenticateRobotChecklistSession(context.Background(), &http.Client{Timeout: 4 * time.Second, Transport: &http.Transport{}}); err == nil {
t.Fatalf("expected auth bootstrap without TLS skip to fail against TLS test server")
}
if _, err := orch.TestHookChecklistAuthHTTPClient(context.Background(), 4*time.Second, true); err != nil {
t.Fatalf("expected checklist auth client fallback redirect path success, got %v", err)
}
}