ananke/internal/cluster/orchestrator_test.go

package cluster

import (
	"context"
	"log"
	"net/http"
	"net/http/httptest"
	"os"
	"reflect"
	"strings"
	"testing"
	"time"

	"scm.bstein.dev/bstein/ananke/internal/config"
	"scm.bstein.dev/bstein/ananke/internal/state"
)

// TestParseVaultSealed runs one orchestration or CLI step.
// Signature: TestParseVaultSealed(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestParseVaultSealed(t *testing.T) {
	sealed, err := parseVaultSealed(`{"initialized":true,"sealed":true}`)
	if err != nil {
		t.Fatalf("parse sealed=true: %v", err)
	}
	if !sealed {
		t.Fatalf("expected sealed=true")
	}

	sealed, err = parseVaultSealed(`{"initialized":true,"sealed":false}`)
	if err != nil {
		t.Fatalf("parse sealed=false: %v", err)
	}
	if sealed {
		t.Fatalf("expected sealed=false")
	}
}

// TestParseVaultSealedRejectsEmpty runs one orchestration or CLI step.
// Signature: TestParseVaultSealedRejectsEmpty(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestParseVaultSealedRejectsEmpty(t *testing.T) {
	if _, err := parseVaultSealed("   "); err == nil {
		t.Fatalf("expected parse error for empty status payload")
	}
}

// TestParseVaultSealedWithKubectlPreamble runs one orchestration or CLI step.
// Signature: TestParseVaultSealedWithKubectlPreamble(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestParseVaultSealedWithKubectlPreamble(t *testing.T) {
	raw := "Defaulted container \"vault\" out of: vault, setup-config (init)\n{\"sealed\":true,\"initialized\":true}\n"
	sealed, err := parseVaultSealed(raw)
	if err != nil {
		t.Fatalf("parse with preamble: %v", err)
	}
	if !sealed {
		t.Fatalf("expected sealed=true from payload with preamble")
	}
}

// TestFallbackWorkersFromInventoryUsesManagedNodes runs one orchestration or CLI step.
// Signature: TestFallbackWorkersFromInventoryUsesManagedNodes(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestFallbackWorkersFromInventoryUsesManagedNodes(t *testing.T) {
	orch := &Orchestrator{
		cfg: config.Config{
			ControlPlanes: []string{"titan-0a", "titan-0b", "titan-0c"},
			SSHManagedNodes: []string{
				"titan-db",
				"titan-0a",
				"titan-15",
				"titan-17",
			},
		},
		log: log.New(os.Stdout, "", 0),
	}
	got := orch.fallbackWorkersFromInventory()
	want := []string{"titan-15", "titan-17", "titan-db"}
	if !reflect.DeepEqual(got, want) {
		t.Fatalf("fallback workers mismatch: got=%v want=%v", got, want)
	}
}

// TestFallbackWorkersFromInventoryFallsBackToHosts runs one orchestration or CLI step.
// Signature: TestFallbackWorkersFromInventoryFallsBackToHosts(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestFallbackWorkersFromInventoryFallsBackToHosts(t *testing.T) {
	orch := &Orchestrator{
		cfg: config.Config{
			ControlPlanes: []string{"titan-0a", "titan-0b", "titan-0c"},
			SSHNodeHosts: map[string]string{
				"titan-0a": "192.168.22.11",
				"titan-22": "192.168.22.22",
				"titan-24": "192.168.22.26",
			},
		},
		log: log.New(os.Stdout, "", 0),
	}
	got := orch.fallbackWorkersFromInventory()
	want := []string{"titan-22", "titan-24"}
	if !reflect.DeepEqual(got, want) {
		t.Fatalf("fallback workers mismatch: got=%v want=%v", got, want)
	}
}

// TestIntentFreshTreatsZeroTimestampAsFresh runs one orchestration or CLI step.
// Signature: TestIntentFreshTreatsZeroTimestampAsFresh(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestIntentFreshTreatsZeroTimestampAsFresh(t *testing.T) {
	if !intentFresh(state.Intent{}, 30*time.Second) {
		t.Fatalf("zero updated_at intent should be treated as fresh")
	}
}

// TestIntentFreshRespectsAge runs one orchestration or CLI step.
// Signature: TestIntentFreshRespectsAge(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestIntentFreshRespectsAge(t *testing.T) {
	stale := state.Intent{UpdatedAt: time.Now().Add(-2 * time.Minute)}
	fresh := state.Intent{UpdatedAt: time.Now().Add(-20 * time.Second)}
	if intentFresh(stale, 30*time.Second) {
		t.Fatalf("expected stale intent to be considered not fresh")
	}
	if !intentFresh(fresh, 30*time.Second) {
		t.Fatalf("expected recent intent to be considered fresh")
	}
}

// TestCoordinationPeersDedupesAndIncludesForwardHost runs one orchestration or CLI step.
// Signature: TestCoordinationPeersDedupesAndIncludesForwardHost(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestCoordinationPeersDedupesAndIncludesForwardHost(t *testing.T) {
	orch := &Orchestrator{
		cfg: config.Config{
			Coordination: config.Coordination{
				PeerHosts:           []string{"titan-24", "titan-db", "titan-24", " "},
				ForwardShutdownHost: "titan-db",
			},
		},
	}
	got := orch.coordinationPeers()
	want := []string{"titan-24", "titan-db"}
	if !reflect.DeepEqual(got, want) {
		t.Fatalf("coordination peers mismatch: got=%v want=%v", got, want)
	}
}

// TestWorkloadTargetsIgnoredNodesByNodeSelector runs one orchestration or CLI step.
// Signature: TestWorkloadTargetsIgnoredNodesByNodeSelector(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestWorkloadTargetsIgnoredNodesByNodeSelector(t *testing.T) {
	spec := podSpec{
		NodeSelector: map[string]string{
			"kubernetes.io/hostname": "titan-22",
		},
	}
	ignored := map[string]struct{}{"titan-22": {}}
	if !workloadTargetsIgnoredNodes(spec, ignored) {
		t.Fatalf("expected workload to target ignored node via nodeSelector")
	}
}

// TestParseWorkloadIgnoreRules runs one orchestration or CLI step.
// Signature: TestParseWorkloadIgnoreRules(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestParseWorkloadIgnoreRules(t *testing.T) {
	rules := parseWorkloadIgnoreRules([]string{
		"maintenance/metis",
		"crypto/statefulset/monerod",
	})
	if len(rules) != 2 {
		t.Fatalf("expected 2 ignore rules, got %d", len(rules))
	}
	if !workloadIgnored(rules, "maintenance", "deployment", "metis") {
		t.Fatalf("expected namespace/name rule to match")
	}
	if !workloadIgnored(rules, "crypto", "statefulset", "monerod") {
		t.Fatalf("expected namespace/kind/name rule to match")
	}
	if workloadIgnored(rules, "crypto", "deployment", "monerod") {
		t.Fatalf("did not expect mismatched kind to match")
	}
}

// TestNamespaceCandidatesFromIgnoreKustomizations runs one orchestration or CLI step.
// Signature: TestNamespaceCandidatesFromIgnoreKustomizations(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestNamespaceCandidatesFromIgnoreKustomizations(t *testing.T) {
	got := namespaceCandidatesFromIgnoreKustomizations([]string{
		"flux-system/jellyfin",
		"flux-system/outline",
	})
	if _, ok := got["jellyfin"]; !ok {
		t.Fatalf("expected jellyfin namespace candidate")
	}
	if _, ok := got["outline"]; !ok {
		t.Fatalf("expected outline namespace candidate")
	}
}

// TestProbeStatusAcceptedRejects404 runs one orchestration or CLI step.
// Signature: TestProbeStatusAcceptedRejects404(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestProbeStatusAcceptedRejects404(t *testing.T) {
	if probeStatusAccepted("https://metrics.bstein.dev/login", 404) {
		t.Fatalf("expected 404 probe status to be rejected")
	}
}

// TestParseFluxKustomizationTimeout runs one orchestration or CLI step.
// Signature: TestParseFluxKustomizationTimeout(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestParseFluxKustomizationTimeout(t *testing.T) {
	if got := parseFluxKustomizationTimeout("30m"); got != 30*time.Minute {
		t.Fatalf("expected 30m duration, got %s", got)
	}
	if got := parseFluxKustomizationTimeout("5m30s"); got != 5*time.Minute+30*time.Second {
		t.Fatalf("expected 5m30s duration, got %s", got)
	}
	if got := parseFluxKustomizationTimeout(""); got != 0 {
		t.Fatalf("expected zero duration for empty timeout, got %s", got)
	}
	if got := parseFluxKustomizationTimeout("not-a-duration"); got != 0 {
		t.Fatalf("expected zero duration for invalid timeout, got %s", got)
	}
}

// TestServiceCheckReadyRequiresBodyContains runs one orchestration or CLI step.
// Signature: TestServiceCheckReadyRequiresBodyContains(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestServiceCheckReadyRequiresBodyContains(t *testing.T) {
	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
		w.WriteHeader(http.StatusOK)
		_, _ = w.Write([]byte(`{"database":"ok"}`))
	}))
	defer srv.Close()

	orch := &Orchestrator{
		log: log.New(os.Stdout, "", 0),
	}
	ok, detail := orch.serviceCheckReady(context.Background(), config.ServiceChecklistCheck{
		Name:             "grafana-api",
		URL:              srv.URL,
		AcceptedStatuses: []int{200},
		BodyContains:     `"database":"ok"`,
		TimeoutSeconds:   5,
	})
	if !ok {
		t.Fatalf("expected service check to pass, detail=%s", detail)
	}
}

// TestServiceCheckReadyBodyContainsIgnoresWhitespace runs one orchestration or CLI step.
// Signature: TestServiceCheckReadyBodyContainsIgnoresWhitespace(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestServiceCheckReadyBodyContainsIgnoresWhitespace(t *testing.T) {
	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
		w.WriteHeader(http.StatusOK)
		_, _ = w.Write([]byte("{\n  \"database\": \"ok\"\n}\n"))
	}))
	defer srv.Close()

	orch := &Orchestrator{
		log: log.New(os.Stdout, "", 0),
	}
	ok, detail := orch.serviceCheckReady(context.Background(), config.ServiceChecklistCheck{
		Name:             "grafana-api",
		URL:              srv.URL,
		AcceptedStatuses: []int{200},
		BodyContains:     `"database":"ok"`,
		TimeoutSeconds:   5,
	})
	if !ok {
		t.Fatalf("expected whitespace-tolerant service check to pass, detail=%s", detail)
	}
}

// TestServiceCheckReadyRequiresLocationContains runs one orchestration or CLI step.
// Signature: TestServiceCheckReadyRequiresLocationContains(t *testing.T).
// Why: startup checks must validate redirect targets for OIDC-gated services.
func TestServiceCheckReadyRequiresLocationContains(t *testing.T) {
	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
		w.Header().Set("Location", "https://sso.bstein.dev/realms/atlas/protocol/openid-connect/auth?client_id=logs")
		w.WriteHeader(http.StatusFound)
	}))
	defer srv.Close()

	orch := &Orchestrator{
		log: log.New(os.Stdout, "", 0),
	}
	ok, detail := orch.serviceCheckReady(context.Background(), config.ServiceChecklistCheck{
		Name:             "logging-oidc-redirect",
		URL:              srv.URL,
		AcceptedStatuses: []int{302},
		LocationContains: "client_id=logs",
		TimeoutSeconds:   5,
	})
	if !ok {
		t.Fatalf("expected location-aware service check to pass, detail=%s", detail)
	}
}

// TestServiceCheckReadyRejectsMissingLocationMarker runs one orchestration or CLI step.
// Signature: TestServiceCheckReadyRejectsMissingLocationMarker(t *testing.T).
// Why: prevents false positives when redirects point somewhere unexpected.
func TestServiceCheckReadyRejectsMissingLocationMarker(t *testing.T) {
	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
		w.Header().Set("Location", "https://sso.bstein.dev/realms/atlas/protocol/openid-connect/auth?client_id=wrong")
		w.WriteHeader(http.StatusFound)
	}))
	defer srv.Close()

	orch := &Orchestrator{
		log: log.New(os.Stdout, "", 0),
	}
	ok, detail := orch.serviceCheckReady(context.Background(), config.ServiceChecklistCheck{
		Name:             "logging-oidc-redirect",
		URL:              srv.URL,
		AcceptedStatuses: []int{302},
		LocationContains: "client_id=logs",
		TimeoutSeconds:   5,
	})
	if ok {
		t.Fatalf("expected location-aware service check to fail")
	}
	if !strings.Contains(detail, "location header missing expected marker") {
		t.Fatalf("expected missing location marker detail, got %q", detail)
	}
}

// TestServiceCheckReadyRequiresFinalURLContains runs one orchestration or CLI step.
// Signature: TestServiceCheckReadyRequiresFinalURLContains(t *testing.T).
// Why: authenticated user-journey checks depend on final URL assertions after
// redirects complete, not only on initial response status.
func TestServiceCheckReadyRequiresFinalURLContains(t *testing.T) {
	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
		if r.URL.Path == "/" {
			http.Redirect(w, r, "/app/home", http.StatusFound)
			return
		}
		if r.URL.Path == "/app/home" {
			w.WriteHeader(http.StatusOK)
			_, _ = w.Write([]byte("OpenSearch Dashboards"))
			return
		}
		w.WriteHeader(http.StatusNotFound)
	}))
	defer srv.Close()

	orch := &Orchestrator{
		log: log.New(os.Stdout, "", 0),
	}
	ok, detail := orch.serviceCheckReady(context.Background(), config.ServiceChecklistCheck{
		Name:             "logging-ui-user-session",
		URL:              srv.URL,
		AcceptedStatuses: []int{200},
		FollowRedirects:  true,
		FinalURLContains: "/app/home",
		BodyContains:     "OpenSearch Dashboards",
		TimeoutSeconds:   5,
	})
	if !ok {
		t.Fatalf("expected final-url-aware service check to pass, detail=%s", detail)
	}
}

// TestServiceCheckReadyRejectsForbiddenFinalURLMarker runs one orchestration or CLI step.
// Signature: TestServiceCheckReadyRejectsForbiddenFinalURLMarker(t *testing.T).
// Why: user-session checks should fail when final URL indicates auth/login loop
// instead of the expected post-login app route.
func TestServiceCheckReadyRejectsForbiddenFinalURLMarker(t *testing.T) {
	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
		if r.URL.Path == "/" {
			http.Redirect(w, r, "/oauth2/sign_in", http.StatusFound)
			return
		}
		if r.URL.Path == "/oauth2/sign_in" {
			w.WriteHeader(http.StatusOK)
			_, _ = w.Write([]byte("sign in"))
			return
		}
		w.WriteHeader(http.StatusNotFound)
	}))
	defer srv.Close()

	orch := &Orchestrator{
		log: log.New(os.Stdout, "", 0),
	}
	ok, detail := orch.serviceCheckReady(context.Background(), config.ServiceChecklistCheck{
		Name:                "logging-ui-user-session",
		URL:                 srv.URL,
		AcceptedStatuses:    []int{200},
		FollowRedirects:     true,
		FinalURLNotContains: "/oauth2/sign_in",
		TimeoutSeconds:      5,
	})
	if ok {
		t.Fatalf("expected forbidden final-url marker check to fail")
	}
	if !strings.Contains(detail, "final url contained forbidden marker") {
		t.Fatalf("expected final-url forbidden marker detail, got %q", detail)
	}
}

// TestChecklistFailureHostFromIngressDetail runs one orchestration or CLI step.
// Signature: TestChecklistFailureHostFromIngressDetail(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestChecklistFailureHostFromIngressDetail(t *testing.T) {
	orch := &Orchestrator{}
	got := orch.checklistFailureHost("cloud.bstein.dev: unexpected status code=500")
	if got != "cloud.bstein.dev" {
		t.Fatalf("expected host cloud.bstein.dev, got %q", got)
	}
}

// TestChecklistFailureHostFromServiceCheckName runs one orchestration or CLI step.
// Signature: TestChecklistFailureHostFromServiceCheckName(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestChecklistFailureHostFromServiceCheckName(t *testing.T) {
	orch := &Orchestrator{
		cfg: config.Config{
			Startup: config.Startup{
				ServiceChecklist: []config.ServiceChecklistCheck{
					{
						Name: "harbor-registry",
						URL:  "https://registry.bstein.dev/v2/",
					},
				},
			},
		},
	}
	got := orch.checklistFailureHost("harbor-registry: unexpected status code=404")
	if got != "registry.bstein.dev" {
		t.Fatalf("expected host registry.bstein.dev, got %q", got)
	}
}

// TestChecklistFailureHostUnknown runs one orchestration or CLI step.
// Signature: TestChecklistFailureHostUnknown(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestChecklistFailureHostUnknown(t *testing.T) {
	orch := &Orchestrator{
		cfg: config.Config{
			Startup: config.Startup{
				ServiceChecklist: []config.ServiceChecklistCheck{
					{
						Name: "grafana-api",
						URL:  "https://metrics.bstein.dev/api/health",
					},
				},
			},
		},
	}
	if got := orch.checklistFailureHost("grafana-api: tcp timeout"); got != "metrics.bstein.dev" {
		t.Fatalf("expected metrics host from configured URL, got %q", got)
	}
	if got := orch.checklistFailureHost("some-unmapped-check: fail"); got != "" {
		t.Fatalf("expected empty host for unknown check, got %q", got)
	}
}