2026-04-08 23:52:29 -03:00
package cluster
import (
"context"
"errors"
"io"
"log"
"net"
"path/filepath"
"strings"
2026-06-18 22:08:14 -03:00
"sync"
2026-04-08 23:52:29 -03:00
"testing"
"time"
"scm.bstein.dev/bstein/ananke/internal/config"
"scm.bstein.dev/bstein/ananke/internal/execx"
"scm.bstein.dev/bstein/ananke/internal/state"
)
type commandStub struct {
match func ( name string , args [ ] string ) bool
out string
err error
}
// buildOrchestratorWithStubs runs one orchestration or CLI step.
// Signature: buildOrchestratorWithStubs(t *testing.T, cfg config.Config, stubs []commandStub) *Orchestrator.
// Why: helper centralizes deterministic command dispatch for fast, isolated unit tests.
func buildOrchestratorWithStubs ( t * testing . T , cfg config . Config , stubs [ ] commandStub ) * Orchestrator {
t . Helper ( )
if cfg . State . Dir == "" {
cfg . State . Dir = t . TempDir ( )
}
if cfg . State . ReportsDir == "" {
cfg . State . ReportsDir = filepath . Join ( cfg . State . Dir , "reports" )
}
if cfg . State . RunHistoryPath == "" {
cfg . State . RunHistoryPath = filepath . Join ( cfg . State . Dir , "runs.json" )
}
orch := & Orchestrator {
cfg : cfg ,
runner : & execx . Runner { } ,
store : state . New ( cfg . State . RunHistoryPath ) ,
log : log . New ( io . Discard , "" , 0 ) ,
}
dispatch := func ( _ context . Context , _ time . Duration , name string , args ... string ) ( string , error ) {
for _ , stub := range stubs {
if stub . match ( name , args ) {
return stub . out , stub . err
}
}
return "" , nil
}
orch . runOverride = dispatch
orch . runSensitiveOverride = dispatch
return orch
}
// matchContains runs one orchestration or CLI step.
// Signature: matchContains(cmd string, parts ...string) func(string, []string) bool.
// Why: concise substring matching keeps command stubs readable across many tests.
func matchContains ( cmd string , parts ... string ) func ( string , [ ] string ) bool {
return func ( name string , args [ ] string ) bool {
if name != cmd {
return false
}
joined := strings . Join ( args , " " )
for _ , part := range parts {
if ! strings . Contains ( joined , part ) {
return false
}
}
return true
}
}
2026-06-18 22:08:14 -03:00
// TestStartupEarlyFailureLeavesFluxSuspensionUnchanged runs one orchestration or CLI step.
// Signature: TestStartupEarlyFailureLeavesFluxSuspensionUnchanged(t *testing.T).
// Why: recovery must not release Flux when bootstrap fails before storage and
// critical workloads are ready, or Flux can re-create the same dependency loop.
func TestStartupEarlyFailureLeavesFluxSuspensionUnchanged ( t * testing . T ) {
tmpDir := t . TempDir ( )
cfg := config . Config {
SSHPort : 2277 ,
Startup : config . Startup {
APIWaitSeconds : 1 ,
APIPollSeconds : 1 ,
RequireNodeInventoryReach : false ,
RequireTimeSync : false ,
RequireNodeSSHAuth : false ,
ReconcileAccessOnBoot : false ,
AutoEtcdRestoreOnAPIFailure : false ,
RequiredNodeLabels : map [ string ] map [ string ] string {
"titan-missing" : {
"node-role.kubernetes.io/worker" : "true" ,
} ,
} ,
} ,
State : config . State {
Dir : tmpDir ,
ReportsDir : filepath . Join ( tmpDir , "reports" ) ,
RunHistoryPath : filepath . Join ( tmpDir , "runs.json" ) ,
LockPath : filepath . Join ( tmpDir , "ananke.lock" ) ,
IntentPath : filepath . Join ( tmpDir , "intent.json" ) ,
} ,
}
var mu sync . Mutex
calls := [ ] string { }
orch := buildOrchestratorWithStubs ( t , cfg , [ ] commandStub {
{
match : func ( name string , args [ ] string ) bool {
mu . Lock ( )
calls = append ( calls , name + " " + strings . Join ( args , " " ) )
mu . Unlock ( )
return false
} ,
} ,
{ match : matchContains ( "kubectl" , "version" , "--request-timeout=5s" ) , out : "ok" } ,
{ match : matchContains ( "kubectl" , "-n" , "vault" , "get" , "pod" , "vault-0" ) , out : "Pending" } ,
{
match : matchContains ( "kubectl" , "label" , "node" , "titan-missing" ) ,
err : errors . New ( ` nodes "titan-missing" not found ` ) ,
} ,
} )
err := orch . Startup ( context . Background ( ) , StartupOptions { Reason : "test early failure" } )
if err == nil {
t . Fatalf ( "expected startup to fail before flux resume" )
}
if ! strings . Contains ( err . Error ( ) , "ensure required node labels on titan-missing" ) {
t . Fatalf ( "expected required-label failure, got: %v" , err )
}
mu . Lock ( )
defer mu . Unlock ( )
for _ , call := range calls {
if strings . Contains ( call , ` "suspend":false ` ) {
t . Fatalf ( "early failed startup unexpectedly resumed flux via call: %s" , call )
}
}
}
// TestRecycleStuckControllerPodsHandlesLonghornAttachBlockedPods runs one orchestration or CLI step.
// Signature: TestRecycleStuckControllerPodsHandlesLonghornAttachBlockedPods(t *testing.T).
// Why: Pending Longhorn-backed pods on Longhorn-unready nodes should be
// rescheduled without mutating Longhorn volume, replica, or disk objects.
func TestRecycleStuckControllerPodsHandlesLonghornAttachBlockedPods ( t * testing . T ) {
created := time . Now ( ) . Add ( - 10 * time . Minute ) . UTC ( ) . Format ( time . RFC3339 )
lastSeen := time . Now ( ) . UTC ( ) . Format ( time . RFC3339 )
pods := ` { "items":[ { "metadata": { "namespace":"monitoring","name":"victoria-metrics-single-server-0","creationTimestamp":" ` + created + ` ","ownerReferences":[ { "kind":"StatefulSet","name":"victoria-metrics-single-server"}]},"spec": { "nodeName":"titan-0b"},"status": { "phase":"Pending"}}]} `
events := ` { "items":[ { "metadata": { "namespace":"monitoring","creationTimestamp":" ` + lastSeen + ` "},"involvedObject": { "kind":"Pod","namespace":"monitoring","name":"victoria-metrics-single-server-0"},"type":"Warning","reason":"FailedAttachVolume","message":"AttachVolume.Attach failed for volume \"pvc-1\" : rpc error from [http://longhorn-backend:9500/v1/volumes/pvc-1?action=attach]: unable to attach volume pvc-1 to titan-0b: node titan-0b is not ready","lastTimestamp":" ` + lastSeen + ` "}]} `
deleted := false
orch := buildOrchestratorWithStubs ( t , config . Config {
Startup : config . Startup { StuckPodGraceSeconds : 180 } ,
} , [ ] commandStub {
{ match : matchContains ( "kubectl" , "get" , "pods" , "-A" , "-o" , "json" ) , out : pods } ,
{ match : matchContains ( "kubectl" , "-n" , "longhorn-system" , "get" , "nodes.longhorn.io" ) , out : "titan-0b\tFalse\n" } ,
{ match : matchContains ( "kubectl" , "get" , "events" , "-A" , "-o" , "json" ) , out : events } ,
2026-06-18 22:46:02 -03:00
{ match : matchContains ( "kubectl" , "get" , "nodes" , "-o" , "json" ) , out : ` { "items":[ { "metadata": { "name":"titan-0b"},"status": { "conditions":[ { "type":"Ready","status":"True"}]}}]} ` } ,
2026-06-18 22:08:14 -03:00
{
match : func ( name string , args [ ] string ) bool {
if ! matchContains ( "kubectl" , "-n" , "monitoring" , "delete" , "pod" , "victoria-metrics-single-server-0" , "--wait=false" ) ( name , args ) {
return false
}
deleted = true
return true
} ,
} ,
} )
if err := orch . recycleStuckControllerPods ( context . Background ( ) ) ; err != nil {
t . Fatalf ( "recycleStuckControllerPods failed: %v" , err )
}
if ! deleted {
t . Fatalf ( "expected longhorn attach-blocked pending pod to be recycled" )
}
}
2026-06-18 22:34:59 -03:00
// TestRecycleStuckControllerPodsRepairsEncryptedVolumeCryptsetup runs one orchestration or CLI step.
// Signature: TestRecycleStuckControllerPodsRepairsEncryptedVolumeCryptsetup(t *testing.T).
// Why: encrypted Longhorn PVC recovery should repair missing host cryptsetup and
// then recycle the blocked pod without touching Longhorn data-plane objects.
func TestRecycleStuckControllerPodsRepairsEncryptedVolumeCryptsetup ( t * testing . T ) {
created := time . Now ( ) . Add ( - 10 * time . Minute ) . UTC ( ) . Format ( time . RFC3339 )
lastSeen := time . Now ( ) . UTC ( ) . Format ( time . RFC3339 )
pods := ` { "items":[ { "metadata": { "namespace":"finance","name":"actual-budget-abc","creationTimestamp":" ` + created + ` ","ownerReferences":[ { "kind":"ReplicaSet","name":"actual-budget"}]},"spec": { "nodeName":"titan-19"},"status": { "phase":"Pending"}}]} `
events := ` { "items":[ { "metadata": { "namespace":"finance","creationTimestamp":" ` + lastSeen + ` "},"involvedObject": { "kind":"Pod","namespace":"finance","name":"actual-budget-abc"},"type":"Warning","reason":"FailedMount","message":"MountVolume.MountDevice failed for volume \"pvc-1\" : nsenter: failed to execute cryptsetup: No such file or directory","lastTimestamp":" ` + lastSeen + ` "}]} `
installed := false
deleted := false
orch := buildOrchestratorWithStubs ( t , config . Config {
Startup : config . Startup { StuckPodGraceSeconds : 180 } ,
} , [ ] commandStub {
{ match : matchContains ( "kubectl" , "get" , "pods" , "-A" , "-o" , "json" ) , out : pods } ,
{ match : matchContains ( "kubectl" , "-n" , "longhorn-system" , "get" , "nodes.longhorn.io" ) , out : "titan-19\tTrue\n" } ,
{ match : matchContains ( "kubectl" , "get" , "events" , "-A" , "-o" , "json" ) , out : events } ,
2026-06-18 22:46:02 -03:00
{ match : matchContains ( "kubectl" , "get" , "nodes" , "-o" , "json" ) , out : ` { "items":[ { "metadata": { "name":"titan-19"},"status": { "conditions":[ { "type":"Ready","status":"True"}]}}]} ` } ,
2026-06-18 22:34:59 -03:00
{
match : func ( name string , args [ ] string ) bool {
if name != "ssh" || ! strings . Contains ( strings . Join ( args , " " ) , "apt-get install -y --no-install-recommends cryptsetup-bin" ) {
return false
}
installed = true
return true
} ,
out : "__ANANKE_CRYPTSETUP_INSTALLED__" ,
} ,
{
match : func ( name string , args [ ] string ) bool {
if ! matchContains ( "kubectl" , "-n" , "finance" , "delete" , "pod" , "actual-budget-abc" , "--wait=false" ) ( name , args ) {
return false
}
deleted = true
return true
} ,
} ,
} )
if err := orch . recycleStuckControllerPods ( context . Background ( ) ) ; err != nil {
t . Fatalf ( "recycleStuckControllerPods failed: %v" , err )
}
if ! installed {
t . Fatalf ( "expected missing host cryptsetup to be installed" )
}
if ! deleted {
t . Fatalf ( "expected encrypted-volume blocked pod to be recycled" )
}
}
2026-06-18 22:37:54 -03:00
// TestRecycleStuckControllerPodsCordonsEncryptedVolumeNodeWhenRepairFails runs one orchestration or CLI step.
// Signature: TestRecycleStuckControllerPodsCordonsEncryptedVolumeNodeWhenRepairFails(t *testing.T).
// Why: when host package repair is blocked by sudo policy, Ananke should avoid
// the bad node and retry the controller-owned pod elsewhere.
func TestRecycleStuckControllerPodsCordonsEncryptedVolumeNodeWhenRepairFails ( t * testing . T ) {
created := time . Now ( ) . Add ( - 10 * time . Minute ) . UTC ( ) . Format ( time . RFC3339 )
lastSeen := time . Now ( ) . UTC ( ) . Format ( time . RFC3339 )
pods := ` { "items":[ { "metadata": { "namespace":"finance","name":"actual-budget-abc","creationTimestamp":" ` + created + ` ","ownerReferences":[ { "kind":"ReplicaSet","name":"actual-budget"}]},"spec": { "nodeName":"titan-19"},"status": { "phase":"Pending"}}]} `
events := ` { "items":[ { "metadata": { "namespace":"finance","creationTimestamp":" ` + lastSeen + ` "},"involvedObject": { "kind":"Pod","namespace":"finance","name":"actual-budget-abc"},"type":"Warning","reason":"FailedMount","message":"MountVolume.MountDevice failed for volume \"pvc-1\" : nsenter: failed to execute cryptsetup: No such file or directory","lastTimestamp":" ` + lastSeen + ` "}]} `
cordoned := false
deleted := false
orch := buildOrchestratorWithStubs ( t , config . Config {
Startup : config . Startup { StuckPodGraceSeconds : 180 } ,
} , [ ] commandStub {
{ match : matchContains ( "kubectl" , "get" , "pods" , "-A" , "-o" , "json" ) , out : pods } ,
{ match : matchContains ( "kubectl" , "-n" , "longhorn-system" , "get" , "nodes.longhorn.io" ) , out : "titan-19\tTrue\n" } ,
{ match : matchContains ( "kubectl" , "get" , "events" , "-A" , "-o" , "json" ) , out : events } ,
2026-06-18 22:46:02 -03:00
{ match : matchContains ( "kubectl" , "get" , "nodes" , "-o" , "json" ) , out : ` { "items":[ { "metadata": { "name":"titan-19"},"status": { "conditions":[ { "type":"Ready","status":"True"}]}}]} ` } ,
2026-06-18 22:37:54 -03:00
{
match : matchContains ( "ssh" , "apt-get install -y --no-install-recommends cryptsetup-bin" ) ,
err : errors . New ( "sudo: a password is required" ) ,
} ,
{
match : func ( name string , args [ ] string ) bool {
if ! matchContains ( "kubectl" , "cordon" , "titan-19" ) ( name , args ) {
return false
}
cordoned = true
return true
} ,
} ,
{
match : func ( name string , args [ ] string ) bool {
if ! matchContains ( "kubectl" , "-n" , "finance" , "delete" , "pod" , "actual-budget-abc" , "--wait=false" ) ( name , args ) {
return false
}
deleted = true
return true
} ,
} ,
} )
if err := orch . recycleStuckControllerPods ( context . Background ( ) ) ; err != nil {
t . Fatalf ( "recycleStuckControllerPods failed: %v" , err )
}
if ! cordoned {
t . Fatalf ( "expected cryptsetup-missing node to be cordoned" )
}
if ! deleted {
t . Fatalf ( "expected encrypted-volume blocked pod to be recycled" )
}
}
2026-06-18 22:55:44 -03:00
// TestRecycleStuckControllerPodsHandlesStalePodsOnReadyNodes runs one orchestration or CLI step.
// Signature: TestRecycleStuckControllerPodsHandlesStalePodsOnReadyNodes(t *testing.T).
// Why: post-outage controller pods can remain Unknown or Failed after their
2026-06-19 04:15:59 -03:00
// node recovers; deletion clears stale status while force deletion stays away
// from PVC-backed storage.
2026-06-18 22:55:44 -03:00
func TestRecycleStuckControllerPodsHandlesStalePodsOnReadyNodes ( t * testing . T ) {
2026-06-18 22:46:02 -03:00
old := time . Now ( ) . Add ( - 10 * time . Minute ) . UTC ( ) . Format ( time . RFC3339 )
recent := time . Now ( ) . Add ( - 30 * time . Second ) . UTC ( ) . Format ( time . RFC3339 )
pods := ` { "items":[ ` +
` { "metadata": { "namespace":"longhorn-system","name":"longhorn-vault-sync-old","creationTimestamp":" ` + old + ` ","ownerReferences":[ { "kind":"ReplicaSet","name":"longhorn-vault-sync"}]},"spec": { "nodeName":"titan-12"},"status": { "phase":"Unknown"}}, ` +
2026-06-18 23:05:02 -03:00
` { "metadata": { "namespace":"longhorn-system","name":"longhorn-vault-sync-failed","creationTimestamp":" ` + old + ` ","deletionTimestamp":" ` + old + ` ","ownerReferences":[ { "kind":"ReplicaSet","name":"longhorn-vault-sync"}]},"spec": { "nodeName":"titan-12","volumes":[ { "name":"secret"}]},"status": { "phase":"Failed"}}, ` +
2026-06-19 04:15:59 -03:00
` { "metadata": { "namespace":"logging","name":"oauth2-proxy-terminating","creationTimestamp":" ` + old + ` ","deletionTimestamp":" ` + old + ` ","ownerReferences":[ { "kind":"ReplicaSet","name":"oauth2-proxy-logs"}]},"spec": { "nodeName":"titan-18","volumes":[ { "name":"secret"}]},"status": { "phase":"Running"}}, ` +
2026-06-18 23:05:02 -03:00
` { "metadata": { "namespace":"longhorn-system","name":"pvc-backed-failed","creationTimestamp":" ` + old + ` ","deletionTimestamp":" ` + old + ` ","ownerReferences":[ { "kind":"ReplicaSet","name":"pvc-backed"}]},"spec": { "nodeName":"titan-12","volumes":[ { "name":"data","persistentVolumeClaim": { "claimName":"data"}}]},"status": { "phase":"Failed"}}, ` +
2026-06-18 22:46:02 -03:00
` { "metadata": { "namespace":"longhorn-system","name":"longhorn-vault-sync-fresh","creationTimestamp":" ` + recent + ` ","ownerReferences":[ { "kind":"ReplicaSet","name":"longhorn-vault-sync"}]},"spec": { "nodeName":"titan-12"},"status": { "phase":"Unknown"}}, ` +
` { "metadata": { "namespace":"maintenance","name":"stale-on-bad-node","creationTimestamp":" ` + old + ` ","ownerReferences":[ { "kind":"ReplicaSet","name":"maintenance"}]},"spec": { "nodeName":"titan-22"},"status": { "phase":"Unknown"}}, ` +
` { "metadata": { "namespace":"default","name":"bare-pod","creationTimestamp":" ` + old + ` "},"spec": { "nodeName":"titan-12"},"status": { "phase":"Unknown"}}]} `
deleted := [ ] string { }
orch := buildOrchestratorWithStubs ( t , config . Config {
Startup : config . Startup { StuckPodGraceSeconds : 180 } ,
} , [ ] commandStub {
{ match : matchContains ( "kubectl" , "get" , "pods" , "-A" , "-o" , "json" ) , out : pods } ,
{ match : matchContains ( "kubectl" , "-n" , "longhorn-system" , "get" , "nodes.longhorn.io" ) , out : "titan-12\tTrue\ntitan-22\tTrue\n" } ,
{ match : matchContains ( "kubectl" , "get" , "events" , "-A" , "-o" , "json" ) , out : ` { "items":[]} ` } ,
{ match : matchContains ( "kubectl" , "get" , "nodes" , "-o" , "json" ) , out : ` { "items":[ { "metadata": { "name":"titan-12"},"status": { "conditions":[ { "type":"Ready","status":"True"}]}}, { "metadata": { "name":"titan-22"},"status": { "conditions":[ { "type":"Ready","status":"False"}]}}]} ` } ,
{
match : func ( name string , args [ ] string ) bool {
if ! matchContains ( "kubectl" , "-n" , "longhorn-system" , "delete" , "pod" , "longhorn-vault-sync-old" , "--wait=false" ) ( name , args ) {
return false
}
deleted = append ( deleted , "longhorn-vault-sync-old" )
return true
} ,
} ,
2026-06-18 22:55:44 -03:00
{
match : func ( name string , args [ ] string ) bool {
2026-06-18 23:05:02 -03:00
if ! matchContains ( "kubectl" , "-n" , "longhorn-system" , "delete" , "pod" , "longhorn-vault-sync-failed" , "--wait=false" , "--grace-period=0" , "--force" ) ( name , args ) {
2026-06-18 22:55:44 -03:00
return false
}
deleted = append ( deleted , "longhorn-vault-sync-failed" )
return true
} ,
} ,
2026-06-19 04:15:59 -03:00
{
match : func ( name string , args [ ] string ) bool {
if ! matchContains ( "kubectl" , "-n" , "logging" , "delete" , "pod" , "oauth2-proxy-terminating" , "--wait=false" , "--grace-period=0" , "--force" ) ( name , args ) {
return false
}
deleted = append ( deleted , "oauth2-proxy-terminating" )
return true
} ,
} ,
2026-06-18 23:05:02 -03:00
{
match : func ( name string , args [ ] string ) bool {
if ! matchContains ( "kubectl" , "-n" , "longhorn-system" , "delete" , "pod" , "pvc-backed-failed" , "--wait=false" ) ( name , args ) {
return false
}
if strings . Contains ( strings . Join ( args , " " ) , "--force" ) {
t . Fatalf ( "pvc-backed stale pod must not be force deleted" )
}
deleted = append ( deleted , "pvc-backed-failed" )
return true
} ,
} ,
2026-06-18 22:46:02 -03:00
} )
if err := orch . recycleStuckControllerPods ( context . Background ( ) ) ; err != nil {
t . Fatalf ( "recycleStuckControllerPods failed: %v" , err )
}
2026-06-19 04:15:59 -03:00
if strings . Join ( deleted , "," ) != "longhorn-vault-sync-old,longhorn-vault-sync-failed,oauth2-proxy-terminating,pvc-backed-failed" {
2026-06-18 22:55:44 -03:00
t . Fatalf ( "expected only stale controller pods on Ready node to be recycled, got %#v" , deleted )
2026-06-18 22:46:02 -03:00
}
}
2026-06-19 04:25:39 -03:00
// TestRecycleStuckControllerPodsCordonsContainerRuntimeWedgeNode runs one orchestration or CLI step.
// Signature: TestRecycleStuckControllerPodsCordonsContainerRuntimeWedgeNode(t *testing.T).
// Why: a Ready node with a wedged container runtime can trap replacement pods
// indefinitely; startup should cordon that scheduler target without draining it
// or touching Longhorn data-plane objects.
func TestRecycleStuckControllerPodsCordonsContainerRuntimeWedgeNode ( t * testing . T ) {
old := time . Now ( ) . Add ( - 10 * time . Minute ) . UTC ( ) . Format ( time . RFC3339 )
lastSeen := time . Now ( ) . UTC ( ) . Format ( time . RFC3339 )
pods := ` { "items":[ ` +
` { "metadata": { "namespace":"logging","name":"oauth2-proxy-bad","creationTimestamp":" ` + old + ` ","ownerReferences":[ { "kind":"ReplicaSet","name":"oauth2-proxy"}]},"spec": { "nodeName":"titan-18","volumes":[ { "name":"scratch"}]},"status": { "phase":"Pending","containerStatuses":[ { "name":"oauth2-proxy","state": { "waiting": { "reason":"CreateContainerError"}}}]}}, ` +
` { "metadata": { "namespace":"monitoring","name":"suite-probe-bad","creationTimestamp":" ` + old + ` ","ownerReferences":[ { "kind":"Job","name":"suite-probe"}]},"spec": { "nodeName":"titan-18","volumes":[ { "name":"scratch"}]},"status": { "phase":"Pending","containerStatuses":[ { "name":"probe","state": { "waiting": { "reason":"CreateContainerError"}}}]}}, ` +
` { "metadata": { "namespace":"sso","name":"secret-ensure-bad","creationTimestamp":" ` + old + ` ","ownerReferences":[ { "kind":"Job","name":"secret-ensure"}]},"spec": { "nodeName":"titan-18","volumes":[ { "name":"scratch"}]},"status": { "phase":"Pending","initContainerStatuses":[ { "name":"init","state": { "waiting": { "reason":"CreateContainerError"}}}]}}, ` +
` { "metadata": { "namespace":"finance","name":"single-node-bad","creationTimestamp":" ` + old + ` ","ownerReferences":[ { "kind":"ReplicaSet","name":"single"}]},"spec": { "nodeName":"titan-19","volumes":[ { "name":"scratch"}]},"status": { "phase":"Pending","containerStatuses":[ { "name":"app","state": { "waiting": { "reason":"CreateContainerError"}}}]}}]} `
events := ` { "items":[ ` +
` { "metadata": { "namespace":"logging","creationTimestamp":" ` + lastSeen + ` "},"involvedObject": { "kind":"Pod","namespace":"logging","name":"oauth2-proxy-bad"},"type":"Warning","reason":"Failed","message":"spec.containers { oauth2-proxy}: Error: failed to reserve container name oauth2-proxy_logging","lastTimestamp":" ` + lastSeen + ` "}, ` +
` { "metadata": { "namespace":"monitoring","creationTimestamp":" ` + lastSeen + ` "},"involvedObject": { "kind":"Pod","namespace":"monitoring","name":"suite-probe-bad"},"type":"Warning","reason":"Failed","message":"spec.containers { probe}: Error: context deadline exceeded","lastTimestamp":" ` + lastSeen + ` "}, ` +
` { "metadata": { "namespace":"sso","creationTimestamp":" ` + lastSeen + ` "},"involvedObject": { "kind":"Pod","namespace":"sso","name":"secret-ensure-bad"},"type":"Warning","reason":"Failed","message":"spec.initContainers { init}: Error: failed to reserve container name init_sso","lastTimestamp":" ` + lastSeen + ` "}, ` +
` { "metadata": { "namespace":"finance","creationTimestamp":" ` + lastSeen + ` "},"involvedObject": { "kind":"Pod","namespace":"finance","name":"single-node-bad"},"type":"Warning","reason":"Failed","message":"spec.containers { app}: Error: failed to reserve container name app_finance","lastTimestamp":" ` + lastSeen + ` "}]} `
cordoned := [ ] string { }
deleted := [ ] string { }
orch := buildOrchestratorWithStubs ( t , config . Config {
Startup : config . Startup { StuckPodGraceSeconds : 180 } ,
} , [ ] commandStub {
{ match : matchContains ( "kubectl" , "get" , "pods" , "-A" , "-o" , "json" ) , out : pods } ,
{ match : matchContains ( "kubectl" , "-n" , "longhorn-system" , "get" , "nodes.longhorn.io" ) , out : "" } ,
{ match : matchContains ( "kubectl" , "get" , "events" , "-A" , "-o" , "json" ) , out : events } ,
{ match : matchContains ( "kubectl" , "get" , "nodes" , "-o" , "json" ) , out : ` { "items":[ { "metadata": { "name":"titan-18"},"status": { "conditions":[ { "type":"Ready","status":"True"}]}}, { "metadata": { "name":"titan-19"},"status": { "conditions":[ { "type":"Ready","status":"True"}]}}]} ` } ,
{
match : func ( name string , args [ ] string ) bool {
if ! matchContains ( "kubectl" , "cordon" ) ( name , args ) {
return false
}
cordoned = append ( cordoned , args [ len ( args ) - 1 ] )
return true
} ,
} ,
{
match : func ( name string , args [ ] string ) bool {
if ! matchContains ( "kubectl" , "delete" , "pod" , "--wait=false" ) ( name , args ) {
return false
}
joined := strings . Join ( args , " " )
if strings . Contains ( joined , "--force" ) {
t . Fatalf ( "container-runtime wedge recycle must not force-delete fresh pods" )
}
if len ( args ) >= 5 {
deleted = append ( deleted , args [ 4 ] )
}
return true
} ,
} ,
} )
if err := orch . recycleStuckControllerPods ( context . Background ( ) ) ; err != nil {
t . Fatalf ( "recycleStuckControllerPods failed: %v" , err )
}
if strings . Join ( cordoned , "," ) != "titan-18" {
t . Fatalf ( "expected only titan-18 to be cordoned, got %#v" , cordoned )
}
if strings . Join ( deleted , "," ) != "oauth2-proxy-bad,suite-probe-bad,secret-ensure-bad,single-node-bad" {
t . Fatalf ( "expected runtime-wedged pods to be recycled, got %#v" , deleted )
}
}
2026-06-18 22:49:05 -03:00
// TestEffectiveWorkersFiltersIgnoredUnavailableNodes runs one orchestration or CLI step.
// Signature: TestEffectiveWorkersFiltersIgnoredUnavailableNodes(t *testing.T).
// Why: ignored unavailable nodes should be excluded before startup tries SSH,
// k3s-agent start, or uncordon operations against intentionally absent hosts.
func TestEffectiveWorkersFiltersIgnoredUnavailableNodes ( t * testing . T ) {
cfg := config . Config {
Workers : [ ] string { " titan-08 " , "titan-09" , "titan-10" , "titan-11" } ,
Startup : config . Startup {
IgnoreUnavailableNodes : [ ] string { "titan-09" , "titan-10" } ,
} ,
}
orch := buildOrchestratorWithStubs ( t , cfg , nil )
got , err := orch . effectiveWorkers ( context . Background ( ) )
if err != nil {
t . Fatalf ( "effectiveWorkers failed: %v" , err )
}
want := [ ] string { "titan-08" , "titan-11" }
if strings . Join ( got , "," ) != strings . Join ( want , "," ) {
t . Fatalf ( "effectiveWorkers mismatch got=%v want=%v" , got , want )
}
}
2026-06-18 23:18:31 -03:00
// TestEnsureLonghornEncryptedHostPrereqsFiltersUnsafeWorkers runs one orchestration or CLI step.
// Signature: TestEnsureLonghornEncryptedHostPrereqsFiltersUnsafeWorkers(t *testing.T).
// Why: startup must not uncordon Longhorn workers that cannot mount encrypted
// PVCs; cordoning those nodes is safe and avoids repeating the post-outage
// mount deadlock.
func TestEnsureLonghornEncryptedHostPrereqsFiltersUnsafeWorkers ( t * testing . T ) {
cordoned := [ ] string { }
orch := buildOrchestratorWithStubs ( t , config . Config {
SSHManagedNodes : [ ] string { "titan-04" , "titan-19" } ,
} , [ ] commandStub {
{ match : matchContains ( "kubectl" , "get" , "nodes" , "-l" , "longhorn-host=true" ) , out : "titan-04\ntitan-19\ntitan-23\n" } ,
{
match : matchContains ( "ssh" , "titan-04" , "command -v cryptsetup" ) ,
out : "__ANANKE_CRYPTSETUP_PRESENT__" ,
} ,
{
match : matchContains ( "ssh" , "titan-19" , "apt-get install -y --no-install-recommends cryptsetup-bin" ) ,
err : errors . New ( "sudo: a password is required" ) ,
} ,
{
match : func ( name string , args [ ] string ) bool {
2026-06-19 04:03:37 -03:00
if name != "kubectl" || len ( args ) == 0 || args [ 0 ] != "cordon" {
2026-06-18 23:18:31 -03:00
return false
}
if len ( args ) > 1 {
cordoned = append ( cordoned , args [ len ( args ) - 1 ] )
}
return true
} ,
} ,
} )
got , err := orch . ensureLonghornEncryptedHostPrereqs ( context . Background ( ) , [ ] string { "titan-04" , "titan-19" , "titan-20" } )
if err != nil {
t . Fatalf ( "ensureLonghornEncryptedHostPrereqs failed: %v" , err )
}
want := [ ] string { "titan-04" , "titan-20" }
if strings . Join ( got , "," ) != strings . Join ( want , "," ) {
t . Fatalf ( "guarded workers mismatch got=%v want=%v" , got , want )
}
if strings . Join ( cordoned , "," ) != "titan-19,titan-23" {
t . Fatalf ( "expected unsafe longhorn hosts to be cordoned, got %v" , cordoned )
}
}
2026-06-19 04:03:37 -03:00
// TestLonghornCryptsetupExemptNodesAreNotQuarantined runs one orchestration or CLI step.
// Signature: TestLonghornCryptsetupExemptNodesAreNotQuarantined(t *testing.T).
// Why: Veles/Oceanus uses titan-23 as a Longhorn host for unencrypted local
// volumes; startup should uncordon that policy-exempt node without requiring
// host SSH or weakening encrypted-volume safety on other workers.
func TestLonghornCryptsetupExemptNodesAreNotQuarantined ( t * testing . T ) {
cordoned := [ ] string { }
uncordoned := [ ] string { }
sshTitan23 := false
orch := buildOrchestratorWithStubs ( t , config . Config {
SSHManagedNodes : [ ] string { "titan-04" } ,
Startup : config . Startup {
LonghornCryptsetupExemptNodes : [ ] string { "titan-23" } ,
} ,
} , [ ] commandStub {
{ match : matchContains ( "kubectl" , "get" , "nodes" , "-l" , "longhorn-host=true" ) , out : "titan-04\ntitan-23\n" } ,
{
match : matchContains ( "ssh" , "titan-04" , "command -v cryptsetup" ) ,
out : "__ANANKE_CRYPTSETUP_PRESENT__" ,
} ,
{
match : func ( name string , args [ ] string ) bool {
if name == "ssh" && strings . Contains ( strings . Join ( args , " " ) , "titan-23" ) {
sshTitan23 = true
return true
}
return false
} ,
} ,
{
match : func ( name string , args [ ] string ) bool {
if name != "kubectl" || len ( args ) == 0 || args [ 0 ] != "cordon" {
return false
}
if len ( args ) > 1 {
cordoned = append ( cordoned , args [ len ( args ) - 1 ] )
}
return true
} ,
} ,
{
match : func ( name string , args [ ] string ) bool {
if ! matchContains ( "kubectl" , "uncordon" ) ( name , args ) {
return false
}
if len ( args ) > 1 {
uncordoned = append ( uncordoned , args [ len ( args ) - 1 ] )
}
return true
} ,
} ,
} )
got , err := orch . ensureLonghornEncryptedHostPrereqs ( context . Background ( ) , [ ] string { "titan-04" } )
if err != nil {
t . Fatalf ( "ensureLonghornEncryptedHostPrereqs failed: %v" , err )
}
if strings . Join ( got , "," ) != "titan-04" {
t . Fatalf ( "guarded workers mismatch got=%v" , got )
}
if err := orch . uncordonLonghornCryptsetupExemptNodes ( context . Background ( ) ) ; err != nil {
t . Fatalf ( "uncordonLonghornCryptsetupExemptNodes failed: %v" , err )
}
if sshTitan23 {
t . Fatalf ( "did not expect cryptsetup SSH check for exempt titan-23" )
}
if len ( cordoned ) != 0 {
t . Fatalf ( "did not expect exempt node to be cordoned, got %v" , cordoned )
}
if strings . Join ( uncordoned , "," ) != "titan-23" {
t . Fatalf ( "expected exempt titan-23 to be uncordoned, got %v" , uncordoned )
}
}
2026-06-18 23:18:31 -03:00
// TestLonghornHostNodesFallsBackToConfiguredLabels runs one orchestration or CLI step.
// Signature: TestLonghornHostNodesFallsBackToConfiguredLabels(t *testing.T).
// Why: bootstrap caches or minimal test clusters can lack live labels; the
// static startup inventory should still protect configured storage workers.
func TestLonghornHostNodesFallsBackToConfiguredLabels ( t * testing . T ) {
orch := buildOrchestratorWithStubs ( t , config . Config {
Startup : config . Startup {
RequiredNodeLabels : map [ string ] map [ string ] string {
"titan-04" : { "longhorn-host" : "true" } ,
"titan-20" : { "node-role.kubernetes.io/worker" : "true" } ,
} ,
} ,
} , [ ] commandStub {
{ match : matchContains ( "kubectl" , "get" , "nodes" , "-l" , "longhorn-host=true" ) , out : "" } ,
} )
got , err := orch . longhornHostNodes ( context . Background ( ) )
if err != nil {
t . Fatalf ( "longhornHostNodes failed: %v" , err )
}
if _ , ok := got [ "titan-04" ] ; ! ok || len ( got ) != 1 {
t . Fatalf ( "expected configured longhorn host fallback, got %v" , got )
}
}
2026-04-08 23:52:29 -03:00
// TestNewConstructsOrchestrator runs one orchestration or CLI step.
// Signature: TestNewConstructsOrchestrator(t *testing.T).
// Why: covers constructor path in orchestrator core module.
func TestNewConstructsOrchestrator ( t * testing . T ) {
cfg := config . Config { State : config . State { RunHistoryPath : filepath . Join ( t . TempDir ( ) , "runs.json" ) } }
r := & execx . Runner { }
s := state . New ( cfg . State . RunHistoryPath )
orch := New ( cfg , r , s , log . New ( io . Discard , "" , 0 ) )
if orch == nil || orch . runner != r || orch . store != s {
t . Fatalf ( "constructor returned unexpected orchestrator: %#v" , orch )
}
}
// TestParseSnapshotPathFromEtcdSnapshotList runs one orchestration or CLI step.
// Signature: TestParseSnapshotPathFromEtcdSnapshotList(t *testing.T).
// Why: covers snapshot-path parser branches including header skip and no-match.
func TestParseSnapshotPathFromEtcdSnapshotList ( t * testing . T ) {
out := strings . Join ( [ ] string {
"Name Size Created Location" ,
` pre-shutdown 4.2M now "file:///var/lib/rancher/k3s/server/db/snapshots/pre-shutdown" ` ,
} , "\n" )
got := parseSnapshotPathFromEtcdSnapshotList ( out )
if got != "/var/lib/rancher/k3s/server/db/snapshots/pre-shutdown" {
t . Fatalf ( "unexpected snapshot path: %q" , got )
}
if parseSnapshotPathFromEtcdSnapshotList ( "no snapshots" ) != "" {
t . Fatalf ( "expected no snapshot path" )
}
}
// TestFluxSourceHelpers runs one orchestration or CLI step.
// Signature: TestFluxSourceHelpers(t *testing.T).
// Why: covers flux source readiness/guard/branch patch helper flows.
func TestFluxSourceHelpers ( t * testing . T ) {
cfg := config . Config {
ExpectedFluxSource : "ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git" ,
}
orch := buildOrchestratorWithStubs ( t , cfg , [ ] commandStub {
{ match : matchContains ( "kubectl" , "jsonpath={.status.conditions" ) , out : "True" } ,
{ match : matchContains ( "kubectl" , "jsonpath={.spec.url}" ) , out : "ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git" } ,
{ match : matchContains ( "kubectl" , "jsonpath={.spec.ref.branch}" ) , out : "main" } ,
{ match : matchContains ( "kubectl" , "patch" , "gitrepository" ) , out : "" } ,
} )
ready , err := orch . fluxSourceReady ( context . Background ( ) )
if err != nil || ! ready {
t . Fatalf ( "expected flux source ready, got ready=%v err=%v" , ready , err )
}
if err := orch . guardFluxSourceDrift ( context . Background ( ) , "main" , false ) ; err != nil {
t . Fatalf ( "guardFluxSourceDrift failed: %v" , err )
}
if err := orch . ensureFluxBranch ( context . Background ( ) , "main" , false ) ; err != nil {
t . Fatalf ( "ensureFluxBranch no-op failed: %v" , err )
}
if got := normalizeGitURL ( " SSH://Git@Host/Repo.git/ " ) ; got != "ssh://git@host/repo" {
t . Fatalf ( "unexpected normalized url: %q" , got )
}
}
// TestCoordinationHelpers runs one orchestration or CLI step.
// Signature: TestCoordinationHelpers(t *testing.T).
// Why: covers intent-age helpers, shell quoting, and peer selection logic.
func TestCoordinationHelpers ( t * testing . T ) {
in := state . Intent { UpdatedAt : time . Now ( ) . Add ( - 10 * time . Second ) }
if intentAge ( in ) <= 0 {
t . Fatalf ( "expected positive age" )
}
if ! intentFresh ( state . Intent { } , time . Second ) {
t . Fatalf ( "zero timestamp should be fresh" )
}
if shellQuote ( "a'b" ) != ` 'a'"'"'b' ` {
t . Fatalf ( "unexpected shell quote output" )
}
orch := buildOrchestratorWithStubs ( t , config . Config {
Coordination : config . Coordination {
PeerHosts : [ ] string { "titan-24" , "titan-24" , "titan-db" } ,
ForwardShutdownHost : "titan-db" ,
} ,
} , nil )
peers := orch . coordinationPeers ( )
if len ( peers ) != 2 {
t . Fatalf ( "expected deduped peers, got %v" , peers )
}
}
// TestVerifyEtcdSnapshotAndRunSudoK3S runs one orchestration or CLI step.
// Signature: TestVerifyEtcdSnapshotAndRunSudoK3S(t *testing.T).
// Why: covers k3s command fallback and snapshot verification happy path.
func TestVerifyEtcdSnapshotAndRunSudoK3S ( t * testing . T ) {
orch := buildOrchestratorWithStubs ( t , config . Config { } , [ ] commandStub {
{ match : matchContains ( "ssh" , "stat -c %s" ) , out : "2097152" } ,
{ match : matchContains ( "ssh" , "k3s etcd-snapshot ls" ) , out : "/var/lib/rancher/k3s/server/db/snapshots/pre-shutdown" } ,
{ match : matchContains ( "ssh" , "sha256sum" ) , out : strings . Repeat ( "a" , 64 ) } ,
} )
if err := orch . verifyEtcdSnapshot ( context . Background ( ) , "titan-0a" , "/var/lib/rancher/k3s/server/db/snapshots/pre-shutdown" ) ; err != nil {
t . Fatalf ( "verifyEtcdSnapshot failed: %v" , err )
}
}
// TestScalingHelpers runs one orchestration or CLI step.
// Signature: TestScalingHelpers(t *testing.T).
// Why: covers workload discovery, snapshot IO, and scale command orchestration.
func TestScalingHelpers ( t * testing . T ) {
cfg := config . Config {
ExcludedNamespaces : [ ] string { "kube-system" } ,
State : config . State { Dir : t . TempDir ( ) } ,
}
orch := buildOrchestratorWithStubs ( t , cfg , [ ] commandStub {
{
match : matchContains ( "kubectl" , "get deployment" , "jsonpath" ) ,
out : strings . Join ( [ ] string {
"default\tgrafana\t1" ,
"kube-system\tcoredns\t2" ,
"" ,
} , "\n" ) ,
} ,
{
match : matchContains ( "kubectl" , "get statefulset" , "jsonpath" ) ,
out : "vault\tvault\t1\n" ,
} ,
{ match : matchContains ( "kubectl" , "scale" , "deployment" , "grafana" ) , out : "" } ,
{ match : matchContains ( "kubectl" , "scale" , "statefulset" , "vault" ) , out : "" } ,
} )
entries , err := orch . listScalableWorkloads ( context . Background ( ) )
if err != nil {
t . Fatalf ( "listScalableWorkloads failed: %v" , err )
}
if len ( entries ) != 2 {
t . Fatalf ( "expected 2 scalable entries, got %d (%v)" , len ( entries ) , entries )
}
if err := orch . writeScaledWorkloadSnapshot ( entries ) ; err != nil {
t . Fatalf ( "writeScaledWorkloadSnapshot failed: %v" , err )
}
snapshot , err := orch . readScaledWorkloadSnapshot ( )
if err != nil || snapshot == nil || len ( snapshot . Entries ) != 2 {
t . Fatalf ( "readScaledWorkloadSnapshot failed snapshot=%v err=%v" , snapshot , err )
}
if err := orch . scaleWorkloads ( context . Background ( ) , entries , - 1 , 2 ) ; err != nil {
t . Fatalf ( "scaleWorkloads failed: %v" , err )
}
}
// TestStorageReadyAndWorkloadHelpers runs one orchestration or CLI step.
// Signature: TestStorageReadyAndWorkloadHelpers(t *testing.T).
// Why: covers storage readiness checks and workload helper utilities.
func TestStorageReadyAndWorkloadHelpers ( t * testing . T ) {
cfg := config . Config {
Startup : config . Startup {
StorageMinReadyNodes : 1 ,
StorageCriticalPVCs : [ ] string { "vault/data-vault-0" } ,
} ,
}
orch := buildOrchestratorWithStubs ( t , cfg , [ ] commandStub {
{ match : matchContains ( "kubectl" , "nodes.longhorn.io" ) , out : "titan-23:True:True\n" } ,
{ match : matchContains ( "kubectl" , "get pvc data-vault-0" ) , out : "Bound" } ,
} )
ok , reason , err := orch . storageReady ( context . Background ( ) )
if err != nil || ! ok {
t . Fatalf ( "expected storageReady true, got ok=%v reason=%q err=%v" , ok , reason , err )
}
}
// TestIngressAndServiceHelpers runs one orchestration or CLI step.
// Signature: TestIngressAndServiceHelpers(t *testing.T).
// Why: covers ingress host discovery helpers and URL parsing helpers.
func TestIngressAndServiceHelpers ( t * testing . T ) {
cfg := config . Config {
Startup : config . Startup {
IngressChecklistIgnoreHosts : [ ] string { "ignore.bstein.dev" } ,
} ,
}
orch := buildOrchestratorWithStubs ( t , cfg , [ ] commandStub {
{ match : matchContains ( "kubectl" , "get ingress" , "-A" , "-o" , "json" ) , out : ` { "items":[ { "metadata": { "namespace":"gitea"},"spec": { "rules":[ { "host":"scm.bstein.dev"}]}}, { "metadata": { "namespace":"x"},"spec": { "rules":[ { "host":"ignore.bstein.dev"}]}}]} ` } ,
} )
hosts , err := orch . discoverIngressHosts ( context . Background ( ) )
if err != nil || len ( hosts ) != 1 || hosts [ 0 ] != "scm.bstein.dev" {
t . Fatalf ( "discoverIngressHosts unexpected hosts=%v err=%v" , hosts , err )
}
if got := hostFromURL ( "https://metrics.bstein.dev/api/health" ) ; got != "metrics.bstein.dev" {
t . Fatalf ( "unexpected hostFromURL value: %q" , got )
}
if ! isLikelyHostname ( "metrics.bstein.dev" ) || isLikelyHostname ( "bad path/value" ) {
t . Fatalf ( "isLikelyHostname classification mismatch" )
}
}
// TestWorkloadConvergenceHelpers runs one orchestration or CLI step.
// Signature: TestWorkloadConvergenceHelpers(t *testing.T).
// Why: covers controller readiness helpers and stuck-pod heuristics.
func TestWorkloadConvergenceHelpers ( t * testing . T ) {
replicas := int32 ( 2 )
item := workloadResource { Kind : "deployment" }
item . Spec . Replicas = & replicas
item . Status . ReadyReplicas = 1
desired , ready , ok := desiredReady ( item )
if ! ok || desired != 2 || ready != 1 {
t . Fatalf ( "desiredReady mismatch desired=%d ready=%d ok=%v" , desired , ready , ok )
}
var pod podResource
pod . Metadata . OwnerReferences = [ ] ownerReference { { Kind : "ReplicaSet" } }
if ! podControllerOwned ( pod ) {
t . Fatalf ( "expected podControllerOwned=true" )
}
pod . Status . ContainerStatuses = [ ] podContainerStatus { { State : podContainerState { Waiting : & podContainerWaitingState { Reason : "CrashLoopBackOff" } } } }
reason := stuckContainerReason ( pod , map [ string ] struct { } { "CrashLoopBackOff" : struct { } { } } )
if reason != "CrashLoopBackOff" {
t . Fatalf ( "unexpected stuck reason: %q" , reason )
}
}
// TestDrainAndK3SHelpers runs one orchestration or CLI step.
// Signature: TestDrainAndK3SHelpers(t *testing.T).
// Why: covers node drain diagnostics and k3s snapshot selection flow.
func TestDrainAndK3SHelpers ( t * testing . T ) {
cfg := config . Config {
SSHManagedNodes : [ ] string { "titan-0a" } ,
}
orch := buildOrchestratorWithStubs ( t , cfg , [ ] commandStub {
{ match : matchContains ( "kubectl" , "get pods" , "--field-selector" , "spec.nodeName=titan-22" ) , out : "vault vault-0 Running StatefulSet\n" } ,
{ match : matchContains ( "ssh" , "k3s etcd-snapshot ls" ) , out : "pre-shutdown /var/lib/rancher/k3s/server/db/snapshots/pre-shutdown" } ,
} )
diag := orch . drainNodeDiagnostics ( context . Background ( ) , "titan-22" )
if ! strings . Contains ( diag , "vault/vault-0" ) {
t . Fatalf ( "unexpected diagnostics output: %q" , diag )
}
snapshot , err := orch . latestEtcdSnapshotPath ( context . Background ( ) , "titan-0a" )
if err != nil || snapshot == "" {
t . Fatalf ( "latestEtcdSnapshotPath failed snapshot=%q err=%v" , snapshot , err )
}
}
// TestTimesyncAndInventoryHelpers runs one orchestration or CLI step.
// Signature: TestTimesyncAndInventoryHelpers(t *testing.T).
// Why: covers time sync helpers, datastore endpoint parsing, and inventory assembly.
func TestTimesyncAndInventoryHelpers ( t * testing . T ) {
cfg := config . Config {
ControlPlanes : [ ] string { "titan-0a" } ,
Workers : [ ] string { "titan-22" } ,
SSHManagedNodes : [ ] string { "titan-0a" , "titan-22" } ,
SSHNodeHosts : map [ string ] string {
"titan-db" : "10.0.0.10" ,
} ,
Coordination : config . Coordination {
PeerHosts : [ ] string { "titan-24" } ,
ForwardShutdownHost : "titan-db" ,
} ,
}
orch := buildOrchestratorWithStubs ( t , cfg , nil )
nodes := orch . inventoryNodesForValidation ( )
if len ( nodes ) < 3 {
t . Fatalf ( "expected combined inventory nodes, got %v" , nodes )
}
if parseDatastoreEndpoint ( "ExecStart=/usr/local/bin/k3s server --datastore-endpoint=postgres://x" ) == "" {
t . Fatalf ( "expected datastore endpoint parse" )
}
if ! isTimeSynced ( "YES" ) || isTimeSynced ( "no" ) {
t . Fatalf ( "unexpected isTimeSynced behavior" )
}
ln , err := net . Listen ( "tcp" , "127.0.0.1:0" )
if err != nil {
t . Fatalf ( "listen failed: %v" , err )
}
defer ln . Close ( )
if ! orch . tcpReachable ( ln . Addr ( ) . String ( ) , 500 * time . Millisecond ) {
t . Fatalf ( "expected tcpReachable=true for open listener" )
}
}
// TestShutdownModeValidation runs one orchestration or CLI step.
// Signature: TestShutdownModeValidation(t *testing.T).
// Why: covers removed poweroff mode and invalid-mode errors.
func TestShutdownModeValidation ( t * testing . T ) {
if mode , err := normalizeShutdownMode ( "cluster-only" ) ; err != nil || mode != "cluster-only" {
t . Fatalf ( "expected cluster-only mode, got mode=%q err=%v" , mode , err )
}
if _ , err := normalizeShutdownMode ( "bogus" ) ; err == nil {
t . Fatalf ( "expected invalid mode error" )
}
}
// TestWaitForAPIDryRunShortCircuit runs one orchestration or CLI step.
// Signature: TestWaitForAPIDryRunShortCircuit(t *testing.T).
// Why: covers dry-run short-circuit branch for api readiness wait.
func TestWaitForAPIDryRunShortCircuit ( t * testing . T ) {
orch := & Orchestrator { runner : & execx . Runner { DryRun : true } }
if err := orch . waitForAPI ( context . Background ( ) , 1 , time . Millisecond ) ; err != nil {
t . Fatalf ( "expected dry-run waitForAPI to pass: %v" , err )
}
}
// TestGuardFluxSourceDriftMismatch runs one orchestration or CLI step.
// Signature: TestGuardFluxSourceDriftMismatch(t *testing.T).
// Why: covers url-drift and branch-drift error branches.
func TestGuardFluxSourceDriftMismatch ( t * testing . T ) {
cfg := config . Config { ExpectedFluxSource : "ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git" }
orch := buildOrchestratorWithStubs ( t , cfg , [ ] commandStub {
{ match : matchContains ( "kubectl" , "jsonpath={.spec.url}" ) , out : "ssh://git@scm.bstein.dev:2242/bstein/wrong.git" } ,
} )
if err := orch . guardFluxSourceDrift ( context . Background ( ) , "main" , false ) ; err == nil {
t . Fatalf ( "expected guardFluxSourceDrift mismatch error" )
}
orch = buildOrchestratorWithStubs ( t , cfg , [ ] commandStub {
{ match : matchContains ( "kubectl" , "jsonpath={.spec.url}" ) , out : "ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git" } ,
{ match : matchContains ( "kubectl" , "jsonpath={.spec.ref.branch}" ) , out : "atlasbot" } ,
} )
if err := orch . guardFluxSourceDrift ( context . Background ( ) , "main" , false ) ; err == nil {
t . Fatalf ( "expected branch drift error" )
}
}
// TestRunSudoK3SFailsWhenAllCandidatesFail runs one orchestration or CLI step.
// Signature: TestRunSudoK3SFailsWhenAllCandidatesFail(t *testing.T).
// Why: covers fallback failure return in runSudoK3S.
func TestRunSudoK3SFailsWhenAllCandidatesFail ( t * testing . T ) {
orch := buildOrchestratorWithStubs ( t , config . Config { } , [ ] commandStub {
{ match : matchContains ( "ssh" , "k3s" ) , err : errors . New ( "no binary" ) } ,
} )
if _ , err := orch . runSudoK3S ( context . Background ( ) , "titan-0a" , "server" ) ; err == nil {
t . Fatalf ( "expected runSudoK3S failure when all candidates fail" )
}
}
// TestCriticalEndpointHelpers runs one orchestration or CLI step.
// Signature: TestCriticalEndpointHelpers(t *testing.T).
// Why: covers critical endpoint parsing and readiness checks that gate startup completion.
func TestCriticalEndpointHelpers ( t * testing . T ) {
cfg := config . Config {
Startup : config . Startup {
CriticalServiceEndpoints : [ ] string { "monitoring/victoria-metrics-single-server" } ,
} ,
}
orch := buildOrchestratorWithStubs ( t , cfg , [ ] commandStub {
{ match : matchContains ( "kubectl" , "get endpoints victoria-metrics-single-server" ) , out : "10.42.0.10\n10.42.0.11\n" } ,
} )
ok , detail , ns , svc , err := orch . criticalServiceEndpointsReady ( context . Background ( ) )
if err != nil || ! ok {
t . Fatalf ( "expected criticalServiceEndpointsReady success, got ok=%v detail=%q ns=%q svc=%q err=%v" , ok , detail , ns , svc , err )
}
if detail != "services=1" {
t . Fatalf ( "unexpected readiness detail: %q" , detail )
}
gotNS , gotSvc , err := parseCriticalServiceEndpoint ( "monitoring/victoria-metrics-single-server" )
if err != nil || gotNS != "monitoring" || gotSvc != "victoria-metrics-single-server" {
t . Fatalf ( "unexpected parse result ns=%q svc=%q err=%v" , gotNS , gotSvc , err )
}
if _ , _ , err := parseCriticalServiceEndpoint ( "invalid" ) ; err == nil {
t . Fatalf ( "expected parseCriticalServiceEndpoint error" )
}
}
// TestCriticalEndpointAutoHealWorkflow runs one orchestration or CLI step.
// Signature: TestCriticalEndpointAutoHealWorkflow(t *testing.T).
// Why: covers endpoint-zero recovery where startup heals workload replicas before succeeding.
func TestCriticalEndpointAutoHealWorkflow ( t * testing . T ) {
cfg := config . Config {
Startup : config . Startup {
CriticalServiceEndpointWaitSec : 2 ,
CriticalServiceEndpointPollSec : 1 ,
CriticalServiceEndpoints : [ ] string { "monitoring/victoria-metrics-single-server" } ,
} ,
State : config . State {
Dir : t . TempDir ( ) ,
ReportsDir : filepath . Join ( t . TempDir ( ) , "reports" ) ,
RunHistoryPath : filepath . Join ( t . TempDir ( ) , "runs.json" ) ,
} ,
}
orch := & Orchestrator {
cfg : cfg ,
runner : & execx . Runner { } ,
store : state . New ( cfg . State . RunHistoryPath ) ,
log : log . New ( io . Discard , "" , 0 ) ,
}
endpointChecks := 0
dispatch := func ( _ context . Context , _ time . Duration , name string , args ... string ) ( string , error ) {
joined := name + " " + strings . Join ( args , " " )
if strings . Contains ( joined , "get endpoints victoria-metrics-single-server" ) {
endpointChecks ++
if endpointChecks == 1 {
return "" , nil
}
return "10.42.0.10\n" , nil
}
if strings . Contains ( joined , "scale deployment victoria-metrics-single-server" ) {
return "" , errors . New ( ` Error from server (NotFound): deployments.apps "victoria-metrics-single-server" not found ` )
}
if strings . Contains ( joined , "scale statefulset victoria-metrics-single-server" ) {
return "" , nil
}
if strings . Contains ( joined , "rollout status statefulset/victoria-metrics-single-server" ) {
return "statefulset rolled out" , nil
}
return "" , nil
}
orch . runOverride = dispatch
orch . runSensitiveOverride = dispatch
if err := orch . waitForCriticalServiceEndpoints ( context . Background ( ) ) ; err != nil {
t . Fatalf ( "waitForCriticalServiceEndpoints failed: %v" , err )
}
if endpointChecks < 2 {
t . Fatalf ( "expected repeated endpoint checks, got %d" , endpointChecks )
}
}