soteria/internal/k8s/jobs_test.go

557 lines
21 KiB
Go
Raw Permalink Normal View History

package k8s
import (
"context"
"errors"
"strings"
"testing"
"time"
"scm.bstein.dev/bstein/soteria/internal/api"
"scm.bstein.dev/bstein/soteria/internal/config"
batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
k8sfake "k8s.io/client-go/kubernetes/fake"
k8stesting "k8s.io/client-go/testing"
)
func TestListBackupJobsAndListBackupJobsForPVCCoverFilteringAndSorting(t *testing.T) {
now := time.Now().UTC()
recent := metav1.NewTime(now.Add(-1 * time.Hour))
old := metav1.NewTime(now.Add(-3 * time.Hour))
client := &Client{Clientset: k8sfake.NewSimpleClientset(
&batchv1.Job{
ObjectMeta: metav1.ObjectMeta{
Name: "backup-zeta",
Namespace: "apps",
Labels: map[string]string{
labelAppName: "soteria",
labelComponent: "backup",
labelAction: "backup",
labelPVC: "data",
},
Annotations: map[string]string{
annotationResticRepository: "s3:https://repo/data",
annotationDedupeEnabled: "false",
annotationKeepLast: "3",
},
CreationTimestamp: old,
},
Status: batchv1.JobStatus{Succeeded: 1, CompletionTime: &old},
},
&batchv1.Job{
ObjectMeta: metav1.ObjectMeta{
Name: "backup-alpha",
Namespace: "apps",
Labels: map[string]string{
labelAppName: "soteria",
labelComponent: "backup",
labelAction: "backup",
labelPVC: "data",
},
Annotations: map[string]string{
annotationResticRepository: "s3:https://repo/data",
annotationDedupeEnabled: "true",
annotationKeepLast: "5",
},
CreationTimestamp: recent,
},
Status: batchv1.JobStatus{Active: 1},
},
&batchv1.Job{
ObjectMeta: metav1.ObjectMeta{
Name: "backup-other",
Namespace: "apps",
Labels: map[string]string{
labelAppName: "soteria",
labelComponent: "backup",
labelAction: "backup",
labelPVC: "cache",
},
CreationTimestamp: recent,
},
Status: batchv1.JobStatus{Failed: 1},
},
&batchv1.Job{
ObjectMeta: metav1.ObjectMeta{
Name: "skip-missing-pvc",
Namespace: "apps",
Labels: map[string]string{
labelAppName: "soteria",
labelComponent: "backup",
labelAction: "backup",
},
CreationTimestamp: recent,
},
},
)}
items, err := client.ListBackupJobs(context.Background(), "apps")
if err != nil {
t.Fatalf("list backup jobs: %v", err)
}
if len(items) != 3 {
t.Fatalf("expected three pvc-tagged backup jobs, got %#v", items)
}
if items[0].Name != "backup-other" || items[1].Name != "backup-alpha" || items[2].Name != "backup-zeta" {
t.Fatalf("expected sorted backup jobs, got %#v", items)
}
if items[1].State != "Running" || items[2].State != "Completed" || items[0].State != "Failed" {
t.Fatalf("expected summarized backup states, got %#v", items)
}
if items[1].DedupeEnabled != true || items[2].DedupeEnabled != false || items[2].KeepLast != 3 {
t.Fatalf("expected annotation-derived summary fields, got %#v", items)
}
items, err = client.ListBackupJobsForPVC(context.Background(), "apps", "data")
if err != nil {
t.Fatalf("list backup jobs for pvc: %v", err)
}
if len(items) != 2 || items[0].Name != "backup-alpha" || items[1].Name != "backup-zeta" {
t.Fatalf("expected filtered pvc job list, got %#v", items)
}
listFailClientset := k8sfake.NewSimpleClientset()
listFailClientset.PrependReactor("list", "jobs", func(action k8stesting.Action) (bool, runtime.Object, error) {
return true, nil, errors.New("list jobs exploded")
})
listFailClient := &Client{Clientset: listFailClientset}
if _, err := listFailClient.ListBackupJobs(context.Background(), "apps"); err == nil || !strings.Contains(err.Error(), "list jobs exploded") {
t.Fatalf("expected list backup jobs error, got %v", err)
}
if _, err := listFailClient.ListBackupJobsForPVC(context.Background(), "apps", "data"); err == nil || !strings.Contains(err.Error(), "list jobs exploded") {
t.Fatalf("expected pvc-scoped list backup jobs error, got %v", err)
}
}
func TestResolvePVCMountedNodeIgnoresDeadPodsAndFindsMountedClaim(t *testing.T) {
now := metav1.NewTime(time.Now().UTC())
client := &Client{Clientset: k8sfake.NewSimpleClientset(
&corev1.Pod{
ObjectMeta: metav1.ObjectMeta{Name: "deleted", Namespace: "apps", DeletionTimestamp: &now},
Spec: corev1.PodSpec{NodeName: "titan-00"},
},
&corev1.Pod{
ObjectMeta: metav1.ObjectMeta{Name: "no-node", Namespace: "apps"},
Spec: corev1.PodSpec{
Volumes: []corev1.Volume{{Name: "data", VolumeSource: corev1.VolumeSource{
PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{ClaimName: "data"},
}}},
},
},
&corev1.Pod{
ObjectMeta: metav1.ObjectMeta{Name: "finished", Namespace: "apps"},
Spec: corev1.PodSpec{
NodeName: "titan-01",
Volumes: []corev1.Volume{{Name: "data", VolumeSource: corev1.VolumeSource{
PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{ClaimName: "data"},
}}},
},
Status: corev1.PodStatus{Phase: corev1.PodSucceeded},
},
&corev1.Pod{
ObjectMeta: metav1.ObjectMeta{Name: "active", Namespace: "apps"},
Spec: corev1.PodSpec{
NodeName: "titan-02",
Volumes: []corev1.Volume{
{Name: "cache", VolumeSource: corev1.VolumeSource{
PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{ClaimName: "cache"},
}},
{Name: "data", VolumeSource: corev1.VolumeSource{
PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{ClaimName: "data"},
}},
},
},
Status: corev1.PodStatus{Phase: corev1.PodRunning},
},
)}
nodeName, err := client.resolvePVCMountedNode(context.Background(), "apps", "data")
if err != nil || nodeName != "titan-02" {
t.Fatalf("expected mounted pvc node, got %q %v", nodeName, err)
}
nodeName, err = client.resolvePVCMountedNode(context.Background(), "apps", "missing")
if err != nil || nodeName != "" {
t.Fatalf("expected missing pvc mount to return empty node, got %q %v", nodeName, err)
}
clientset := k8sfake.NewSimpleClientset()
clientset.PrependReactor("list", "pods", func(action k8stesting.Action) (bool, runtime.Object, error) {
return true, nil, errors.New("resolve pods exploded")
})
client = &Client{Clientset: clientset}
if _, err := client.resolvePVCMountedNode(context.Background(), "apps", "data"); err == nil || !strings.Contains(err.Error(), "resolve pods exploded") {
t.Fatalf("expected pod list error, got %v", err)
}
}
func TestReadBackupJobLogCoversSuccessAndListFailures(t *testing.T) {
client := &Client{Clientset: k8sfake.NewSimpleClientset(
&corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "job-pod-old",
Namespace: "apps",
CreationTimestamp: metav1.NewTime(time.Now().UTC().Add(-2 * time.Hour)),
Labels: map[string]string{"job-name": "backup-job"},
},
Status: corev1.PodStatus{StartTime: ptrTime(metav1.NewTime(time.Now().UTC().Add(-90 * time.Minute)))},
},
&corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "job-pod-new",
Namespace: "apps",
CreationTimestamp: metav1.NewTime(time.Now().UTC().Add(-1 * time.Hour)),
Labels: map[string]string{"job-name": "backup-job"},
},
Status: corev1.PodStatus{StartTime: ptrTime(metav1.NewTime(time.Now().UTC().Add(-30 * time.Minute)))},
},
)}
logs, err := client.ReadBackupJobLog(context.Background(), "apps", "backup-job")
if err != nil || logs != "fake logs" {
t.Fatalf("expected fake pod logs response, got %q %v", logs, err)
}
emptyClient := &Client{Clientset: k8sfake.NewSimpleClientset()}
if _, err := emptyClient.ReadBackupJobLog(context.Background(), "apps", "backup-job"); err == nil || !strings.Contains(err.Error(), "no pod found") {
t.Fatalf("expected missing pod error, got %v", err)
}
clientset := k8sfake.NewSimpleClientset()
clientset.PrependReactor("list", "pods", func(action k8stesting.Action) (bool, runtime.Object, error) {
return true, nil, errors.New("list pods exploded")
})
listFailClient := &Client{Clientset: clientset}
if _, err := listFailClient.ReadBackupJobLog(context.Background(), "apps", "backup-job"); err == nil || !strings.Contains(err.Error(), "list pods exploded") {
t.Fatalf("expected wrapped pod list error, got %v", err)
}
}
func TestLatestBackupJobPodNameCoversSortBranches(t *testing.T) {
now := time.Now().UTC()
t.Run("prefers latest start time and name tiebreak", func(t *testing.T) {
name := latestBackupJobPodName([]corev1.Pod{
{
ObjectMeta: metav1.ObjectMeta{Name: "job-a", CreationTimestamp: metav1.NewTime(now.Add(-10 * time.Minute))},
Status: corev1.PodStatus{StartTime: ptrTime(metav1.NewTime(now.Add(-1 * time.Minute)))},
},
{
ObjectMeta: metav1.ObjectMeta{Name: "job-z", CreationTimestamp: metav1.NewTime(now.Add(-20 * time.Minute))},
Status: corev1.PodStatus{StartTime: ptrTime(metav1.NewTime(now.Add(-1 * time.Minute)))},
},
})
if name != "job-z" {
t.Fatalf("expected name tiebreak to pick lexically later pod, got %q", name)
}
})
t.Run("prefers started pod over nil start time", func(t *testing.T) {
name := latestBackupJobPodName([]corev1.Pod{
{ObjectMeta: metav1.ObjectMeta{Name: "started"}, Status: corev1.PodStatus{StartTime: ptrTime(metav1.NewTime(now))}},
{ObjectMeta: metav1.ObjectMeta{Name: "not-started"}},
})
if name != "started" {
t.Fatalf("expected started pod to win, got %q", name)
}
})
t.Run("falls back to creation time when start times are nil", func(t *testing.T) {
name := latestBackupJobPodName([]corev1.Pod{
{ObjectMeta: metav1.ObjectMeta{Name: "older", CreationTimestamp: metav1.NewTime(now.Add(-30 * time.Minute))}},
{ObjectMeta: metav1.ObjectMeta{Name: "newer", CreationTimestamp: metav1.NewTime(now.Add(-5 * time.Minute))}},
})
if name != "newer" {
t.Fatalf("expected newer creation timestamp to win, got %q", name)
}
})
}
func TestCreateBackupJobCoversValidationDryRunAndLiveCreation(t *testing.T) {
clientset := k8sfake.NewSimpleClientset(
&corev1.Secret{
ObjectMeta: metav1.ObjectMeta{Name: "restic-src", Namespace: "shared"},
Type: corev1.SecretTypeOpaque,
Data: map[string][]byte{
"AWS_ACCESS_KEY_ID": []byte("abc"),
"AWS_SECRET_ACCESS_KEY": []byte("def"),
"RESTIC_PASSWORD": []byte("ghi"),
},
},
&corev1.Pod{
ObjectMeta: metav1.ObjectMeta{Name: "app-pod", Namespace: "apps"},
Spec: corev1.PodSpec{
NodeName: "titan-02",
Volumes: []corev1.Volume{{Name: "data", VolumeSource: corev1.VolumeSource{
PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{ClaimName: "data"},
}}},
},
Status: corev1.PodStatus{Phase: corev1.PodRunning},
},
)
client := &Client{Clientset: clientset}
cfg := &config.Config{
SecretNamespace: "shared",
ResticSecretName: "restic-src",
ResticRepository: "s3:https://repo/root",
ResticImage: "restic/restic:latest",
JobTTLSeconds: 3600,
WorkerServiceAccount: "soteria-sa",
}
if _, _, err := client.CreateBackupJob(context.Background(), cfg, api.BackupRequest{PVC: "data"}); err == nil {
t.Fatalf("expected missing namespace validation error")
}
if _, _, err := client.CreateBackupJob(context.Background(), cfg, api.BackupRequest{Namespace: "apps"}); err == nil {
t.Fatalf("expected missing pvc validation error")
}
if _, _, err := client.CreateBackupJob(context.Background(), cfg, api.BackupRequest{Namespace: "apps", PVC: "data", Snapshot: true}); err == nil {
t.Fatalf("expected unsupported snapshot error")
}
jobName, secretName, err := client.CreateBackupJob(context.Background(), cfg, api.BackupRequest{
Namespace: "apps",
PVC: "data",
DryRun: true,
})
if err != nil || jobName == "" || secretName == "" {
t.Fatalf("expected dry-run names, got job=%q secret=%q err=%v", jobName, secretName, err)
}
if _, err := client.Clientset.BatchV1().Jobs("apps").Get(context.Background(), jobName, metav1.GetOptions{}); err == nil {
t.Fatalf("expected dry-run to skip job creation")
}
dedupe := false
keepLast := 3
jobName, secretName, err = client.CreateBackupJob(context.Background(), cfg, api.BackupRequest{
Namespace: "apps",
PVC: "data",
Dedupe: &dedupe,
KeepLast: &keepLast,
})
if err != nil {
t.Fatalf("create live backup job: %v", err)
}
job, err := client.Clientset.BatchV1().Jobs("apps").Get(context.Background(), jobName, metav1.GetOptions{})
if err != nil {
t.Fatalf("get created backup job: %v", err)
}
if job.Spec.Template.Spec.NodeName != "titan-02" || job.Spec.Template.Spec.ServiceAccountName != "soteria-sa" {
t.Fatalf("expected node pin + service account, got %#v", job.Spec.Template.Spec)
}
if job.Annotations[annotationDedupeEnabled] != "false" || job.Annotations[annotationKeepLast] != "3" {
t.Fatalf("expected backup annotations, got %#v", job.Annotations)
}
secret, err := client.Clientset.CoreV1().Secrets("apps").Get(context.Background(), secretName, metav1.GetOptions{})
if err != nil {
t.Fatalf("get copied backup secret: %v", err)
}
if len(secret.OwnerReferences) != 1 || secret.OwnerReferences[0].Name != job.Name || secret.OwnerReferences[0].Kind != "Job" {
t.Fatalf("expected job owner reference on copied secret, got %#v", secret.OwnerReferences)
}
}
func TestCreateBackupJobCleansUpSecretOnJobCreateFailureAndSurfacesBindFailure(t *testing.T) {
clientset := k8sfake.NewSimpleClientset(
&corev1.Secret{
ObjectMeta: metav1.ObjectMeta{Name: "restic-src", Namespace: "shared"},
Type: corev1.SecretTypeOpaque,
Data: map[string][]byte{
"AWS_ACCESS_KEY_ID": []byte("abc"),
"AWS_SECRET_ACCESS_KEY": []byte("def"),
"RESTIC_PASSWORD": []byte("ghi"),
},
},
)
cfg := &config.Config{
SecretNamespace: "shared",
ResticSecretName: "restic-src",
ResticRepository: "s3:https://repo/root",
ResticImage: "restic/restic:latest",
JobTTLSeconds: 3600,
}
clientset.PrependReactor("create", "jobs", func(action k8stesting.Action) (bool, runtime.Object, error) {
return true, nil, errors.New("create job exploded")
})
client := &Client{Clientset: clientset}
if _, secretName, err := client.CreateBackupJob(context.Background(), cfg, api.BackupRequest{
Namespace: "apps",
PVC: "data",
}); err == nil || !strings.Contains(err.Error(), "create job exploded") {
t.Fatalf("expected job create error, got secret=%q err=%v", secretName, err)
}
secrets, err := client.Clientset.CoreV1().Secrets("apps").List(context.Background(), metav1.ListOptions{})
if err != nil {
t.Fatalf("list secrets after failed backup create: %v", err)
}
if len(secrets.Items) != 0 {
t.Fatalf("expected copied secret cleanup on create failure, got %#v", secrets.Items)
}
bindFailClientset := k8sfake.NewSimpleClientset(
&corev1.Secret{
ObjectMeta: metav1.ObjectMeta{Name: "restic-src", Namespace: "shared"},
Type: corev1.SecretTypeOpaque,
Data: map[string][]byte{
"AWS_ACCESS_KEY_ID": []byte("abc"),
"AWS_SECRET_ACCESS_KEY": []byte("def"),
"RESTIC_PASSWORD": []byte("ghi"),
},
},
)
bindFailClientset.PrependReactor("update", "secrets", func(action k8stesting.Action) (bool, runtime.Object, error) {
update := action.(k8stesting.UpdateAction)
secret := update.GetObject().(*corev1.Secret)
if secret.Namespace == "apps" {
return true, nil, errors.New("bind secret exploded")
}
return false, nil, nil
})
bindFailClient := &Client{Clientset: bindFailClientset}
jobName, secretName, err := bindFailClient.CreateBackupJob(context.Background(), cfg, api.BackupRequest{
Namespace: "apps",
PVC: "data",
})
if err == nil || !strings.Contains(err.Error(), "bind secret exploded") || jobName == "" || secretName == "" {
t.Fatalf("expected bind failure after backup job create, got job=%q secret=%q err=%v", jobName, secretName, err)
}
}
func TestCreateRestoreJobCoversValidationDryRunAndLiveCreation(t *testing.T) {
clientset := k8sfake.NewSimpleClientset(
&corev1.Secret{
ObjectMeta: metav1.ObjectMeta{Name: "restic-src", Namespace: "shared"},
Type: corev1.SecretTypeOpaque,
Data: map[string][]byte{
"AWS_ACCESS_KEY_ID": []byte("abc"),
"AWS_SECRET_ACCESS_KEY": []byte("def"),
"RESTIC_PASSWORD": []byte("ghi"),
},
},
)
client := &Client{Clientset: clientset}
cfg := &config.Config{
SecretNamespace: "shared",
ResticSecretName: "restic-src",
ResticRepository: "s3:https://repo/root",
ResticImage: "restic/restic:latest",
JobTTLSeconds: 3600,
WorkerServiceAccount: "soteria-sa",
}
if _, _, err := client.CreateRestoreJob(context.Background(), cfg, api.RestoreTestRequest{}); err == nil {
t.Fatalf("expected missing namespace validation error")
}
jobName, secretName, err := client.CreateRestoreJob(context.Background(), cfg, api.RestoreTestRequest{
Namespace: "apps",
DryRun: true,
})
if err != nil || jobName == "" || secretName == "" {
t.Fatalf("expected restore dry-run names, got job=%q secret=%q err=%v", jobName, secretName, err)
}
jobName, secretName, err = client.CreateRestoreJob(context.Background(), cfg, api.RestoreTestRequest{
Namespace: "apps",
TargetPVC: "restore-data",
Snapshot: "snap-123",
})
if err != nil {
t.Fatalf("create live restore job: %v", err)
}
job, err := client.Clientset.BatchV1().Jobs("apps").Get(context.Background(), jobName, metav1.GetOptions{})
if err != nil {
t.Fatalf("get created restore job: %v", err)
}
if job.Labels[labelPVC] != "restore-data" {
t.Fatalf("expected target pvc label on restore job, got %#v", job.Labels)
}
if job.Spec.Template.Spec.Volumes[0].PersistentVolumeClaim == nil || job.Spec.Template.Spec.Volumes[0].PersistentVolumeClaim.ClaimName != "restore-data" {
t.Fatalf("expected restore pvc volume, got %#v", job.Spec.Template.Spec.Volumes)
}
if !strings.Contains(job.Spec.Template.Spec.Containers[0].Args[0], "restic restore snap-123") {
t.Fatalf("expected restore command to include snapshot, got %#v", job.Spec.Template.Spec.Containers[0].Args)
}
secret, err := client.Clientset.CoreV1().Secrets("apps").Get(context.Background(), secretName, metav1.GetOptions{})
if err != nil {
t.Fatalf("get copied restore secret: %v", err)
}
if len(secret.OwnerReferences) != 1 || secret.OwnerReferences[0].Name != job.Name {
t.Fatalf("expected restore job owner reference on copied secret, got %#v", secret.OwnerReferences)
}
}
func TestCreateRestoreJobCleansUpSecretOnJobCreateFailureAndSurfacesBindFailure(t *testing.T) {
cfg := &config.Config{
SecretNamespace: "shared",
ResticSecretName: "restic-src",
ResticRepository: "s3:https://repo/root",
ResticImage: "restic/restic:latest",
JobTTLSeconds: 3600,
}
clientset := k8sfake.NewSimpleClientset(
&corev1.Secret{
ObjectMeta: metav1.ObjectMeta{Name: "restic-src", Namespace: "shared"},
Type: corev1.SecretTypeOpaque,
Data: map[string][]byte{
"AWS_ACCESS_KEY_ID": []byte("abc"),
"AWS_SECRET_ACCESS_KEY": []byte("def"),
"RESTIC_PASSWORD": []byte("ghi"),
},
},
)
clientset.PrependReactor("create", "jobs", func(action k8stesting.Action) (bool, runtime.Object, error) {
return true, nil, errors.New("create restore job exploded")
})
client := &Client{Clientset: clientset}
if _, _, err := client.CreateRestoreJob(context.Background(), cfg, api.RestoreTestRequest{Namespace: "apps"}); err == nil || !strings.Contains(err.Error(), "create restore job exploded") {
t.Fatalf("expected restore job create error, got %v", err)
}
secrets, err := client.Clientset.CoreV1().Secrets("apps").List(context.Background(), metav1.ListOptions{})
if err != nil {
t.Fatalf("list secrets after failed restore create: %v", err)
}
if len(secrets.Items) != 0 {
t.Fatalf("expected copied secret cleanup on restore create failure, got %#v", secrets.Items)
}
bindFailClientset := k8sfake.NewSimpleClientset(
&corev1.Secret{
ObjectMeta: metav1.ObjectMeta{Name: "restic-src", Namespace: "shared"},
Type: corev1.SecretTypeOpaque,
Data: map[string][]byte{
"AWS_ACCESS_KEY_ID": []byte("abc"),
"AWS_SECRET_ACCESS_KEY": []byte("def"),
"RESTIC_PASSWORD": []byte("ghi"),
},
},
)
bindFailClientset.PrependReactor("update", "secrets", func(action k8stesting.Action) (bool, runtime.Object, error) {
update := action.(k8stesting.UpdateAction)
secret := update.GetObject().(*corev1.Secret)
if secret.Namespace == "apps" {
return true, nil, errors.New("bind restore secret exploded")
}
return false, nil, nil
})
bindFailClient := &Client{Clientset: bindFailClientset}
jobName, secretName, err := bindFailClient.CreateRestoreJob(context.Background(), cfg, api.RestoreTestRequest{Namespace: "apps"})
if err == nil || !strings.Contains(err.Error(), "bind restore secret exploded") || jobName == "" || secretName == "" {
t.Fatalf("expected restore bind failure after job create, got job=%q secret=%q err=%v", jobName, secretName, err)
}
}
func ptrTime(value metav1.Time) *metav1.Time {
return &value
}