From bff6b83d1189153ad190f5912bc6260940ffed2b Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Thu, 9 Oct 2025 18:28:20 -0500 Subject: [PATCH] gpu(titan-24): add RuntimeClass + NVIDIA device-plugin DS; enable containerd nvidia runtime --- .../gpu/device-plugin-tethys/daemonset.yaml | 48 +++++++++++++++++++ .../device-plugin-tethys/kustomization.yaml | 4 ++ .../profiles/tethys-only/kustomization.yaml | 5 ++ .../gpu/runtimeclass/nvidia-runtimeclass.yaml | 5 ++ infrastructure/core/kustomization.yaml | 3 +- 5 files changed, 64 insertions(+), 1 deletion(-) create mode 100644 infrastructure/core/gpu/device-plugin-tethys/daemonset.yaml create mode 100644 infrastructure/core/gpu/device-plugin-tethys/kustomization.yaml create mode 100644 infrastructure/core/gpu/profiles/tethys-only/kustomization.yaml create mode 100644 infrastructure/core/gpu/runtimeclass/nvidia-runtimeclass.yaml diff --git a/infrastructure/core/gpu/device-plugin-tethys/daemonset.yaml b/infrastructure/core/gpu/device-plugin-tethys/daemonset.yaml new file mode 100644 index 0000000..44a47d9 --- /dev/null +++ b/infrastructure/core/gpu/device-plugin-tethys/daemonset.yaml @@ -0,0 +1,48 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: nvidia-device-plugin-titan24 + namespace: kube-system + labels: + app.kubernetes.io/name: nvidia-device-plugin + app.kubernetes.io/instance: titan24 +spec: + selector: + matchLabels: + app.kubernetes.io/name: nvidia-device-plugin + app.kubernetes.io/instance: titan24 + template: + metadata: + labels: + app.kubernetes.io/name: nvidia-device-plugin + app.kubernetes.io/instance: titan24 + spec: + nodeSelector: + kubernetes.io/hostname: titan-24 + kubernetes.io/arch: amd64 + tolerations: + - operator: Exists + priorityClassName: system-node-critical + runtimeClassName: nvidia + containers: + - name: nvidia-device-plugin-ctr + image: nvcr.io/nvidia/k8s-device-plugin:v0.16.2 + imagePullPolicy: IfNotPresent + args: + - "--fail-on-init-error=false" + - "--device-list-strategy=envvar" + - "--mig-strategy=none" + securityContext: + privileged: true + env: + - name: NVIDIA_VISIBLE_DEVICES + value: "all" + - name: NVIDIA_DRIVER_CAPABILITIES + value: "compute,video,utility" + volumeMounts: + - name: device-plugin + mountPath: /var/lib/kubelet/device-plugins + volumes: + - name: device-plugin + hostPath: + path: /var/lib/kubelet/device-plugins diff --git a/infrastructure/core/gpu/device-plugin-tethys/kustomization.yaml b/infrastructure/core/gpu/device-plugin-tethys/kustomization.yaml new file mode 100644 index 0000000..48f1d3d --- /dev/null +++ b/infrastructure/core/gpu/device-plugin-tethys/kustomization.yaml @@ -0,0 +1,4 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - daemonset.yaml diff --git a/infrastructure/core/gpu/profiles/tethys-only/kustomization.yaml b/infrastructure/core/gpu/profiles/tethys-only/kustomization.yaml new file mode 100644 index 0000000..c78c41c --- /dev/null +++ b/infrastructure/core/gpu/profiles/tethys-only/kustomization.yaml @@ -0,0 +1,5 @@ +# infrastructure/core/gpu/daemonsets/profiles/tethys-only/kustomization.yaml +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - ../../device-plugin-tethys diff --git a/infrastructure/core/gpu/runtimeclass/nvidia-runtimeclass.yaml b/infrastructure/core/gpu/runtimeclass/nvidia-runtimeclass.yaml new file mode 100644 index 0000000..635f2b6 --- /dev/null +++ b/infrastructure/core/gpu/runtimeclass/nvidia-runtimeclass.yaml @@ -0,0 +1,5 @@ +apiVersion: node.k8s.io/v1 +kind: RuntimeClass +metadata: +name: nvidia +handler: nvidia diff --git a/infrastructure/core/kustomization.yaml b/infrastructure/core/kustomization.yaml index 939a93d..f752d47 100644 --- a/infrastructure/core/kustomization.yaml +++ b/infrastructure/core/kustomization.yaml @@ -5,4 +5,5 @@ resources: - base # - gpu/profiles/jetson-only # - gpu/profiles/minipc-and-jetson - - gpu/profiles/minipc-only + # - gpu/profiles/minipc-only + - gpu/profiles/tethys-only -- 2.47.2