# services/monitoring/nvidia-process-exporter.yaml apiVersion: v1 kind: ServiceAccount metadata: name: nvidia-process-exporter namespace: monitoring --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: nvidia-process-exporter rules: - apiGroups: [""] resources: - pods verbs: ["get", "list", "watch"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: name: nvidia-process-exporter roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: nvidia-process-exporter subjects: - kind: ServiceAccount name: nvidia-process-exporter namespace: monitoring --- apiVersion: apps/v1 kind: DaemonSet metadata: name: nvidia-process-exporter namespace: monitoring labels: app: nvidia-process-exporter spec: selector: matchLabels: app: nvidia-process-exporter updateStrategy: rollingUpdate: maxUnavailable: 1 template: metadata: labels: app: nvidia-process-exporter annotations: prometheus.io/scrape: "true" prometheus.io/port: "9401" spec: serviceAccountName: nvidia-process-exporter imagePullSecrets: - name: harbor-regcred runtimeClassName: nvidia hostPID: true affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: - key: kubernetes.io/arch operator: In values: - amd64 - key: jetson operator: NotIn values: - "true" tolerations: - operator: Exists containers: - name: exporter image: python:3.12-slim imagePullPolicy: IfNotPresent ports: - name: metrics containerPort: 9401 env: - name: NVIDIA_VISIBLE_DEVICES value: all - name: NVIDIA_DRIVER_CAPABILITIES value: all - name: NVIDIA_PROCESS_EXPORTER_PORT value: "9401" - name: NODE_NAME valueFrom: fieldRef: fieldPath: spec.nodeName command: - sh - -lc - | pip install --no-cache-dir nvidia-ml-py==13.595.45 exec python /etc/nvidia-process-exporter/exporter.py securityContext: privileged: true resources: requests: cpu: 50m memory: 96Mi limits: cpu: 250m memory: 256Mi volumeMounts: - name: script mountPath: /etc/nvidia-process-exporter readOnly: true - name: host-proc mountPath: /host/proc readOnly: true volumes: - name: script configMap: name: nvidia-process-exporter-script defaultMode: 0555 - name: host-proc hostPath: path: /proc type: Directory --- apiVersion: v1 kind: Service metadata: name: nvidia-process-exporter namespace: monitoring labels: app: nvidia-process-exporter spec: selector: app: nvidia-process-exporter ports: - name: metrics port: 9401 targetPort: metrics