# services/monitoring/dcgm-exporter.yaml apiVersion: apps/v1 kind: DaemonSet metadata: name: dcgm-exporter namespace: monitoring labels: app: dcgm-exporter spec: selector: matchLabels: app: dcgm-exporter updateStrategy: rollingUpdate: maxUnavailable: 2 template: metadata: labels: app: dcgm-exporter annotations: prometheus.io/scrape: "true" prometheus.io/port: "9400" spec: serviceAccountName: default runtimeClassName: nvidia affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: - key: kubernetes.io/hostname operator: In values: - titan-20 - titan-21 - titan-22 - titan-24 tolerations: - operator: Exists containers: - name: dcgm-exporter image: registry.bstein.dev/monitoring/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04 imagePullPolicy: Always ports: - name: metrics containerPort: 9400 env: - name: DCGM_EXPORTER_KUBERNETES value: "true" securityContext: privileged: true resources: requests: cpu: 50m memory: 64Mi volumeMounts: - name: pod-resources mountPath: /var/lib/kubelet/pod-resources imagePullSecrets: - name: zot-regcred volumes: - name: pod-resources hostPath: path: /var/lib/kubelet/pod-resources type: Directory --- apiVersion: v1 kind: Service metadata: name: dcgm-exporter namespace: monitoring labels: app: dcgm-exporter spec: selector: app: dcgm-exporter ports: - name: metrics port: 9400 targetPort: metrics