titan-iac/services/monitoring/dcgm-exporter.yaml

81 lines
1.9 KiB
YAML
Raw Normal View History

# services/monitoring/dcgm-exporter.yaml
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: dcgm-exporter
namespace: monitoring
labels:
app: dcgm-exporter
spec:
selector:
matchLabels:
app: dcgm-exporter
updateStrategy:
rollingUpdate:
maxUnavailable: 2
template:
metadata:
labels:
app: dcgm-exporter
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "9400"
spec:
serviceAccountName: default
runtimeClassName: nvidia
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- titan-20
- titan-21
- titan-22
- titan-24
tolerations:
- operator: Exists
containers:
- name: dcgm-exporter
image: registry.bstein.dev/monitoring/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04
imagePullPolicy: Always
ports:
- name: metrics
containerPort: 9400
env:
- name: DCGM_EXPORTER_KUBERNETES
value: "true"
securityContext:
privileged: true
resources:
requests:
cpu: 50m
memory: 64Mi
volumeMounts:
- name: pod-resources
mountPath: /var/lib/kubelet/pod-resources
imagePullSecrets:
- name: zot-regcred
volumes:
- name: pod-resources
hostPath:
path: /var/lib/kubelet/pod-resources
type: Directory
---
apiVersion: v1
kind: Service
metadata:
name: dcgm-exporter
namespace: monitoring
labels:
app: dcgm-exporter
spec:
selector:
app: dcgm-exporter
ports:
- name: metrics
port: 9400
targetPort: metrics