titan-iac/services/monitoring/dcgm-exporter.yaml

75 lines
1.8 KiB
YAML

# services/monitoring/dcgm-exporter.yaml
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: dcgm-exporter
namespace: monitoring
labels:
app: dcgm-exporter
spec:
selector:
matchLabels:
app: dcgm-exporter
template:
metadata:
labels:
app: dcgm-exporter
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "9400"
spec:
serviceAccountName: default
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- titan-20
- titan-21
- titan-22
- titan-24
tolerations:
- operator: Exists
containers:
- name: dcgm-exporter
image: registry.bstein.dev/monitoring/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04
imagePullPolicy: IfNotPresent
ports:
- name: metrics
containerPort: 9400
env:
- name: DCGM_EXPORTER_KUBERNETES
value: "true"
securityContext:
privileged: true
resources:
requests:
cpu: 50m
memory: 64Mi
volumeMounts:
- name: pod-resources
mountPath: /var/lib/kubelet/pod-resources
volumes:
- name: pod-resources
hostPath:
path: /var/lib/kubelet/pod-resources
type: Directory
---
apiVersion: v1
kind: Service
metadata:
name: dcgm-exporter
namespace: monitoring
labels:
app: dcgm-exporter
spec:
selector:
app: dcgm-exporter
ports:
- name: metrics
port: 9400
targetPort: metrics