titan-iac/services/monitoring/nvidia-process-exporter.yaml

137 lines
3.4 KiB
YAML

# services/monitoring/nvidia-process-exporter.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: nvidia-process-exporter
namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: nvidia-process-exporter
rules:
- apiGroups: [""]
resources:
- pods
verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: nvidia-process-exporter
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: nvidia-process-exporter
subjects:
- kind: ServiceAccount
name: nvidia-process-exporter
namespace: monitoring
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nvidia-process-exporter
namespace: monitoring
labels:
app: nvidia-process-exporter
spec:
selector:
matchLabels:
app: nvidia-process-exporter
updateStrategy:
rollingUpdate:
maxUnavailable: 1
template:
metadata:
labels:
app: nvidia-process-exporter
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "9401"
spec:
serviceAccountName: nvidia-process-exporter
imagePullSecrets:
- name: harbor-regcred
runtimeClassName: nvidia
hostPID: true
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/arch
operator: In
values:
- amd64
- key: jetson
operator: NotIn
values:
- "true"
tolerations:
- operator: Exists
containers:
- name: exporter
image: python:3.12-slim
imagePullPolicy: IfNotPresent
ports:
- name: metrics
containerPort: 9401
env:
- name: NVIDIA_VISIBLE_DEVICES
value: all
- name: NVIDIA_DRIVER_CAPABILITIES
value: all
- name: NVIDIA_PROCESS_EXPORTER_PORT
value: "9401"
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
command:
- sh
- -lc
- |
pip install --no-cache-dir nvidia-ml-py==13.595.45
exec python /etc/nvidia-process-exporter/exporter.py
securityContext:
privileged: true
resources:
requests:
cpu: 50m
memory: 96Mi
limits:
cpu: 250m
memory: 256Mi
volumeMounts:
- name: script
mountPath: /etc/nvidia-process-exporter
readOnly: true
- name: host-proc
mountPath: /host/proc
readOnly: true
volumes:
- name: script
configMap:
name: nvidia-process-exporter-script
defaultMode: 0555
- name: host-proc
hostPath:
path: /proc
type: Directory
---
apiVersion: v1
kind: Service
metadata:
name: nvidia-process-exporter
namespace: monitoring
labels:
app: nvidia-process-exporter
spec:
selector:
app: nvidia-process-exporter
ports:
- name: metrics
port: 9401
targetPort: metrics