137 lines
3.4 KiB
YAML
137 lines
3.4 KiB
YAML
|
|
# services/monitoring/nvidia-process-exporter.yaml
|
||
|
|
apiVersion: v1
|
||
|
|
kind: ServiceAccount
|
||
|
|
metadata:
|
||
|
|
name: nvidia-process-exporter
|
||
|
|
namespace: monitoring
|
||
|
|
---
|
||
|
|
apiVersion: rbac.authorization.k8s.io/v1
|
||
|
|
kind: ClusterRole
|
||
|
|
metadata:
|
||
|
|
name: nvidia-process-exporter
|
||
|
|
rules:
|
||
|
|
- apiGroups: [""]
|
||
|
|
resources:
|
||
|
|
- pods
|
||
|
|
verbs: ["get", "list", "watch"]
|
||
|
|
---
|
||
|
|
apiVersion: rbac.authorization.k8s.io/v1
|
||
|
|
kind: ClusterRoleBinding
|
||
|
|
metadata:
|
||
|
|
name: nvidia-process-exporter
|
||
|
|
roleRef:
|
||
|
|
apiGroup: rbac.authorization.k8s.io
|
||
|
|
kind: ClusterRole
|
||
|
|
name: nvidia-process-exporter
|
||
|
|
subjects:
|
||
|
|
- kind: ServiceAccount
|
||
|
|
name: nvidia-process-exporter
|
||
|
|
namespace: monitoring
|
||
|
|
---
|
||
|
|
apiVersion: apps/v1
|
||
|
|
kind: DaemonSet
|
||
|
|
metadata:
|
||
|
|
name: nvidia-process-exporter
|
||
|
|
namespace: monitoring
|
||
|
|
labels:
|
||
|
|
app: nvidia-process-exporter
|
||
|
|
spec:
|
||
|
|
selector:
|
||
|
|
matchLabels:
|
||
|
|
app: nvidia-process-exporter
|
||
|
|
updateStrategy:
|
||
|
|
rollingUpdate:
|
||
|
|
maxUnavailable: 1
|
||
|
|
template:
|
||
|
|
metadata:
|
||
|
|
labels:
|
||
|
|
app: nvidia-process-exporter
|
||
|
|
annotations:
|
||
|
|
prometheus.io/scrape: "true"
|
||
|
|
prometheus.io/port: "9401"
|
||
|
|
spec:
|
||
|
|
serviceAccountName: nvidia-process-exporter
|
||
|
|
imagePullSecrets:
|
||
|
|
- name: harbor-regcred
|
||
|
|
runtimeClassName: nvidia
|
||
|
|
hostPID: true
|
||
|
|
affinity:
|
||
|
|
nodeAffinity:
|
||
|
|
requiredDuringSchedulingIgnoredDuringExecution:
|
||
|
|
nodeSelectorTerms:
|
||
|
|
- matchExpressions:
|
||
|
|
- key: kubernetes.io/arch
|
||
|
|
operator: In
|
||
|
|
values:
|
||
|
|
- amd64
|
||
|
|
- key: jetson
|
||
|
|
operator: NotIn
|
||
|
|
values:
|
||
|
|
- "true"
|
||
|
|
tolerations:
|
||
|
|
- operator: Exists
|
||
|
|
containers:
|
||
|
|
- name: exporter
|
||
|
|
image: python:3.12-slim
|
||
|
|
imagePullPolicy: IfNotPresent
|
||
|
|
ports:
|
||
|
|
- name: metrics
|
||
|
|
containerPort: 9401
|
||
|
|
env:
|
||
|
|
- name: NVIDIA_VISIBLE_DEVICES
|
||
|
|
value: all
|
||
|
|
- name: NVIDIA_DRIVER_CAPABILITIES
|
||
|
|
value: all
|
||
|
|
- name: NVIDIA_PROCESS_EXPORTER_PORT
|
||
|
|
value: "9401"
|
||
|
|
- name: NODE_NAME
|
||
|
|
valueFrom:
|
||
|
|
fieldRef:
|
||
|
|
fieldPath: spec.nodeName
|
||
|
|
command:
|
||
|
|
- sh
|
||
|
|
- -lc
|
||
|
|
- |
|
||
|
|
pip install --no-cache-dir nvidia-ml-py==13.595.45
|
||
|
|
exec python /etc/nvidia-process-exporter/exporter.py
|
||
|
|
securityContext:
|
||
|
|
privileged: true
|
||
|
|
resources:
|
||
|
|
requests:
|
||
|
|
cpu: 50m
|
||
|
|
memory: 96Mi
|
||
|
|
limits:
|
||
|
|
cpu: 250m
|
||
|
|
memory: 256Mi
|
||
|
|
volumeMounts:
|
||
|
|
- name: script
|
||
|
|
mountPath: /etc/nvidia-process-exporter
|
||
|
|
readOnly: true
|
||
|
|
- name: host-proc
|
||
|
|
mountPath: /host/proc
|
||
|
|
readOnly: true
|
||
|
|
volumes:
|
||
|
|
- name: script
|
||
|
|
configMap:
|
||
|
|
name: nvidia-process-exporter-script
|
||
|
|
defaultMode: 0555
|
||
|
|
- name: host-proc
|
||
|
|
hostPath:
|
||
|
|
path: /proc
|
||
|
|
type: Directory
|
||
|
|
---
|
||
|
|
apiVersion: v1
|
||
|
|
kind: Service
|
||
|
|
metadata:
|
||
|
|
name: nvidia-process-exporter
|
||
|
|
namespace: monitoring
|
||
|
|
labels:
|
||
|
|
app: nvidia-process-exporter
|
||
|
|
spec:
|
||
|
|
selector:
|
||
|
|
app: nvidia-process-exporter
|
||
|
|
ports:
|
||
|
|
- name: metrics
|
||
|
|
port: 9401
|
||
|
|
targetPort: metrics
|