137 lines
3.4 KiB
YAML
137 lines
3.4 KiB
YAML
# services/monitoring/nvidia-process-exporter.yaml
|
|
apiVersion: v1
|
|
kind: ServiceAccount
|
|
metadata:
|
|
name: nvidia-process-exporter
|
|
namespace: monitoring
|
|
---
|
|
apiVersion: rbac.authorization.k8s.io/v1
|
|
kind: ClusterRole
|
|
metadata:
|
|
name: nvidia-process-exporter
|
|
rules:
|
|
- apiGroups: [""]
|
|
resources:
|
|
- pods
|
|
verbs: ["get", "list", "watch"]
|
|
---
|
|
apiVersion: rbac.authorization.k8s.io/v1
|
|
kind: ClusterRoleBinding
|
|
metadata:
|
|
name: nvidia-process-exporter
|
|
roleRef:
|
|
apiGroup: rbac.authorization.k8s.io
|
|
kind: ClusterRole
|
|
name: nvidia-process-exporter
|
|
subjects:
|
|
- kind: ServiceAccount
|
|
name: nvidia-process-exporter
|
|
namespace: monitoring
|
|
---
|
|
apiVersion: apps/v1
|
|
kind: DaemonSet
|
|
metadata:
|
|
name: nvidia-process-exporter
|
|
namespace: monitoring
|
|
labels:
|
|
app: nvidia-process-exporter
|
|
spec:
|
|
selector:
|
|
matchLabels:
|
|
app: nvidia-process-exporter
|
|
updateStrategy:
|
|
rollingUpdate:
|
|
maxUnavailable: 1
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app: nvidia-process-exporter
|
|
annotations:
|
|
prometheus.io/scrape: "true"
|
|
prometheus.io/port: "9401"
|
|
spec:
|
|
serviceAccountName: nvidia-process-exporter
|
|
imagePullSecrets:
|
|
- name: harbor-regcred
|
|
runtimeClassName: nvidia
|
|
hostPID: true
|
|
affinity:
|
|
nodeAffinity:
|
|
requiredDuringSchedulingIgnoredDuringExecution:
|
|
nodeSelectorTerms:
|
|
- matchExpressions:
|
|
- key: kubernetes.io/arch
|
|
operator: In
|
|
values:
|
|
- amd64
|
|
- key: jetson
|
|
operator: NotIn
|
|
values:
|
|
- "true"
|
|
tolerations:
|
|
- operator: Exists
|
|
containers:
|
|
- name: exporter
|
|
image: python:3.12-slim
|
|
imagePullPolicy: IfNotPresent
|
|
ports:
|
|
- name: metrics
|
|
containerPort: 9401
|
|
env:
|
|
- name: NVIDIA_VISIBLE_DEVICES
|
|
value: all
|
|
- name: NVIDIA_DRIVER_CAPABILITIES
|
|
value: all
|
|
- name: NVIDIA_PROCESS_EXPORTER_PORT
|
|
value: "9401"
|
|
- name: NODE_NAME
|
|
valueFrom:
|
|
fieldRef:
|
|
fieldPath: spec.nodeName
|
|
command:
|
|
- sh
|
|
- -lc
|
|
- |
|
|
pip install --no-cache-dir nvidia-ml-py==13.595.45
|
|
exec python /etc/nvidia-process-exporter/exporter.py
|
|
securityContext:
|
|
privileged: true
|
|
resources:
|
|
requests:
|
|
cpu: 50m
|
|
memory: 96Mi
|
|
limits:
|
|
cpu: 250m
|
|
memory: 256Mi
|
|
volumeMounts:
|
|
- name: script
|
|
mountPath: /etc/nvidia-process-exporter
|
|
readOnly: true
|
|
- name: host-proc
|
|
mountPath: /host/proc
|
|
readOnly: true
|
|
volumes:
|
|
- name: script
|
|
configMap:
|
|
name: nvidia-process-exporter-script
|
|
defaultMode: 0555
|
|
- name: host-proc
|
|
hostPath:
|
|
path: /proc
|
|
type: Directory
|
|
---
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata:
|
|
name: nvidia-process-exporter
|
|
namespace: monitoring
|
|
labels:
|
|
app: nvidia-process-exporter
|
|
spec:
|
|
selector:
|
|
app: nvidia-process-exporter
|
|
ports:
|
|
- name: metrics
|
|
port: 9401
|
|
targetPort: metrics
|