gpu: enable time-slicing and refresh dashboards

This commit is contained in:
Brad Stein 2026-01-01 14:16:08 -03:00
parent 7020d53fd8
commit 6a76fc0fa3
15 changed files with 288 additions and 67 deletions

View File

@ -2,6 +2,7 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ../components/device-plugin-config
- ../components/device-plugin-jetson
- ../components/device-plugin-minipc
- ../components/device-plugin-tethys

View File

@ -0,0 +1,15 @@
# infrastructure/modules/profiles/components/device-plugin-config/configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: nvidia-device-plugin-config
namespace: kube-system
data:
config.yaml: |
version: v1
sharing:
timeSlicing:
renameByDefault: true
resources:
- name: nvidia.com/gpu
replicas: 4

View File

@ -0,0 +1,5 @@
# infrastructure/modules/profiles/components/device-plugin-config/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- configmap.yaml

View File

@ -31,6 +31,7 @@ spec:
args:
- "--fail-on-init-error=false"
- "--device-list-strategy=envvar,cdi"
- "--config-file=/config/config.yaml"
securityContext:
privileged: true
env:
@ -41,7 +42,12 @@ spec:
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
- name: config
mountPath: /config
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
- name: config
configMap:
name: nvidia-device-plugin-config

View File

@ -32,6 +32,7 @@ spec:
- "--fail-on-init-error=false"
- "--device-list-strategy=envvar"
- "--mig-strategy=none"
- "--config-file=/config/config.yaml"
securityContext:
privileged: true
env:
@ -42,7 +43,12 @@ spec:
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
- name: config
mountPath: /config
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
- name: config
configMap:
name: nvidia-device-plugin-config

View File

@ -33,6 +33,7 @@ spec:
- "--fail-on-init-error=false"
- "--device-list-strategy=envvar"
- "--mig-strategy=none"
- "--config-file=/config/config.yaml"
securityContext:
privileged: true
env:
@ -43,7 +44,12 @@ spec:
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
- name: config
mountPath: /config
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
- name: config
configMap:
name: nvidia-device-plugin-config

View File

@ -2,4 +2,5 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ../components/device-plugin-config
- ../components/device-plugin-tethys

View File

@ -171,9 +171,8 @@ def node_io_expr(scope=""):
def namespace_share_expr(resource_expr):
selected = f"( {resource_expr} ) and on(namespace) ( {NAMESPACE_TOP_FILTER} )"
total = f"clamp_min(sum( {selected} ), 1)"
return f"100 * ( {selected} ) / {total}"
total = f"clamp_min(sum( {resource_expr} ), 1)"
return f"100 * ( {resource_expr} ) / {total}"
def namespace_cpu_share_expr():
@ -185,7 +184,10 @@ def namespace_ram_share_expr():
def namespace_gpu_share_expr():
return namespace_share_expr(NAMESPACE_GPU_RAW)
total = f"(sum({NAMESPACE_GPU_USAGE_INSTANT}) or on() vector(0))"
share = f"100 * ({NAMESPACE_GPU_USAGE_INSTANT}) / clamp_min({total}, 1)"
idle = 'label_replace(vector(100), "namespace", "idle", "", "") and on() (' + total + " == 0)"
return f"({share}) or {idle}"
PROBLEM_PODS_EXPR = (
@ -270,46 +272,20 @@ STUCK_TABLE_EXPR = (
")"
)
NAMESPACE_SCOPE_WORKLOAD = 'namespace!~"(^kube.*|.*-system$|^traefik$)"'
NAMESPACE_SCOPE_ALL = 'namespace=~".*"'
NAMESPACE_SCOPE_INFRA = 'namespace=~"(^kube.*|.*-system$|^traefik$)"'
NAMESPACE_SCOPE_VAR = "$namespace_scope"
NAMESPACE_SELECTOR = f'namespace!="",pod!="",container!="",{NAMESPACE_SCOPE_VAR}'
NAMESPACE_GPU_SELECTOR = f'namespace!="",pod!="",{NAMESPACE_SCOPE_VAR}'
NAMESPACE_CPU_RAW = (
'sum(rate(container_cpu_usage_seconds_total{namespace!="",pod!="",container!=""}[5m])) by (namespace)'
)
NAMESPACE_RAM_RAW = (
'sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace)'
f'sum(rate(container_cpu_usage_seconds_total{{{NAMESPACE_SELECTOR}}}[5m])) by (namespace)'
)
NAMESPACE_RAM_RAW = f'sum(container_memory_working_set_bytes{{{NAMESPACE_SELECTOR}}}) by (namespace)'
GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"]
GPU_NODE_REGEX = "|".join(GPU_NODES)
NAMESPACE_GPU_ALLOC = (
'sum((kube_pod_container_resource_requests{namespace!="",resource="nvidia.com/gpu"}'
' or kube_pod_container_resource_limits{namespace!="",resource="nvidia.com/gpu"})) by (namespace)'
)
NAMESPACE_GPU_USAGE_SHARE = (
'sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))'
)
NAMESPACE_GPU_USAGE_INSTANT = 'sum(DCGM_FI_DEV_GPU_UTIL{namespace!="",pod!=""}) by (namespace)'
NAMESPACE_GPU_RAW = (
"("
+ NAMESPACE_GPU_USAGE_SHARE
+ ") or on(namespace) ("
+ NAMESPACE_CPU_RAW
+ " * 0)"
)
NAMESPACE_GPU_WEIGHT = (
"("
+ NAMESPACE_GPU_ALLOC
+ ") or on(namespace) ("
+ NAMESPACE_CPU_RAW
+ " * 0)"
)
NAMESPACE_ACTIVITY_SCORE = (
"( "
+ NAMESPACE_CPU_RAW
+ " ) + ("
+ NAMESPACE_RAM_RAW
+ " / 1e9) + ("
+ NAMESPACE_GPU_WEIGHT
+ " * 100)"
)
NAMESPACE_TOP_FILTER = "(topk(10, " + NAMESPACE_ACTIVITY_SCORE + ") >= bool 0)"
NAMESPACE_GPU_USAGE_INSTANT = f'sum(DCGM_FI_DEV_GPU_UTIL{{{NAMESPACE_GPU_SELECTOR}}}) by (namespace)'
TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
TRAEFIK_NET_INGRESS = (
'sum(rate(container_network_receive_bytes_total{namespace="traefik",pod=~"traefik-.*"}[5m]))'
@ -588,6 +564,44 @@ def pie_panel(panel_id, title, expr, grid):
}
def namespace_scope_variable():
options = [
{
"text": "workload namespaces only",
"value": NAMESPACE_SCOPE_WORKLOAD,
"selected": True,
},
{"text": "all namespaces", "value": NAMESPACE_SCOPE_ALL, "selected": False},
{
"text": "infrastructure namespaces only",
"value": NAMESPACE_SCOPE_INFRA,
"selected": False,
},
]
query = (
"workload namespaces only : "
+ NAMESPACE_SCOPE_WORKLOAD
+ ",all namespaces : "
+ NAMESPACE_SCOPE_ALL
+ ",infrastructure namespaces only : "
+ NAMESPACE_SCOPE_INFRA
)
return {
"name": "namespace_scope",
"label": "Namespace filter",
"type": "custom",
"query": query,
"current": {"text": options[0]["text"], "value": options[0]["value"], "selected": True},
"options": options,
"hide": 0,
"multi": False,
"includeAll": False,
"refresh": 1,
"sort": 0,
"skipUrlSync": False,
}
def bargauge_panel(
panel_id,
title,
@ -1063,7 +1077,7 @@ def build_overview():
"schemaVersion": 39,
"style": "dark",
"tags": ["atlas", "overview"],
"templating": {"list": []},
"templating": {"list": [namespace_scope_variable()]},
"time": {"from": "now-1h", "to": "now"},
"refresh": "1m",
"links": [],
@ -1757,6 +1771,7 @@ def build_gpu_dashboard():
"schemaVersion": 39,
"style": "dark",
"tags": ["atlas", "gpu"],
"templating": {"list": [namespace_scope_variable()]},
}

View File

@ -16,10 +16,20 @@ spec:
app: ollama
annotations:
ai.bstein.dev/model: qwen2.5-coder:7b-instruct-q4_0
ai.bstein.dev/gpu: RTX 3080 8GB (titan-24)
ai.bstein.dev/gpu: GPU pool (titan-20/21/22/24)
spec:
nodeSelector:
kubernetes.io/hostname: titan-24
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- titan-20
- titan-21
- titan-22
- titan-24
runtimeClassName: nvidia
volumes:
- name: models
@ -55,9 +65,9 @@ spec:
requests:
cpu: 250m
memory: 1Gi
nvidia.com/gpu: 1
nvidia.com/gpu.shared: 1
limits:
nvidia.com/gpu: 1
nvidia.com/gpu.shared: 1
containers:
- name: ollama
image: ollama/ollama:latest
@ -83,8 +93,8 @@ spec:
requests:
cpu: "2"
memory: 8Gi
nvidia.com/gpu: 1
nvidia.com/gpu.shared: 1
limits:
cpu: "4"
memory: 12Gi
nvidia.com/gpu: 1
nvidia.com/gpu.shared: 1

View File

@ -39,7 +39,7 @@ spec:
fieldPath: spec.nodeName
- name: AI_NODE_GPU_MAP
value: |
{"titan-24": "RTX 3080 8GB (local GPU)", "titan-22": "RTX 3050 8GB (local GPU)"}
{"titan-20": "Jetson Xavier (edge GPU)", "titan-21": "Jetson Xavier (edge GPU)", "titan-22": "RTX 3050 8GB (local GPU)", "titan-24": "RTX 3080 8GB (local GPU)"}
ports:
- name: http
containerPort: 8080

View File

@ -68,8 +68,18 @@ spec:
volumeMounts:
- name: config
mountPath: /config
nodeSelector:
jellyfin: "true"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- titan-20
- titan-21
- titan-22
- titan-24
securityContext:
runAsUser: 1000
fsGroup: 65532
@ -96,11 +106,11 @@ spec:
value: "002"
resources:
limits:
nvidia.com/gpu: 1
nvidia.com/gpu.shared: 1
# cpu: "4"
# memory: 8Gi
requests:
nvidia.com/gpu: 1
nvidia.com/gpu.shared: 1
cpu: "500m"
memory: 1Gi
volumeMounts:

View File

@ -20,7 +20,7 @@
},
"targets": [
{
"expr": "100 * ( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
"expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)), 1)) or label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)) == 0)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -71,7 +71,7 @@
},
"targets": [
{
"expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)",
"expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -182,5 +182,43 @@
"tags": [
"atlas",
"gpu"
],
"templating": {
"list": [
{
"name": "namespace_scope",
"label": "Namespace filter",
"type": "custom",
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$)\"",
"current": {
"text": "workload namespaces only",
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"",
"selected": true
},
"options": [
{
"text": "workload namespaces only",
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"",
"selected": true
},
{
"text": "all namespaces",
"value": "namespace=~\".*\"",
"selected": false
},
{
"text": "infrastructure namespaces only",
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$)\"",
"selected": false
}
],
"hide": 0,
"multi": false,
"includeAll": false,
"refresh": 1,
"sort": 0,
"skipUrlSync": false
}
]
}
}

View File

@ -1086,7 +1086,7 @@
},
"targets": [
{
"expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
"expr": "100 * ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}[5m])) by (namespace) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}[5m])) by (namespace) ), 1)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -1137,7 +1137,7 @@
},
"targets": [
{
"expr": "100 * ( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
"expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)), 1)) or label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)) == 0)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -1188,7 +1188,7 @@
},
"targets": [
{
"expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
"expr": "100 * ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}) by (namespace) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}) by (namespace) ), 1)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -1791,7 +1791,42 @@
"overview"
],
"templating": {
"list": []
"list": [
{
"name": "namespace_scope",
"label": "Namespace filter",
"type": "custom",
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$)\"",
"current": {
"text": "workload namespaces only",
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"",
"selected": true
},
"options": [
{
"text": "workload namespaces only",
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"",
"selected": true
},
{
"text": "all namespaces",
"value": "namespace=~\".*\"",
"selected": false
},
{
"text": "infrastructure namespaces only",
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$)\"",
"selected": false
}
],
"hide": 0,
"multi": false,
"includeAll": false,
"refresh": 1,
"sort": 0,
"skipUrlSync": false
}
]
},
"time": {
"from": "now-1h",

View File

@ -29,7 +29,7 @@ data:
},
"targets": [
{
"expr": "100 * ( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
"expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)), 1)) or label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)) == 0)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -80,7 +80,7 @@ data:
},
"targets": [
{
"expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)",
"expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -191,5 +191,43 @@ data:
"tags": [
"atlas",
"gpu"
],
"templating": {
"list": [
{
"name": "namespace_scope",
"label": "Namespace filter",
"type": "custom",
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$)\"",
"current": {
"text": "workload namespaces only",
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"",
"selected": true
},
"options": [
{
"text": "workload namespaces only",
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"",
"selected": true
},
{
"text": "all namespaces",
"value": "namespace=~\".*\"",
"selected": false
},
{
"text": "infrastructure namespaces only",
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$)\"",
"selected": false
}
],
"hide": 0,
"multi": false,
"includeAll": false,
"refresh": 1,
"sort": 0,
"skipUrlSync": false
}
]
}
}

View File

@ -1095,7 +1095,7 @@ data:
},
"targets": [
{
"expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
"expr": "100 * ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}[5m])) by (namespace) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}[5m])) by (namespace) ), 1)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -1146,7 +1146,7 @@ data:
},
"targets": [
{
"expr": "100 * ( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
"expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)), 1)) or label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)) == 0)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -1197,7 +1197,7 @@ data:
},
"targets": [
{
"expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
"expr": "100 * ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}) by (namespace) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}) by (namespace) ), 1)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -1800,7 +1800,42 @@ data:
"overview"
],
"templating": {
"list": []
"list": [
{
"name": "namespace_scope",
"label": "Namespace filter",
"type": "custom",
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$)\"",
"current": {
"text": "workload namespaces only",
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"",
"selected": true
},
"options": [
{
"text": "workload namespaces only",
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"",
"selected": true
},
{
"text": "all namespaces",
"value": "namespace=~\".*\"",
"selected": false
},
{
"text": "infrastructure namespaces only",
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$)\"",
"selected": false
}
],
"hide": 0,
"multi": false,
"includeAll": false,
"refresh": 1,
"sort": 0,
"skipUrlSync": false
}
]
},
"time": {
"from": "now-1h",