From b5dc723e02227cec85a70ae68c26478ca70acdfb Mon Sep 17 00:00:00 2001 From: jenkins Date: Fri, 22 May 2026 02:35:08 -0300 Subject: [PATCH] monitoring(gpu): hide zero-utilization namespaces --- scripts/dashboards_render_atlas.py | 3 ++- services/monitoring/dashboards/atlas-gpu.json | 4 ++-- services/monitoring/dashboards/atlas-overview.json | 2 +- services/monitoring/grafana-dashboard-gpu.yaml | 4 ++-- services/monitoring/grafana-dashboard-overview.yaml | 2 +- 5 files changed, 8 insertions(+), 7 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index bf02b442..c9752493 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -344,11 +344,12 @@ def namespace_ram_share_expr(scope_var): def nvidia_process_gpu_usage_by_namespace(scope_var): - return ( + usage = ( "sum by (namespace) (" f"nvidia_namespace_gpu_sm_util_percent{{{namespace_gpu_selector(scope_var)}}}" ")" ) + return f"(({usage}) > 0)" def nvidia_process_gpu_present(): diff --git a/services/monitoring/dashboards/atlas-gpu.json b/services/monitoring/dashboards/atlas-gpu.json index b5cbb675..6084d726 100644 --- a/services/monitoring/dashboards/atlas-gpu.json +++ b/services/monitoring/dashboards/atlas-gpu.json @@ -20,7 +20,7 @@ }, "targets": [ { - "expr": "((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or (label_replace(clamp_min(((100 * count(nvidia_gpu_device_utilization_percent)) or (100 * count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0)) - (sum((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)), 0), \"namespace\", \"unused\", \"\", \"\") and on() ((sum((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) == 0))", + "expr": "((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or (label_replace(clamp_min(((100 * count(nvidia_gpu_device_utilization_percent)) or (100 * count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0)) - (sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)), 0), \"namespace\", \"unused\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) == 0))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -89,7 +89,7 @@ }, "targets": [ { - "expr": "sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})", + "expr": "((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)", "refId": "A", "legendFormat": "{{namespace}}" } diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 500bed23..4bdf9dca 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -3728,7 +3728,7 @@ }, "targets": [ { - "expr": "((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or (label_replace(clamp_min(((100 * count(nvidia_gpu_device_utilization_percent)) or (100 * count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0)) - (sum((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)), 0), \"namespace\", \"unused\", \"\", \"\") and on() ((sum((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) == 0))", + "expr": "((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or (label_replace(clamp_min(((100 * count(nvidia_gpu_device_utilization_percent)) or (100 * count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0)) - (sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)), 0), \"namespace\", \"unused\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) == 0))", "refId": "A", "legendFormat": "{{namespace}}" } diff --git a/services/monitoring/grafana-dashboard-gpu.yaml b/services/monitoring/grafana-dashboard-gpu.yaml index 21c8716b..c5610512 100644 --- a/services/monitoring/grafana-dashboard-gpu.yaml +++ b/services/monitoring/grafana-dashboard-gpu.yaml @@ -29,7 +29,7 @@ data: }, "targets": [ { - "expr": "((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or (label_replace(clamp_min(((100 * count(nvidia_gpu_device_utilization_percent)) or (100 * count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0)) - (sum((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)), 0), \"namespace\", \"unused\", \"\", \"\") and on() ((sum((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) == 0))", + "expr": "((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or (label_replace(clamp_min(((100 * count(nvidia_gpu_device_utilization_percent)) or (100 * count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0)) - (sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)), 0), \"namespace\", \"unused\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) == 0))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -98,7 +98,7 @@ data: }, "targets": [ { - "expr": "sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})", + "expr": "((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)", "refId": "A", "legendFormat": "{{namespace}}" } diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index add4b664..7714bbb6 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -3737,7 +3737,7 @@ data: }, "targets": [ { - "expr": "((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or (label_replace(clamp_min(((100 * count(nvidia_gpu_device_utilization_percent)) or (100 * count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0)) - (sum((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)), 0), \"namespace\", \"unused\", \"\", \"\") and on() ((sum((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) == 0))", + "expr": "((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or (label_replace(clamp_min(((100 * count(nvidia_gpu_device_utilization_percent)) or (100 * count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0)) - (sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)), 0), \"namespace\", \"unused\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) == 0))", "refId": "A", "legendFormat": "{{namespace}}" }