monitoring: unify gpu namespace usage
This commit is contained in:
parent
2fe763189d
commit
ba16f5119b
@ -208,32 +208,53 @@ def namespace_ram_raw(scope_var):
|
|||||||
|
|
||||||
|
|
||||||
def namespace_gpu_usage_instant(scope_var):
|
def namespace_gpu_usage_instant(scope_var):
|
||||||
dcgm = f"sum(DCGM_FI_DEV_GPU_UTIL{{{namespace_gpu_selector(scope_var)}}}) by (namespace)"
|
return gpu_usage_by_namespace(scope_var)
|
||||||
jetson = jetson_gpu_usage_by_namespace(scope_var)
|
|
||||||
merged = (
|
|
||||||
f'label_replace({dcgm}, "source", "dcgm", "", "") '
|
|
||||||
f'or label_replace({jetson}, "source", "jetson", "", "")'
|
|
||||||
)
|
|
||||||
return f"sum by (namespace) ({merged})"
|
|
||||||
|
|
||||||
|
|
||||||
def jetson_gpu_util_by_node():
|
def jetson_gpu_util_by_node():
|
||||||
return 'max by (node) (jetson_gr3d_freq_percent{node!=""})'
|
return 'max by (node) (jetson_gr3d_freq_percent{node!=""})'
|
||||||
|
|
||||||
|
|
||||||
def jetson_gpu_util_by_hostname():
|
def dcgm_gpu_util_by_node():
|
||||||
|
dcgm_pod = 'label_replace(DCGM_FI_DEV_GPU_UTIL, "pod", "$1", "Hostname", "(.*)")'
|
||||||
|
dcgm_ns = 'label_replace(' + dcgm_pod + ', "namespace", "monitoring", "", "")'
|
||||||
return (
|
return (
|
||||||
'label_replace(max by (node) (jetson_gr3d_freq_percent{node!=""}), '
|
"avg by (node) ("
|
||||||
'"Hostname", "$1", "node", "(.*)")'
|
f"{dcgm_ns} * on(namespace,pod) group_left(node) "
|
||||||
|
'kube_pod_info{namespace="monitoring"}'
|
||||||
|
")"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def jetson_gpu_requests(scope_var):
|
def gpu_util_by_node():
|
||||||
|
return f"{dcgm_gpu_util_by_node()} or {jetson_gpu_util_by_node()}"
|
||||||
|
|
||||||
|
|
||||||
|
def gpu_util_by_hostname():
|
||||||
|
return 'label_replace(' + gpu_util_by_node() + ', "Hostname", "$1", "node", "(.*)")'
|
||||||
|
|
||||||
|
|
||||||
|
def gpu_node_labels():
|
||||||
|
return 'kube_node_labels{label_accelerator=~".+"} or kube_node_labels{label_jetson="true"}'
|
||||||
|
|
||||||
|
|
||||||
|
def gpu_requests_by_namespace_node(scope_var):
|
||||||
return (
|
return (
|
||||||
"sum by (namespace,node) ("
|
"sum by (namespace,node) ("
|
||||||
f'kube_pod_container_resource_requests{{resource=~"nvidia.com/gpu.*",{scope_var}}} '
|
f'kube_pod_container_resource_requests{{resource=~"nvidia.com/gpu.*",{scope_var}}} '
|
||||||
"* on(namespace,pod) group_left(node) kube_pod_info "
|
"* on(namespace,pod) group_left(node) kube_pod_info "
|
||||||
'* on(node) group_left(label_jetson) kube_node_labels{label_jetson="true"}'
|
f"* on(node) group_left() {gpu_node_labels()}"
|
||||||
|
")"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def gpu_usage_by_namespace(scope_var):
|
||||||
|
requests_by_ns = gpu_requests_by_namespace_node(scope_var)
|
||||||
|
total_by_node = f"sum by (node) ({requests_by_ns})"
|
||||||
|
return (
|
||||||
|
"sum by (namespace) ("
|
||||||
|
f"({requests_by_ns}) / clamp_min({total_by_node}, 1) "
|
||||||
|
f"* on(node) group_left() {gpu_util_by_node()}"
|
||||||
")"
|
")"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -2695,7 +2716,7 @@ def build_gpu_dashboard():
|
|||||||
timeseries_panel(
|
timeseries_panel(
|
||||||
3,
|
3,
|
||||||
"GPU Util by Node",
|
"GPU Util by Node",
|
||||||
f'(sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{{pod!=""}})) or ({jetson_gpu_util_by_hostname()})',
|
gpu_util_by_hostname(),
|
||||||
{"h": 8, "w": 12, "x": 0, "y": 8},
|
{"h": 8, "w": 12, "x": 0, "y": 8},
|
||||||
unit="percent",
|
unit="percent",
|
||||||
legend="{{Hostname}}",
|
legend="{{Hostname}}",
|
||||||
|
|||||||
@ -20,7 +20,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))",
|
"expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)) == bool 0))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}"
|
"legendFormat": "{{namespace}}"
|
||||||
}
|
}
|
||||||
@ -89,7 +89,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))",
|
"expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}"
|
"legendFormat": "{{namespace}}"
|
||||||
}
|
}
|
||||||
@ -126,7 +126,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "(sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})) or (label_replace(max by (node) (jetson_gr3d_freq_percent{node!=\"\"}), \"Hostname\", \"$1\", \"node\", \"(.*)\"))",
|
"expr": "label_replace(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}), \"Hostname\", \"$1\", \"node\", \"(.*)\")",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{Hostname}}"
|
"legendFormat": "{{Hostname}}"
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1901,7 +1901,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))",
|
"expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)) == bool 0))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}"
|
"legendFormat": "{{namespace}}"
|
||||||
}
|
}
|
||||||
|
|||||||
@ -29,7 +29,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))",
|
"expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)) == bool 0))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}"
|
"legendFormat": "{{namespace}}"
|
||||||
}
|
}
|
||||||
@ -98,7 +98,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))",
|
"expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}"
|
"legendFormat": "{{namespace}}"
|
||||||
}
|
}
|
||||||
@ -135,7 +135,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "(sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})) or (label_replace(max by (node) (jetson_gr3d_freq_percent{node!=\"\"}), \"Hostname\", \"$1\", \"node\", \"(.*)\"))",
|
"expr": "label_replace(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}), \"Hostname\", \"$1\", \"node\", \"(.*)\")",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{Hostname}}"
|
"legendFormat": "{{Hostname}}"
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1910,7 +1910,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))",
|
"expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)) == bool 0))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}"
|
"legendFormat": "{{namespace}}"
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user