monitoring(gpu): clarify reservation accounting
This commit is contained in:
parent
39db0471d7
commit
d9955af899
@ -304,6 +304,10 @@ def gpu_requests_by_namespace_node(scope_var):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def gpu_requests_by_namespace(scope_var):
|
||||||
|
return f"sum by (namespace) ({gpu_requests_by_namespace_node(scope_var)})"
|
||||||
|
|
||||||
|
|
||||||
def gpu_usage_by_namespace(scope_var):
|
def gpu_usage_by_namespace(scope_var):
|
||||||
requests_by_ns = gpu_requests_by_namespace_node(scope_var)
|
requests_by_ns = gpu_requests_by_namespace_node(scope_var)
|
||||||
total_by_node = f"sum by (node) ({requests_by_ns})"
|
total_by_node = f"sum by (node) ({requests_by_ns})"
|
||||||
@ -340,10 +344,10 @@ def namespace_ram_share_expr(scope_var):
|
|||||||
|
|
||||||
|
|
||||||
def namespace_gpu_share_expr(scope_var):
|
def namespace_gpu_share_expr(scope_var):
|
||||||
usage = f"max_over_time(({namespace_gpu_usage_instant(scope_var)})[$__range:$__interval])"
|
reservation = f"max_over_time(({gpu_requests_by_namespace(scope_var)})[$__range:$__interval])"
|
||||||
total = f"(sum({usage}) or on() vector(0))"
|
total = f"(sum({reservation}) or on() vector(0))"
|
||||||
present = f"(count({usage}) or on() vector(0))"
|
present = f"(count({reservation}) or on() vector(0))"
|
||||||
share = f"100 * ({usage}) / clamp_min({total}, 1)"
|
share = f"100 * ({reservation}) / clamp_min({total}, 1)"
|
||||||
idle = 'label_replace(vector(100), "namespace", "idle", "", "") and on() (' + present + " == 0)"
|
idle = 'label_replace(vector(100), "namespace", "idle", "", "") and on() (' + present + " == 0)"
|
||||||
return f"({share}) or ({idle})"
|
return f"({share}) or ({idle})"
|
||||||
|
|
||||||
@ -1802,7 +1806,7 @@ OVERVIEW_PANEL_DESCRIPTIONS = {
|
|||||||
"Postgres Connections Used": "Current Postgres connections; lower leaves room for apps during spikes.",
|
"Postgres Connections Used": "Current Postgres connections; lower leaves room for apps during spikes.",
|
||||||
"Postgres Hottest Connections": "Database with the most active connections; high values identify the pressure source.",
|
"Postgres Hottest Connections": "Database with the most active connections; high values identify the pressure source.",
|
||||||
"Namespace CPU Share": "CPU share by namespace in the selected scope; big slices show who is using compute.",
|
"Namespace CPU Share": "CPU share by namespace in the selected scope; big slices show who is using compute.",
|
||||||
"Namespace GPU Share": "GPU share by namespace in the selected scope; idle is good unless GPU work is expected.",
|
"Namespace GPU Reservation": "GPU reservation share by namespace in the selected scope. This is allocation, not per-process GPU utilization.",
|
||||||
"Namespace RAM Share": "Memory share by namespace in the selected scope; big slices show who may drive pressure.",
|
"Namespace RAM Share": "Memory share by namespace in the selected scope; big slices show who may drive pressure.",
|
||||||
"Worker Node CPU": "Worker CPU over time; lower is calmer, sustained high load may need rescheduling.",
|
"Worker Node CPU": "Worker CPU over time; lower is calmer, sustained high load may need rescheduling.",
|
||||||
"Worker Node RAM": "Worker memory over time; lower is safer, sustained high use risks evictions.",
|
"Worker Node RAM": "Worker memory over time; lower is safer, sustained high use risks evictions.",
|
||||||
@ -2815,11 +2819,11 @@ def build_overview():
|
|||||||
panels.append(
|
panels.append(
|
||||||
pie_panel(
|
pie_panel(
|
||||||
12,
|
12,
|
||||||
"Namespace GPU Share",
|
"Namespace GPU Reservation",
|
||||||
namespace_gpu_share_expr(gpu_scope),
|
namespace_gpu_share_expr(gpu_scope),
|
||||||
{"h": 9, "w": 8, "x": 8, "y": 23},
|
{"h": 9, "w": 8, "x": 8, "y": 23},
|
||||||
links=namespace_scope_links("namespace_scope_gpu"),
|
links=namespace_scope_links("namespace_scope_gpu"),
|
||||||
description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
|
description="Shares are based on Kubernetes GPU requests seen in the selected range, not true per-process GPU utilization.",
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
panels.append(
|
panels.append(
|
||||||
@ -5336,23 +5340,24 @@ def build_gpu_dashboard():
|
|||||||
panels.append(
|
panels.append(
|
||||||
pie_panel(
|
pie_panel(
|
||||||
1,
|
1,
|
||||||
"Namespace GPU Share",
|
"Namespace GPU Reservation",
|
||||||
namespace_gpu_share_expr(gpu_scope),
|
namespace_gpu_share_expr(gpu_scope),
|
||||||
{"h": 8, "w": 12, "x": 0, "y": 0},
|
{"h": 8, "w": 12, "x": 0, "y": 0},
|
||||||
links=namespace_scope_links("namespace_scope_gpu"),
|
links=namespace_scope_links("namespace_scope_gpu"),
|
||||||
description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
|
description="Shares are based on Kubernetes GPU requests seen in the selected range, not true per-process GPU utilization.",
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
panels.append(
|
panels.append(
|
||||||
timeseries_panel(
|
timeseries_panel(
|
||||||
2,
|
2,
|
||||||
"GPU Util by Namespace",
|
"GPU Activity by Reservation",
|
||||||
namespace_gpu_usage_instant(gpu_scope),
|
namespace_gpu_usage_instant(gpu_scope),
|
||||||
{"h": 8, "w": 12, "x": 12, "y": 0},
|
{"h": 8, "w": 12, "x": 12, "y": 0},
|
||||||
unit="percent",
|
unit="percent",
|
||||||
legend="{{namespace}}",
|
legend="{{namespace}}",
|
||||||
legend_display="table",
|
legend_display="table",
|
||||||
legend_placement="right",
|
legend_placement="right",
|
||||||
|
description="Node/device GPU activity attributed by each namespace's GPU reservation on that node.",
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
panels.append(
|
panels.append(
|
||||||
@ -5370,11 +5375,12 @@ def build_gpu_dashboard():
|
|||||||
panels.append(
|
panels.append(
|
||||||
table_panel(
|
table_panel(
|
||||||
4,
|
4,
|
||||||
"Top Pods by GPU Util",
|
"GPU Pods Reporting Device Util",
|
||||||
'topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=""}) by (namespace,pod,Hostname))',
|
'topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=""}) by (namespace,pod,Hostname))',
|
||||||
{"h": 8, "w": 12, "x": 12, "y": 8},
|
{"h": 8, "w": 12, "x": 12, "y": 8},
|
||||||
unit="percent",
|
unit="percent",
|
||||||
transformations=[{"id": "labelsToFields", "options": {}}],
|
transformations=[{"id": "labelsToFields", "options": {}}],
|
||||||
|
description="DCGM labels the device utilization sample with GPU-consuming pods; shared-GPU pods can report the same device value.",
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
return {
|
return {
|
||||||
|
|||||||
@ -154,9 +154,9 @@ def test_overview_uses_readable_quality_power_and_gitops_panels():
|
|||||||
assert "backup-telemetry-missing" in pvc_backup_expr
|
assert "backup-telemetry-missing" in pvc_backup_expr
|
||||||
assert 'pvc_backup_(count|last_success_timestamp_seconds|health_reason)' in pvc_backup_expr
|
assert 'pvc_backup_(count|last_success_timestamp_seconds|health_reason)' in pvc_backup_expr
|
||||||
|
|
||||||
gpu_expr = panels_by_title["Namespace GPU Share"]["targets"][0]["expr"]
|
gpu_expr = panels_by_title["Namespace GPU Reservation"]["targets"][0]["expr"]
|
||||||
assert 'resource=~"nvidia(_com_|[.]com/)gpu.*"' in gpu_expr
|
assert 'resource=~"nvidia(_com_|[.]com/)gpu.*"' in gpu_expr
|
||||||
assert "/ on(node) group_left() clamp_min" in gpu_expr
|
assert "sum by (namespace)" in gpu_expr
|
||||||
assert "kube_node_status_allocatable" in gpu_expr
|
assert "kube_node_status_allocatable" in gpu_expr
|
||||||
assert "kube_node_labels" not in gpu_expr
|
assert "kube_node_labels" not in gpu_expr
|
||||||
assert "count(max_over_time(" in gpu_expr
|
assert "count(max_over_time(" in gpu_expr
|
||||||
|
|||||||
@ -7,7 +7,7 @@
|
|||||||
{
|
{
|
||||||
"id": 1,
|
"id": 1,
|
||||||
"type": "piechart",
|
"type": "piechart",
|
||||||
"title": "Namespace GPU Share",
|
"title": "Namespace GPU Reservation",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -20,7 +20,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "(100 * (max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) / clamp_min((sum(max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count(max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) or on() vector(0)) == 0))",
|
"expr": "(100 * (max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) / clamp_min((sum(max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count(max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) or on() vector(0)) == 0))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}"
|
"legendFormat": "{{namespace}}"
|
||||||
}
|
}
|
||||||
@ -71,12 +71,12 @@
|
|||||||
"targetBlank": false
|
"targetBlank": false
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"description": "Shares are normalized within the selected filter. Switching scope changes the denominator."
|
"description": "Shares are based on Kubernetes GPU requests seen in the selected range, not true per-process GPU utilization."
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 2,
|
"id": 2,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "GPU Util by Namespace",
|
"title": "GPU Activity by Reservation",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -108,7 +108,8 @@
|
|||||||
"tooltip": {
|
"tooltip": {
|
||||||
"mode": "multi"
|
"mode": "multi"
|
||||||
}
|
}
|
||||||
}
|
},
|
||||||
|
"description": "Node/device GPU activity attributed by each namespace's GPU reservation on that node."
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 3,
|
"id": 3,
|
||||||
@ -150,7 +151,7 @@
|
|||||||
{
|
{
|
||||||
"id": 4,
|
"id": 4,
|
||||||
"type": "table",
|
"type": "table",
|
||||||
"title": "Top Pods by GPU Util",
|
"title": "GPU Pods Reporting Device Util",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -185,7 +186,8 @@
|
|||||||
"id": "labelsToFields",
|
"id": "labelsToFields",
|
||||||
"options": {}
|
"options": {}
|
||||||
}
|
}
|
||||||
]
|
],
|
||||||
|
"description": "DCGM labels the device utilization sample with GPU-consuming pods; shared-GPU pods can report the same device value."
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"time": {
|
"time": {
|
||||||
|
|||||||
@ -3715,7 +3715,7 @@
|
|||||||
{
|
{
|
||||||
"id": 12,
|
"id": 12,
|
||||||
"type": "piechart",
|
"type": "piechart",
|
||||||
"title": "Namespace GPU Share",
|
"title": "Namespace GPU Reservation",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -3728,7 +3728,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "(100 * (max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) / clamp_min((sum(max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count(max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) or on() vector(0)) == 0))",
|
"expr": "(100 * (max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) / clamp_min((sum(max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count(max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) or on() vector(0)) == 0))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}"
|
"legendFormat": "{{namespace}}"
|
||||||
}
|
}
|
||||||
@ -3779,7 +3779,7 @@
|
|||||||
"targetBlank": false
|
"targetBlank": false
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"description": "Shares are normalized within the selected filter. Switching scope changes the denominator."
|
"description": "Shares are based on Kubernetes GPU requests seen in the selected range, not true per-process GPU utilization."
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 13,
|
"id": 13,
|
||||||
|
|||||||
@ -16,7 +16,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 1,
|
"id": 1,
|
||||||
"type": "piechart",
|
"type": "piechart",
|
||||||
"title": "Namespace GPU Share",
|
"title": "Namespace GPU Reservation",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -29,7 +29,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "(100 * (max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) / clamp_min((sum(max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count(max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) or on() vector(0)) == 0))",
|
"expr": "(100 * (max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) / clamp_min((sum(max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count(max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) or on() vector(0)) == 0))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}"
|
"legendFormat": "{{namespace}}"
|
||||||
}
|
}
|
||||||
@ -80,12 +80,12 @@ data:
|
|||||||
"targetBlank": false
|
"targetBlank": false
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"description": "Shares are normalized within the selected filter. Switching scope changes the denominator."
|
"description": "Shares are based on Kubernetes GPU requests seen in the selected range, not true per-process GPU utilization."
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 2,
|
"id": 2,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "GPU Util by Namespace",
|
"title": "GPU Activity by Reservation",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -117,7 +117,8 @@ data:
|
|||||||
"tooltip": {
|
"tooltip": {
|
||||||
"mode": "multi"
|
"mode": "multi"
|
||||||
}
|
}
|
||||||
}
|
},
|
||||||
|
"description": "Node/device GPU activity attributed by each namespace's GPU reservation on that node."
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 3,
|
"id": 3,
|
||||||
@ -159,7 +160,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 4,
|
"id": 4,
|
||||||
"type": "table",
|
"type": "table",
|
||||||
"title": "Top Pods by GPU Util",
|
"title": "GPU Pods Reporting Device Util",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -194,7 +195,8 @@ data:
|
|||||||
"id": "labelsToFields",
|
"id": "labelsToFields",
|
||||||
"options": {}
|
"options": {}
|
||||||
}
|
}
|
||||||
]
|
],
|
||||||
|
"description": "DCGM labels the device utilization sample with GPU-consuming pods; shared-GPU pods can report the same device value."
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"time": {
|
"time": {
|
||||||
|
|||||||
@ -3724,7 +3724,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 12,
|
"id": 12,
|
||||||
"type": "piechart",
|
"type": "piechart",
|
||||||
"title": "Namespace GPU Share",
|
"title": "Namespace GPU Reservation",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -3737,7 +3737,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "(100 * (max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) / clamp_min((sum(max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count(max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) or on() vector(0)) == 0))",
|
"expr": "(100 * (max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) / clamp_min((sum(max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count(max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) or on() vector(0)) == 0))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}"
|
"legendFormat": "{{namespace}}"
|
||||||
}
|
}
|
||||||
@ -3788,7 +3788,7 @@ data:
|
|||||||
"targetBlank": false
|
"targetBlank": false
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"description": "Shares are normalized within the selected filter. Switching scope changes the denominator."
|
"description": "Shares are based on Kubernetes GPU requests seen in the selected range, not true per-process GPU utilization."
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 13,
|
"id": 13,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user