monitoring: rework gpu share + gauges

This commit is contained in:
Brad Stein 2025-11-18 12:11:47 -03:00
parent 497164a1ad
commit 8e6c0a3cfe
3 changed files with 144 additions and 207 deletions

View File

@ -238,7 +238,7 @@ NAMESPACE_GPU_RAW = (
+ NAMESPACE_GPU_ALLOC
+ " * 0)"
)
NAMESPACE_GPU_WEIGHT = NAMESPACE_GPU_RAW
NAMESPACE_GPU_WEIGHT = NAMESPACE_GPU_ALLOC
NAMESPACE_COMBINED_FILTER = (
'topk(10, ('
+ NAMESPACE_CPU_RAW
@ -319,6 +319,49 @@ def stat_panel(
return panel
def gauge_panel(
panel_id,
title,
expr,
grid,
*,
min_value=0,
max_value=1,
thresholds=None,
links=None,
):
return {
"id": panel_id,
"type": "gauge",
"title": title,
"datasource": PROM_DS,
"gridPos": grid,
"targets": [{"expr": expr, "refId": "A"}],
"fieldConfig": {
"defaults": {
"min": min_value,
"max": max_value,
"thresholds": thresholds
or {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "red", "value": max_value},
],
},
},
"overrides": [],
},
"options": {
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
"orientation": "auto",
"showThresholdMarkers": False,
"showThresholdLabels": False,
},
**({"links": links} if links else {}),
}
def timeseries_panel(
panel_id,
title,
@ -472,7 +515,10 @@ def build_overview():
]
for idx, (panel_id, title, expr, suffix, ok_value, links) in enumerate(row1_stats):
thresholds = None
min_value = 0
max_value = ok_value or 5
if panel_id == 1:
max_value = WORKER_TOTAL
thresholds = {
"mode": "absolute",
"steps": [
@ -483,6 +529,7 @@ def build_overview():
],
}
elif panel_id == 2:
max_value = CONTROL_TOTAL
thresholds = {
"mode": "absolute",
"steps": [
@ -491,6 +538,7 @@ def build_overview():
],
}
elif panel_id in (3, 4, 5):
max_value = 4
thresholds = {
"mode": "absolute",
"steps": [
@ -500,13 +548,22 @@ def build_overview():
{"color": "red", "value": 3},
],
}
else:
thresholds = {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "red", "value": max_value},
],
}
panels.append(
stat_panel(
gauge_panel(
panel_id,
title,
expr,
{"h": 5, "w": 4, "x": 4 * idx, "y": 0},
value_suffix=suffix,
min_value=min_value,
max_value=max_value,
thresholds=thresholds,
links=links,
)

View File

@ -9,7 +9,7 @@
"panels": [
{
"id": 1,
"type": "stat",
"type": "gauge",
"title": "Workers ready",
"datasource": {
"type": "prometheus",
@ -29,10 +29,8 @@
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"min": 0,
"max": 18,
"thresholds": {
"mode": "absolute",
"steps": [
@ -53,19 +51,11 @@
"value": 18
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto",
"valueSuffix": "/18"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
@ -73,12 +63,14 @@
"fields": "",
"values": false
},
"textMode": "value"
"orientation": "auto",
"showThresholdMarkers": false,
"showThresholdLabels": false
}
},
{
"id": 2,
"type": "stat",
"type": "gauge",
"title": "Control plane ready",
"datasource": {
"type": "prometheus",
@ -98,10 +90,8 @@
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"min": 0,
"max": 3,
"thresholds": {
"mode": "absolute",
"steps": [
@ -114,19 +104,11 @@
"value": 3
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto",
"valueSuffix": "/3"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
@ -134,12 +116,14 @@
"fields": "",
"values": false
},
"textMode": "value"
"orientation": "auto",
"showThresholdMarkers": false,
"showThresholdLabels": false
}
},
{
"id": 3,
"type": "stat",
"type": "gauge",
"title": "Control plane workloads",
"datasource": {
"type": "prometheus",
@ -159,10 +143,8 @@
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"min": 0,
"max": 4,
"thresholds": {
"mode": "absolute",
"steps": [
@ -183,18 +165,11 @@
"value": 3
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
@ -202,7 +177,9 @@
"fields": "",
"values": false
},
"textMode": "value"
"orientation": "auto",
"showThresholdMarkers": false,
"showThresholdLabels": false
},
"links": [
{
@ -214,7 +191,7 @@
},
{
"id": 4,
"type": "stat",
"type": "gauge",
"title": "Problem pods",
"datasource": {
"type": "prometheus",
@ -234,10 +211,8 @@
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"min": 0,
"max": 4,
"thresholds": {
"mode": "absolute",
"steps": [
@ -258,18 +233,11 @@
"value": 3
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
@ -277,7 +245,9 @@
"fields": "",
"values": false
},
"textMode": "value"
"orientation": "auto",
"showThresholdMarkers": false,
"showThresholdLabels": false
},
"links": [
{
@ -289,7 +259,7 @@
},
{
"id": 5,
"type": "stat",
"type": "gauge",
"title": "Stuck terminating",
"datasource": {
"type": "prometheus",
@ -309,10 +279,8 @@
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"min": 0,
"max": 4,
"thresholds": {
"mode": "absolute",
"steps": [
@ -333,18 +301,11 @@
"value": 3
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
@ -352,7 +313,9 @@
"fields": "",
"values": false
},
"textMode": "value"
"orientation": "auto",
"showThresholdMarkers": false,
"showThresholdLabels": false
},
"links": [
{
@ -364,7 +327,7 @@
},
{
"id": 6,
"type": "stat",
"type": "gauge",
"title": "Running pods",
"datasource": {
"type": "prometheus",
@ -384,34 +347,25 @@
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"min": 0,
"max": 5,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"color": "green",
"value": null
},
{
"color": "green",
"value": 1
"color": "red",
"value": 5
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
@ -419,7 +373,9 @@
"fields": "",
"values": false
},
"textMode": "value"
"orientation": "auto",
"showThresholdMarkers": false,
"showThresholdLabels": false
}
},
{
@ -722,7 +678,7 @@
},
"targets": [
{
"expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
"expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -764,7 +720,7 @@
},
"targets": [
{
"expr": "100 * ( ( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ), 1)",
"expr": "100 * ( ( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ), 1)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -806,7 +762,7 @@
},
"targets": [
{
"expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
"expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -1480,22 +1436,6 @@
"targetBlank": true
}
]
},
{
"id": 25,
"type": "text",
"title": "About this dashboard",
"gridPos": {
"h": 5,
"w": 24,
"x": 0,
"y": 55
},
"datasource": null,
"options": {
"mode": "markdown",
"content": "### Atlas Overview\n- Anonymous users land here; follow the panel links for pod/node/storage/network drill-downs.\n- Control plane workload count flags any non-system pods that slipped onto the HA nodes.\n- Problem and stuck pods use kube-state-metrics so counts and detail tables match exactly."
}
}
],
"schemaVersion": 39,

View File

@ -18,7 +18,7 @@ data:
"panels": [
{
"id": 1,
"type": "stat",
"type": "gauge",
"title": "Workers ready",
"datasource": {
"type": "prometheus",
@ -38,10 +38,8 @@ data:
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"min": 0,
"max": 18,
"thresholds": {
"mode": "absolute",
"steps": [
@ -62,19 +60,11 @@ data:
"value": 18
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto",
"valueSuffix": "/18"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
@ -82,12 +72,14 @@ data:
"fields": "",
"values": false
},
"textMode": "value"
"orientation": "auto",
"showThresholdMarkers": false,
"showThresholdLabels": false
}
},
{
"id": 2,
"type": "stat",
"type": "gauge",
"title": "Control plane ready",
"datasource": {
"type": "prometheus",
@ -107,10 +99,8 @@ data:
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"min": 0,
"max": 3,
"thresholds": {
"mode": "absolute",
"steps": [
@ -123,19 +113,11 @@ data:
"value": 3
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto",
"valueSuffix": "/3"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
@ -143,12 +125,14 @@ data:
"fields": "",
"values": false
},
"textMode": "value"
"orientation": "auto",
"showThresholdMarkers": false,
"showThresholdLabels": false
}
},
{
"id": 3,
"type": "stat",
"type": "gauge",
"title": "Control plane workloads",
"datasource": {
"type": "prometheus",
@ -168,10 +152,8 @@ data:
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"min": 0,
"max": 4,
"thresholds": {
"mode": "absolute",
"steps": [
@ -192,18 +174,11 @@ data:
"value": 3
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
@ -211,7 +186,9 @@ data:
"fields": "",
"values": false
},
"textMode": "value"
"orientation": "auto",
"showThresholdMarkers": false,
"showThresholdLabels": false
},
"links": [
{
@ -223,7 +200,7 @@ data:
},
{
"id": 4,
"type": "stat",
"type": "gauge",
"title": "Problem pods",
"datasource": {
"type": "prometheus",
@ -243,10 +220,8 @@ data:
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"min": 0,
"max": 4,
"thresholds": {
"mode": "absolute",
"steps": [
@ -267,18 +242,11 @@ data:
"value": 3
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
@ -286,7 +254,9 @@ data:
"fields": "",
"values": false
},
"textMode": "value"
"orientation": "auto",
"showThresholdMarkers": false,
"showThresholdLabels": false
},
"links": [
{
@ -298,7 +268,7 @@ data:
},
{
"id": 5,
"type": "stat",
"type": "gauge",
"title": "Stuck terminating",
"datasource": {
"type": "prometheus",
@ -318,10 +288,8 @@ data:
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"min": 0,
"max": 4,
"thresholds": {
"mode": "absolute",
"steps": [
@ -342,18 +310,11 @@ data:
"value": 3
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
@ -361,7 +322,9 @@ data:
"fields": "",
"values": false
},
"textMode": "value"
"orientation": "auto",
"showThresholdMarkers": false,
"showThresholdLabels": false
},
"links": [
{
@ -373,7 +336,7 @@ data:
},
{
"id": 6,
"type": "stat",
"type": "gauge",
"title": "Running pods",
"datasource": {
"type": "prometheus",
@ -393,34 +356,25 @@ data:
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"min": 0,
"max": 5,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"color": "green",
"value": null
},
{
"color": "green",
"value": 1
"color": "red",
"value": 5
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
@ -428,7 +382,9 @@ data:
"fields": "",
"values": false
},
"textMode": "value"
"orientation": "auto",
"showThresholdMarkers": false,
"showThresholdLabels": false
}
},
{
@ -731,7 +687,7 @@ data:
},
"targets": [
{
"expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
"expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -773,7 +729,7 @@ data:
},
"targets": [
{
"expr": "100 * ( ( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ), 1)",
"expr": "100 * ( ( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ), 1)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -815,7 +771,7 @@ data:
},
"targets": [
{
"expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
"expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -1489,22 +1445,6 @@ data:
"targetBlank": true
}
]
},
{
"id": 25,
"type": "text",
"title": "About this dashboard",
"gridPos": {
"h": 5,
"w": 24,
"x": 0,
"y": 55
},
"datasource": null,
"options": {
"mode": "markdown",
"content": "### Atlas Overview\n- Anonymous users land here; follow the panel links for pod/node/storage/network drill-downs.\n- Control plane workload count flags any non-system pods that slipped onto the HA nodes.\n- Problem and stuck pods use kube-state-metrics so counts and detail tables match exactly."
}
}
],
"schemaVersion": 39,