feature/atlas-monitoring #3

Merged
bstein merged 71 commits from feature/atlas-monitoring into main 2025-12-02 20:52:36 +00:00
3 changed files with 144 additions and 207 deletions
Showing only changes of commit 8e6c0a3cfe - Show all commits

View File

@ -238,7 +238,7 @@ NAMESPACE_GPU_RAW = (
+ NAMESPACE_GPU_ALLOC + NAMESPACE_GPU_ALLOC
+ " * 0)" + " * 0)"
) )
NAMESPACE_GPU_WEIGHT = NAMESPACE_GPU_RAW NAMESPACE_GPU_WEIGHT = NAMESPACE_GPU_ALLOC
NAMESPACE_COMBINED_FILTER = ( NAMESPACE_COMBINED_FILTER = (
'topk(10, (' 'topk(10, ('
+ NAMESPACE_CPU_RAW + NAMESPACE_CPU_RAW
@ -319,6 +319,49 @@ def stat_panel(
return panel return panel
def gauge_panel(
panel_id,
title,
expr,
grid,
*,
min_value=0,
max_value=1,
thresholds=None,
links=None,
):
return {
"id": panel_id,
"type": "gauge",
"title": title,
"datasource": PROM_DS,
"gridPos": grid,
"targets": [{"expr": expr, "refId": "A"}],
"fieldConfig": {
"defaults": {
"min": min_value,
"max": max_value,
"thresholds": thresholds
or {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "red", "value": max_value},
],
},
},
"overrides": [],
},
"options": {
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
"orientation": "auto",
"showThresholdMarkers": False,
"showThresholdLabels": False,
},
**({"links": links} if links else {}),
}
def timeseries_panel( def timeseries_panel(
panel_id, panel_id,
title, title,
@ -472,7 +515,10 @@ def build_overview():
] ]
for idx, (panel_id, title, expr, suffix, ok_value, links) in enumerate(row1_stats): for idx, (panel_id, title, expr, suffix, ok_value, links) in enumerate(row1_stats):
thresholds = None thresholds = None
min_value = 0
max_value = ok_value or 5
if panel_id == 1: if panel_id == 1:
max_value = WORKER_TOTAL
thresholds = { thresholds = {
"mode": "absolute", "mode": "absolute",
"steps": [ "steps": [
@ -483,6 +529,7 @@ def build_overview():
], ],
} }
elif panel_id == 2: elif panel_id == 2:
max_value = CONTROL_TOTAL
thresholds = { thresholds = {
"mode": "absolute", "mode": "absolute",
"steps": [ "steps": [
@ -491,6 +538,7 @@ def build_overview():
], ],
} }
elif panel_id in (3, 4, 5): elif panel_id in (3, 4, 5):
max_value = 4
thresholds = { thresholds = {
"mode": "absolute", "mode": "absolute",
"steps": [ "steps": [
@ -500,13 +548,22 @@ def build_overview():
{"color": "red", "value": 3}, {"color": "red", "value": 3},
], ],
} }
else:
thresholds = {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "red", "value": max_value},
],
}
panels.append( panels.append(
stat_panel( gauge_panel(
panel_id, panel_id,
title, title,
expr, expr,
{"h": 5, "w": 4, "x": 4 * idx, "y": 0}, {"h": 5, "w": 4, "x": 4 * idx, "y": 0},
value_suffix=suffix, min_value=min_value,
max_value=max_value,
thresholds=thresholds, thresholds=thresholds,
links=links, links=links,
) )

View File

@ -9,7 +9,7 @@
"panels": [ "panels": [
{ {
"id": 1, "id": 1,
"type": "stat", "type": "gauge",
"title": "Workers ready", "title": "Workers ready",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
@ -29,10 +29,8 @@
], ],
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"color": { "min": 0,
"mode": "palette-classic" "max": 18,
},
"mappings": [],
"thresholds": { "thresholds": {
"mode": "absolute", "mode": "absolute",
"steps": [ "steps": [
@ -53,19 +51,11 @@
"value": 18 "value": 18
} }
] ]
},
"unit": "none",
"custom": {
"displayMode": "auto",
"valueSuffix": "/18"
} }
}, },
"overrides": [] "overrides": []
}, },
"options": { "options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": { "reduceOptions": {
"calcs": [ "calcs": [
"lastNotNull" "lastNotNull"
@ -73,12 +63,14 @@
"fields": "", "fields": "",
"values": false "values": false
}, },
"textMode": "value" "orientation": "auto",
"showThresholdMarkers": false,
"showThresholdLabels": false
} }
}, },
{ {
"id": 2, "id": 2,
"type": "stat", "type": "gauge",
"title": "Control plane ready", "title": "Control plane ready",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
@ -98,10 +90,8 @@
], ],
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"color": { "min": 0,
"mode": "palette-classic" "max": 3,
},
"mappings": [],
"thresholds": { "thresholds": {
"mode": "absolute", "mode": "absolute",
"steps": [ "steps": [
@ -114,19 +104,11 @@
"value": 3 "value": 3
} }
] ]
},
"unit": "none",
"custom": {
"displayMode": "auto",
"valueSuffix": "/3"
} }
}, },
"overrides": [] "overrides": []
}, },
"options": { "options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": { "reduceOptions": {
"calcs": [ "calcs": [
"lastNotNull" "lastNotNull"
@ -134,12 +116,14 @@
"fields": "", "fields": "",
"values": false "values": false
}, },
"textMode": "value" "orientation": "auto",
"showThresholdMarkers": false,
"showThresholdLabels": false
} }
}, },
{ {
"id": 3, "id": 3,
"type": "stat", "type": "gauge",
"title": "Control plane workloads", "title": "Control plane workloads",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
@ -159,10 +143,8 @@
], ],
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"color": { "min": 0,
"mode": "palette-classic" "max": 4,
},
"mappings": [],
"thresholds": { "thresholds": {
"mode": "absolute", "mode": "absolute",
"steps": [ "steps": [
@ -183,18 +165,11 @@
"value": 3 "value": 3
} }
] ]
},
"unit": "none",
"custom": {
"displayMode": "auto"
} }
}, },
"overrides": [] "overrides": []
}, },
"options": { "options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": { "reduceOptions": {
"calcs": [ "calcs": [
"lastNotNull" "lastNotNull"
@ -202,7 +177,9 @@
"fields": "", "fields": "",
"values": false "values": false
}, },
"textMode": "value" "orientation": "auto",
"showThresholdMarkers": false,
"showThresholdLabels": false
}, },
"links": [ "links": [
{ {
@ -214,7 +191,7 @@
}, },
{ {
"id": 4, "id": 4,
"type": "stat", "type": "gauge",
"title": "Problem pods", "title": "Problem pods",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
@ -234,10 +211,8 @@
], ],
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"color": { "min": 0,
"mode": "palette-classic" "max": 4,
},
"mappings": [],
"thresholds": { "thresholds": {
"mode": "absolute", "mode": "absolute",
"steps": [ "steps": [
@ -258,18 +233,11 @@
"value": 3 "value": 3
} }
] ]
},
"unit": "none",
"custom": {
"displayMode": "auto"
} }
}, },
"overrides": [] "overrides": []
}, },
"options": { "options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": { "reduceOptions": {
"calcs": [ "calcs": [
"lastNotNull" "lastNotNull"
@ -277,7 +245,9 @@
"fields": "", "fields": "",
"values": false "values": false
}, },
"textMode": "value" "orientation": "auto",
"showThresholdMarkers": false,
"showThresholdLabels": false
}, },
"links": [ "links": [
{ {
@ -289,7 +259,7 @@
}, },
{ {
"id": 5, "id": 5,
"type": "stat", "type": "gauge",
"title": "Stuck terminating", "title": "Stuck terminating",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
@ -309,10 +279,8 @@
], ],
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"color": { "min": 0,
"mode": "palette-classic" "max": 4,
},
"mappings": [],
"thresholds": { "thresholds": {
"mode": "absolute", "mode": "absolute",
"steps": [ "steps": [
@ -333,18 +301,11 @@
"value": 3 "value": 3
} }
] ]
},
"unit": "none",
"custom": {
"displayMode": "auto"
} }
}, },
"overrides": [] "overrides": []
}, },
"options": { "options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": { "reduceOptions": {
"calcs": [ "calcs": [
"lastNotNull" "lastNotNull"
@ -352,7 +313,9 @@
"fields": "", "fields": "",
"values": false "values": false
}, },
"textMode": "value" "orientation": "auto",
"showThresholdMarkers": false,
"showThresholdLabels": false
}, },
"links": [ "links": [
{ {
@ -364,7 +327,7 @@
}, },
{ {
"id": 6, "id": 6,
"type": "stat", "type": "gauge",
"title": "Running pods", "title": "Running pods",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
@ -384,34 +347,25 @@
], ],
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"color": { "min": 0,
"mode": "palette-classic" "max": 5,
},
"mappings": [],
"thresholds": { "thresholds": {
"mode": "absolute", "mode": "absolute",
"steps": [ "steps": [
{ {
"color": "rgba(115, 115, 115, 1)", "color": "green",
"value": null "value": null
}, },
{ {
"color": "green", "color": "red",
"value": 1 "value": 5
} }
] ]
},
"unit": "none",
"custom": {
"displayMode": "auto"
} }
}, },
"overrides": [] "overrides": []
}, },
"options": { "options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": { "reduceOptions": {
"calcs": [ "calcs": [
"lastNotNull" "lastNotNull"
@ -419,7 +373,9 @@
"fields": "", "fields": "",
"values": false "values": false
}, },
"textMode": "value" "orientation": "auto",
"showThresholdMarkers": false,
"showThresholdLabels": false
} }
}, },
{ {
@ -722,7 +678,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
"refId": "A", "refId": "A",
"legendFormat": "{{namespace}}" "legendFormat": "{{namespace}}"
} }
@ -764,7 +720,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "100 * ( ( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ), 1)", "expr": "100 * ( ( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ), 1)",
"refId": "A", "refId": "A",
"legendFormat": "{{namespace}}" "legendFormat": "{{namespace}}"
} }
@ -806,7 +762,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
"refId": "A", "refId": "A",
"legendFormat": "{{namespace}}" "legendFormat": "{{namespace}}"
} }
@ -1480,22 +1436,6 @@
"targetBlank": true "targetBlank": true
} }
] ]
},
{
"id": 25,
"type": "text",
"title": "About this dashboard",
"gridPos": {
"h": 5,
"w": 24,
"x": 0,
"y": 55
},
"datasource": null,
"options": {
"mode": "markdown",
"content": "### Atlas Overview\n- Anonymous users land here; follow the panel links for pod/node/storage/network drill-downs.\n- Control plane workload count flags any non-system pods that slipped onto the HA nodes.\n- Problem and stuck pods use kube-state-metrics so counts and detail tables match exactly."
}
} }
], ],
"schemaVersion": 39, "schemaVersion": 39,

View File

@ -18,7 +18,7 @@ data:
"panels": [ "panels": [
{ {
"id": 1, "id": 1,
"type": "stat", "type": "gauge",
"title": "Workers ready", "title": "Workers ready",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
@ -38,10 +38,8 @@ data:
], ],
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"color": { "min": 0,
"mode": "palette-classic" "max": 18,
},
"mappings": [],
"thresholds": { "thresholds": {
"mode": "absolute", "mode": "absolute",
"steps": [ "steps": [
@ -62,19 +60,11 @@ data:
"value": 18 "value": 18
} }
] ]
},
"unit": "none",
"custom": {
"displayMode": "auto",
"valueSuffix": "/18"
} }
}, },
"overrides": [] "overrides": []
}, },
"options": { "options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": { "reduceOptions": {
"calcs": [ "calcs": [
"lastNotNull" "lastNotNull"
@ -82,12 +72,14 @@ data:
"fields": "", "fields": "",
"values": false "values": false
}, },
"textMode": "value" "orientation": "auto",
"showThresholdMarkers": false,
"showThresholdLabels": false
} }
}, },
{ {
"id": 2, "id": 2,
"type": "stat", "type": "gauge",
"title": "Control plane ready", "title": "Control plane ready",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
@ -107,10 +99,8 @@ data:
], ],
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"color": { "min": 0,
"mode": "palette-classic" "max": 3,
},
"mappings": [],
"thresholds": { "thresholds": {
"mode": "absolute", "mode": "absolute",
"steps": [ "steps": [
@ -123,19 +113,11 @@ data:
"value": 3 "value": 3
} }
] ]
},
"unit": "none",
"custom": {
"displayMode": "auto",
"valueSuffix": "/3"
} }
}, },
"overrides": [] "overrides": []
}, },
"options": { "options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": { "reduceOptions": {
"calcs": [ "calcs": [
"lastNotNull" "lastNotNull"
@ -143,12 +125,14 @@ data:
"fields": "", "fields": "",
"values": false "values": false
}, },
"textMode": "value" "orientation": "auto",
"showThresholdMarkers": false,
"showThresholdLabels": false
} }
}, },
{ {
"id": 3, "id": 3,
"type": "stat", "type": "gauge",
"title": "Control plane workloads", "title": "Control plane workloads",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
@ -168,10 +152,8 @@ data:
], ],
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"color": { "min": 0,
"mode": "palette-classic" "max": 4,
},
"mappings": [],
"thresholds": { "thresholds": {
"mode": "absolute", "mode": "absolute",
"steps": [ "steps": [
@ -192,18 +174,11 @@ data:
"value": 3 "value": 3
} }
] ]
},
"unit": "none",
"custom": {
"displayMode": "auto"
} }
}, },
"overrides": [] "overrides": []
}, },
"options": { "options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": { "reduceOptions": {
"calcs": [ "calcs": [
"lastNotNull" "lastNotNull"
@ -211,7 +186,9 @@ data:
"fields": "", "fields": "",
"values": false "values": false
}, },
"textMode": "value" "orientation": "auto",
"showThresholdMarkers": false,
"showThresholdLabels": false
}, },
"links": [ "links": [
{ {
@ -223,7 +200,7 @@ data:
}, },
{ {
"id": 4, "id": 4,
"type": "stat", "type": "gauge",
"title": "Problem pods", "title": "Problem pods",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
@ -243,10 +220,8 @@ data:
], ],
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"color": { "min": 0,
"mode": "palette-classic" "max": 4,
},
"mappings": [],
"thresholds": { "thresholds": {
"mode": "absolute", "mode": "absolute",
"steps": [ "steps": [
@ -267,18 +242,11 @@ data:
"value": 3 "value": 3
} }
] ]
},
"unit": "none",
"custom": {
"displayMode": "auto"
} }
}, },
"overrides": [] "overrides": []
}, },
"options": { "options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": { "reduceOptions": {
"calcs": [ "calcs": [
"lastNotNull" "lastNotNull"
@ -286,7 +254,9 @@ data:
"fields": "", "fields": "",
"values": false "values": false
}, },
"textMode": "value" "orientation": "auto",
"showThresholdMarkers": false,
"showThresholdLabels": false
}, },
"links": [ "links": [
{ {
@ -298,7 +268,7 @@ data:
}, },
{ {
"id": 5, "id": 5,
"type": "stat", "type": "gauge",
"title": "Stuck terminating", "title": "Stuck terminating",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
@ -318,10 +288,8 @@ data:
], ],
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"color": { "min": 0,
"mode": "palette-classic" "max": 4,
},
"mappings": [],
"thresholds": { "thresholds": {
"mode": "absolute", "mode": "absolute",
"steps": [ "steps": [
@ -342,18 +310,11 @@ data:
"value": 3 "value": 3
} }
] ]
},
"unit": "none",
"custom": {
"displayMode": "auto"
} }
}, },
"overrides": [] "overrides": []
}, },
"options": { "options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": { "reduceOptions": {
"calcs": [ "calcs": [
"lastNotNull" "lastNotNull"
@ -361,7 +322,9 @@ data:
"fields": "", "fields": "",
"values": false "values": false
}, },
"textMode": "value" "orientation": "auto",
"showThresholdMarkers": false,
"showThresholdLabels": false
}, },
"links": [ "links": [
{ {
@ -373,7 +336,7 @@ data:
}, },
{ {
"id": 6, "id": 6,
"type": "stat", "type": "gauge",
"title": "Running pods", "title": "Running pods",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
@ -393,34 +356,25 @@ data:
], ],
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"color": { "min": 0,
"mode": "palette-classic" "max": 5,
},
"mappings": [],
"thresholds": { "thresholds": {
"mode": "absolute", "mode": "absolute",
"steps": [ "steps": [
{ {
"color": "rgba(115, 115, 115, 1)", "color": "green",
"value": null "value": null
}, },
{ {
"color": "green", "color": "red",
"value": 1 "value": 5
} }
] ]
},
"unit": "none",
"custom": {
"displayMode": "auto"
} }
}, },
"overrides": [] "overrides": []
}, },
"options": { "options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": { "reduceOptions": {
"calcs": [ "calcs": [
"lastNotNull" "lastNotNull"
@ -428,7 +382,9 @@ data:
"fields": "", "fields": "",
"values": false "values": false
}, },
"textMode": "value" "orientation": "auto",
"showThresholdMarkers": false,
"showThresholdLabels": false
} }
}, },
{ {
@ -731,7 +687,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
"refId": "A", "refId": "A",
"legendFormat": "{{namespace}}" "legendFormat": "{{namespace}}"
} }
@ -773,7 +729,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "100 * ( ( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ), 1)", "expr": "100 * ( ( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ), 1)",
"refId": "A", "refId": "A",
"legendFormat": "{{namespace}}" "legendFormat": "{{namespace}}"
} }
@ -815,7 +771,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
"refId": "A", "refId": "A",
"legendFormat": "{{namespace}}" "legendFormat": "{{namespace}}"
} }
@ -1489,22 +1445,6 @@ data:
"targetBlank": true "targetBlank": true
} }
] ]
},
{
"id": 25,
"type": "text",
"title": "About this dashboard",
"gridPos": {
"h": 5,
"w": 24,
"x": 0,
"y": 55
},
"datasource": null,
"options": {
"mode": "markdown",
"content": "### Atlas Overview\n- Anonymous users land here; follow the panel links for pod/node/storage/network drill-downs.\n- Control plane workload count flags any non-system pods that slipped onto the HA nodes.\n- Problem and stuck pods use kube-state-metrics so counts and detail tables match exactly."
}
} }
], ],
"schemaVersion": 39, "schemaVersion": 39,