diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index bce5bfe..937dfb7 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -238,7 +238,7 @@ NAMESPACE_GPU_RAW = ( + NAMESPACE_GPU_ALLOC + " * 0)" ) -NAMESPACE_GPU_WEIGHT = NAMESPACE_GPU_RAW +NAMESPACE_GPU_WEIGHT = NAMESPACE_GPU_ALLOC NAMESPACE_COMBINED_FILTER = ( 'topk(10, (' + NAMESPACE_CPU_RAW @@ -319,6 +319,49 @@ def stat_panel( return panel +def gauge_panel( + panel_id, + title, + expr, + grid, + *, + min_value=0, + max_value=1, + thresholds=None, + links=None, +): + return { + "id": panel_id, + "type": "gauge", + "title": title, + "datasource": PROM_DS, + "gridPos": grid, + "targets": [{"expr": expr, "refId": "A"}], + "fieldConfig": { + "defaults": { + "min": min_value, + "max": max_value, + "thresholds": thresholds + or { + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "red", "value": max_value}, + ], + }, + }, + "overrides": [], + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False}, + "orientation": "auto", + "showThresholdMarkers": False, + "showThresholdLabels": False, + }, + **({"links": links} if links else {}), + } + + def timeseries_panel( panel_id, title, @@ -472,7 +515,10 @@ def build_overview(): ] for idx, (panel_id, title, expr, suffix, ok_value, links) in enumerate(row1_stats): thresholds = None + min_value = 0 + max_value = ok_value or 5 if panel_id == 1: + max_value = WORKER_TOTAL thresholds = { "mode": "absolute", "steps": [ @@ -483,6 +529,7 @@ def build_overview(): ], } elif panel_id == 2: + max_value = CONTROL_TOTAL thresholds = { "mode": "absolute", "steps": [ @@ -491,6 +538,7 @@ def build_overview(): ], } elif panel_id in (3, 4, 5): + max_value = 4 thresholds = { "mode": "absolute", "steps": [ @@ -500,13 +548,22 @@ def build_overview(): {"color": "red", "value": 3}, ], } + else: + thresholds = { + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "red", "value": max_value}, + ], + } panels.append( - stat_panel( + gauge_panel( panel_id, title, expr, {"h": 5, "w": 4, "x": 4 * idx, "y": 0}, - value_suffix=suffix, + min_value=min_value, + max_value=max_value, thresholds=thresholds, links=links, ) diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 5953697..ad460bb 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -9,7 +9,7 @@ "panels": [ { "id": 1, - "type": "stat", + "type": "gauge", "title": "Workers ready", "datasource": { "type": "prometheus", @@ -29,10 +29,8 @@ ], "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "mappings": [], + "min": 0, + "max": 18, "thresholds": { "mode": "absolute", "steps": [ @@ -53,19 +51,11 @@ "value": 18 } ] - }, - "unit": "none", - "custom": { - "displayMode": "auto", - "valueSuffix": "/18" } }, "overrides": [] }, "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" @@ -73,12 +63,14 @@ "fields": "", "values": false }, - "textMode": "value" + "orientation": "auto", + "showThresholdMarkers": false, + "showThresholdLabels": false } }, { "id": 2, - "type": "stat", + "type": "gauge", "title": "Control plane ready", "datasource": { "type": "prometheus", @@ -98,10 +90,8 @@ ], "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "mappings": [], + "min": 0, + "max": 3, "thresholds": { "mode": "absolute", "steps": [ @@ -114,19 +104,11 @@ "value": 3 } ] - }, - "unit": "none", - "custom": { - "displayMode": "auto", - "valueSuffix": "/3" } }, "overrides": [] }, "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" @@ -134,12 +116,14 @@ "fields": "", "values": false }, - "textMode": "value" + "orientation": "auto", + "showThresholdMarkers": false, + "showThresholdLabels": false } }, { "id": 3, - "type": "stat", + "type": "gauge", "title": "Control plane workloads", "datasource": { "type": "prometheus", @@ -159,10 +143,8 @@ ], "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "mappings": [], + "min": 0, + "max": 4, "thresholds": { "mode": "absolute", "steps": [ @@ -183,18 +165,11 @@ "value": 3 } ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" } }, "overrides": [] }, "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" @@ -202,7 +177,9 @@ "fields": "", "values": false }, - "textMode": "value" + "orientation": "auto", + "showThresholdMarkers": false, + "showThresholdLabels": false }, "links": [ { @@ -214,7 +191,7 @@ }, { "id": 4, - "type": "stat", + "type": "gauge", "title": "Problem pods", "datasource": { "type": "prometheus", @@ -234,10 +211,8 @@ ], "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "mappings": [], + "min": 0, + "max": 4, "thresholds": { "mode": "absolute", "steps": [ @@ -258,18 +233,11 @@ "value": 3 } ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" } }, "overrides": [] }, "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" @@ -277,7 +245,9 @@ "fields": "", "values": false }, - "textMode": "value" + "orientation": "auto", + "showThresholdMarkers": false, + "showThresholdLabels": false }, "links": [ { @@ -289,7 +259,7 @@ }, { "id": 5, - "type": "stat", + "type": "gauge", "title": "Stuck terminating", "datasource": { "type": "prometheus", @@ -309,10 +279,8 @@ ], "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "mappings": [], + "min": 0, + "max": 4, "thresholds": { "mode": "absolute", "steps": [ @@ -333,18 +301,11 @@ "value": 3 } ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" } }, "overrides": [] }, "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" @@ -352,7 +313,9 @@ "fields": "", "values": false }, - "textMode": "value" + "orientation": "auto", + "showThresholdMarkers": false, + "showThresholdLabels": false }, "links": [ { @@ -364,7 +327,7 @@ }, { "id": 6, - "type": "stat", + "type": "gauge", "title": "Running pods", "datasource": { "type": "prometheus", @@ -384,34 +347,25 @@ ], "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "mappings": [], + "min": 0, + "max": 5, "thresholds": { "mode": "absolute", "steps": [ { - "color": "rgba(115, 115, 115, 1)", + "color": "green", "value": null }, { - "color": "green", - "value": 1 + "color": "red", + "value": 5 } ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" } }, "overrides": [] }, "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" @@ -419,7 +373,9 @@ "fields": "", "values": false }, - "textMode": "value" + "orientation": "auto", + "showThresholdMarkers": false, + "showThresholdLabels": false } }, { @@ -722,7 +678,7 @@ }, "targets": [ { - "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", + "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -764,7 +720,7 @@ }, "targets": [ { - "expr": "100 * ( ( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ), 1)", + "expr": "100 * ( ( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -806,7 +762,7 @@ }, "targets": [ { - "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", + "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -1480,22 +1436,6 @@ "targetBlank": true } ] - }, - { - "id": 25, - "type": "text", - "title": "About this dashboard", - "gridPos": { - "h": 5, - "w": 24, - "x": 0, - "y": 55 - }, - "datasource": null, - "options": { - "mode": "markdown", - "content": "### Atlas Overview\n- Anonymous users land here; follow the panel links for pod/node/storage/network drill-downs.\n- Control plane workload count flags any non-system pods that slipped onto the HA nodes.\n- Problem and stuck pods use kube-state-metrics so counts and detail tables match exactly." - } } ], "schemaVersion": 39, diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index f8b40af..6503da9 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -18,7 +18,7 @@ data: "panels": [ { "id": 1, - "type": "stat", + "type": "gauge", "title": "Workers ready", "datasource": { "type": "prometheus", @@ -38,10 +38,8 @@ data: ], "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "mappings": [], + "min": 0, + "max": 18, "thresholds": { "mode": "absolute", "steps": [ @@ -62,19 +60,11 @@ data: "value": 18 } ] - }, - "unit": "none", - "custom": { - "displayMode": "auto", - "valueSuffix": "/18" } }, "overrides": [] }, "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" @@ -82,12 +72,14 @@ data: "fields": "", "values": false }, - "textMode": "value" + "orientation": "auto", + "showThresholdMarkers": false, + "showThresholdLabels": false } }, { "id": 2, - "type": "stat", + "type": "gauge", "title": "Control plane ready", "datasource": { "type": "prometheus", @@ -107,10 +99,8 @@ data: ], "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "mappings": [], + "min": 0, + "max": 3, "thresholds": { "mode": "absolute", "steps": [ @@ -123,19 +113,11 @@ data: "value": 3 } ] - }, - "unit": "none", - "custom": { - "displayMode": "auto", - "valueSuffix": "/3" } }, "overrides": [] }, "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" @@ -143,12 +125,14 @@ data: "fields": "", "values": false }, - "textMode": "value" + "orientation": "auto", + "showThresholdMarkers": false, + "showThresholdLabels": false } }, { "id": 3, - "type": "stat", + "type": "gauge", "title": "Control plane workloads", "datasource": { "type": "prometheus", @@ -168,10 +152,8 @@ data: ], "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "mappings": [], + "min": 0, + "max": 4, "thresholds": { "mode": "absolute", "steps": [ @@ -192,18 +174,11 @@ data: "value": 3 } ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" } }, "overrides": [] }, "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" @@ -211,7 +186,9 @@ data: "fields": "", "values": false }, - "textMode": "value" + "orientation": "auto", + "showThresholdMarkers": false, + "showThresholdLabels": false }, "links": [ { @@ -223,7 +200,7 @@ data: }, { "id": 4, - "type": "stat", + "type": "gauge", "title": "Problem pods", "datasource": { "type": "prometheus", @@ -243,10 +220,8 @@ data: ], "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "mappings": [], + "min": 0, + "max": 4, "thresholds": { "mode": "absolute", "steps": [ @@ -267,18 +242,11 @@ data: "value": 3 } ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" } }, "overrides": [] }, "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" @@ -286,7 +254,9 @@ data: "fields": "", "values": false }, - "textMode": "value" + "orientation": "auto", + "showThresholdMarkers": false, + "showThresholdLabels": false }, "links": [ { @@ -298,7 +268,7 @@ data: }, { "id": 5, - "type": "stat", + "type": "gauge", "title": "Stuck terminating", "datasource": { "type": "prometheus", @@ -318,10 +288,8 @@ data: ], "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "mappings": [], + "min": 0, + "max": 4, "thresholds": { "mode": "absolute", "steps": [ @@ -342,18 +310,11 @@ data: "value": 3 } ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" } }, "overrides": [] }, "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" @@ -361,7 +322,9 @@ data: "fields": "", "values": false }, - "textMode": "value" + "orientation": "auto", + "showThresholdMarkers": false, + "showThresholdLabels": false }, "links": [ { @@ -373,7 +336,7 @@ data: }, { "id": 6, - "type": "stat", + "type": "gauge", "title": "Running pods", "datasource": { "type": "prometheus", @@ -393,34 +356,25 @@ data: ], "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "mappings": [], + "min": 0, + "max": 5, "thresholds": { "mode": "absolute", "steps": [ { - "color": "rgba(115, 115, 115, 1)", + "color": "green", "value": null }, { - "color": "green", - "value": 1 + "color": "red", + "value": 5 } ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" } }, "overrides": [] }, "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" @@ -428,7 +382,9 @@ data: "fields": "", "values": false }, - "textMode": "value" + "orientation": "auto", + "showThresholdMarkers": false, + "showThresholdLabels": false } }, { @@ -731,7 +687,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", + "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -773,7 +729,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ), 1)", + "expr": "100 * ( ( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -815,7 +771,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", + "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -1489,22 +1445,6 @@ data: "targetBlank": true } ] - }, - { - "id": 25, - "type": "text", - "title": "About this dashboard", - "gridPos": { - "h": 5, - "w": 24, - "x": 0, - "y": 55 - }, - "datasource": null, - "options": { - "mode": "markdown", - "content": "### Atlas Overview\n- Anonymous users land here; follow the panel links for pod/node/storage/network drill-downs.\n- Control plane workload count flags any non-system pods that slipped onto the HA nodes.\n- Problem and stuck pods use kube-state-metrics so counts and detail tables match exactly." - } } ], "schemaVersion": 39,