From 2ba642d49f92b20057ab914687830c3d6edf449c Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 18 Nov 2025 00:11:39 -0300 Subject: [PATCH] monitoring: add gpu pie and tidy net panels --- scripts/render_dashboards.py | 51 ++++-- .../monitoring/dashboards/atlas-overview.json | 157 +++++++++++------- .../grafana-dashboard-overview.yaml | 157 +++++++++++------- 3 files changed, 239 insertions(+), 126 deletions(-) diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index 4e8e5a5..c194771 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -167,12 +167,20 @@ def node_io_expr(scope=""): def namespace_cpu_share_expr(): selected = f"( {NAMESPACE_CPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )" - return f"100 * ( {selected} ) / sum( {NAMESPACE_CPU_RAW} )" + total = f"clamp_min(sum( {NAMESPACE_CPU_RAW} ), 1)" + return f"100 * ( {selected} ) / {total}" def namespace_ram_share_expr(): selected = f"( {NAMESPACE_RAM_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )" - return f"100 * ( {selected} ) / sum( {NAMESPACE_RAM_RAW} )" + total = f"clamp_min(sum( {NAMESPACE_RAM_RAW} ), 1)" + return f"100 * ( {selected} ) / {total}" + + +def namespace_gpu_share_expr(): + selected = f"( {NAMESPACE_GPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )" + total = f"clamp_min(sum( {NAMESPACE_GPU_RAW} ), 1)" + return f"100 * ( {selected} ) / {total}" PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))' @@ -214,12 +222,17 @@ NAMESPACE_CPU_RAW = ( NAMESPACE_RAM_RAW = ( 'sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace)' ) +NAMESPACE_GPU_RAW = ( + 'sum(kube_pod_container_resource_requests{namespace!="",resource="nvidia.com/gpu"}) by (namespace)' +) NAMESPACE_COMBINED_FILTER = ( 'topk(10, (' + NAMESPACE_CPU_RAW + ") + (" + NAMESPACE_RAM_RAW - + ' / 1e9))' + + ' / 1e9) + ( ' + + NAMESPACE_GPU_RAW + + ' * 10))' ) TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))" NET_INGRESS_EXPR = ( @@ -512,22 +525,30 @@ def build_overview(): 11, "Namespace CPU share", namespace_cpu_share_expr(), - {"h": 9, "w": 12, "x": 0, "y": 10}, + {"h": 9, "w": 8, "x": 0, "y": 10}, ) ) panels.append( pie_panel( 12, + "Namespace GPU share", + namespace_gpu_share_expr(), + {"h": 9, "w": 8, "x": 8, "y": 10}, + ) + ) + panels.append( + pie_panel( + 13, "Namespace RAM share", namespace_ram_share_expr(), - {"h": 9, "w": 12, "x": 12, "y": 10}, + {"h": 9, "w": 8, "x": 16, "y": 10}, ) ) worker_filter = f"{WORKER_REGEX}" panels.append( timeseries_panel( - 13, + 14, "Worker node CPU", node_cpu_expr(worker_filter), {"h": 8, "w": 12, "x": 0, "y": 19}, @@ -541,7 +562,7 @@ def build_overview(): ) panels.append( timeseries_panel( - 14, + 15, "Worker node RAM", node_mem_expr(worker_filter), {"h": 8, "w": 12, "x": 12, "y": 19}, @@ -556,7 +577,7 @@ def build_overview(): panels.append( timeseries_panel( - 15, + 16, "Control plane CPU", node_cpu_expr(CONTROL_REGEX), {"h": 7, "w": 12, "x": 0, "y": 27}, @@ -568,7 +589,7 @@ def build_overview(): ) panels.append( timeseries_panel( - 16, + 17, "Control plane RAM", node_mem_expr(CONTROL_REGEX), {"h": 7, "w": 12, "x": 12, "y": 27}, @@ -581,11 +602,12 @@ def build_overview(): panels.append( timeseries_panel( - 17, + 18, "Cluster ingress throughput", NET_INGRESS_EXPR, {"h": 7, "w": 12, "x": 0, "y": 34}, unit="Bps", + legend="Ingress", legend_display="list", legend_placement="bottom", links=link_to("atlas-network"), @@ -593,11 +615,12 @@ def build_overview(): ) panels.append( timeseries_panel( - 18, + 19, "Cluster egress throughput", NET_EGRESS_EXPR, {"h": 7, "w": 12, "x": 12, "y": 34}, unit="Bps", + legend="Egress", legend_display="list", legend_placement="bottom", links=link_to("atlas-network"), @@ -606,7 +629,7 @@ def build_overview(): panels.append( timeseries_panel( - 19, + 20, "Root filesystem usage", root_usage_expr(), {"h": 8, "w": 12, "x": 0, "y": 41}, @@ -621,12 +644,12 @@ def build_overview(): ) panels.append( { - "id": 20, + "id": 21, "type": "bargauge", "title": "Nodes closest to full root disks", "datasource": PROM_DS, "gridPos": {"h": 8, "w": 12, "x": 12, "y": 41}, - "targets": [{"expr": f"topk(8, {root_usage_expr()})", "refId": "A"}], + "targets": [{"expr": f"topk(8, {root_usage_expr()})", "refId": "A", "legendFormat": "{{node}}"}], "fieldConfig": { "defaults": { "unit": "percent", diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 55c1909..0b2f69f 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -716,13 +716,13 @@ }, "gridPos": { "h": 9, - "w": 12, + "w": 8, "x": 0, "y": 10 }, "targets": [ { - "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )", + "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -751,20 +751,20 @@ { "id": 12, "type": "piechart", - "title": "Namespace RAM share", + "title": "Namespace GPU share", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 9, - "w": 12, - "x": 12, + "w": 8, + "x": 8, "y": 10 }, "targets": [ { - "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) )", + "expr": "100 * ( ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -792,6 +792,48 @@ }, { "id": 13, + "type": "piechart", + "title": "Namespace RAM share", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 8, + "x": 16, + "y": 10 + }, + "targets": [ + { + "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", + "refId": "A", + "legendFormat": "{{namespace}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "right" + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + } + }, + { + "id": 14, "type": "timeseries", "title": "Worker node CPU", "datasource": { @@ -838,7 +880,7 @@ ] }, { - "id": 14, + "id": 15, "type": "timeseries", "title": "Worker node RAM", "datasource": { @@ -885,7 +927,7 @@ ] }, { - "id": 15, + "id": 16, "type": "timeseries", "title": "Control plane CPU", "datasource": { @@ -922,7 +964,7 @@ } }, { - "id": 16, + "id": 17, "type": "timeseries", "title": "Control plane RAM", "datasource": { @@ -959,7 +1001,7 @@ } }, { - "id": 17, + "id": 18, "type": "timeseries", "title": "Cluster ingress throughput", "datasource": { @@ -975,50 +1017,8 @@ "targets": [ { "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "unit": "Bps" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi" - } - }, - "links": [ - { - "title": "Open atlas-network dashboard", - "url": "/d/atlas-network", - "targetBlank": true - } - ] - }, - { - "id": 18, - "type": "timeseries", - "title": "Cluster egress throughput", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 34 - }, - "targets": [ - { - "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)", - "refId": "A" + "refId": "A", + "legendFormat": "Ingress" } ], "fieldConfig": { @@ -1047,6 +1047,50 @@ { "id": 19, "type": "timeseries", + "title": "Cluster egress throughput", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 34 + }, + "targets": [ + { + "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)", + "refId": "A", + "legendFormat": "Egress" + } + ], + "fieldConfig": { + "defaults": { + "unit": "Bps" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "links": [ + { + "title": "Open atlas-network dashboard", + "url": "/d/atlas-network", + "targetBlank": true + } + ] + }, + { + "id": 20, + "type": "timeseries", "title": "Root filesystem usage", "datasource": { "type": "prometheus", @@ -1093,7 +1137,7 @@ ] }, { - "id": 20, + "id": 21, "type": "bargauge", "title": "Nodes closest to full root disks", "datasource": { @@ -1109,7 +1153,8 @@ "targets": [ { "expr": "topk(8, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", - "refId": "A" + "refId": "A", + "legendFormat": "{{node}}" } ], "fieldConfig": { diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index deeeacc..0ac79db 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -725,13 +725,13 @@ data: }, "gridPos": { "h": 9, - "w": 12, + "w": 8, "x": 0, "y": 10 }, "targets": [ { - "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )", + "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -760,20 +760,20 @@ data: { "id": 12, "type": "piechart", - "title": "Namespace RAM share", + "title": "Namespace GPU share", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 9, - "w": 12, - "x": 12, + "w": 8, + "x": 8, "y": 10 }, "targets": [ { - "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) )", + "expr": "100 * ( ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -801,6 +801,48 @@ data: }, { "id": 13, + "type": "piechart", + "title": "Namespace RAM share", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 8, + "x": 16, + "y": 10 + }, + "targets": [ + { + "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", + "refId": "A", + "legendFormat": "{{namespace}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "right" + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + } + }, + { + "id": 14, "type": "timeseries", "title": "Worker node CPU", "datasource": { @@ -847,7 +889,7 @@ data: ] }, { - "id": 14, + "id": 15, "type": "timeseries", "title": "Worker node RAM", "datasource": { @@ -894,7 +936,7 @@ data: ] }, { - "id": 15, + "id": 16, "type": "timeseries", "title": "Control plane CPU", "datasource": { @@ -931,7 +973,7 @@ data: } }, { - "id": 16, + "id": 17, "type": "timeseries", "title": "Control plane RAM", "datasource": { @@ -968,7 +1010,7 @@ data: } }, { - "id": 17, + "id": 18, "type": "timeseries", "title": "Cluster ingress throughput", "datasource": { @@ -984,50 +1026,8 @@ data: "targets": [ { "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "unit": "Bps" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi" - } - }, - "links": [ - { - "title": "Open atlas-network dashboard", - "url": "/d/atlas-network", - "targetBlank": true - } - ] - }, - { - "id": 18, - "type": "timeseries", - "title": "Cluster egress throughput", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 34 - }, - "targets": [ - { - "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)", - "refId": "A" + "refId": "A", + "legendFormat": "Ingress" } ], "fieldConfig": { @@ -1056,6 +1056,50 @@ data: { "id": 19, "type": "timeseries", + "title": "Cluster egress throughput", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 34 + }, + "targets": [ + { + "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)", + "refId": "A", + "legendFormat": "Egress" + } + ], + "fieldConfig": { + "defaults": { + "unit": "Bps" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "links": [ + { + "title": "Open atlas-network dashboard", + "url": "/d/atlas-network", + "targetBlank": true + } + ] + }, + { + "id": 20, + "type": "timeseries", "title": "Root filesystem usage", "datasource": { "type": "prometheus", @@ -1102,7 +1146,7 @@ data: ] }, { - "id": 20, + "id": 21, "type": "bargauge", "title": "Nodes closest to full root disks", "datasource": { @@ -1118,7 +1162,8 @@ data: "targets": [ { "expr": "topk(8, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", - "refId": "A" + "refId": "A", + "legendFormat": "{{node}}" } ], "fieldConfig": {