From beb3243839343349cfe2803ea9a8be634d9fc72c Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 17 Nov 2025 23:42:55 -0300 Subject: [PATCH] Revert GPU pie chart additions --- scripts/render_dashboards.py | 66 +++++--------- .../monitoring/dashboards/atlas-overview.json | 88 +++++-------------- .../grafana-dashboard-overview.yaml | 88 +++++-------------- 3 files changed, 67 insertions(+), 175 deletions(-) diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index a09eeae..4e8e5a5 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -145,7 +145,7 @@ def astreae_free_expr(mount): def topk_with_node(expr): - return f"topk(1, {expr})" + return f'label_replace(topk(1, {expr}), "__name__", "$1", "node", "(.*)")' def node_net_expr(scope=""): @@ -167,20 +167,12 @@ def node_io_expr(scope=""): def namespace_cpu_share_expr(): selected = f"( {NAMESPACE_CPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )" - total = f"clamp_min(sum( {NAMESPACE_CPU_RAW} ), 1)" - return f"100 * ( {selected} ) / {total}" + return f"100 * ( {selected} ) / sum( {NAMESPACE_CPU_RAW} )" def namespace_ram_share_expr(): selected = f"( {NAMESPACE_RAM_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )" - total = f"clamp_min(sum( {NAMESPACE_RAM_RAW} ), 1)" - return f"100 * ( {selected} ) / {total}" - - -def namespace_gpu_share_expr(): - selected = f"( {NAMESPACE_GPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )" - total = f"clamp_min(sum( {NAMESPACE_GPU_RAW} ), 1)" - return f"100 * ( {selected} ) / {total}" + return f"100 * ( {selected} ) / sum( {NAMESPACE_RAM_RAW} )" PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))' @@ -222,20 +214,12 @@ NAMESPACE_CPU_RAW = ( NAMESPACE_RAM_RAW = ( 'sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace)' ) -NAMESPACE_GPU_RAW = ( - 'sum(rate(container_gpu_usage_seconds_total{namespace!="",pod!="",container!=""}[5m])) by (namespace)' -) -NAMESPACE_GPU_RAW = ( - 'sum(kube_pod_container_resource_requests{resource="nvidia.com/gpu",namespace!=""}) by (namespace)' -) NAMESPACE_COMBINED_FILTER = ( 'topk(10, (' + NAMESPACE_CPU_RAW + ") + (" + NAMESPACE_RAM_RAW - + ' / 1e9) + (' - + NAMESPACE_GPU_RAW - + ' * 10))' + + ' / 1e9))' ) TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))" NET_INGRESS_EXPR = ( @@ -526,32 +510,24 @@ def build_overview(): panels.append( pie_panel( 11, - "Namespace GPU share", - namespace_gpu_share_expr(), - {"h": 9, "w": 8, "x": 0, "y": 10}, + "Namespace CPU share", + namespace_cpu_share_expr(), + {"h": 9, "w": 12, "x": 0, "y": 10}, ) ) panels.append( pie_panel( 12, - "Namespace CPU share", - namespace_cpu_share_expr(), - {"h": 9, "w": 8, "x": 8, "y": 10}, - ) - ) - panels.append( - pie_panel( - 13, "Namespace RAM share", namespace_ram_share_expr(), - {"h": 9, "w": 8, "x": 16, "y": 10}, + {"h": 9, "w": 12, "x": 12, "y": 10}, ) ) worker_filter = f"{WORKER_REGEX}" panels.append( timeseries_panel( - 14, + 13, "Worker node CPU", node_cpu_expr(worker_filter), {"h": 8, "w": 12, "x": 0, "y": 19}, @@ -565,7 +541,7 @@ def build_overview(): ) panels.append( timeseries_panel( - 15, + 14, "Worker node RAM", node_mem_expr(worker_filter), {"h": 8, "w": 12, "x": 12, "y": 19}, @@ -580,7 +556,7 @@ def build_overview(): panels.append( timeseries_panel( - 16, + 15, "Control plane CPU", node_cpu_expr(CONTROL_REGEX), {"h": 7, "w": 12, "x": 0, "y": 27}, @@ -592,7 +568,7 @@ def build_overview(): ) panels.append( timeseries_panel( - 17, + 16, "Control plane RAM", node_mem_expr(CONTROL_REGEX), {"h": 7, "w": 12, "x": 12, "y": 27}, @@ -605,7 +581,7 @@ def build_overview(): panels.append( timeseries_panel( - 18, + 17, "Cluster ingress throughput", NET_INGRESS_EXPR, {"h": 7, "w": 12, "x": 0, "y": 34}, @@ -617,7 +593,7 @@ def build_overview(): ) panels.append( timeseries_panel( - 19, + 18, "Cluster egress throughput", NET_EGRESS_EXPR, {"h": 7, "w": 12, "x": 12, "y": 34}, @@ -630,7 +606,7 @@ def build_overview(): panels.append( timeseries_panel( - 20, + 19, "Root filesystem usage", root_usage_expr(), {"h": 8, "w": 12, "x": 0, "y": 41}, @@ -645,7 +621,7 @@ def build_overview(): ) panels.append( { - "id": 21, + "id": 20, "type": "bargauge", "title": "Nodes closest to full root disks", "datasource": PROM_DS, @@ -679,10 +655,10 @@ def build_overview(): ) storage_panels = [ - (22, "Astreae usage", astreae_usage_expr("/mnt/astreae"), "percent"), - (23, "Asteria usage", astreae_usage_expr("/mnt/asteria"), "percent"), - (24, "Astreae free", astreae_free_expr("/mnt/astreae"), "decbytes"), - (25, "Asteria free", astreae_free_expr("/mnt/asteria"), "decbytes"), + (21, "Astreae usage", astreae_usage_expr("/mnt/astreae"), "percent"), + (22, "Asteria usage", astreae_usage_expr("/mnt/asteria"), "percent"), + (23, "Astreae free", astreae_free_expr("/mnt/astreae"), "decbytes"), + (24, "Asteria free", astreae_free_expr("/mnt/asteria"), "decbytes"), ] for idx, (panel_id, title, expr, unit) in enumerate(storage_panels): panels.append( @@ -699,7 +675,7 @@ def build_overview(): panels.append( text_panel( - 26, + 25, "About this dashboard", textwrap.dedent( """\ diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 1bb0b53..55c1909 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -438,7 +438,7 @@ }, "targets": [ { - "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "expr": "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}", "instant": true @@ -511,7 +511,7 @@ }, "targets": [ { - "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "expr": "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}", "instant": true @@ -584,7 +584,7 @@ }, "targets": [ { - "expr": "topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}", "instant": true @@ -653,7 +653,7 @@ }, "targets": [ { - "expr": "topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}", "instant": true @@ -709,20 +709,20 @@ { "id": 11, "type": "piechart", - "title": "Namespace GPU share", + "title": "Namespace CPU share", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 9, - "w": 8, + "w": 12, "x": 0, "y": 10 }, "targets": [ { - "expr": "100 * ( ( sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) ), 1)", + "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )", "refId": "A", "legendFormat": "{{namespace}}" } @@ -751,20 +751,20 @@ { "id": 12, "type": "piechart", - "title": "Namespace CPU share", + "title": "Namespace RAM share", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 9, - "w": 8, - "x": 8, + "w": 12, + "x": 12, "y": 10 }, "targets": [ { - "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", + "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) )", "refId": "A", "legendFormat": "{{namespace}}" } @@ -792,48 +792,6 @@ }, { "id": 13, - "type": "piechart", - "title": "Namespace RAM share", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 9, - "w": 8, - "x": 16, - "y": 10 - }, - "targets": [ - { - "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", - "refId": "A", - "legendFormat": "{{namespace}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "list", - "placement": "right" - }, - "pieType": "pie", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - } - } - }, - { - "id": 14, "type": "timeseries", "title": "Worker node CPU", "datasource": { @@ -880,7 +838,7 @@ ] }, { - "id": 15, + "id": 14, "type": "timeseries", "title": "Worker node RAM", "datasource": { @@ -927,7 +885,7 @@ ] }, { - "id": 16, + "id": 15, "type": "timeseries", "title": "Control plane CPU", "datasource": { @@ -964,7 +922,7 @@ } }, { - "id": 17, + "id": 16, "type": "timeseries", "title": "Control plane RAM", "datasource": { @@ -1001,7 +959,7 @@ } }, { - "id": 18, + "id": 17, "type": "timeseries", "title": "Cluster ingress throughput", "datasource": { @@ -1044,7 +1002,7 @@ ] }, { - "id": 19, + "id": 18, "type": "timeseries", "title": "Cluster egress throughput", "datasource": { @@ -1087,7 +1045,7 @@ ] }, { - "id": 20, + "id": 19, "type": "timeseries", "title": "Root filesystem usage", "datasource": { @@ -1135,7 +1093,7 @@ ] }, { - "id": 21, + "id": 20, "type": "bargauge", "title": "Nodes closest to full root disks", "datasource": { @@ -1204,7 +1162,7 @@ ] }, { - "id": 22, + "id": 21, "type": "stat", "title": "Astreae usage", "datasource": { @@ -1275,7 +1233,7 @@ ] }, { - "id": 23, + "id": 22, "type": "stat", "title": "Asteria usage", "datasource": { @@ -1346,7 +1304,7 @@ ] }, { - "id": 24, + "id": 23, "type": "stat", "title": "Astreae free", "datasource": { @@ -1413,7 +1371,7 @@ ] }, { - "id": 25, + "id": 24, "type": "stat", "title": "Asteria free", "datasource": { @@ -1480,7 +1438,7 @@ ] }, { - "id": 26, + "id": 25, "type": "text", "title": "About this dashboard", "gridPos": { diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index f2ef289..deeeacc 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -447,7 +447,7 @@ data: }, "targets": [ { - "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "expr": "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}", "instant": true @@ -520,7 +520,7 @@ data: }, "targets": [ { - "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "expr": "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}", "instant": true @@ -593,7 +593,7 @@ data: }, "targets": [ { - "expr": "topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}", "instant": true @@ -662,7 +662,7 @@ data: }, "targets": [ { - "expr": "topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}", "instant": true @@ -718,20 +718,20 @@ data: { "id": 11, "type": "piechart", - "title": "Namespace GPU share", + "title": "Namespace CPU share", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 9, - "w": 8, + "w": 12, "x": 0, "y": 10 }, "targets": [ { - "expr": "100 * ( ( sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) ), 1)", + "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )", "refId": "A", "legendFormat": "{{namespace}}" } @@ -760,20 +760,20 @@ data: { "id": 12, "type": "piechart", - "title": "Namespace CPU share", + "title": "Namespace RAM share", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 9, - "w": 8, - "x": 8, + "w": 12, + "x": 12, "y": 10 }, "targets": [ { - "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", + "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) )", "refId": "A", "legendFormat": "{{namespace}}" } @@ -801,48 +801,6 @@ data: }, { "id": 13, - "type": "piechart", - "title": "Namespace RAM share", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 9, - "w": 8, - "x": 16, - "y": 10 - }, - "targets": [ - { - "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", - "refId": "A", - "legendFormat": "{{namespace}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "list", - "placement": "right" - }, - "pieType": "pie", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - } - } - }, - { - "id": 14, "type": "timeseries", "title": "Worker node CPU", "datasource": { @@ -889,7 +847,7 @@ data: ] }, { - "id": 15, + "id": 14, "type": "timeseries", "title": "Worker node RAM", "datasource": { @@ -936,7 +894,7 @@ data: ] }, { - "id": 16, + "id": 15, "type": "timeseries", "title": "Control plane CPU", "datasource": { @@ -973,7 +931,7 @@ data: } }, { - "id": 17, + "id": 16, "type": "timeseries", "title": "Control plane RAM", "datasource": { @@ -1010,7 +968,7 @@ data: } }, { - "id": 18, + "id": 17, "type": "timeseries", "title": "Cluster ingress throughput", "datasource": { @@ -1053,7 +1011,7 @@ data: ] }, { - "id": 19, + "id": 18, "type": "timeseries", "title": "Cluster egress throughput", "datasource": { @@ -1096,7 +1054,7 @@ data: ] }, { - "id": 20, + "id": 19, "type": "timeseries", "title": "Root filesystem usage", "datasource": { @@ -1144,7 +1102,7 @@ data: ] }, { - "id": 21, + "id": 20, "type": "bargauge", "title": "Nodes closest to full root disks", "datasource": { @@ -1213,7 +1171,7 @@ data: ] }, { - "id": 22, + "id": 21, "type": "stat", "title": "Astreae usage", "datasource": { @@ -1284,7 +1242,7 @@ data: ] }, { - "id": 23, + "id": 22, "type": "stat", "title": "Asteria usage", "datasource": { @@ -1355,7 +1313,7 @@ data: ] }, { - "id": 24, + "id": 23, "type": "stat", "title": "Astreae free", "datasource": { @@ -1422,7 +1380,7 @@ data: ] }, { - "id": 25, + "id": 24, "type": "stat", "title": "Asteria free", "datasource": { @@ -1489,7 +1447,7 @@ data: ] }, { - "id": 26, + "id": 25, "type": "text", "title": "About this dashboard", "gridPos": {