From b28e7501b72d52ea5101f11c80e29fcc6946be14 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 17 Nov 2025 20:00:40 -0300 Subject: [PATCH] monitoring: show hottest node labels --- scripts/render_dashboards.py | 18 ++++++------------ .../monitoring/dashboards/atlas-overview.json | 16 ++++++++++------ .../monitoring/grafana-dashboard-overview.yaml | 16 ++++++++++------ 3 files changed, 26 insertions(+), 24 deletions(-) diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index acc1c38..e215ca8 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -144,12 +144,6 @@ def astreae_free_expr(mount): return f"sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}})" -def hottest_stat_expr(inner_expr): - return ( - f'label_replace(topk(1, {inner_expr}), "__name__", "$1", "node", "(.*)")' - ) - - PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))' CRASHLOOP_EXPR = ( 'sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason' @@ -467,22 +461,22 @@ def build_overview(): ) hottest = [ - (7, "Hottest node: CPU", hottest_stat_expr(node_cpu_expr()), "percent"), - (8, "Hottest node: RAM", hottest_stat_expr(node_mem_expr()), "percent"), - (9, "Hottest node: NET", hottest_stat_expr(NET_SERIES_EXPR), "Bps"), - (10, "Hottest node: I/O", hottest_stat_expr(IO_SERIES_EXPR), "Bps"), + (7, "Hottest node: CPU", node_cpu_expr(), "percent"), + (8, "Hottest node: RAM", node_mem_expr(), "percent"), + (9, "Hottest node: NET (rx+tx)", NET_SERIES_EXPR, "Bps"), + (10, "Hottest node: I/O (r+w)", IO_SERIES_EXPR, "Bps"), ] for idx, (panel_id, title, expr, unit) in enumerate(hottest): panels.append( stat_panel( panel_id, title, - expr, + f"topk(1, {expr})", {"h": 5, "w": 6, "x": 6 * idx, "y": 5}, unit=unit, thresholds=PERCENT_THRESHOLDS if unit == "percent" else None, text_mode="name_and_value", - legend=None, + legend="{{node}}", instant=True, links=link_to("atlas-nodes"), ) diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index ea4e40e..468ca8a 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -438,8 +438,9 @@ }, "targets": [ { - "expr": "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", + "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "refId": "A", + "legendFormat": "{{node}}", "instant": true } ], @@ -510,8 +511,9 @@ }, "targets": [ { - "expr": "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", + "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "refId": "A", + "legendFormat": "{{node}}", "instant": true } ], @@ -569,7 +571,7 @@ { "id": 9, "type": "stat", - "title": "Hottest node: NET", + "title": "Hottest node: NET (rx+tx)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -582,8 +584,9 @@ }, "targets": [ { - "expr": "label_replace(topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))), \"__name__\", \"$1\", \"node\", \"(.*)\")", + "expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))", "refId": "A", + "legendFormat": "{{node}}", "instant": true } ], @@ -637,7 +640,7 @@ { "id": 10, "type": "stat", - "title": "Hottest node: I/O", + "title": "Hottest node: I/O (r+w)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -650,8 +653,9 @@ }, "targets": [ { - "expr": "label_replace(topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))), \"__name__\", \"$1\", \"node\", \"(.*)\")", + "expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))", "refId": "A", + "legendFormat": "{{node}}", "instant": true } ], diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 1df2956..dbcc916 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -447,8 +447,9 @@ data: }, "targets": [ { - "expr": "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", + "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "refId": "A", + "legendFormat": "{{node}}", "instant": true } ], @@ -519,8 +520,9 @@ data: }, "targets": [ { - "expr": "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", + "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "refId": "A", + "legendFormat": "{{node}}", "instant": true } ], @@ -578,7 +580,7 @@ data: { "id": 9, "type": "stat", - "title": "Hottest node: NET", + "title": "Hottest node: NET (rx+tx)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -591,8 +593,9 @@ data: }, "targets": [ { - "expr": "label_replace(topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))), \"__name__\", \"$1\", \"node\", \"(.*)\")", + "expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))", "refId": "A", + "legendFormat": "{{node}}", "instant": true } ], @@ -646,7 +649,7 @@ data: { "id": 10, "type": "stat", - "title": "Hottest node: I/O", + "title": "Hottest node: I/O (r+w)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -659,8 +662,9 @@ data: }, "targets": [ { - "expr": "label_replace(topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))), \"__name__\", \"$1\", \"node\", \"(.*)\")", + "expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))", "refId": "A", + "legendFormat": "{{node}}", "instant": true } ],