From 4aece7e5cb5e0972fb7419eaaae6ee02ef64909e Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 17 Nov 2025 19:56:57 -0300 Subject: [PATCH] monitoring: fix hottest node labels --- scripts/render_dashboards.py | 20 +++++++++------- .../monitoring/dashboards/atlas-overview.json | 24 +++++++------------ .../grafana-dashboard-overview.yaml | 24 +++++++------------ 3 files changed, 27 insertions(+), 41 deletions(-) diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index a9c319a..acc1c38 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -144,6 +144,12 @@ def astreae_free_expr(mount): return f"sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}})" +def hottest_stat_expr(inner_expr): + return ( + f'label_replace(topk(1, {inner_expr}), "__name__", "$1", "node", "(.*)")' + ) + + PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))' CRASHLOOP_EXPR = ( 'sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason' @@ -224,7 +230,6 @@ def stat_panel( instant=False, value_suffix=None, links=None, - display_name=None, ): """Return a Grafana stat panel definition.""" defaults = { @@ -243,8 +248,6 @@ def stat_panel( } if value_suffix: defaults["custom"]["valueSuffix"] = value_suffix - if display_name: - defaults["displayName"] = display_name panel = { "id": panel_id, "type": "stat", @@ -464,10 +467,10 @@ def build_overview(): ) hottest = [ - (7, "Hottest node: CPU", f"topk(1, {node_cpu_expr()})", "percent"), - (8, "Hottest node: RAM", f"topk(1, {node_mem_expr()})", "percent"), - (9, "Hottest node: NET", NET_TOP_EXPR, "Bps"), - (10, "Hottest node: I/O", IO_TOP_EXPR, "Bps"), + (7, "Hottest node: CPU", hottest_stat_expr(node_cpu_expr()), "percent"), + (8, "Hottest node: RAM", hottest_stat_expr(node_mem_expr()), "percent"), + (9, "Hottest node: NET", hottest_stat_expr(NET_SERIES_EXPR), "Bps"), + (10, "Hottest node: I/O", hottest_stat_expr(IO_SERIES_EXPR), "Bps"), ] for idx, (panel_id, title, expr, unit) in enumerate(hottest): panels.append( @@ -479,9 +482,8 @@ def build_overview(): unit=unit, thresholds=PERCENT_THRESHOLDS if unit == "percent" else None, text_mode="name_and_value", - legend="{{node}}", + legend=None, instant=True, - display_name="{{__field.labels.node}}\\n", links=link_to("atlas-nodes"), ) ) diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index f0cceaf..ea4e40e 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -438,9 +438,8 @@ }, "targets": [ { - "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "expr": "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", - "legendFormat": "{{node}}", "instant": true } ], @@ -470,8 +469,7 @@ "unit": "percent", "custom": { "displayMode": "auto" - }, - "displayName": "{{__field.labels.node}}\\n" + } }, "overrides": [] }, @@ -512,9 +510,8 @@ }, "targets": [ { - "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "expr": "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", - "legendFormat": "{{node}}", "instant": true } ], @@ -544,8 +541,7 @@ "unit": "percent", "custom": { "displayMode": "auto" - }, - "displayName": "{{__field.labels.node}}\\n" + } }, "overrides": [] }, @@ -586,9 +582,8 @@ }, "targets": [ { - "expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))", + "expr": "label_replace(topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", - "legendFormat": "{{node}}", "instant": true } ], @@ -614,8 +609,7 @@ "unit": "Bps", "custom": { "displayMode": "auto" - }, - "displayName": "{{__field.labels.node}}\\n" + } }, "overrides": [] }, @@ -656,9 +650,8 @@ }, "targets": [ { - "expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))", + "expr": "label_replace(topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", - "legendFormat": "{{node}}", "instant": true } ], @@ -684,8 +677,7 @@ "unit": "Bps", "custom": { "displayMode": "auto" - }, - "displayName": "{{__field.labels.node}}\\n" + } }, "overrides": [] }, diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 1839d8f..1df2956 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -447,9 +447,8 @@ data: }, "targets": [ { - "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "expr": "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", - "legendFormat": "{{node}}", "instant": true } ], @@ -479,8 +478,7 @@ data: "unit": "percent", "custom": { "displayMode": "auto" - }, - "displayName": "{{__field.labels.node}}\\n" + } }, "overrides": [] }, @@ -521,9 +519,8 @@ data: }, "targets": [ { - "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "expr": "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", - "legendFormat": "{{node}}", "instant": true } ], @@ -553,8 +550,7 @@ data: "unit": "percent", "custom": { "displayMode": "auto" - }, - "displayName": "{{__field.labels.node}}\\n" + } }, "overrides": [] }, @@ -595,9 +591,8 @@ data: }, "targets": [ { - "expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))", + "expr": "label_replace(topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", - "legendFormat": "{{node}}", "instant": true } ], @@ -623,8 +618,7 @@ data: "unit": "Bps", "custom": { "displayMode": "auto" - }, - "displayName": "{{__field.labels.node}}\\n" + } }, "overrides": [] }, @@ -665,9 +659,8 @@ data: }, "targets": [ { - "expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))", + "expr": "label_replace(topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", - "legendFormat": "{{node}}", "instant": true } ], @@ -693,8 +686,7 @@ data: "unit": "Bps", "custom": { "displayMode": "auto" - }, - "displayName": "{{__field.labels.node}}\\n" + } }, "overrides": [] },