From c9708a83eaf0f54f8db0a0d2437951c2c61018e9 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Thu, 29 Jan 2026 06:11:54 -0300 Subject: [PATCH] fix(cluster-state): normalize hottest node label expr --- ariadne/services/cluster_state.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/ariadne/services/cluster_state.py b/ariadne/services/cluster_state.py index 14a854a..a0920e9 100644 --- a/ariadne/services/cluster_state.py +++ b/ariadne/services/cluster_state.py @@ -18,6 +18,7 @@ logger = get_logger(__name__) _VALUE_PAIR_LEN = 2 _RATE_WINDOW = "5m" _RESTARTS_WINDOW = "1h" +_NODE_UNAME_LABEL = 'node_uname_info{nodename!=""}' _WORKLOAD_LABEL_KEYS = ( "app.kubernetes.io/name", "app", @@ -1162,22 +1163,22 @@ def _hottest_nodes(errors: list[str]) -> dict[str, Any]: try: hottest["cpu"] = _vm_topk( f'label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{{mode="idle"}}[{_RATE_WINDOW}]))) * 100) ' - '* on(instance) group_left(node) label_replace(node_uname_info{{nodename!=""}}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")', + f'* on(instance) group_left(node) label_replace({_NODE_UNAME_LABEL}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")', "node", ) hottest["ram"] = _vm_topk( - 'label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) ' - '/ node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{{nodename!=""}}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")', + f'label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) ' + f'/ node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace({_NODE_UNAME_LABEL}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")', "node", ) hottest["net"] = _vm_topk( f'label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{{device!~"lo"}}[{_RATE_WINDOW}]) ' - f'+ rate(node_network_transmit_bytes_total{{device!~"lo"}}[{_RATE_WINDOW}]))) * on(instance) group_left(node) label_replace(node_uname_info{{nodename!=""}}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")', + f'+ rate(node_network_transmit_bytes_total{{device!~"lo"}}[{_RATE_WINDOW}]))) * on(instance) group_left(node) label_replace({_NODE_UNAME_LABEL}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")', "node", ) hottest["io"] = _vm_topk( f'label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[{_RATE_WINDOW}]) + rate(node_disk_written_bytes_total[{_RATE_WINDOW}]))) ' - '* on(instance) group_left(node) label_replace(node_uname_info{{nodename!=""}}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")', + f'* on(instance) group_left(node) label_replace({_NODE_UNAME_LABEL}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")', "node", ) except Exception as exc: