diff --git a/ariadne/services/cluster_state.py b/ariadne/services/cluster_state.py index 14a854a..a0920e9 100644 --- a/ariadne/services/cluster_state.py +++ b/ariadne/services/cluster_state.py @@ -18,6 +18,7 @@ logger = get_logger(__name__) _VALUE_PAIR_LEN = 2 _RATE_WINDOW = "5m" _RESTARTS_WINDOW = "1h" +_NODE_UNAME_LABEL = 'node_uname_info{nodename!=""}' _WORKLOAD_LABEL_KEYS = ( "app.kubernetes.io/name", "app", @@ -1162,22 +1163,22 @@ def _hottest_nodes(errors: list[str]) -> dict[str, Any]: try: hottest["cpu"] = _vm_topk( f'label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{{mode="idle"}}[{_RATE_WINDOW}]))) * 100) ' - '* on(instance) group_left(node) label_replace(node_uname_info{{nodename!=""}}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")', + f'* on(instance) group_left(node) label_replace({_NODE_UNAME_LABEL}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")', "node", ) hottest["ram"] = _vm_topk( - 'label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) ' - '/ node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{{nodename!=""}}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")', + f'label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) ' + f'/ node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace({_NODE_UNAME_LABEL}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")', "node", ) hottest["net"] = _vm_topk( f'label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{{device!~"lo"}}[{_RATE_WINDOW}]) ' - f'+ rate(node_network_transmit_bytes_total{{device!~"lo"}}[{_RATE_WINDOW}]))) * on(instance) group_left(node) label_replace(node_uname_info{{nodename!=""}}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")', + f'+ rate(node_network_transmit_bytes_total{{device!~"lo"}}[{_RATE_WINDOW}]))) * on(instance) group_left(node) label_replace({_NODE_UNAME_LABEL}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")', "node", ) hottest["io"] = _vm_topk( f'label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[{_RATE_WINDOW}]) + rate(node_disk_written_bytes_total[{_RATE_WINDOW}]))) ' - '* on(instance) group_left(node) label_replace(node_uname_info{{nodename!=""}}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")', + f'* on(instance) group_left(node) label_replace({_NODE_UNAME_LABEL}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")', "node", ) except Exception as exc: