fix(cluster-state): normalize hottest node label expr

This commit is contained in:
Brad Stein 2026-01-29 06:11:54 -03:00
parent bdb7cc4fcd
commit c9708a83ea

View File

@ -18,6 +18,7 @@ logger = get_logger(__name__)
_VALUE_PAIR_LEN = 2 _VALUE_PAIR_LEN = 2
_RATE_WINDOW = "5m" _RATE_WINDOW = "5m"
_RESTARTS_WINDOW = "1h" _RESTARTS_WINDOW = "1h"
_NODE_UNAME_LABEL = 'node_uname_info{nodename!=""}'
_WORKLOAD_LABEL_KEYS = ( _WORKLOAD_LABEL_KEYS = (
"app.kubernetes.io/name", "app.kubernetes.io/name",
"app", "app",
@ -1162,22 +1163,22 @@ def _hottest_nodes(errors: list[str]) -> dict[str, Any]:
try: try:
hottest["cpu"] = _vm_topk( hottest["cpu"] = _vm_topk(
f'label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{{mode="idle"}}[{_RATE_WINDOW}]))) * 100) ' f'label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{{mode="idle"}}[{_RATE_WINDOW}]))) * 100) '
'* on(instance) group_left(node) label_replace(node_uname_info{{nodename!=""}}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")', f'* on(instance) group_left(node) label_replace({_NODE_UNAME_LABEL}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")',
"node", "node",
) )
hottest["ram"] = _vm_topk( hottest["ram"] = _vm_topk(
'label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) ' f'label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) '
'/ node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{{nodename!=""}}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")', f'/ node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace({_NODE_UNAME_LABEL}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")',
"node", "node",
) )
hottest["net"] = _vm_topk( hottest["net"] = _vm_topk(
f'label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{{device!~"lo"}}[{_RATE_WINDOW}]) ' f'label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{{device!~"lo"}}[{_RATE_WINDOW}]) '
f'+ rate(node_network_transmit_bytes_total{{device!~"lo"}}[{_RATE_WINDOW}]))) * on(instance) group_left(node) label_replace(node_uname_info{{nodename!=""}}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")', f'+ rate(node_network_transmit_bytes_total{{device!~"lo"}}[{_RATE_WINDOW}]))) * on(instance) group_left(node) label_replace({_NODE_UNAME_LABEL}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")',
"node", "node",
) )
hottest["io"] = _vm_topk( hottest["io"] = _vm_topk(
f'label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[{_RATE_WINDOW}]) + rate(node_disk_written_bytes_total[{_RATE_WINDOW}]))) ' f'label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[{_RATE_WINDOW}]) + rate(node_disk_written_bytes_total[{_RATE_WINDOW}]))) '
'* on(instance) group_left(node) label_replace(node_uname_info{{nodename!=""}}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")', f'* on(instance) group_left(node) label_replace({_NODE_UNAME_LABEL}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")',
"node", "node",
) )
except Exception as exc: except Exception as exc: