From e93b0a7744886e106c7f483fc12224baa902bbf9 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 20:29:13 -0300 Subject: [PATCH] snapshot: annotate windows and units --- ariadne/services/cluster_state.py | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/ariadne/services/cluster_state.py b/ariadne/services/cluster_state.py index 502537d..8683221 100644 --- a/ariadne/services/cluster_state.py +++ b/ariadne/services/cluster_state.py @@ -16,6 +16,8 @@ from ..utils.logging import get_logger logger = get_logger(__name__) _VALUE_PAIR_LEN = 2 +_RATE_WINDOW = "5m" +_RESTARTS_WINDOW = "1h" _WORKLOAD_LABEL_KEYS = ( "app.kubernetes.io/name", "app", @@ -449,7 +451,7 @@ def _hottest_nodes(errors: list[str]) -> dict[str, Any]: hottest: dict[str, Any] = {} try: hottest["cpu"] = _vm_topk( - 'label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100) ' + f'label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{{mode="idle"}}[{_RATE_WINDOW}]))) * 100) ' '* on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")', "node", ) @@ -459,12 +461,12 @@ def _hottest_nodes(errors: list[str]) -> dict[str, Any]: "node", ) hottest["net"] = _vm_topk( - 'label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~"lo"}[5m]) ' - '+ rate(node_network_transmit_bytes_total{device!~"lo"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")', + f'label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{{device!~"lo"}}[{_RATE_WINDOW}]) ' + f'+ rate(node_network_transmit_bytes_total{{device!~"lo"}}[{_RATE_WINDOW}]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")', "node", ) hottest["io"] = _vm_topk( - 'label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) ' + f'label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[{_RATE_WINDOW}]) + rate(node_disk_written_bytes_total[{_RATE_WINDOW}]))) ' '* on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")', "node", ) @@ -477,7 +479,7 @@ def _node_usage(errors: list[str]) -> dict[str, Any]: usage: dict[str, Any] = {} try: usage["cpu"] = _vm_node_metric( - 'avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100) ' + f'avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{{mode="idle"}}[{_RATE_WINDOW}]))) * 100) ' '* on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))', "node", ) @@ -487,13 +489,13 @@ def _node_usage(errors: list[str]) -> dict[str, Any]: "node", ) usage["net"] = _vm_node_metric( - 'avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~"lo"}[5m]) ' - '+ rate(node_network_transmit_bytes_total{device!~"lo"}[5m]))) * on(instance) group_left(node) ' + f'avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{{device!~"lo"}}[{_RATE_WINDOW}]) ' + f'+ rate(node_network_transmit_bytes_total{{device!~"lo"}}[{_RATE_WINDOW}]))) * on(instance) group_left(node) ' 'label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))', "node", ) usage["io"] = _vm_node_metric( - 'avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) ' + f'avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[{_RATE_WINDOW}]) + rate(node_disk_written_bytes_total[{_RATE_WINDOW}]))) ' '* on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))', "node", ) @@ -514,13 +516,24 @@ def _summarize_metrics(errors: list[str]) -> dict[str, Any]: metrics["pods_failed"] = _vm_scalar("sum(kube_pod_status_phase{phase=\"Failed\"})") metrics["pods_succeeded"] = _vm_scalar("sum(kube_pod_status_phase{phase=\"Succeeded\"})") metrics["top_restarts_1h"] = _vm_vector( - "topk(5, sum by (namespace,pod) (increase(kube_pod_container_status_restarts_total[1h])))" + f"topk(5, sum by (namespace,pod) (increase(kube_pod_container_status_restarts_total[{_RESTARTS_WINDOW}])))" ) except Exception as exc: errors.append(f"vm: {exc}") metrics["postgres_connections"] = _postgres_connections(errors) metrics["hottest_nodes"] = _hottest_nodes(errors) metrics["node_usage"] = _node_usage(errors) + metrics["units"] = { + "cpu": "percent", + "ram": "percent", + "net": "bytes_per_sec", + "io": "bytes_per_sec", + "restarts": "count", + } + metrics["windows"] = { + "rates": _RATE_WINDOW, + "restarts": _RESTARTS_WINDOW, + } return metrics