snapshot: annotate windows and units

This commit is contained in:
Brad Stein 2026-01-28 20:29:13 -03:00
parent 672ce20967
commit e93b0a7744

View File

@ -16,6 +16,8 @@ from ..utils.logging import get_logger
logger = get_logger(__name__) logger = get_logger(__name__)
_VALUE_PAIR_LEN = 2 _VALUE_PAIR_LEN = 2
_RATE_WINDOW = "5m"
_RESTARTS_WINDOW = "1h"
_WORKLOAD_LABEL_KEYS = ( _WORKLOAD_LABEL_KEYS = (
"app.kubernetes.io/name", "app.kubernetes.io/name",
"app", "app",
@ -449,7 +451,7 @@ def _hottest_nodes(errors: list[str]) -> dict[str, Any]:
hottest: dict[str, Any] = {} hottest: dict[str, Any] = {}
try: try:
hottest["cpu"] = _vm_topk( hottest["cpu"] = _vm_topk(
'label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100) ' f'label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{{mode="idle"}}[{_RATE_WINDOW}]))) * 100) '
'* on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")', '* on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")',
"node", "node",
) )
@ -459,12 +461,12 @@ def _hottest_nodes(errors: list[str]) -> dict[str, Any]:
"node", "node",
) )
hottest["net"] = _vm_topk( hottest["net"] = _vm_topk(
'label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~"lo"}[5m]) ' f'label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{{device!~"lo"}}[{_RATE_WINDOW}]) '
'+ rate(node_network_transmit_bytes_total{device!~"lo"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")', f'+ rate(node_network_transmit_bytes_total{{device!~"lo"}}[{_RATE_WINDOW}]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")',
"node", "node",
) )
hottest["io"] = _vm_topk( hottest["io"] = _vm_topk(
'label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) ' f'label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[{_RATE_WINDOW}]) + rate(node_disk_written_bytes_total[{_RATE_WINDOW}]))) '
'* on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")', '* on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")',
"node", "node",
) )
@ -477,7 +479,7 @@ def _node_usage(errors: list[str]) -> dict[str, Any]:
usage: dict[str, Any] = {} usage: dict[str, Any] = {}
try: try:
usage["cpu"] = _vm_node_metric( usage["cpu"] = _vm_node_metric(
'avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100) ' f'avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{{mode="idle"}}[{_RATE_WINDOW}]))) * 100) '
'* on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))', '* on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))',
"node", "node",
) )
@ -487,13 +489,13 @@ def _node_usage(errors: list[str]) -> dict[str, Any]:
"node", "node",
) )
usage["net"] = _vm_node_metric( usage["net"] = _vm_node_metric(
'avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~"lo"}[5m]) ' f'avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{{device!~"lo"}}[{_RATE_WINDOW}]) '
'+ rate(node_network_transmit_bytes_total{device!~"lo"}[5m]))) * on(instance) group_left(node) ' f'+ rate(node_network_transmit_bytes_total{{device!~"lo"}}[{_RATE_WINDOW}]))) * on(instance) group_left(node) '
'label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))', 'label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))',
"node", "node",
) )
usage["io"] = _vm_node_metric( usage["io"] = _vm_node_metric(
'avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) ' f'avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[{_RATE_WINDOW}]) + rate(node_disk_written_bytes_total[{_RATE_WINDOW}]))) '
'* on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))', '* on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))',
"node", "node",
) )
@ -514,13 +516,24 @@ def _summarize_metrics(errors: list[str]) -> dict[str, Any]:
metrics["pods_failed"] = _vm_scalar("sum(kube_pod_status_phase{phase=\"Failed\"})") metrics["pods_failed"] = _vm_scalar("sum(kube_pod_status_phase{phase=\"Failed\"})")
metrics["pods_succeeded"] = _vm_scalar("sum(kube_pod_status_phase{phase=\"Succeeded\"})") metrics["pods_succeeded"] = _vm_scalar("sum(kube_pod_status_phase{phase=\"Succeeded\"})")
metrics["top_restarts_1h"] = _vm_vector( metrics["top_restarts_1h"] = _vm_vector(
"topk(5, sum by (namespace,pod) (increase(kube_pod_container_status_restarts_total[1h])))" f"topk(5, sum by (namespace,pod) (increase(kube_pod_container_status_restarts_total[{_RESTARTS_WINDOW}])))"
) )
except Exception as exc: except Exception as exc:
errors.append(f"vm: {exc}") errors.append(f"vm: {exc}")
metrics["postgres_connections"] = _postgres_connections(errors) metrics["postgres_connections"] = _postgres_connections(errors)
metrics["hottest_nodes"] = _hottest_nodes(errors) metrics["hottest_nodes"] = _hottest_nodes(errors)
metrics["node_usage"] = _node_usage(errors) metrics["node_usage"] = _node_usage(errors)
metrics["units"] = {
"cpu": "percent",
"ram": "percent",
"net": "bytes_per_sec",
"io": "bytes_per_sec",
"restarts": "count",
}
metrics["windows"] = {
"rates": _RATE_WINDOW,
"restarts": _RESTARTS_WINDOW,
}
return metrics return metrics