snapshot: annotate windows and units

This commit is contained in:
Brad Stein 2026-01-28 20:29:13 -03:00
parent 672ce20967
commit e93b0a7744

View File

@ -16,6 +16,8 @@ from ..utils.logging import get_logger
logger = get_logger(__name__)
_VALUE_PAIR_LEN = 2
_RATE_WINDOW = "5m"
_RESTARTS_WINDOW = "1h"
_WORKLOAD_LABEL_KEYS = (
"app.kubernetes.io/name",
"app",
@ -449,7 +451,7 @@ def _hottest_nodes(errors: list[str]) -> dict[str, Any]:
hottest: dict[str, Any] = {}
try:
hottest["cpu"] = _vm_topk(
'label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100) '
f'label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{{mode="idle"}}[{_RATE_WINDOW}]))) * 100) '
'* on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")',
"node",
)
@ -459,12 +461,12 @@ def _hottest_nodes(errors: list[str]) -> dict[str, Any]:
"node",
)
hottest["net"] = _vm_topk(
'label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~"lo"}[5m]) '
'+ rate(node_network_transmit_bytes_total{device!~"lo"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")',
f'label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{{device!~"lo"}}[{_RATE_WINDOW}]) '
f'+ rate(node_network_transmit_bytes_total{{device!~"lo"}}[{_RATE_WINDOW}]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")',
"node",
)
hottest["io"] = _vm_topk(
'label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) '
f'label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[{_RATE_WINDOW}]) + rate(node_disk_written_bytes_total[{_RATE_WINDOW}]))) '
'* on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")',
"node",
)
@ -477,7 +479,7 @@ def _node_usage(errors: list[str]) -> dict[str, Any]:
usage: dict[str, Any] = {}
try:
usage["cpu"] = _vm_node_metric(
'avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100) '
f'avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{{mode="idle"}}[{_RATE_WINDOW}]))) * 100) '
'* on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))',
"node",
)
@ -487,13 +489,13 @@ def _node_usage(errors: list[str]) -> dict[str, Any]:
"node",
)
usage["net"] = _vm_node_metric(
'avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~"lo"}[5m]) '
'+ rate(node_network_transmit_bytes_total{device!~"lo"}[5m]))) * on(instance) group_left(node) '
f'avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{{device!~"lo"}}[{_RATE_WINDOW}]) '
f'+ rate(node_network_transmit_bytes_total{{device!~"lo"}}[{_RATE_WINDOW}]))) * on(instance) group_left(node) '
'label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))',
"node",
)
usage["io"] = _vm_node_metric(
'avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) '
f'avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[{_RATE_WINDOW}]) + rate(node_disk_written_bytes_total[{_RATE_WINDOW}]))) '
'* on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))',
"node",
)
@ -514,13 +516,24 @@ def _summarize_metrics(errors: list[str]) -> dict[str, Any]:
metrics["pods_failed"] = _vm_scalar("sum(kube_pod_status_phase{phase=\"Failed\"})")
metrics["pods_succeeded"] = _vm_scalar("sum(kube_pod_status_phase{phase=\"Succeeded\"})")
metrics["top_restarts_1h"] = _vm_vector(
"topk(5, sum by (namespace,pod) (increase(kube_pod_container_status_restarts_total[1h])))"
f"topk(5, sum by (namespace,pod) (increase(kube_pod_container_status_restarts_total[{_RESTARTS_WINDOW}])))"
)
except Exception as exc:
errors.append(f"vm: {exc}")
metrics["postgres_connections"] = _postgres_connections(errors)
metrics["hottest_nodes"] = _hottest_nodes(errors)
metrics["node_usage"] = _node_usage(errors)
metrics["units"] = {
"cpu": "percent",
"ram": "percent",
"net": "bytes_per_sec",
"io": "bytes_per_sec",
"restarts": "count",
}
metrics["windows"] = {
"rates": _RATE_WINDOW,
"restarts": _RESTARTS_WINDOW,
}
return metrics