snapshot: annotate windows and units
This commit is contained in:
parent
672ce20967
commit
e93b0a7744
@ -16,6 +16,8 @@ from ..utils.logging import get_logger
|
||||
logger = get_logger(__name__)
|
||||
|
||||
_VALUE_PAIR_LEN = 2
|
||||
_RATE_WINDOW = "5m"
|
||||
_RESTARTS_WINDOW = "1h"
|
||||
_WORKLOAD_LABEL_KEYS = (
|
||||
"app.kubernetes.io/name",
|
||||
"app",
|
||||
@ -449,7 +451,7 @@ def _hottest_nodes(errors: list[str]) -> dict[str, Any]:
|
||||
hottest: dict[str, Any] = {}
|
||||
try:
|
||||
hottest["cpu"] = _vm_topk(
|
||||
'label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100) '
|
||||
f'label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{{mode="idle"}}[{_RATE_WINDOW}]))) * 100) '
|
||||
'* on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")',
|
||||
"node",
|
||||
)
|
||||
@ -459,12 +461,12 @@ def _hottest_nodes(errors: list[str]) -> dict[str, Any]:
|
||||
"node",
|
||||
)
|
||||
hottest["net"] = _vm_topk(
|
||||
'label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~"lo"}[5m]) '
|
||||
'+ rate(node_network_transmit_bytes_total{device!~"lo"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")',
|
||||
f'label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{{device!~"lo"}}[{_RATE_WINDOW}]) '
|
||||
f'+ rate(node_network_transmit_bytes_total{{device!~"lo"}}[{_RATE_WINDOW}]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")',
|
||||
"node",
|
||||
)
|
||||
hottest["io"] = _vm_topk(
|
||||
'label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) '
|
||||
f'label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[{_RATE_WINDOW}]) + rate(node_disk_written_bytes_total[{_RATE_WINDOW}]))) '
|
||||
'* on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")',
|
||||
"node",
|
||||
)
|
||||
@ -477,7 +479,7 @@ def _node_usage(errors: list[str]) -> dict[str, Any]:
|
||||
usage: dict[str, Any] = {}
|
||||
try:
|
||||
usage["cpu"] = _vm_node_metric(
|
||||
'avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100) '
|
||||
f'avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{{mode="idle"}}[{_RATE_WINDOW}]))) * 100) '
|
||||
'* on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))',
|
||||
"node",
|
||||
)
|
||||
@ -487,13 +489,13 @@ def _node_usage(errors: list[str]) -> dict[str, Any]:
|
||||
"node",
|
||||
)
|
||||
usage["net"] = _vm_node_metric(
|
||||
'avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~"lo"}[5m]) '
|
||||
'+ rate(node_network_transmit_bytes_total{device!~"lo"}[5m]))) * on(instance) group_left(node) '
|
||||
f'avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{{device!~"lo"}}[{_RATE_WINDOW}]) '
|
||||
f'+ rate(node_network_transmit_bytes_total{{device!~"lo"}}[{_RATE_WINDOW}]))) * on(instance) group_left(node) '
|
||||
'label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))',
|
||||
"node",
|
||||
)
|
||||
usage["io"] = _vm_node_metric(
|
||||
'avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) '
|
||||
f'avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[{_RATE_WINDOW}]) + rate(node_disk_written_bytes_total[{_RATE_WINDOW}]))) '
|
||||
'* on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))',
|
||||
"node",
|
||||
)
|
||||
@ -514,13 +516,24 @@ def _summarize_metrics(errors: list[str]) -> dict[str, Any]:
|
||||
metrics["pods_failed"] = _vm_scalar("sum(kube_pod_status_phase{phase=\"Failed\"})")
|
||||
metrics["pods_succeeded"] = _vm_scalar("sum(kube_pod_status_phase{phase=\"Succeeded\"})")
|
||||
metrics["top_restarts_1h"] = _vm_vector(
|
||||
"topk(5, sum by (namespace,pod) (increase(kube_pod_container_status_restarts_total[1h])))"
|
||||
f"topk(5, sum by (namespace,pod) (increase(kube_pod_container_status_restarts_total[{_RESTARTS_WINDOW}])))"
|
||||
)
|
||||
except Exception as exc:
|
||||
errors.append(f"vm: {exc}")
|
||||
metrics["postgres_connections"] = _postgres_connections(errors)
|
||||
metrics["hottest_nodes"] = _hottest_nodes(errors)
|
||||
metrics["node_usage"] = _node_usage(errors)
|
||||
metrics["units"] = {
|
||||
"cpu": "percent",
|
||||
"ram": "percent",
|
||||
"net": "bytes_per_sec",
|
||||
"io": "bytes_per_sec",
|
||||
"restarts": "count",
|
||||
}
|
||||
metrics["windows"] = {
|
||||
"rates": _RATE_WINDOW,
|
||||
"restarts": _RESTARTS_WINDOW,
|
||||
}
|
||||
return metrics
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user