snapshot: annotate windows and units
This commit is contained in:
parent
672ce20967
commit
e93b0a7744
@ -16,6 +16,8 @@ from ..utils.logging import get_logger
|
|||||||
logger = get_logger(__name__)
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
_VALUE_PAIR_LEN = 2
|
_VALUE_PAIR_LEN = 2
|
||||||
|
_RATE_WINDOW = "5m"
|
||||||
|
_RESTARTS_WINDOW = "1h"
|
||||||
_WORKLOAD_LABEL_KEYS = (
|
_WORKLOAD_LABEL_KEYS = (
|
||||||
"app.kubernetes.io/name",
|
"app.kubernetes.io/name",
|
||||||
"app",
|
"app",
|
||||||
@ -449,7 +451,7 @@ def _hottest_nodes(errors: list[str]) -> dict[str, Any]:
|
|||||||
hottest: dict[str, Any] = {}
|
hottest: dict[str, Any] = {}
|
||||||
try:
|
try:
|
||||||
hottest["cpu"] = _vm_topk(
|
hottest["cpu"] = _vm_topk(
|
||||||
'label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100) '
|
f'label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{{mode="idle"}}[{_RATE_WINDOW}]))) * 100) '
|
||||||
'* on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")',
|
'* on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")',
|
||||||
"node",
|
"node",
|
||||||
)
|
)
|
||||||
@ -459,12 +461,12 @@ def _hottest_nodes(errors: list[str]) -> dict[str, Any]:
|
|||||||
"node",
|
"node",
|
||||||
)
|
)
|
||||||
hottest["net"] = _vm_topk(
|
hottest["net"] = _vm_topk(
|
||||||
'label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~"lo"}[5m]) '
|
f'label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{{device!~"lo"}}[{_RATE_WINDOW}]) '
|
||||||
'+ rate(node_network_transmit_bytes_total{device!~"lo"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")',
|
f'+ rate(node_network_transmit_bytes_total{{device!~"lo"}}[{_RATE_WINDOW}]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")',
|
||||||
"node",
|
"node",
|
||||||
)
|
)
|
||||||
hottest["io"] = _vm_topk(
|
hottest["io"] = _vm_topk(
|
||||||
'label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) '
|
f'label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[{_RATE_WINDOW}]) + rate(node_disk_written_bytes_total[{_RATE_WINDOW}]))) '
|
||||||
'* on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")',
|
'* on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")',
|
||||||
"node",
|
"node",
|
||||||
)
|
)
|
||||||
@ -477,7 +479,7 @@ def _node_usage(errors: list[str]) -> dict[str, Any]:
|
|||||||
usage: dict[str, Any] = {}
|
usage: dict[str, Any] = {}
|
||||||
try:
|
try:
|
||||||
usage["cpu"] = _vm_node_metric(
|
usage["cpu"] = _vm_node_metric(
|
||||||
'avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100) '
|
f'avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{{mode="idle"}}[{_RATE_WINDOW}]))) * 100) '
|
||||||
'* on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))',
|
'* on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))',
|
||||||
"node",
|
"node",
|
||||||
)
|
)
|
||||||
@ -487,13 +489,13 @@ def _node_usage(errors: list[str]) -> dict[str, Any]:
|
|||||||
"node",
|
"node",
|
||||||
)
|
)
|
||||||
usage["net"] = _vm_node_metric(
|
usage["net"] = _vm_node_metric(
|
||||||
'avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~"lo"}[5m]) '
|
f'avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{{device!~"lo"}}[{_RATE_WINDOW}]) '
|
||||||
'+ rate(node_network_transmit_bytes_total{device!~"lo"}[5m]))) * on(instance) group_left(node) '
|
f'+ rate(node_network_transmit_bytes_total{{device!~"lo"}}[{_RATE_WINDOW}]))) * on(instance) group_left(node) '
|
||||||
'label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))',
|
'label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))',
|
||||||
"node",
|
"node",
|
||||||
)
|
)
|
||||||
usage["io"] = _vm_node_metric(
|
usage["io"] = _vm_node_metric(
|
||||||
'avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) '
|
f'avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[{_RATE_WINDOW}]) + rate(node_disk_written_bytes_total[{_RATE_WINDOW}]))) '
|
||||||
'* on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))',
|
'* on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))',
|
||||||
"node",
|
"node",
|
||||||
)
|
)
|
||||||
@ -514,13 +516,24 @@ def _summarize_metrics(errors: list[str]) -> dict[str, Any]:
|
|||||||
metrics["pods_failed"] = _vm_scalar("sum(kube_pod_status_phase{phase=\"Failed\"})")
|
metrics["pods_failed"] = _vm_scalar("sum(kube_pod_status_phase{phase=\"Failed\"})")
|
||||||
metrics["pods_succeeded"] = _vm_scalar("sum(kube_pod_status_phase{phase=\"Succeeded\"})")
|
metrics["pods_succeeded"] = _vm_scalar("sum(kube_pod_status_phase{phase=\"Succeeded\"})")
|
||||||
metrics["top_restarts_1h"] = _vm_vector(
|
metrics["top_restarts_1h"] = _vm_vector(
|
||||||
"topk(5, sum by (namespace,pod) (increase(kube_pod_container_status_restarts_total[1h])))"
|
f"topk(5, sum by (namespace,pod) (increase(kube_pod_container_status_restarts_total[{_RESTARTS_WINDOW}])))"
|
||||||
)
|
)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
errors.append(f"vm: {exc}")
|
errors.append(f"vm: {exc}")
|
||||||
metrics["postgres_connections"] = _postgres_connections(errors)
|
metrics["postgres_connections"] = _postgres_connections(errors)
|
||||||
metrics["hottest_nodes"] = _hottest_nodes(errors)
|
metrics["hottest_nodes"] = _hottest_nodes(errors)
|
||||||
metrics["node_usage"] = _node_usage(errors)
|
metrics["node_usage"] = _node_usage(errors)
|
||||||
|
metrics["units"] = {
|
||||||
|
"cpu": "percent",
|
||||||
|
"ram": "percent",
|
||||||
|
"net": "bytes_per_sec",
|
||||||
|
"io": "bytes_per_sec",
|
||||||
|
"restarts": "count",
|
||||||
|
}
|
||||||
|
metrics["windows"] = {
|
||||||
|
"rates": _RATE_WINDOW,
|
||||||
|
"restarts": _RESTARTS_WINDOW,
|
||||||
|
}
|
||||||
return metrics
|
return metrics
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user