monitoring: show hottest node labels
This commit is contained in:
parent
4aece7e5cb
commit
b28e7501b7
@ -144,12 +144,6 @@ def astreae_free_expr(mount):
|
||||
return f"sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}})"
|
||||
|
||||
|
||||
def hottest_stat_expr(inner_expr):
|
||||
return (
|
||||
f'label_replace(topk(1, {inner_expr}), "__name__", "$1", "node", "(.*)")'
|
||||
)
|
||||
|
||||
|
||||
PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))'
|
||||
CRASHLOOP_EXPR = (
|
||||
'sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason'
|
||||
@ -467,22 +461,22 @@ def build_overview():
|
||||
)
|
||||
|
||||
hottest = [
|
||||
(7, "Hottest node: CPU", hottest_stat_expr(node_cpu_expr()), "percent"),
|
||||
(8, "Hottest node: RAM", hottest_stat_expr(node_mem_expr()), "percent"),
|
||||
(9, "Hottest node: NET", hottest_stat_expr(NET_SERIES_EXPR), "Bps"),
|
||||
(10, "Hottest node: I/O", hottest_stat_expr(IO_SERIES_EXPR), "Bps"),
|
||||
(7, "Hottest node: CPU", node_cpu_expr(), "percent"),
|
||||
(8, "Hottest node: RAM", node_mem_expr(), "percent"),
|
||||
(9, "Hottest node: NET (rx+tx)", NET_SERIES_EXPR, "Bps"),
|
||||
(10, "Hottest node: I/O (r+w)", IO_SERIES_EXPR, "Bps"),
|
||||
]
|
||||
for idx, (panel_id, title, expr, unit) in enumerate(hottest):
|
||||
panels.append(
|
||||
stat_panel(
|
||||
panel_id,
|
||||
title,
|
||||
expr,
|
||||
f"topk(1, {expr})",
|
||||
{"h": 5, "w": 6, "x": 6 * idx, "y": 5},
|
||||
unit=unit,
|
||||
thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
|
||||
text_mode="name_and_value",
|
||||
legend=None,
|
||||
legend="{{node}}",
|
||||
instant=True,
|
||||
links=link_to("atlas-nodes"),
|
||||
)
|
||||
|
||||
@ -438,8 +438,9 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
|
||||
"expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
@ -510,8 +511,9 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
|
||||
"expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
@ -569,7 +571,7 @@
|
||||
{
|
||||
"id": 9,
|
||||
"type": "stat",
|
||||
"title": "Hottest node: NET",
|
||||
"title": "Hottest node: NET (rx+tx)",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
@ -582,8 +584,9 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "label_replace(topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
|
||||
"expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
@ -637,7 +640,7 @@
|
||||
{
|
||||
"id": 10,
|
||||
"type": "stat",
|
||||
"title": "Hottest node: I/O",
|
||||
"title": "Hottest node: I/O (r+w)",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
@ -650,8 +653,9 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "label_replace(topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
|
||||
"expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
|
||||
@ -447,8 +447,9 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
|
||||
"expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
@ -519,8 +520,9 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
|
||||
"expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
@ -578,7 +580,7 @@ data:
|
||||
{
|
||||
"id": 9,
|
||||
"type": "stat",
|
||||
"title": "Hottest node: NET",
|
||||
"title": "Hottest node: NET (rx+tx)",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
@ -591,8 +593,9 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "label_replace(topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
|
||||
"expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
@ -646,7 +649,7 @@ data:
|
||||
{
|
||||
"id": 10,
|
||||
"type": "stat",
|
||||
"title": "Hottest node: I/O",
|
||||
"title": "Hottest node: I/O (r+w)",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
@ -659,8 +662,9 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "label_replace(topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
|
||||
"expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user