monitoring: attach nodes to net/io stats
This commit is contained in:
parent
a67a6a1f3a
commit
b8998a3c6a
@ -144,6 +144,23 @@ def astreae_free_expr(mount):
|
||||
return f"sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}})"
|
||||
|
||||
|
||||
def node_net_expr(scope=""):
|
||||
base = (
|
||||
'sum by (instance) ('
|
||||
'rate(container_network_receive_bytes_total{namespace!="" ,pod!=""}[5m]) '
|
||||
'+ rate(container_network_transmit_bytes_total{namespace!="" ,pod!=""}[5m]))'
|
||||
)
|
||||
return scoped_node_expr(base, scope)
|
||||
|
||||
|
||||
def node_io_expr(scope=""):
|
||||
base = (
|
||||
"sum by (instance) (rate(node_disk_read_bytes_total[5m]) "
|
||||
"+ rate(node_disk_written_bytes_total[5m]))"
|
||||
)
|
||||
return scoped_node_expr(base, scope)
|
||||
|
||||
|
||||
PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))'
|
||||
CRASHLOOP_EXPR = (
|
||||
'sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason'
|
||||
@ -185,17 +202,6 @@ NAMESPACE_RAM_EXPR = (
|
||||
'topk(10, sum(container_memory_working_set_bytes{namespace!=""'
|
||||
',pod!=""}) by (namespace))'
|
||||
)
|
||||
NET_SERIES_EXPR = (
|
||||
'avg by (node) ('
|
||||
'rate(container_network_receive_bytes_total{namespace!="" ,pod!=""}[5m]) '
|
||||
'+ rate(container_network_transmit_bytes_total{namespace!="" ,pod!=""}[5m]))'
|
||||
)
|
||||
NET_TOP_EXPR = f"topk(1, {NET_SERIES_EXPR})"
|
||||
IO_SERIES_EXPR = (
|
||||
"avg by (node) (rate(node_disk_read_bytes_total[5m]) "
|
||||
"+ rate(node_disk_written_bytes_total[5m]))"
|
||||
)
|
||||
IO_TOP_EXPR = f"topk(1, {IO_SERIES_EXPR})"
|
||||
TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
|
||||
NET_INGRESS_EXPR = (
|
||||
'sum(rate(container_network_receive_bytes_total{namespace!="" ,pod!=""}[5m])) '
|
||||
@ -463,8 +469,8 @@ def build_overview():
|
||||
hottest = [
|
||||
(7, "Hottest node: CPU", node_cpu_expr(), "percent"),
|
||||
(8, "Hottest node: RAM", node_mem_expr(), "percent"),
|
||||
(9, "Hottest node: NET (rx+tx)", NET_SERIES_EXPR, "Bps"),
|
||||
(10, "Hottest node: I/O (r+w)", IO_SERIES_EXPR, "Bps"),
|
||||
(9, "Hottest node: NET (rx+tx)", node_net_expr(), "Bps"),
|
||||
(10, "Hottest node: I/O (r+w)", node_io_expr(), "Bps"),
|
||||
]
|
||||
for idx, (panel_id, title, expr, unit) in enumerate(hottest):
|
||||
panels.append(
|
||||
@ -476,7 +482,7 @@ def build_overview():
|
||||
unit=unit,
|
||||
thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
|
||||
text_mode="name_and_value",
|
||||
legend="{{node}}\\n",
|
||||
legend="{{node}}",
|
||||
instant=True,
|
||||
links=link_to("atlas-nodes"),
|
||||
)
|
||||
@ -1021,7 +1027,7 @@ def build_network_dashboard():
|
||||
timeseries_panel(
|
||||
4,
|
||||
"Per-node throughput",
|
||||
NET_SERIES_EXPR,
|
||||
node_net_expr(),
|
||||
{"h": 8, "w": 24, "x": 0, "y": 4},
|
||||
unit="Bps",
|
||||
legend="{{node}}",
|
||||
|
||||
@ -202,7 +202,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))",
|
||||
"expr": "avg by (node) ((sum by (instance) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{node}}"
|
||||
}
|
||||
|
||||
@ -440,7 +440,7 @@
|
||||
{
|
||||
"expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{node}}\\n",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
@ -513,7 +513,7 @@
|
||||
{
|
||||
"expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{node}}\\n",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
@ -584,9 +584,9 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))",
|
||||
"expr": "topk(1, avg by (node) ((sum by (instance) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{node}}\\n",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
@ -653,9 +653,9 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))",
|
||||
"expr": "topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{node}}\\n",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
|
||||
@ -211,7 +211,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))",
|
||||
"expr": "avg by (node) ((sum by (instance) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{node}}"
|
||||
}
|
||||
|
||||
@ -449,7 +449,7 @@ data:
|
||||
{
|
||||
"expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{node}}\\n",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
@ -522,7 +522,7 @@ data:
|
||||
{
|
||||
"expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{node}}\\n",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
@ -593,9 +593,9 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))",
|
||||
"expr": "topk(1, avg by (node) ((sum by (instance) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{node}}\\n",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
@ -662,9 +662,9 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))",
|
||||
"expr": "topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{node}}\\n",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user