monitoring: attach nodes to net/io stats
This commit is contained in:
parent
a67a6a1f3a
commit
b8998a3c6a
@ -144,6 +144,23 @@ def astreae_free_expr(mount):
|
|||||||
return f"sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}})"
|
return f"sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}})"
|
||||||
|
|
||||||
|
|
||||||
|
def node_net_expr(scope=""):
|
||||||
|
base = (
|
||||||
|
'sum by (instance) ('
|
||||||
|
'rate(container_network_receive_bytes_total{namespace!="" ,pod!=""}[5m]) '
|
||||||
|
'+ rate(container_network_transmit_bytes_total{namespace!="" ,pod!=""}[5m]))'
|
||||||
|
)
|
||||||
|
return scoped_node_expr(base, scope)
|
||||||
|
|
||||||
|
|
||||||
|
def node_io_expr(scope=""):
|
||||||
|
base = (
|
||||||
|
"sum by (instance) (rate(node_disk_read_bytes_total[5m]) "
|
||||||
|
"+ rate(node_disk_written_bytes_total[5m]))"
|
||||||
|
)
|
||||||
|
return scoped_node_expr(base, scope)
|
||||||
|
|
||||||
|
|
||||||
PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))'
|
PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))'
|
||||||
CRASHLOOP_EXPR = (
|
CRASHLOOP_EXPR = (
|
||||||
'sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason'
|
'sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason'
|
||||||
@ -185,17 +202,6 @@ NAMESPACE_RAM_EXPR = (
|
|||||||
'topk(10, sum(container_memory_working_set_bytes{namespace!=""'
|
'topk(10, sum(container_memory_working_set_bytes{namespace!=""'
|
||||||
',pod!=""}) by (namespace))'
|
',pod!=""}) by (namespace))'
|
||||||
)
|
)
|
||||||
NET_SERIES_EXPR = (
|
|
||||||
'avg by (node) ('
|
|
||||||
'rate(container_network_receive_bytes_total{namespace!="" ,pod!=""}[5m]) '
|
|
||||||
'+ rate(container_network_transmit_bytes_total{namespace!="" ,pod!=""}[5m]))'
|
|
||||||
)
|
|
||||||
NET_TOP_EXPR = f"topk(1, {NET_SERIES_EXPR})"
|
|
||||||
IO_SERIES_EXPR = (
|
|
||||||
"avg by (node) (rate(node_disk_read_bytes_total[5m]) "
|
|
||||||
"+ rate(node_disk_written_bytes_total[5m]))"
|
|
||||||
)
|
|
||||||
IO_TOP_EXPR = f"topk(1, {IO_SERIES_EXPR})"
|
|
||||||
TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
|
TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
|
||||||
NET_INGRESS_EXPR = (
|
NET_INGRESS_EXPR = (
|
||||||
'sum(rate(container_network_receive_bytes_total{namespace!="" ,pod!=""}[5m])) '
|
'sum(rate(container_network_receive_bytes_total{namespace!="" ,pod!=""}[5m])) '
|
||||||
@ -463,8 +469,8 @@ def build_overview():
|
|||||||
hottest = [
|
hottest = [
|
||||||
(7, "Hottest node: CPU", node_cpu_expr(), "percent"),
|
(7, "Hottest node: CPU", node_cpu_expr(), "percent"),
|
||||||
(8, "Hottest node: RAM", node_mem_expr(), "percent"),
|
(8, "Hottest node: RAM", node_mem_expr(), "percent"),
|
||||||
(9, "Hottest node: NET (rx+tx)", NET_SERIES_EXPR, "Bps"),
|
(9, "Hottest node: NET (rx+tx)", node_net_expr(), "Bps"),
|
||||||
(10, "Hottest node: I/O (r+w)", IO_SERIES_EXPR, "Bps"),
|
(10, "Hottest node: I/O (r+w)", node_io_expr(), "Bps"),
|
||||||
]
|
]
|
||||||
for idx, (panel_id, title, expr, unit) in enumerate(hottest):
|
for idx, (panel_id, title, expr, unit) in enumerate(hottest):
|
||||||
panels.append(
|
panels.append(
|
||||||
@ -476,7 +482,7 @@ def build_overview():
|
|||||||
unit=unit,
|
unit=unit,
|
||||||
thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
|
thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
|
||||||
text_mode="name_and_value",
|
text_mode="name_and_value",
|
||||||
legend="{{node}}\\n",
|
legend="{{node}}",
|
||||||
instant=True,
|
instant=True,
|
||||||
links=link_to("atlas-nodes"),
|
links=link_to("atlas-nodes"),
|
||||||
)
|
)
|
||||||
@ -1021,7 +1027,7 @@ def build_network_dashboard():
|
|||||||
timeseries_panel(
|
timeseries_panel(
|
||||||
4,
|
4,
|
||||||
"Per-node throughput",
|
"Per-node throughput",
|
||||||
NET_SERIES_EXPR,
|
node_net_expr(),
|
||||||
{"h": 8, "w": 24, "x": 0, "y": 4},
|
{"h": 8, "w": 24, "x": 0, "y": 4},
|
||||||
unit="Bps",
|
unit="Bps",
|
||||||
legend="{{node}}",
|
legend="{{node}}",
|
||||||
|
|||||||
@ -202,7 +202,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))",
|
"expr": "avg by (node) ((sum by (instance) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{node}}"
|
"legendFormat": "{{node}}"
|
||||||
}
|
}
|
||||||
|
|||||||
@ -440,7 +440,7 @@
|
|||||||
{
|
{
|
||||||
"expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
|
"expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{node}}\\n",
|
"legendFormat": "{{node}}",
|
||||||
"instant": true
|
"instant": true
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -513,7 +513,7 @@
|
|||||||
{
|
{
|
||||||
"expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
|
"expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{node}}\\n",
|
"legendFormat": "{{node}}",
|
||||||
"instant": true
|
"instant": true
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -584,9 +584,9 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))",
|
"expr": "topk(1, avg by (node) ((sum by (instance) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{node}}\\n",
|
"legendFormat": "{{node}}",
|
||||||
"instant": true
|
"instant": true
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -653,9 +653,9 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))",
|
"expr": "topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{node}}\\n",
|
"legendFormat": "{{node}}",
|
||||||
"instant": true
|
"instant": true
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|||||||
@ -211,7 +211,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))",
|
"expr": "avg by (node) ((sum by (instance) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{node}}"
|
"legendFormat": "{{node}}"
|
||||||
}
|
}
|
||||||
|
|||||||
@ -449,7 +449,7 @@ data:
|
|||||||
{
|
{
|
||||||
"expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
|
"expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{node}}\\n",
|
"legendFormat": "{{node}}",
|
||||||
"instant": true
|
"instant": true
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -522,7 +522,7 @@ data:
|
|||||||
{
|
{
|
||||||
"expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
|
"expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{node}}\\n",
|
"legendFormat": "{{node}}",
|
||||||
"instant": true
|
"instant": true
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -593,9 +593,9 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))",
|
"expr": "topk(1, avg by (node) ((sum by (instance) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{node}}\\n",
|
"legendFormat": "{{node}}",
|
||||||
"instant": true
|
"instant": true
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -662,9 +662,9 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))",
|
"expr": "topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{node}}\\n",
|
"legendFormat": "{{node}}",
|
||||||
"instant": true
|
"instant": true
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user