monitoring: attach nodes to net/io stats

This commit is contained in:
Brad Stein 2025-11-17 20:14:11 -03:00
parent a67a6a1f3a
commit b8998a3c6a
5 changed files with 35 additions and 29 deletions

View File

@ -144,6 +144,23 @@ def astreae_free_expr(mount):
return f"sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}})"
def node_net_expr(scope=""):
base = (
'sum by (instance) ('
'rate(container_network_receive_bytes_total{namespace!="" ,pod!=""}[5m]) '
'+ rate(container_network_transmit_bytes_total{namespace!="" ,pod!=""}[5m]))'
)
return scoped_node_expr(base, scope)
def node_io_expr(scope=""):
base = (
"sum by (instance) (rate(node_disk_read_bytes_total[5m]) "
"+ rate(node_disk_written_bytes_total[5m]))"
)
return scoped_node_expr(base, scope)
PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))'
CRASHLOOP_EXPR = (
'sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason'
@ -185,17 +202,6 @@ NAMESPACE_RAM_EXPR = (
'topk(10, sum(container_memory_working_set_bytes{namespace!=""'
',pod!=""}) by (namespace))'
)
NET_SERIES_EXPR = (
'avg by (node) ('
'rate(container_network_receive_bytes_total{namespace!="" ,pod!=""}[5m]) '
'+ rate(container_network_transmit_bytes_total{namespace!="" ,pod!=""}[5m]))'
)
NET_TOP_EXPR = f"topk(1, {NET_SERIES_EXPR})"
IO_SERIES_EXPR = (
"avg by (node) (rate(node_disk_read_bytes_total[5m]) "
"+ rate(node_disk_written_bytes_total[5m]))"
)
IO_TOP_EXPR = f"topk(1, {IO_SERIES_EXPR})"
TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
NET_INGRESS_EXPR = (
'sum(rate(container_network_receive_bytes_total{namespace!="" ,pod!=""}[5m])) '
@ -463,8 +469,8 @@ def build_overview():
hottest = [
(7, "Hottest node: CPU", node_cpu_expr(), "percent"),
(8, "Hottest node: RAM", node_mem_expr(), "percent"),
(9, "Hottest node: NET (rx+tx)", NET_SERIES_EXPR, "Bps"),
(10, "Hottest node: I/O (r+w)", IO_SERIES_EXPR, "Bps"),
(9, "Hottest node: NET (rx+tx)", node_net_expr(), "Bps"),
(10, "Hottest node: I/O (r+w)", node_io_expr(), "Bps"),
]
for idx, (panel_id, title, expr, unit) in enumerate(hottest):
panels.append(
@ -476,7 +482,7 @@ def build_overview():
unit=unit,
thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
text_mode="name_and_value",
legend="{{node}}\\n",
legend="{{node}}",
instant=True,
links=link_to("atlas-nodes"),
)
@ -1021,7 +1027,7 @@ def build_network_dashboard():
timeseries_panel(
4,
"Per-node throughput",
NET_SERIES_EXPR,
node_net_expr(),
{"h": 8, "w": 24, "x": 0, "y": 4},
unit="Bps",
legend="{{node}}",

View File

@ -202,7 +202,7 @@
},
"targets": [
{
"expr": "avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))",
"expr": "avg by (node) ((sum by (instance) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
"refId": "A",
"legendFormat": "{{node}}"
}

View File

@ -440,7 +440,7 @@
{
"expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
"refId": "A",
"legendFormat": "{{node}}\\n",
"legendFormat": "{{node}}",
"instant": true
}
],
@ -513,7 +513,7 @@
{
"expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
"refId": "A",
"legendFormat": "{{node}}\\n",
"legendFormat": "{{node}}",
"instant": true
}
],
@ -584,9 +584,9 @@
},
"targets": [
{
"expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))",
"expr": "topk(1, avg by (node) ((sum by (instance) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
"refId": "A",
"legendFormat": "{{node}}\\n",
"legendFormat": "{{node}}",
"instant": true
}
],
@ -653,9 +653,9 @@
},
"targets": [
{
"expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))",
"expr": "topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
"refId": "A",
"legendFormat": "{{node}}\\n",
"legendFormat": "{{node}}",
"instant": true
}
],

View File

@ -211,7 +211,7 @@ data:
},
"targets": [
{
"expr": "avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))",
"expr": "avg by (node) ((sum by (instance) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
"refId": "A",
"legendFormat": "{{node}}"
}

View File

@ -449,7 +449,7 @@ data:
{
"expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
"refId": "A",
"legendFormat": "{{node}}\\n",
"legendFormat": "{{node}}",
"instant": true
}
],
@ -522,7 +522,7 @@ data:
{
"expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
"refId": "A",
"legendFormat": "{{node}}\\n",
"legendFormat": "{{node}}",
"instant": true
}
],
@ -593,9 +593,9 @@ data:
},
"targets": [
{
"expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))",
"expr": "topk(1, avg by (node) ((sum by (instance) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
"refId": "A",
"legendFormat": "{{node}}\\n",
"legendFormat": "{{node}}",
"instant": true
}
],
@ -662,9 +662,9 @@ data:
},
"targets": [
{
"expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))",
"expr": "topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
"refId": "A",
"legendFormat": "{{node}}\\n",
"legendFormat": "{{node}}",
"instant": true
}
],