From b8998a3c6ab81493b52ebf18abc21a78ad6c01e9 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 17 Nov 2025 20:14:11 -0300 Subject: [PATCH] monitoring: attach nodes to net/io stats --- scripts/render_dashboards.py | 36 +++++++++++-------- .../monitoring/dashboards/atlas-network.json | 2 +- .../monitoring/dashboards/atlas-overview.json | 12 +++---- .../monitoring/grafana-dashboard-network.yaml | 2 +- .../grafana-dashboard-overview.yaml | 12 +++---- 5 files changed, 35 insertions(+), 29 deletions(-) diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index 4f25ab5..37f2607 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -144,6 +144,23 @@ def astreae_free_expr(mount): return f"sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}})" +def node_net_expr(scope=""): + base = ( + 'sum by (instance) (' + 'rate(container_network_receive_bytes_total{namespace!="" ,pod!=""}[5m]) ' + '+ rate(container_network_transmit_bytes_total{namespace!="" ,pod!=""}[5m]))' + ) + return scoped_node_expr(base, scope) + + +def node_io_expr(scope=""): + base = ( + "sum by (instance) (rate(node_disk_read_bytes_total[5m]) " + "+ rate(node_disk_written_bytes_total[5m]))" + ) + return scoped_node_expr(base, scope) + + PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))' CRASHLOOP_EXPR = ( 'sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason' @@ -185,17 +202,6 @@ NAMESPACE_RAM_EXPR = ( 'topk(10, sum(container_memory_working_set_bytes{namespace!=""' ',pod!=""}) by (namespace))' ) -NET_SERIES_EXPR = ( - 'avg by (node) (' - 'rate(container_network_receive_bytes_total{namespace!="" ,pod!=""}[5m]) ' - '+ rate(container_network_transmit_bytes_total{namespace!="" ,pod!=""}[5m]))' -) -NET_TOP_EXPR = f"topk(1, {NET_SERIES_EXPR})" -IO_SERIES_EXPR = ( - "avg by (node) (rate(node_disk_read_bytes_total[5m]) " - "+ rate(node_disk_written_bytes_total[5m]))" -) -IO_TOP_EXPR = f"topk(1, {IO_SERIES_EXPR})" TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))" NET_INGRESS_EXPR = ( 'sum(rate(container_network_receive_bytes_total{namespace!="" ,pod!=""}[5m])) ' @@ -463,8 +469,8 @@ def build_overview(): hottest = [ (7, "Hottest node: CPU", node_cpu_expr(), "percent"), (8, "Hottest node: RAM", node_mem_expr(), "percent"), - (9, "Hottest node: NET (rx+tx)", NET_SERIES_EXPR, "Bps"), - (10, "Hottest node: I/O (r+w)", IO_SERIES_EXPR, "Bps"), + (9, "Hottest node: NET (rx+tx)", node_net_expr(), "Bps"), + (10, "Hottest node: I/O (r+w)", node_io_expr(), "Bps"), ] for idx, (panel_id, title, expr, unit) in enumerate(hottest): panels.append( @@ -476,7 +482,7 @@ def build_overview(): unit=unit, thresholds=PERCENT_THRESHOLDS if unit == "percent" else None, text_mode="name_and_value", - legend="{{node}}\\n", + legend="{{node}}", instant=True, links=link_to("atlas-nodes"), ) @@ -1021,7 +1027,7 @@ def build_network_dashboard(): timeseries_panel( 4, "Per-node throughput", - NET_SERIES_EXPR, + node_net_expr(), {"h": 8, "w": 24, "x": 0, "y": 4}, unit="Bps", legend="{{node}}", diff --git a/services/monitoring/dashboards/atlas-network.json b/services/monitoring/dashboards/atlas-network.json index abd9da7..42026eb 100644 --- a/services/monitoring/dashboards/atlas-network.json +++ b/services/monitoring/dashboards/atlas-network.json @@ -202,7 +202,7 @@ }, "targets": [ { - "expr": "avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))", + "expr": "avg by (node) ((sum by (instance) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", "refId": "A", "legendFormat": "{{node}}" } diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index ad3a947..be5dead 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -440,7 +440,7 @@ { "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "refId": "A", - "legendFormat": "{{node}}\\n", + "legendFormat": "{{node}}", "instant": true } ], @@ -513,7 +513,7 @@ { "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "refId": "A", - "legendFormat": "{{node}}\\n", + "legendFormat": "{{node}}", "instant": true } ], @@ -584,9 +584,9 @@ }, "targets": [ { - "expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))", + "expr": "topk(1, avg by (node) ((sum by (instance) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "refId": "A", - "legendFormat": "{{node}}\\n", + "legendFormat": "{{node}}", "instant": true } ], @@ -653,9 +653,9 @@ }, "targets": [ { - "expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))", + "expr": "topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "refId": "A", - "legendFormat": "{{node}}\\n", + "legendFormat": "{{node}}", "instant": true } ], diff --git a/services/monitoring/grafana-dashboard-network.yaml b/services/monitoring/grafana-dashboard-network.yaml index 8f614ae..8b5d50d 100644 --- a/services/monitoring/grafana-dashboard-network.yaml +++ b/services/monitoring/grafana-dashboard-network.yaml @@ -211,7 +211,7 @@ data: }, "targets": [ { - "expr": "avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))", + "expr": "avg by (node) ((sum by (instance) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", "refId": "A", "legendFormat": "{{node}}" } diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 5f3062a..26e0454 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -449,7 +449,7 @@ data: { "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "refId": "A", - "legendFormat": "{{node}}\\n", + "legendFormat": "{{node}}", "instant": true } ], @@ -522,7 +522,7 @@ data: { "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "refId": "A", - "legendFormat": "{{node}}\\n", + "legendFormat": "{{node}}", "instant": true } ], @@ -593,9 +593,9 @@ data: }, "targets": [ { - "expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))", + "expr": "topk(1, avg by (node) ((sum by (instance) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "refId": "A", - "legendFormat": "{{node}}\\n", + "legendFormat": "{{node}}", "instant": true } ], @@ -662,9 +662,9 @@ data: }, "targets": [ { - "expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))", + "expr": "topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "refId": "A", - "legendFormat": "{{node}}\\n", + "legendFormat": "{{node}}", "instant": true } ],