From f06be37f44e53c3f80870fca822b421f61e86901 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 18 Nov 2025 16:18:52 -0300 Subject: [PATCH] monitoring: refine network metrics and control-plane allowance --- scripts/render_dashboards.py | 21 ++++++++++++++----- .../monitoring/dashboards/atlas-network.json | 6 +++--- .../monitoring/dashboards/atlas-nodes.json | 2 +- .../monitoring/dashboards/atlas-overview.json | 8 +++---- .../monitoring/dashboards/atlas-pods.json | 2 +- .../monitoring/grafana-dashboard-network.yaml | 6 +++--- .../monitoring/grafana-dashboard-nodes.yaml | 2 +- .../grafana-dashboard-overview.yaml | 8 +++---- .../monitoring/grafana-dashboard-pods.yaml | 2 +- 9 files changed, 34 insertions(+), 23 deletions(-) diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index bf06d40..33b388d 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -79,7 +79,7 @@ CONTROL_TOTAL = len(CONTROL_PLANE_NODES) WORKER_TOTAL = len(WORKER_NODES) CONTROL_SUFFIX = f"/{CONTROL_TOTAL}" WORKER_SUFFIX = f"/{WORKER_TOTAL}" -CP_ALLOWED_NS = "kube-system|kube-public|kube-node-lease|longhorn-system|monitoring" +CP_ALLOWED_NS = "kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system" LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]" GAUGE_WIDTHS = [5, 5, 5, 5, 4] @@ -271,10 +271,21 @@ NET_CLUSTER_TX = ( 'sum(rate(container_network_transmit_bytes_total{namespace!="",pod!="",container!=""}[5m]))' " or on() vector(0)" ) -NET_TOTAL_EXPR = NET_CLUSTER_TX -NET_INGRESS_EXPR = TRAEFIK_NET_INGRESS -NET_EGRESS_EXPR = TRAEFIK_NET_EGRESS -NET_INTERNAL_EXPR = f"clamp_min((({NET_CLUSTER_RX}) + ({NET_CLUSTER_TX})) - (({TRAEFIK_NET_INGRESS}) + ({TRAEFIK_NET_EGRESS})), 0)" +PHYSICAL_NET_FILTER = 'device!~"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*"' +NET_NODE_RX_PHYS = ( + f'sum(rate(node_network_receive_bytes_total{{{PHYSICAL_NET_FILTER}}}[5m])) or on() vector(0)' +) +NET_NODE_TX_PHYS = ( + f'sum(rate(node_network_transmit_bytes_total{{{PHYSICAL_NET_FILTER}}}[5m])) or on() vector(0)' +) +NET_TOTAL_EXPR = NET_NODE_TX_PHYS +NET_INGRESS_EXPR = NET_NODE_RX_PHYS +NET_EGRESS_EXPR = NET_NODE_TX_PHYS +NET_INTERNAL_EXPR = ( + 'sum(rate(container_network_receive_bytes_total{namespace!="traefik",pod!="",container!=""}[5m]) ' + '+ rate(container_network_transmit_bytes_total{namespace!="traefik",pod!="",container!=""}[5m]))' + ' or on() vector(0)' +) # --------------------------------------------------------------------------- # Panel factories diff --git a/services/monitoring/dashboards/atlas-network.json b/services/monitoring/dashboards/atlas-network.json index 8a8b8f4..ca671c8 100644 --- a/services/monitoring/dashboards/atlas-network.json +++ b/services/monitoring/dashboards/atlas-network.json @@ -20,7 +20,7 @@ }, "targets": [ { - "expr": "sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)", + "expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)", "refId": "A" } ], @@ -80,7 +80,7 @@ }, "targets": [ { - "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)", + "expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)", "refId": "A" } ], @@ -140,7 +140,7 @@ }, "targets": [ { - "expr": "clamp_min(((sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0))) - ((sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0))), 0)", + "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)", "refId": "A" } ], diff --git a/services/monitoring/dashboards/atlas-nodes.json b/services/monitoring/dashboards/atlas-nodes.json index e974d8a..3cf784f 100644 --- a/services/monitoring/dashboards/atlas-nodes.json +++ b/services/monitoring/dashboards/atlas-nodes.json @@ -142,7 +142,7 @@ }, "targets": [ { - "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring\"})", + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"})", "refId": "A" } ], diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 4cd4b29..156d96f 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -137,7 +137,7 @@ }, "targets": [ { - "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring\"})", + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"})", "refId": "A" } ], @@ -1228,7 +1228,7 @@ }, "targets": [ { - "expr": "sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)", + "expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)", "refId": "A", "legendFormat": "Ingress (Traefik)" } @@ -1272,7 +1272,7 @@ }, "targets": [ { - "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)", + "expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)", "refId": "A", "legendFormat": "Egress (Traefik)" } @@ -1316,7 +1316,7 @@ }, "targets": [ { - "expr": "clamp_min(((sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0))) - ((sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0))), 0)", + "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)", "refId": "A", "legendFormat": "Internal traffic" } diff --git a/services/monitoring/dashboards/atlas-pods.json b/services/monitoring/dashboards/atlas-pods.json index 8494e89..f519d14 100644 --- a/services/monitoring/dashboards/atlas-pods.json +++ b/services/monitoring/dashboards/atlas-pods.json @@ -200,7 +200,7 @@ }, "targets": [ { - "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring\"})", + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"})", "refId": "A" } ], diff --git a/services/monitoring/grafana-dashboard-network.yaml b/services/monitoring/grafana-dashboard-network.yaml index 1727e6a..fa5b742 100644 --- a/services/monitoring/grafana-dashboard-network.yaml +++ b/services/monitoring/grafana-dashboard-network.yaml @@ -29,7 +29,7 @@ data: }, "targets": [ { - "expr": "sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)", + "expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)", "refId": "A" } ], @@ -89,7 +89,7 @@ data: }, "targets": [ { - "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)", + "expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)", "refId": "A" } ], @@ -149,7 +149,7 @@ data: }, "targets": [ { - "expr": "clamp_min(((sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0))) - ((sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0))), 0)", + "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)", "refId": "A" } ], diff --git a/services/monitoring/grafana-dashboard-nodes.yaml b/services/monitoring/grafana-dashboard-nodes.yaml index afbeb3c..c78e994 100644 --- a/services/monitoring/grafana-dashboard-nodes.yaml +++ b/services/monitoring/grafana-dashboard-nodes.yaml @@ -151,7 +151,7 @@ data: }, "targets": [ { - "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring\"})", + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"})", "refId": "A" } ], diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 99d6d46..957bb6a 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -146,7 +146,7 @@ data: }, "targets": [ { - "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring\"})", + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"})", "refId": "A" } ], @@ -1237,7 +1237,7 @@ data: }, "targets": [ { - "expr": "sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)", + "expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)", "refId": "A", "legendFormat": "Ingress (Traefik)" } @@ -1281,7 +1281,7 @@ data: }, "targets": [ { - "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)", + "expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)", "refId": "A", "legendFormat": "Egress (Traefik)" } @@ -1325,7 +1325,7 @@ data: }, "targets": [ { - "expr": "clamp_min(((sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0))) - ((sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0))), 0)", + "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)", "refId": "A", "legendFormat": "Internal traffic" } diff --git a/services/monitoring/grafana-dashboard-pods.yaml b/services/monitoring/grafana-dashboard-pods.yaml index e160eca..78beca5 100644 --- a/services/monitoring/grafana-dashboard-pods.yaml +++ b/services/monitoring/grafana-dashboard-pods.yaml @@ -209,7 +209,7 @@ data: }, "targets": [ { - "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring\"})", + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"})", "refId": "A" } ],