diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index 273090a..bf06d40 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -81,6 +81,7 @@ CONTROL_SUFFIX = f"/{CONTROL_TOTAL}" WORKER_SUFFIX = f"/{WORKER_TOTAL}" CP_ALLOWED_NS = "kube-system|kube-public|kube-node-lease|longhorn-system|monitoring" LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]" +GAUGE_WIDTHS = [5, 5, 5, 5, 4] # --------------------------------------------------------------------------- # PromQL helpers @@ -262,13 +263,18 @@ TRAEFIK_NET_EGRESS = ( 'sum(rate(container_network_transmit_bytes_total{namespace="traefik",pod=~"traefik-.*"}[5m]))' " or on() vector(0)" ) -NET_TOTAL_EXPR = ( +NET_CLUSTER_RX = ( + 'sum(rate(container_network_receive_bytes_total{namespace!="",pod!="",container!=""}[5m]))' + " or on() vector(0)" +) +NET_CLUSTER_TX = ( 'sum(rate(container_network_transmit_bytes_total{namespace!="",pod!="",container!=""}[5m]))' " or on() vector(0)" ) +NET_TOTAL_EXPR = NET_CLUSTER_TX NET_INGRESS_EXPR = TRAEFIK_NET_INGRESS NET_EGRESS_EXPR = TRAEFIK_NET_EGRESS -NET_INTERNAL_EXPR = f"clamp_min(({NET_TOTAL_EXPR}) - ({TRAEFIK_NET_EGRESS}), 0)" +NET_INTERNAL_EXPR = f"clamp_min((({NET_CLUSTER_RX}) + ({NET_CLUSTER_TX})) - (({TRAEFIK_NET_INGRESS}) + ({TRAEFIK_NET_EGRESS})), 0)" # --------------------------------------------------------------------------- # Panel factories @@ -534,6 +540,11 @@ def build_overview(): link_to("atlas-pods"), ), ] + def gauge_grid(idx): + width = GAUGE_WIDTHS[idx] if idx < len(GAUGE_WIDTHS) else 4 + x = sum(GAUGE_WIDTHS[:idx]) + return width, x + for idx, (panel_id, title, expr, suffix, ok_value, links) in enumerate(row1_stats): thresholds = None min_value = 0 @@ -577,12 +588,13 @@ def build_overview(): {"color": "red", "value": max_value}, ], } + width, x = gauge_grid(idx) panels.append( gauge_panel( panel_id, title, expr, - {"h": 5, "w": 4, "x": 4 * idx, "y": 0}, + {"h": 5, "w": width, "x": x, "y": 0}, min_value=min_value, max_value=max_value, thresholds=thresholds, @@ -662,7 +674,7 @@ def build_overview(): 14, "Worker node CPU", node_cpu_expr(worker_filter), - {"h": 8, "w": 12, "x": 0, "y": 25}, + {"h": 8, "w": 12, "x": 0, "y": 32}, unit="percent", legend="{{node}}", legend_calcs=["last"], @@ -676,7 +688,7 @@ def build_overview(): 15, "Worker node RAM", node_mem_expr(worker_filter), - {"h": 8, "w": 12, "x": 12, "y": 25}, + {"h": 8, "w": 12, "x": 12, "y": 32}, unit="percent", legend="{{node}}", legend_calcs=["last"], @@ -691,7 +703,7 @@ def build_overview(): 16, "Control plane CPU", node_cpu_expr(CONTROL_REGEX), - {"h": 7, "w": 12, "x": 0, "y": 33}, + {"h": 7, "w": 12, "x": 0, "y": 40}, unit="percent", legend="{{node}}", legend_display="table", @@ -703,7 +715,7 @@ def build_overview(): 17, "Control plane RAM", node_mem_expr(CONTROL_REGEX), - {"h": 7, "w": 12, "x": 12, "y": 33}, + {"h": 7, "w": 12, "x": 12, "y": 40}, unit="percent", legend="{{node}}", legend_display="table", @@ -716,7 +728,7 @@ def build_overview(): 18, "Cluster ingress throughput", NET_INGRESS_EXPR, - {"h": 7, "w": 8, "x": 0, "y": 40}, + {"h": 7, "w": 8, "x": 0, "y": 25}, unit="Bps", legend="Ingress (Traefik)", legend_display="list", @@ -729,7 +741,7 @@ def build_overview(): 19, "Cluster egress throughput", NET_EGRESS_EXPR, - {"h": 7, "w": 8, "x": 8, "y": 40}, + {"h": 7, "w": 8, "x": 8, "y": 25}, unit="Bps", legend="Egress (Traefik)", legend_display="list", @@ -742,7 +754,7 @@ def build_overview(): 20, "Intra-cluster throughput", NET_INTERNAL_EXPR, - {"h": 7, "w": 8, "x": 16, "y": 40}, + {"h": 7, "w": 8, "x": 16, "y": 25}, unit="Bps", legend="Internal traffic", legend_display="list", diff --git a/services/monitoring/dashboards/atlas-network.json b/services/monitoring/dashboards/atlas-network.json index 1baec3a..8a8b8f4 100644 --- a/services/monitoring/dashboards/atlas-network.json +++ b/services/monitoring/dashboards/atlas-network.json @@ -140,7 +140,7 @@ }, "targets": [ { - "expr": "clamp_min((sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) - (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)), 0)", + "expr": "clamp_min(((sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0))) - ((sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0))), 0)", "refId": "A" } ], diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index eba6466..4cd4b29 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -17,7 +17,7 @@ }, "gridPos": { "h": 5, - "w": 4, + "w": 5, "x": 0, "y": 0 }, @@ -78,8 +78,8 @@ }, "gridPos": { "h": 5, - "w": 4, - "x": 4, + "w": 5, + "x": 5, "y": 0 }, "targets": [ @@ -131,8 +131,8 @@ }, "gridPos": { "h": 5, - "w": 4, - "x": 8, + "w": 5, + "x": 10, "y": 0 }, "targets": [ @@ -199,8 +199,8 @@ }, "gridPos": { "h": 5, - "w": 4, - "x": 12, + "w": 5, + "x": 15, "y": 0 }, "targets": [ @@ -268,7 +268,7 @@ "gridPos": { "h": 5, "w": 4, - "x": 16, + "x": 20, "y": 0 }, "targets": [ @@ -1056,7 +1056,7 @@ "h": 8, "w": 12, "x": 0, - "y": 25 + "y": 32 }, "targets": [ { @@ -1103,7 +1103,7 @@ "h": 8, "w": 12, "x": 12, - "y": 25 + "y": 32 }, "targets": [ { @@ -1150,7 +1150,7 @@ "h": 7, "w": 12, "x": 0, - "y": 33 + "y": 40 }, "targets": [ { @@ -1187,7 +1187,7 @@ "h": 7, "w": 12, "x": 12, - "y": 33 + "y": 40 }, "targets": [ { @@ -1224,7 +1224,7 @@ "h": 7, "w": 8, "x": 0, - "y": 40 + "y": 25 }, "targets": [ { @@ -1268,7 +1268,7 @@ "h": 7, "w": 8, "x": 8, - "y": 40 + "y": 25 }, "targets": [ { @@ -1312,11 +1312,11 @@ "h": 7, "w": 8, "x": 16, - "y": 40 + "y": 25 }, "targets": [ { - "expr": "clamp_min((sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) - (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)), 0)", + "expr": "clamp_min(((sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0))) - ((sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0))), 0)", "refId": "A", "legendFormat": "Internal traffic" } diff --git a/services/monitoring/grafana-dashboard-network.yaml b/services/monitoring/grafana-dashboard-network.yaml index ade7457..1727e6a 100644 --- a/services/monitoring/grafana-dashboard-network.yaml +++ b/services/monitoring/grafana-dashboard-network.yaml @@ -149,7 +149,7 @@ data: }, "targets": [ { - "expr": "clamp_min((sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) - (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)), 0)", + "expr": "clamp_min(((sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0))) - ((sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0))), 0)", "refId": "A" } ], diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index d20a5a4..99d6d46 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -26,7 +26,7 @@ data: }, "gridPos": { "h": 5, - "w": 4, + "w": 5, "x": 0, "y": 0 }, @@ -87,8 +87,8 @@ data: }, "gridPos": { "h": 5, - "w": 4, - "x": 4, + "w": 5, + "x": 5, "y": 0 }, "targets": [ @@ -140,8 +140,8 @@ data: }, "gridPos": { "h": 5, - "w": 4, - "x": 8, + "w": 5, + "x": 10, "y": 0 }, "targets": [ @@ -208,8 +208,8 @@ data: }, "gridPos": { "h": 5, - "w": 4, - "x": 12, + "w": 5, + "x": 15, "y": 0 }, "targets": [ @@ -277,7 +277,7 @@ data: "gridPos": { "h": 5, "w": 4, - "x": 16, + "x": 20, "y": 0 }, "targets": [ @@ -1065,7 +1065,7 @@ data: "h": 8, "w": 12, "x": 0, - "y": 25 + "y": 32 }, "targets": [ { @@ -1112,7 +1112,7 @@ data: "h": 8, "w": 12, "x": 12, - "y": 25 + "y": 32 }, "targets": [ { @@ -1159,7 +1159,7 @@ data: "h": 7, "w": 12, "x": 0, - "y": 33 + "y": 40 }, "targets": [ { @@ -1196,7 +1196,7 @@ data: "h": 7, "w": 12, "x": 12, - "y": 33 + "y": 40 }, "targets": [ { @@ -1233,7 +1233,7 @@ data: "h": 7, "w": 8, "x": 0, - "y": 40 + "y": 25 }, "targets": [ { @@ -1277,7 +1277,7 @@ data: "h": 7, "w": 8, "x": 8, - "y": 40 + "y": 25 }, "targets": [ { @@ -1321,11 +1321,11 @@ data: "h": 7, "w": 8, "x": 16, - "y": 40 + "y": 25 }, "targets": [ { - "expr": "clamp_min((sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) - (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)), 0)", + "expr": "clamp_min(((sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0))) - ((sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0))), 0)", "refId": "A", "legendFormat": "Internal traffic" }