From bf6179f907bd08bb141bf3a2e5b84ad8fa9c17ae Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Fri, 12 Dec 2025 18:00:43 -0300 Subject: [PATCH] atlas internal dashboards: add SLO/burn and api health panels --- scripts/dashboards_render_atlas.py | 201 ++++++++-- .../monitoring/dashboards/atlas-network.json | 357 ++++++++++++++---- .../monitoring/dashboards/atlas-nodes.json | 217 ++++++++++- .../monitoring/grafana-dashboard-network.yaml | 357 ++++++++++++++---- .../monitoring/grafana-dashboard-nodes.yaml | 217 ++++++++++- 5 files changed, 1171 insertions(+), 178 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index a895bd8..f4fb8cb 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -327,6 +327,34 @@ NET_INTERNAL_EXPR = ( '+ rate(container_network_transmit_bytes_total{namespace!="traefik",pod!=""}[5m]))' ' or on() vector(0)' ) +APISERVER_5XX_RATE = 'sum(rate(apiserver_request_total{code=~"5.."}[5m]))' +APISERVER_P99_LATENCY_MS = ( + "histogram_quantile(0.99, sum by (le) (rate(apiserver_request_duration_seconds_bucket[5m]))) * 1000" +) +ETCD_P99_LATENCY_MS = ( + "histogram_quantile(0.99, sum by (le) (rate(etcd_request_duration_seconds_bucket[5m]))) * 1000" +) +TRAEFIK_TOTAL_5M = "sum(rate(traefik_entrypoint_requests_total[5m]))" +TRAEFIK_SUCCESS_5M = 'sum(rate(traefik_entrypoint_requests_total{code!~"5.."}[5m]))' +TRAEFIK_SLI_5M = f"({TRAEFIK_SUCCESS_5M}) / clamp_min({TRAEFIK_TOTAL_5M}, 1)" +TRAEFIK_P99_LATENCY_MS = ( + "histogram_quantile(0.99, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000" +) +TRAEFIK_P95_LATENCY_MS = ( + "histogram_quantile(0.95, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000" +) +SLO_AVAILABILITY = 0.999 + + +def traefik_sli(window): + total = f'sum(rate(traefik_entrypoint_requests_total[{window}]))' + success = f'sum(rate(traefik_entrypoint_requests_total{{code!~"5.."}}[{window}]))' + return f"({success}) / clamp_min({total}, 1)" + + +def traefik_burn(window): + sli = traefik_sli(window) + return f"(1 - ({sli})) / {1 - SLO_AVAILABILITY}" # --------------------------------------------------------------------------- # Panel factories @@ -1067,12 +1095,69 @@ def build_nodes_dashboard(): {"h": 4, "w": 8, "x": 16, "y": 0}, ) ) + panels.append( + stat_panel( + 9, + "API Server 5xx rate", + APISERVER_5XX_RATE, + {"h": 4, "w": 8, "x": 0, "y": 4}, + unit="req/s", + thresholds={ + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 0.05}, + {"color": "orange", "value": 0.2}, + {"color": "red", "value": 0.5}, + ], + }, + decimals=3, + ) + ) + panels.append( + stat_panel( + 10, + "API Server P99 latency", + APISERVER_P99_LATENCY_MS, + {"h": 4, "w": 8, "x": 8, "y": 4}, + unit="ms", + thresholds={ + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 250}, + {"color": "orange", "value": 400}, + {"color": "red", "value": 600}, + ], + }, + decimals=1, + ) + ) + panels.append( + stat_panel( + 11, + "etcd P99 latency", + ETCD_P99_LATENCY_MS, + {"h": 4, "w": 8, "x": 16, "y": 4}, + unit="ms", + thresholds={ + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 50}, + {"color": "orange", "value": 100}, + {"color": "red", "value": 200}, + ], + }, + decimals=1, + ) + ) panels.append( timeseries_panel( 4, "Node CPU", node_cpu_expr(), - {"h": 9, "w": 24, "x": 0, "y": 4}, + {"h": 9, "w": 24, "x": 0, "y": 8}, unit="percent", legend="{{node}}", legend_calcs=["last"], @@ -1085,7 +1170,7 @@ def build_nodes_dashboard(): 5, "Node RAM", node_mem_expr(), - {"h": 9, "w": 24, "x": 0, "y": 13}, + {"h": 9, "w": 24, "x": 0, "y": 17}, unit="percent", legend="{{node}}", legend_calcs=["last"], @@ -1098,7 +1183,7 @@ def build_nodes_dashboard(): 6, "Control Plane (incl. titan-db) CPU", node_cpu_expr(CONTROL_ALL_REGEX), - {"h": 9, "w": 12, "x": 0, "y": 22}, + {"h": 9, "w": 12, "x": 0, "y": 26}, unit="percent", legend="{{node}}", legend_display="table", @@ -1110,7 +1195,7 @@ def build_nodes_dashboard(): 7, "Control Plane (incl. titan-db) RAM", node_mem_expr(CONTROL_ALL_REGEX), - {"h": 9, "w": 12, "x": 12, "y": 22}, + {"h": 9, "w": 12, "x": 12, "y": 26}, unit="percent", legend="{{node}}", legend_display="table", @@ -1122,7 +1207,7 @@ def build_nodes_dashboard(): 8, "Root Filesystem Usage", root_usage_expr(), - {"h": 9, "w": 24, "x": 0, "y": 31}, + {"h": 9, "w": 24, "x": 0, "y": 35}, unit="percent", legend="{{node}}", legend_display="table", @@ -1249,43 +1334,107 @@ def build_network_dashboard(): panels.append( stat_panel( 1, - "Ingress Traffic", - NET_INGRESS_EXPR, - {"h": 4, "w": 8, "x": 0, "y": 0}, - unit="Bps", + "Ingress Success Rate (5m)", + TRAEFIK_SLI_5M, + {"h": 4, "w": 6, "x": 0, "y": 0}, + unit="percentunit", + decimals=2, + thresholds={ + "mode": "absolute", + "steps": [ + {"color": "red", "value": None}, + {"color": "orange", "value": 0.995}, + {"color": "yellow", "value": 0.999}, + {"color": "green", "value": 0.9995}, + ], + }, ) ) panels.append( stat_panel( 2, - "Egress Traffic", - NET_EGRESS_EXPR, - {"h": 4, "w": 8, "x": 8, "y": 0}, - unit="Bps", + "Error Budget Burn (1h)", + traefik_burn("1h"), + {"h": 4, "w": 6, "x": 6, "y": 0}, + thresholds={ + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 1}, + {"color": "orange", "value": 2}, + {"color": "red", "value": 4}, + ], + }, + decimals=2, ) ) panels.append( stat_panel( 3, - "Intra-Cluster Traffic", - NET_INTERNAL_EXPR, - {"h": 4, "w": 8, "x": 16, "y": 0}, - unit="Bps", + "Error Budget Burn (6h)", + traefik_burn("6h"), + {"h": 4, "w": 6, "x": 12, "y": 0}, + thresholds={ + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 1}, + {"color": "orange", "value": 2}, + {"color": "red", "value": 4}, + ], + }, + decimals=2, ) ) panels.append( stat_panel( 4, - "Top Router req/s", - f"topk(1, {TRAEFIK_ROUTER_EXPR})", + "Edge P99 Latency (ms)", + TRAEFIK_P99_LATENCY_MS, + {"h": 4, "w": 6, "x": 18, "y": 0}, + unit="ms", + thresholds={ + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 200}, + {"color": "orange", "value": 350}, + {"color": "red", "value": 500}, + ], + }, + decimals=1, + ) + ) + panels.append( + stat_panel( + 5, + "Ingress Traffic", + NET_INGRESS_EXPR, {"h": 4, "w": 8, "x": 0, "y": 4}, - unit="req/s", - legend="{{router}}", + unit="Bps", + ) + ) + panels.append( + stat_panel( + 6, + "Egress Traffic", + NET_EGRESS_EXPR, + {"h": 4, "w": 8, "x": 8, "y": 4}, + unit="Bps", + ) + ) + panels.append( + stat_panel( + 7, + "Intra-Cluster Traffic", + NET_INTERNAL_EXPR, + {"h": 4, "w": 8, "x": 16, "y": 4}, + unit="Bps", ) ) panels.append( timeseries_panel( - 5, + 8, "Per-Node Throughput", f'avg by (node) (({NET_NODE_TX_PHYS} + {NET_NODE_RX_PHYS}) * on(instance) group_left(node) {NODE_INFO})', {"h": 8, "w": 24, "x": 0, "y": 8}, @@ -1297,7 +1446,7 @@ def build_network_dashboard(): ) panels.append( table_panel( - 6, + 9, "Top Namespaces", 'topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=""}[5m]) ' '+ rate(container_network_receive_bytes_total{namespace!=""}[5m])) by (namespace))', @@ -1308,7 +1457,7 @@ def build_network_dashboard(): ) panels.append( table_panel( - 7, + 10, "Top Pods", 'topk(10, sum(rate(container_network_transmit_bytes_total{pod!=""}[5m]) ' '+ rate(container_network_receive_bytes_total{pod!=""}[5m])) by (namespace,pod))', @@ -1319,7 +1468,7 @@ def build_network_dashboard(): ) panels.append( timeseries_panel( - 8, + 11, "Traefik Routers (req/s)", f"topk(10, {TRAEFIK_ROUTER_EXPR})", {"h": 9, "w": 12, "x": 0, "y": 25}, @@ -1331,7 +1480,7 @@ def build_network_dashboard(): ) panels.append( timeseries_panel( - 9, + 12, "Traefik Entrypoints (req/s)", 'sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))', {"h": 9, "w": 12, "x": 12, "y": 25}, diff --git a/services/monitoring/dashboards/atlas-network.json b/services/monitoring/dashboards/atlas-network.json index ff0af9b..64bd7ee 100644 --- a/services/monitoring/dashboards/atlas-network.json +++ b/services/monitoring/dashboards/atlas-network.json @@ -7,6 +7,282 @@ { "id": 1, "type": "stat", + "title": "Ingress Success Rate (5m)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 0 + }, + "targets": [ + { + "expr": "(sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[5m]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[5m])), 1)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 0.995 + }, + { + "color": "yellow", + "value": 0.999 + }, + { + "color": "green", + "value": 0.9995 + } + ] + }, + "unit": "percentunit", + "custom": { + "displayMode": "auto" + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 2, + "type": "stat", + "title": "Error Budget Burn (1h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 0 + }, + "targets": [ + { + "expr": "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[1h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[1h])), 1))) / 0.0010000000000000009", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 2 + }, + { + "color": "red", + "value": 4 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 3, + "type": "stat", + "title": "Error Budget Burn (6h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 0 + }, + "targets": [ + { + "expr": "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[6h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[6h])), 1))) / 0.0010000000000000009", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 2 + }, + { + "color": "red", + "value": 4 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 4, + "type": "stat", + "title": "Edge P99 Latency (ms)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 0 + }, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 200 + }, + { + "color": "orange", + "value": 350 + }, + { + "color": "red", + "value": 500 + } + ] + }, + "unit": "ms", + "custom": { + "displayMode": "auto" + }, + "decimals": 1 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 5, + "type": "stat", "title": "Ingress Traffic", "datasource": { "type": "prometheus", @@ -16,7 +292,7 @@ "h": 4, "w": 8, "x": 0, - "y": 0 + "y": 4 }, "targets": [ { @@ -65,7 +341,7 @@ } }, { - "id": 2, + "id": 6, "type": "stat", "title": "Egress Traffic", "datasource": { @@ -76,7 +352,7 @@ "h": 4, "w": 8, "x": 8, - "y": 0 + "y": 4 }, "targets": [ { @@ -125,7 +401,7 @@ } }, { - "id": 3, + "id": 7, "type": "stat", "title": "Intra-Cluster Traffic", "datasource": { @@ -136,7 +412,7 @@ "h": 4, "w": 8, "x": 16, - "y": 0 + "y": 4 }, "targets": [ { @@ -185,68 +461,7 @@ } }, { - "id": 4, - "type": "stat", - "title": "Top Router req/s", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 8, - "x": 0, - "y": 4 - }, - "targets": [ - { - "expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))", - "refId": "A", - "legendFormat": "{{router}}" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "req/s", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 5, + "id": 8, "type": "timeseries", "title": "Per-Node Throughput", "datasource": { @@ -283,7 +498,7 @@ } }, { - "id": 6, + "id": 9, "type": "table", "title": "Top Namespaces", "datasource": { @@ -319,7 +534,7 @@ ] }, { - "id": 7, + "id": 10, "type": "table", "title": "Top Pods", "datasource": { @@ -355,7 +570,7 @@ ] }, { - "id": 8, + "id": 11, "type": "timeseries", "title": "Traefik Routers (req/s)", "datasource": { @@ -392,7 +607,7 @@ } }, { - "id": 9, + "id": 12, "type": "timeseries", "title": "Traefik Entrypoints (req/s)", "datasource": { diff --git a/services/monitoring/dashboards/atlas-nodes.json b/services/monitoring/dashboards/atlas-nodes.json index 802fe5a..fb665d8 100644 --- a/services/monitoring/dashboards/atlas-nodes.json +++ b/services/monitoring/dashboards/atlas-nodes.json @@ -186,6 +186,213 @@ "textMode": "value" } }, + { + "id": 9, + "type": "stat", + "title": "API Server 5xx rate", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 0, + "y": 4 + }, + "targets": [ + { + "expr": "sum(rate(apiserver_request_total{code=~\"5..\"}[5m]))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.05 + }, + { + "color": "orange", + "value": 0.2 + }, + { + "color": "red", + "value": 0.5 + } + ] + }, + "unit": "req/s", + "custom": { + "displayMode": "auto" + }, + "decimals": 3 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 10, + "type": "stat", + "title": "API Server P99 latency", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 8, + "y": 4 + }, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum by (le) (rate(apiserver_request_duration_seconds_bucket[5m]))) * 1000", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 250 + }, + { + "color": "orange", + "value": 400 + }, + { + "color": "red", + "value": 600 + } + ] + }, + "unit": "ms", + "custom": { + "displayMode": "auto" + }, + "decimals": 1 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 11, + "type": "stat", + "title": "etcd P99 latency", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 16, + "y": 4 + }, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum by (le) (rate(etcd_request_duration_seconds_bucket[5m]))) * 1000", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "orange", + "value": 100 + }, + { + "color": "red", + "value": 200 + } + ] + }, + "unit": "ms", + "custom": { + "displayMode": "auto" + }, + "decimals": 1 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, { "id": 4, "type": "timeseries", @@ -198,7 +405,7 @@ "h": 9, "w": 24, "x": 0, - "y": 4 + "y": 8 }, "targets": [ { @@ -238,7 +445,7 @@ "h": 9, "w": 24, "x": 0, - "y": 13 + "y": 17 }, "targets": [ { @@ -278,7 +485,7 @@ "h": 9, "w": 12, "x": 0, - "y": 22 + "y": 26 }, "targets": [ { @@ -315,7 +522,7 @@ "h": 9, "w": 12, "x": 12, - "y": 22 + "y": 26 }, "targets": [ { @@ -352,7 +559,7 @@ "h": 9, "w": 24, "x": 0, - "y": 31 + "y": 35 }, "targets": [ { diff --git a/services/monitoring/grafana-dashboard-network.yaml b/services/monitoring/grafana-dashboard-network.yaml index fd1f5d6..309b005 100644 --- a/services/monitoring/grafana-dashboard-network.yaml +++ b/services/monitoring/grafana-dashboard-network.yaml @@ -16,6 +16,282 @@ data: { "id": 1, "type": "stat", + "title": "Ingress Success Rate (5m)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 0 + }, + "targets": [ + { + "expr": "(sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[5m]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[5m])), 1)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 0.995 + }, + { + "color": "yellow", + "value": 0.999 + }, + { + "color": "green", + "value": 0.9995 + } + ] + }, + "unit": "percentunit", + "custom": { + "displayMode": "auto" + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 2, + "type": "stat", + "title": "Error Budget Burn (1h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 0 + }, + "targets": [ + { + "expr": "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[1h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[1h])), 1))) / 0.0010000000000000009", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 2 + }, + { + "color": "red", + "value": 4 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 3, + "type": "stat", + "title": "Error Budget Burn (6h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 0 + }, + "targets": [ + { + "expr": "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[6h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[6h])), 1))) / 0.0010000000000000009", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 2 + }, + { + "color": "red", + "value": 4 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 4, + "type": "stat", + "title": "Edge P99 Latency (ms)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 0 + }, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 200 + }, + { + "color": "orange", + "value": 350 + }, + { + "color": "red", + "value": 500 + } + ] + }, + "unit": "ms", + "custom": { + "displayMode": "auto" + }, + "decimals": 1 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 5, + "type": "stat", "title": "Ingress Traffic", "datasource": { "type": "prometheus", @@ -25,7 +301,7 @@ data: "h": 4, "w": 8, "x": 0, - "y": 0 + "y": 4 }, "targets": [ { @@ -74,7 +350,7 @@ data: } }, { - "id": 2, + "id": 6, "type": "stat", "title": "Egress Traffic", "datasource": { @@ -85,7 +361,7 @@ data: "h": 4, "w": 8, "x": 8, - "y": 0 + "y": 4 }, "targets": [ { @@ -134,7 +410,7 @@ data: } }, { - "id": 3, + "id": 7, "type": "stat", "title": "Intra-Cluster Traffic", "datasource": { @@ -145,7 +421,7 @@ data: "h": 4, "w": 8, "x": 16, - "y": 0 + "y": 4 }, "targets": [ { @@ -194,68 +470,7 @@ data: } }, { - "id": 4, - "type": "stat", - "title": "Top Router req/s", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 8, - "x": 0, - "y": 4 - }, - "targets": [ - { - "expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))", - "refId": "A", - "legendFormat": "{{router}}" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "req/s", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 5, + "id": 8, "type": "timeseries", "title": "Per-Node Throughput", "datasource": { @@ -292,7 +507,7 @@ data: } }, { - "id": 6, + "id": 9, "type": "table", "title": "Top Namespaces", "datasource": { @@ -328,7 +543,7 @@ data: ] }, { - "id": 7, + "id": 10, "type": "table", "title": "Top Pods", "datasource": { @@ -364,7 +579,7 @@ data: ] }, { - "id": 8, + "id": 11, "type": "timeseries", "title": "Traefik Routers (req/s)", "datasource": { @@ -401,7 +616,7 @@ data: } }, { - "id": 9, + "id": 12, "type": "timeseries", "title": "Traefik Entrypoints (req/s)", "datasource": { diff --git a/services/monitoring/grafana-dashboard-nodes.yaml b/services/monitoring/grafana-dashboard-nodes.yaml index 2facfed..97c5539 100644 --- a/services/monitoring/grafana-dashboard-nodes.yaml +++ b/services/monitoring/grafana-dashboard-nodes.yaml @@ -195,6 +195,213 @@ data: "textMode": "value" } }, + { + "id": 9, + "type": "stat", + "title": "API Server 5xx rate", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 0, + "y": 4 + }, + "targets": [ + { + "expr": "sum(rate(apiserver_request_total{code=~\"5..\"}[5m]))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.05 + }, + { + "color": "orange", + "value": 0.2 + }, + { + "color": "red", + "value": 0.5 + } + ] + }, + "unit": "req/s", + "custom": { + "displayMode": "auto" + }, + "decimals": 3 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 10, + "type": "stat", + "title": "API Server P99 latency", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 8, + "y": 4 + }, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum by (le) (rate(apiserver_request_duration_seconds_bucket[5m]))) * 1000", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 250 + }, + { + "color": "orange", + "value": 400 + }, + { + "color": "red", + "value": 600 + } + ] + }, + "unit": "ms", + "custom": { + "displayMode": "auto" + }, + "decimals": 1 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 11, + "type": "stat", + "title": "etcd P99 latency", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 16, + "y": 4 + }, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum by (le) (rate(etcd_request_duration_seconds_bucket[5m]))) * 1000", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "orange", + "value": 100 + }, + { + "color": "red", + "value": 200 + } + ] + }, + "unit": "ms", + "custom": { + "displayMode": "auto" + }, + "decimals": 1 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, { "id": 4, "type": "timeseries", @@ -207,7 +414,7 @@ data: "h": 9, "w": 24, "x": 0, - "y": 4 + "y": 8 }, "targets": [ { @@ -247,7 +454,7 @@ data: "h": 9, "w": 24, "x": 0, - "y": 13 + "y": 17 }, "targets": [ { @@ -287,7 +494,7 @@ data: "h": 9, "w": 12, "x": 0, - "y": 22 + "y": 26 }, "targets": [ { @@ -324,7 +531,7 @@ data: "h": 9, "w": 12, "x": 12, - "y": 22 + "y": 26 }, "targets": [ { @@ -361,7 +568,7 @@ data: "h": 9, "w": 24, "x": 0, - "y": 31 + "y": 35 }, "targets": [ {