# services/monitoring/grafana-dashboard-network.yaml apiVersion: v1 kind: ConfigMap metadata: name: grafana-dashboard-network labels: grafana_dashboard: "1" data: atlas-network.json: | { "uid": "atlas-network", "title": "Atlas Network", "folderUid": "atlas-internal", "editable": true, "panels": [ { "id": 1, "type": "stat", "title": "Ingress Success Rate (5m)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 }, "targets": [ { "expr": "(sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[5m]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[5m])), 1)", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "orange", "value": 0.995 }, { "color": "yellow", "value": 0.999 }, { "color": "green", "value": 0.9995 } ] }, "unit": "percentunit", "custom": { "displayMode": "auto" }, "decimals": 2 }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" } }, { "id": 2, "type": "stat", "title": "Error Budget Burn (1h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 }, "targets": [ { "expr": "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[1h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[1h])), 1))) / 0.0010000000000000009", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 1 }, { "color": "orange", "value": 2 }, { "color": "red", "value": 4 } ] }, "unit": "none", "custom": { "displayMode": "auto" }, "decimals": 2 }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" } }, { "id": 3, "type": "stat", "title": "Error Budget Burn (6h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 }, "targets": [ { "expr": "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[6h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[6h])), 1))) / 0.0010000000000000009", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 1 }, { "color": "orange", "value": 2 }, { "color": "red", "value": 4 } ] }, "unit": "none", "custom": { "displayMode": "auto" }, "decimals": 2 }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" } }, { "id": 4, "type": "stat", "title": "Edge P99 Latency (ms)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 }, "targets": [ { "expr": "histogram_quantile(0.99, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 200 }, { "color": "orange", "value": 350 }, { "color": "red", "value": 500 } ] }, "unit": "ms", "custom": { "displayMode": "auto" }, "decimals": 1 }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" } }, { "id": 5, "type": "stat", "title": "Ingress Traffic", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 4, "w": 8, "x": 0, "y": 4 }, "targets": [ { "expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "rgba(115, 115, 115, 1)", "value": null }, { "color": "green", "value": 1 } ] }, "unit": "Bps", "custom": { "displayMode": "auto" } }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" } }, { "id": 6, "type": "stat", "title": "Egress Traffic", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 4, "w": 8, "x": 8, "y": 4 }, "targets": [ { "expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "rgba(115, 115, 115, 1)", "value": null }, { "color": "green", "value": 1 } ] }, "unit": "Bps", "custom": { "displayMode": "auto" } }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" } }, { "id": 7, "type": "stat", "title": "Intra-Cluster Traffic", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 4, "w": 8, "x": 16, "y": 4 }, "targets": [ { "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "rgba(115, 115, 115, 1)", "value": null }, { "color": "green", "value": 1 } ] }, "unit": "Bps", "custom": { "displayMode": "auto" } }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" } }, { "id": 8, "type": "timeseries", "title": "Per-Node Throughput", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 8, "w": 24, "x": 0, "y": 8 }, "targets": [ { "expr": "avg by (node) ((sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0) + sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", "refId": "A", "legendFormat": "{{node}}" } ], "fieldConfig": { "defaults": { "unit": "Bps" }, "overrides": [] }, "options": { "legend": { "displayMode": "table", "placement": "right" }, "tooltip": { "mode": "multi" } } }, { "id": 9, "type": "table", "title": "Top Namespaces", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 9, "w": 12, "x": 0, "y": 16 }, "targets": [ { "expr": "topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=\"\"}[5m]) + rate(container_network_receive_bytes_total{namespace!=\"\"}[5m])) by (namespace))", "refId": "A" } ], "fieldConfig": { "defaults": { "unit": "Bps" }, "overrides": [] }, "options": { "showHeader": true }, "transformations": [ { "id": "labelsToFields", "options": {} } ] }, { "id": 10, "type": "table", "title": "Top Pods", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 9, "w": 12, "x": 12, "y": 16 }, "targets": [ { "expr": "topk(10, sum(rate(container_network_transmit_bytes_total{pod!=\"\"}[5m]) + rate(container_network_receive_bytes_total{pod!=\"\"}[5m])) by (namespace,pod))", "refId": "A" } ], "fieldConfig": { "defaults": { "unit": "Bps" }, "overrides": [] }, "options": { "showHeader": true }, "transformations": [ { "id": "labelsToFields", "options": {} } ] }, { "id": 11, "type": "timeseries", "title": "Traefik Routers (req/s)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 9, "w": 12, "x": 0, "y": 25 }, "targets": [ { "expr": "topk(10, sum by (router) (rate(traefik_router_requests_total[5m])))", "refId": "A", "legendFormat": "{{router}}" } ], "fieldConfig": { "defaults": { "unit": "req/s" }, "overrides": [] }, "options": { "legend": { "displayMode": "table", "placement": "right" }, "tooltip": { "mode": "multi" } } }, { "id": 12, "type": "timeseries", "title": "Traefik Entrypoints (req/s)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 9, "w": 12, "x": 12, "y": 25 }, "targets": [ { "expr": "sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))", "refId": "A", "legendFormat": "{{entrypoint}}" } ], "fieldConfig": { "defaults": { "unit": "req/s" }, "overrides": [] }, "options": { "legend": { "displayMode": "table", "placement": "right" }, "tooltip": { "mode": "multi" } } } ], "time": { "from": "now-12h", "to": "now" }, "annotations": { "list": [] }, "schemaVersion": 39, "style": "dark", "tags": [ "atlas", "network" ] }