{ "uid": "atlas-overview", "title": "Atlas Overview", "folderUid": "overview", "editable": false, "annotations": { "list": [] }, "panels": [ { "id": 1, "type": "gauge", "title": "Workers Ready", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 5, "w": 5, "x": 0, "y": 0 }, "targets": [ { "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})", "refId": "A" } ], "fieldConfig": { "defaults": { "min": 0, "max": 18, "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "orange", "value": 16 }, { "color": "yellow", "value": 17 }, { "color": "green", "value": 18 } ] } }, "overrides": [] }, "options": { "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "orientation": "auto", "showThresholdMarkers": false, "showThresholdLabels": false } }, { "id": 2, "type": "gauge", "title": "Control Plane Ready", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 5, "w": 5, "x": 5, "y": 0 }, "targets": [ { "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})", "refId": "A" } ], "fieldConfig": { "defaults": { "min": 0, "max": 3, "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "green", "value": 3 } ] } }, "overrides": [] }, "options": { "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "orientation": "auto", "showThresholdMarkers": false, "showThresholdLabels": false } }, { "id": 3, "type": "stat", "title": "Control Plane Workloads", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 5, "w": 5, "x": 10, "y": 0 }, "targets": [ { "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"}) or on() vector(0)", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 1 }, { "color": "orange", "value": 2 }, { "color": "red", "value": 3 } ] }, "unit": "none", "custom": { "displayMode": "auto" } }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" }, "links": [ { "title": "Open atlas-pods dashboard", "url": "/d/atlas-pods", "targetBlank": true } ] }, { "id": 4, "type": "stat", "title": "Problem Pods", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 5, "w": 5, "x": 15, "y": 0 }, "targets": [ { "expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"}))", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 1 }, { "color": "orange", "value": 2 }, { "color": "red", "value": 3 } ] }, "unit": "none", "custom": { "displayMode": "auto" } }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" }, "links": [ { "title": "Open atlas-pods dashboard", "url": "/d/atlas-pods", "targetBlank": true } ] }, { "id": 5, "type": "stat", "title": "Stuck Terminating", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 5, "w": 4, "x": 20, "y": 0 }, "targets": [ { "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)))", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 1 }, { "color": "orange", "value": 2 }, { "color": "red", "value": 3 } ] }, "unit": "none", "custom": { "displayMode": "auto" } }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" }, "links": [ { "title": "Open atlas-pods dashboard", "url": "/d/atlas-pods", "targetBlank": true } ] }, { "id": 7, "type": "stat", "title": "Hottest node: CPU", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 3, "w": 6, "x": 0, "y": 5 }, "targets": [ { "expr": "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}", "instant": true } ], "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "percentage", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 } ] }, "unit": "percent", "custom": { "displayMode": "auto" } }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "name_and_value" }, "links": [ { "title": "Open atlas-nodes dashboard", "url": "/d/atlas-nodes", "targetBlank": true } ] }, { "id": 8, "type": "stat", "title": "Hottest node: RAM", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 3, "w": 6, "x": 6, "y": 5 }, "targets": [ { "expr": "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}", "instant": true } ], "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "percentage", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 } ] }, "unit": "percent", "custom": { "displayMode": "auto" } }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "name_and_value" }, "links": [ { "title": "Open atlas-nodes dashboard", "url": "/d/atlas-nodes", "targetBlank": true } ] }, { "id": 9, "type": "stat", "title": "Hottest node: NET (rx+tx)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 3, "w": 6, "x": 12, "y": 5 }, "targets": [ { "expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}", "instant": true } ], "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "rgba(115, 115, 115, 1)", "value": null }, { "color": "green", "value": 1 } ] }, "unit": "Bps", "custom": { "displayMode": "auto" } }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "name_and_value" }, "links": [ { "title": "Open atlas-nodes dashboard", "url": "/d/atlas-nodes", "targetBlank": true } ] }, { "id": 10, "type": "stat", "title": "Hottest node: I/O (r+w)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 3, "w": 6, "x": 18, "y": 5 }, "targets": [ { "expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}", "instant": true } ], "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "rgba(115, 115, 115, 1)", "value": null }, { "color": "green", "value": 1 } ] }, "unit": "Bps", "custom": { "displayMode": "auto" } }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "name_and_value" }, "links": [ { "title": "Open atlas-nodes dashboard", "url": "/d/atlas-nodes", "targetBlank": true } ] }, { "id": 23, "type": "stat", "title": "Astreae Usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 6, "w": 6, "x": 0, "y": 10 }, "targets": [ { "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "percentage", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 } ] }, "unit": "percent", "custom": { "displayMode": "auto" } }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" }, "links": [ { "title": "Open atlas-storage dashboard", "url": "/d/atlas-storage", "targetBlank": true } ] }, { "id": 24, "type": "stat", "title": "Asteria Usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 6, "w": 6, "x": 6, "y": 10 }, "targets": [ { "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "percentage", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 } ] }, "unit": "percent", "custom": { "displayMode": "auto" } }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" }, "links": [ { "title": "Open atlas-storage dashboard", "url": "/d/atlas-storage", "targetBlank": true } ] }, { "id": 25, "type": "stat", "title": "Astreae Free", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 6, "w": 6, "x": 12, "y": 10 }, "targets": [ { "expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "rgba(115, 115, 115, 1)", "value": null }, { "color": "green", "value": 1 } ] }, "unit": "decbytes", "custom": { "displayMode": "auto" } }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" }, "links": [ { "title": "Open atlas-storage dashboard", "url": "/d/atlas-storage", "targetBlank": true } ] }, { "id": 26, "type": "stat", "title": "Asteria Free", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 6, "w": 6, "x": 18, "y": 10 }, "targets": [ { "expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "rgba(115, 115, 115, 1)", "value": null }, { "color": "green", "value": 1 } ] }, "unit": "decbytes", "custom": { "displayMode": "auto" } }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" }, "links": [ { "title": "Open atlas-storage dashboard", "url": "/d/atlas-storage", "targetBlank": true } ] }, { "id": 11, "type": "piechart", "title": "Namespace CPU Share", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 9, "w": 8, "x": 0, "y": 16 }, "targets": [ { "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } ], "fieldConfig": { "defaults": { "unit": "percent", "color": { "mode": "palette-classic" } }, "overrides": [] }, "options": { "legend": { "displayMode": "list", "placement": "right" }, "pieType": "pie", "displayLabels": [ "percent" ], "tooltip": { "mode": "single" }, "colorScheme": "interpolateSpectral", "colorBy": "value", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false } } }, { "id": 12, "type": "piechart", "title": "Namespace GPU Share", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 9, "w": 8, "x": 8, "y": 16 }, "targets": [ { "expr": "100 * ( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } ], "fieldConfig": { "defaults": { "unit": "percent", "color": { "mode": "palette-classic" } }, "overrides": [] }, "options": { "legend": { "displayMode": "list", "placement": "right" }, "pieType": "pie", "displayLabels": [ "percent" ], "tooltip": { "mode": "single" }, "colorScheme": "interpolateSpectral", "colorBy": "value", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false } } }, { "id": 13, "type": "piechart", "title": "Namespace RAM Share", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 9, "w": 8, "x": 16, "y": 16 }, "targets": [ { "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } ], "fieldConfig": { "defaults": { "unit": "percent", "color": { "mode": "palette-classic" } }, "overrides": [] }, "options": { "legend": { "displayMode": "list", "placement": "right" }, "pieType": "pie", "displayLabels": [ "percent" ], "tooltip": { "mode": "single" }, "colorScheme": "interpolateSpectral", "colorBy": "value", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false } } }, { "id": 14, "type": "timeseries", "title": "Worker Node CPU", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 12, "w": 12, "x": 0, "y": 32 }, "targets": [ { "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } ], "fieldConfig": { "defaults": { "unit": "percent" }, "overrides": [] }, "options": { "legend": { "displayMode": "table", "placement": "right", "calcs": [ "last" ] }, "tooltip": { "mode": "multi" } }, "links": [ { "title": "Open atlas-nodes dashboard", "url": "/d/atlas-nodes", "targetBlank": true } ] }, { "id": 15, "type": "timeseries", "title": "Worker Node RAM", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 12, "w": 12, "x": 12, "y": 32 }, "targets": [ { "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } ], "fieldConfig": { "defaults": { "unit": "percent" }, "overrides": [] }, "options": { "legend": { "displayMode": "table", "placement": "right", "calcs": [ "last" ] }, "tooltip": { "mode": "multi" } }, "links": [ { "title": "Open atlas-nodes dashboard", "url": "/d/atlas-nodes", "targetBlank": true } ] }, { "id": 16, "type": "timeseries", "title": "Control plane CPU", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 10, "w": 12, "x": 0, "y": 44 }, "targets": [ { "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } ], "fieldConfig": { "defaults": { "unit": "percent" }, "overrides": [] }, "options": { "legend": { "displayMode": "table", "placement": "right" }, "tooltip": { "mode": "multi" } } }, { "id": 17, "type": "timeseries", "title": "Control plane RAM", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 10, "w": 12, "x": 12, "y": 44 }, "targets": [ { "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } ], "fieldConfig": { "defaults": { "unit": "percent" }, "overrides": [] }, "options": { "legend": { "displayMode": "table", "placement": "right" }, "tooltip": { "mode": "multi" } } }, { "id": 18, "type": "timeseries", "title": "Cluster Ingress Throughput", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 7, "w": 8, "x": 0, "y": 25 }, "targets": [ { "expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)", "refId": "A", "legendFormat": "Ingress (Traefik)" } ], "fieldConfig": { "defaults": { "unit": "Bps" }, "overrides": [] }, "options": { "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { "mode": "multi" } }, "links": [ { "title": "Open atlas-network dashboard", "url": "/d/atlas-network", "targetBlank": true } ] }, { "id": 19, "type": "timeseries", "title": "Cluster Egress Throughput", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 7, "w": 8, "x": 8, "y": 25 }, "targets": [ { "expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)", "refId": "A", "legendFormat": "Egress (Traefik)" } ], "fieldConfig": { "defaults": { "unit": "Bps" }, "overrides": [] }, "options": { "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { "mode": "multi" } }, "links": [ { "title": "Open atlas-network dashboard", "url": "/d/atlas-network", "targetBlank": true } ] }, { "id": 20, "type": "timeseries", "title": "Intra-Cluster Throughput", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 7, "w": 8, "x": 16, "y": 25 }, "targets": [ { "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)", "refId": "A", "legendFormat": "Internal traffic" } ], "fieldConfig": { "defaults": { "unit": "Bps" }, "overrides": [] }, "options": { "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { "mode": "multi" } }, "links": [ { "title": "Open atlas-network dashboard", "url": "/d/atlas-network", "targetBlank": true } ] }, { "id": 21, "type": "timeseries", "title": "Root Filesystem Usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 16, "w": 12, "x": 0, "y": 54 }, "targets": [ { "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", "refId": "A", "legendFormat": "{{node}}" } ], "fieldConfig": { "defaults": { "unit": "percent" }, "overrides": [] }, "options": { "legend": { "displayMode": "table", "placement": "right", "calcs": [ "last" ] }, "tooltip": { "mode": "multi" } }, "timeFrom": "30d", "links": [ { "title": "Open atlas-storage dashboard", "url": "/d/atlas-storage", "targetBlank": true } ] }, { "id": 22, "type": "bargauge", "title": "Nodes Closest to Full Root Disks", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 16, "w": 12, "x": 12, "y": 54 }, "targets": [ { "expr": "topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "refId": "A", "legendFormat": "{{node}}" } ], "fieldConfig": { "defaults": { "unit": "percent", "min": 0, "max": 100, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 50 }, { "color": "orange", "value": 70 }, { "color": "red", "value": 85 } ] } }, "overrides": [] }, "options": { "displayMode": "gradient", "orientation": "horizontal", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false } }, "links": [ { "title": "Open atlas-storage dashboard", "url": "/d/atlas-storage", "targetBlank": true } ] } ], "schemaVersion": 39, "style": "dark", "tags": [ "atlas", "overview" ], "templating": { "list": [] }, "time": { "from": "now-1h", "to": "now" }, "refresh": "1m", "links": [ { "title": "Atlas Pods", "type": "dashboard", "dashboardUid": "atlas-pods", "keepTime": false }, { "title": "Atlas Nodes", "type": "dashboard", "dashboardUid": "atlas-nodes", "keepTime": false }, { "title": "Atlas Storage", "type": "dashboard", "dashboardUid": "atlas-storage", "keepTime": false }, { "title": "Atlas Network", "type": "dashboard", "dashboardUid": "atlas-network", "keepTime": false }, { "title": "Atlas GPU", "type": "dashboard", "dashboardUid": "atlas-gpu", "keepTime": false } ] }