2025-11-17 14:22:46 -03:00
# services/monitoring/grafana-dashboard-overview.yaml
2025-11-14 00:02:59 -03:00
apiVersion : v1
kind : ConfigMap
metadata :
2025-11-17 14:22:46 -03:00
name : grafana-dashboard-overview
2025-11-14 00:02:59 -03:00
labels :
grafana_dashboard : "1"
data :
2025-11-17 14:22:46 -03:00
atlas-overview.json : |
2025-11-14 00:02:59 -03:00
{
2025-11-17 14:22:46 -03:00
"uid": "atlas-overview" ,
"title": "Atlas Overview" ,
2025-12-02 14:41:39 -03:00
"folderUid": "overview" ,
2025-11-17 16:27:38 -03:00
"editable": false ,
2025-11-14 00:02:59 -03:00
"annotations": {
2025-11-17 16:27:38 -03:00
"list": [ ]
2025-11-14 00:02:59 -03:00
},
"panels": [
{
2025-12-12 15:56:33 -03:00
"id": 2 ,
2025-11-18 12:11:47 -03:00
"type": "gauge" ,
2025-12-12 15:56:33 -03:00
"title": "Control Plane Ready" ,
2025-11-14 00:02:59 -03:00
"datasource": {
"type": "prometheus" ,
"uid": "atlas-vm"
},
2025-11-15 21:03:11 -03:00
"gridPos": {
2025-11-16 00:55:28 -03:00
"h": 5 ,
2025-12-12 15:23:51 -03:00
"w": 4 ,
2025-11-15 21:03:11 -03:00
"x": 0 ,
"y": 0
},
"targets": [
{
2025-12-12 15:56:33 -03:00
"expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})" ,
2025-11-15 21:03:11 -03:00
"refId": "A"
}
] ,
2025-11-14 00:02:59 -03:00
"fieldConfig": {
"defaults": {
2025-11-18 12:11:47 -03:00
"min": 0 ,
2025-12-12 15:56:33 -03:00
"max": 3 ,
2025-11-14 00:02:59 -03:00
"thresholds": {
"mode": "absolute" ,
"steps": [
{
2025-11-17 19:49:50 -03:00
"color": "red" ,
2025-11-18 11:12:03 -03:00
"value": null
2025-11-15 21:03:11 -03:00
},
{
"color": "green" ,
2025-12-12 15:56:33 -03:00
"value": 3
2025-11-14 00:02:59 -03:00
}
]
2025-11-17 16:27:38 -03:00
}
2025-11-14 00:02:59 -03:00
},
"overrides": [ ]
},
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
] ,
"fields": "" ,
"values": false
2025-11-16 00:55:28 -03:00
},
2025-11-18 12:11:47 -03:00
"orientation": "auto" ,
"showThresholdMarkers": false ,
"showThresholdLabels": false
2025-11-15 21:03:11 -03:00
}
},
{
2025-12-12 15:56:33 -03:00
"id": 3 ,
"type": "stat" ,
"title": "Control Plane Workloads" ,
2025-11-15 21:03:11 -03:00
"datasource": {
"type": "prometheus" ,
"uid": "atlas-vm"
},
"gridPos": {
2025-11-16 00:55:28 -03:00
"h": 5 ,
2025-12-12 15:23:51 -03:00
"w": 3 ,
"x": 4 ,
2025-11-15 21:03:11 -03:00
"y": 0
2025-11-14 00:02:59 -03:00
},
"targets": [
{
2025-12-12 15:56:33 -03:00
"expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"}) or on() vector(0)" ,
2025-11-14 00:02:59 -03:00
"refId": "A"
}
] ,
2025-11-15 21:03:11 -03:00
"fieldConfig": {
"defaults": {
2025-12-12 15:56:33 -03:00
"color": {
2025-12-12 20:44:20 -03:00
"mode": "thresholds"
2025-12-12 15:56:33 -03:00
},
"mappings": [ ] ,
2025-11-15 21:03:11 -03:00
"thresholds": {
2025-11-16 00:55:28 -03:00
"mode": "absolute" ,
2025-11-15 21:03:11 -03:00
"steps": [
{
2025-12-12 15:56:33 -03:00
"color": "green" ,
2025-12-12 20:40:32 -03:00
"value": 0
2025-11-15 21:03:11 -03:00
},
2025-12-12 15:56:33 -03:00
{
"color": "red" ,
2025-12-12 20:30:00 -03:00
"value": 1
2025-11-16 00:55:28 -03:00
}
]
2025-12-12 15:56:33 -03:00
},
"unit": "none" ,
"custom": {
"displayMode": "auto"
2025-11-17 16:27:38 -03:00
}
2025-11-16 00:55:28 -03:00
},
"overrides": [ ]
},
"options": {
2025-12-12 15:56:33 -03:00
"colorMode": "value" ,
"graphMode": "area" ,
"justifyMode": "center" ,
2025-11-16 00:55:28 -03:00
"reduceOptions": {
"calcs": [
"lastNotNull"
] ,
"fields": "" ,
"values": false
},
2025-12-12 15:56:33 -03:00
"textMode": "value"
},
"links": [
{
"title": "Open atlas-pods dashboard" ,
"url": "/d/atlas-pods" ,
"targetBlank": true
}
]
2025-11-16 00:55:28 -03:00
},
{
2025-12-12 15:56:33 -03:00
"id": 5 ,
2025-11-18 17:09:13 -03:00
"type": "stat" ,
2025-12-12 15:56:33 -03:00
"title": "Stuck Terminating" ,
2025-11-16 00:55:28 -03:00
"datasource": {
"type": "prometheus" ,
"uid": "atlas-vm"
},
"gridPos": {
"h": 5 ,
2025-12-12 15:23:51 -03:00
"w": 3 ,
"x": 7 ,
2025-11-16 00:55:28 -03:00
"y": 0
},
"targets": [
{
2025-12-12 20:30:00 -03:00
"expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)" ,
2025-11-16 00:55:28 -03:00
"refId": "A"
}
] ,
"fieldConfig": {
"defaults": {
2025-11-18 17:09:13 -03:00
"color": {
2025-12-12 20:44:20 -03:00
"mode": "thresholds"
2025-11-18 17:09:13 -03:00
},
"mappings": [ ] ,
2025-11-16 00:55:28 -03:00
"thresholds": {
"mode": "absolute" ,
"steps": [
{
2025-11-17 19:49:50 -03:00
"color": "green" ,
2025-12-12 20:40:32 -03:00
"value": 0
2025-11-15 21:03:11 -03:00
},
2025-11-17 19:49:50 -03:00
{
"color": "red" ,
2025-12-12 20:30:00 -03:00
"value": 1
2025-11-15 21:03:11 -03:00
}
]
2025-11-18 17:09:13 -03:00
},
"unit": "none" ,
"custom": {
"displayMode": "auto"
2025-11-17 16:27:38 -03:00
}
2025-11-15 21:03:11 -03:00
},
"overrides": [ ]
},
"options": {
2025-11-18 17:09:13 -03:00
"colorMode": "value" ,
"graphMode": "area" ,
"justifyMode": "center" ,
2025-11-15 21:03:11 -03:00
"reduceOptions": {
"calcs": [
"lastNotNull"
] ,
"fields": "" ,
"values": false
2025-11-16 00:55:28 -03:00
},
2025-11-18 17:09:13 -03:00
"textMode": "value"
2025-11-17 19:49:50 -03:00
},
"links": [
{
"title": "Open atlas-pods dashboard" ,
"url": "/d/atlas-pods" ,
"targetBlank": true
}
]
2025-11-14 00:02:59 -03:00
},
2025-12-12 15:23:51 -03:00
{
"id": 27 ,
"type": "stat" ,
2025-12-12 16:11:28 -03:00
"title": "Atlas Availability (30d)" ,
2025-12-12 15:23:51 -03:00
"datasource": {
"type": "prometheus" ,
"uid": "atlas-vm"
},
"gridPos": {
"h": 5 ,
"w": 4 ,
"x": 10 ,
"y": 0
},
"targets": [
{
2025-12-12 16:36:47 -03:00
"expr": "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[30d:5m])" ,
2025-12-12 15:23:51 -03:00
"refId": "A"
}
] ,
"fieldConfig": {
"defaults": {
"color": {
2025-12-12 20:44:20 -03:00
"mode": "thresholds"
2025-12-12 15:23:51 -03:00
},
"mappings": [ ] ,
"thresholds": {
"mode": "absolute" ,
"steps": [
{
"color": "red" ,
"value": null
},
{
"color": "orange" ,
2025-12-12 16:36:47 -03:00
"value": 0.99
2025-12-12 15:23:51 -03:00
},
{
"color": "yellow" ,
2025-12-12 16:36:47 -03:00
"value": 0.999
2025-12-12 15:23:51 -03:00
},
{
"color": "green" ,
2025-12-12 16:36:47 -03:00
"value": 0.9999
2025-12-12 15:23:51 -03:00
}
]
},
2025-12-12 16:15:37 -03:00
"unit": "percentunit" ,
2025-12-12 15:23:51 -03:00
"custom": {
2025-12-12 16:36:47 -03:00
"displayMode": "auto"
2025-12-12 16:15:37 -03:00
},
"decimals": 3
2025-12-12 15:23:51 -03:00
},
"overrides": [ ]
},
"options": {
"colorMode": "value" ,
"graphMode": "area" ,
"justifyMode": "center" ,
"reduceOptions": {
"calcs": [
"lastNotNull"
] ,
"fields": "" ,
"values": false
},
"textMode": "value"
}
},
2025-11-14 00:02:59 -03:00
{
2025-11-16 00:55:28 -03:00
"id": 4 ,
2025-12-02 14:41:39 -03:00
"type": "stat" ,
"title": "Problem Pods" ,
2025-11-14 00:02:59 -03:00
"datasource": {
"type": "prometheus" ,
"uid": "atlas-vm"
},
2025-11-15 21:03:11 -03:00
"gridPos": {
2025-11-16 00:55:28 -03:00
"h": 5 ,
2025-12-12 15:23:51 -03:00
"w": 3 ,
"x": 14 ,
2025-11-15 21:03:11 -03:00
"y": 0
},
"targets": [
{
2025-12-12 20:30:00 -03:00
"expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)" ,
2025-11-15 21:03:11 -03:00
"refId": "A"
}
] ,
2025-11-14 00:02:59 -03:00
"fieldConfig": {
"defaults": {
2025-12-02 14:41:39 -03:00
"color": {
2025-12-12 20:44:20 -03:00
"mode": "thresholds"
2025-12-02 14:41:39 -03:00
},
"mappings": [ ] ,
2025-11-14 00:02:59 -03:00
"thresholds": {
2025-11-17 14:22:46 -03:00
"mode": "absolute" ,
2025-11-14 00:02:59 -03:00
"steps": [
{
2025-11-17 16:27:38 -03:00
"color": "green" ,
2025-12-12 20:40:32 -03:00
"value": 0
2025-11-14 00:02:59 -03:00
},
2025-11-17 19:24:03 -03:00
{
"color": "red" ,
2025-12-12 20:30:00 -03:00
"value": 1
2025-11-14 00:02:59 -03:00
}
]
2025-12-02 14:41:39 -03:00
},
"unit": "none" ,
"custom": {
"displayMode": "auto"
2025-12-12 15:23:51 -03:00
}
},
"overrides": [ ]
},
"options": {
"colorMode": "value" ,
"graphMode": "area" ,
"justifyMode": "center" ,
"reduceOptions": {
"calcs": [
"lastNotNull"
] ,
"fields": "" ,
"values": false
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-pods dashboard" ,
"url": "/d/atlas-pods" ,
"targetBlank": true
}
]
},
{
"id": 6 ,
"type": "stat" ,
"title": "CrashLoop / ImagePull" ,
"datasource": {
"type": "prometheus" ,
"uid": "atlas-vm"
},
"gridPos": {
"h": 5 ,
"w": 3 ,
"x": 17 ,
"y": 0
},
"targets": [
{
2025-12-12 20:30:00 -03:00
"expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)" ,
2025-12-12 15:23:51 -03:00
"refId": "A"
}
] ,
"fieldConfig": {
"defaults": {
"color": {
2025-12-12 20:44:20 -03:00
"mode": "thresholds"
2025-12-12 15:23:51 -03:00
},
"mappings": [ ] ,
"thresholds": {
"mode": "absolute" ,
"steps": [
{
"color": "green" ,
2025-12-12 20:40:32 -03:00
"value": 0
2025-12-12 15:23:51 -03:00
},
{
"color": "red" ,
2025-12-12 20:30:00 -03:00
"value": 1
2025-12-12 15:23:51 -03:00
}
]
},
"unit": "none" ,
"custom": {
"displayMode": "auto"
2025-11-17 16:27:38 -03:00
}
2025-11-14 00:02:59 -03:00
},
"overrides": [ ]
},
2025-11-15 21:03:11 -03:00
"options": {
2025-12-02 14:41:39 -03:00
"colorMode": "value" ,
"graphMode": "area" ,
"justifyMode": "center" ,
2025-11-15 21:03:11 -03:00
"reduceOptions": {
"calcs": [
"lastNotNull"
] ,
"fields": "" ,
"values": false
2025-11-16 00:55:28 -03:00
},
2025-12-02 14:41:39 -03:00
"textMode": "value"
2025-11-17 16:27:38 -03:00
},
"links": [
{
"title": "Open atlas-pods dashboard" ,
"url": "/d/atlas-pods" ,
"targetBlank": true
}
]
2025-11-15 21:03:11 -03:00
},
{
2025-12-12 15:56:33 -03:00
"id": 1 ,
"type": "gauge" ,
"title": "Workers Ready" ,
2025-11-15 21:03:11 -03:00
"datasource": {
"type": "prometheus" ,
"uid": "atlas-vm"
},
2025-11-14 00:02:59 -03:00
"gridPos": {
2025-11-16 00:55:28 -03:00
"h": 5 ,
"w": 4 ,
2025-11-18 15:55:24 -03:00
"x": 20 ,
2025-11-14 00:02:59 -03:00
"y": 0
},
2025-11-15 21:03:11 -03:00
"targets": [
{
2025-12-12 15:56:33 -03:00
"expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})" ,
2025-11-17 14:22:46 -03:00
"refId": "A"
}
] ,
"fieldConfig": {
"defaults": {
2025-12-12 15:56:33 -03:00
"min": 0 ,
"max": 18 ,
2025-11-17 14:22:46 -03:00
"thresholds": {
"mode": "absolute" ,
"steps": [
{
2025-12-12 15:56:33 -03:00
"color": "red" ,
2025-11-18 11:12:03 -03:00
"value": null
2025-11-17 14:22:46 -03:00
},
{
2025-12-12 15:56:33 -03:00
"color": "orange" ,
"value": 16
2025-11-17 19:24:03 -03:00
},
{
2025-12-12 15:56:33 -03:00
"color": "yellow" ,
"value": 17
2025-11-17 19:24:03 -03:00
},
{
2025-12-12 15:56:33 -03:00
"color": "green" ,
"value": 18
2025-11-17 14:22:46 -03:00
}
]
2025-11-17 16:27:38 -03:00
}
2025-11-17 14:22:46 -03:00
},
"overrides": [ ]
},
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
] ,
"fields": "" ,
"values": false
},
2025-12-12 15:56:33 -03:00
"orientation": "auto" ,
"showThresholdMarkers": false ,
"showThresholdLabels": false
}
2025-11-17 14:22:46 -03:00
},
{
"id": 7 ,
"type": "stat" ,
"title": "Hottest node: CPU" ,
"datasource": {
"type": "prometheus" ,
"uid": "atlas-vm"
},
"gridPos": {
2025-12-02 15:12:16 -03:00
"h": 3 ,
2025-11-17 16:27:38 -03:00
"w": 6 ,
"x": 0 ,
"y": 5
2025-11-17 14:22:46 -03:00
},
"targets": [
{
2025-11-17 23:42:55 -03:00
"expr": "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")" ,
2025-11-17 19:24:03 -03:00
"refId": "A" ,
2025-11-17 20:14:11 -03:00
"legendFormat": "{{node}}" ,
2025-11-17 19:38:40 -03:00
"instant": true
2025-11-17 14:22:46 -03:00
}
] ,
2025-11-15 21:03:11 -03:00
"fieldConfig": {
"defaults": {
"color": {
2025-12-12 20:44:20 -03:00
"mode": "thresholds"
2025-11-15 21:03:11 -03:00
},
"mappings": [ ] ,
"thresholds": {
"mode": "percentage" ,
"steps": [
{
"color": "green" ,
"value": null
},
{
"color": "yellow" ,
2025-11-17 14:22:46 -03:00
"value": 70
2025-11-15 21:03:11 -03:00
},
{
"color": "red" ,
2025-11-17 14:22:46 -03:00
"value": 85
2025-11-15 21:03:11 -03:00
}
]
},
2025-11-17 16:27:38 -03:00
"unit": "percent" ,
"custom": {
"displayMode": "auto"
2025-11-17 19:56:57 -03:00
}
2025-11-15 21:03:11 -03:00
},
"overrides": [ ]
},
2025-11-14 00:02:59 -03:00
"options": {
"colorMode": "value" ,
"graphMode": "area" ,
2025-11-15 21:03:11 -03:00
"justifyMode": "center" ,
2025-11-14 00:02:59 -03:00
"reduceOptions": {
"calcs": [
"lastNotNull"
] ,
"fields": "" ,
"values": false
2025-11-16 00:55:28 -03:00
},
2025-11-17 19:49:50 -03:00
"textMode": "name_and_value"
2025-11-17 16:27:38 -03:00
},
"links": [
{
"title": "Open atlas-nodes dashboard" ,
"url": "/d/atlas-nodes" ,
"targetBlank": true
}
]
2025-11-16 00:55:28 -03:00
},
{
2025-11-17 14:22:46 -03:00
"id": 8 ,
2025-11-16 00:55:28 -03:00
"type": "stat" ,
2025-11-17 14:22:46 -03:00
"title": "Hottest node: RAM" ,
2025-11-16 00:55:28 -03:00
"datasource": {
"type": "prometheus" ,
"uid": "atlas-vm"
},
"gridPos": {
2025-12-02 15:12:16 -03:00
"h": 3 ,
2025-11-17 16:27:38 -03:00
"w": 6 ,
"x": 6 ,
"y": 5
2025-11-16 00:55:28 -03:00
},
"targets": [
{
2025-11-17 23:42:55 -03:00
"expr": "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")" ,
2025-11-17 19:24:03 -03:00
"refId": "A" ,
2025-11-17 20:14:11 -03:00
"legendFormat": "{{node}}" ,
2025-11-17 19:38:40 -03:00
"instant": true
2025-11-15 21:03:11 -03:00
}
2025-11-16 00:55:28 -03:00
] ,
"fieldConfig": {
"defaults": {
"color": {
2025-12-12 20:44:20 -03:00
"mode": "thresholds"
2025-11-16 00:55:28 -03:00
},
"mappings": [ ] ,
"thresholds": {
2025-11-17 14:22:46 -03:00
"mode": "percentage" ,
2025-11-16 00:55:28 -03:00
"steps": [
{
"color": "green" ,
"value": null
},
{
"color": "yellow" ,
2025-11-17 14:22:46 -03:00
"value": 70
2025-11-16 00:55:28 -03:00
},
{
"color": "red" ,
2025-11-17 14:22:46 -03:00
"value": 85
2025-11-16 00:55:28 -03:00
}
]
},
2025-11-17 16:27:38 -03:00
"unit": "percent" ,
"custom": {
"displayMode": "auto"
2025-11-17 19:56:57 -03:00
}
2025-11-16 00:55:28 -03:00
},
"overrides": [ ]
},
"options": {
"colorMode": "value" ,
"graphMode": "area" ,
"justifyMode": "center" ,
"reduceOptions": {
"calcs": [
"lastNotNull"
] ,
"fields": "" ,
"values": false
},
2025-11-17 19:49:50 -03:00
"textMode": "name_and_value"
2025-11-17 16:27:38 -03:00
},
"links": [
{
"title": "Open atlas-nodes dashboard" ,
"url": "/d/atlas-nodes" ,
"targetBlank": true
}
]
2025-11-15 21:03:11 -03:00
},
{
2025-11-17 14:22:46 -03:00
"id": 9 ,
2025-11-17 16:27:38 -03:00
"type": "stat" ,
2025-11-17 20:00:40 -03:00
"title": "Hottest node: NET (rx+tx)" ,
2025-11-17 16:27:38 -03:00
"datasource": {
"type": "prometheus" ,
"uid": "atlas-vm"
},
"gridPos": {
2025-12-02 15:12:16 -03:00
"h": 3 ,
2025-11-17 16:27:38 -03:00
"w": 6 ,
"x": 12 ,
"y": 5
},
"targets": [
{
2025-11-17 23:42:55 -03:00
"expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")" ,
2025-11-17 19:24:03 -03:00
"refId": "A" ,
2025-11-17 20:14:11 -03:00
"legendFormat": "{{node}}" ,
2025-11-17 19:38:40 -03:00
"instant": true
2025-11-17 16:27:38 -03:00
}
] ,
"fieldConfig": {
"defaults": {
"color": {
2025-12-12 20:44:20 -03:00
"mode": "thresholds"
2025-11-17 16:27:38 -03:00
},
"mappings": [ ] ,
"thresholds": {
"mode": "absolute" ,
"steps": [
{
"color": "rgba(115, 115, 115, 1)" ,
"value": null
},
{
"color": "green" ,
"value": 1
}
]
},
2025-11-17 18:55:11 -03:00
"unit": "Bps" ,
2025-11-17 16:27:38 -03:00
"custom": {
"displayMode": "auto"
2025-11-17 19:56:57 -03:00
}
2025-11-17 16:27:38 -03:00
},
"overrides": [ ]
},
"options": {
"colorMode": "value" ,
"graphMode": "area" ,
"justifyMode": "center" ,
"reduceOptions": {
"calcs": [
"lastNotNull"
] ,
"fields": "" ,
"values": false
},
2025-11-17 19:49:50 -03:00
"textMode": "name_and_value"
2025-11-17 16:27:38 -03:00
},
"links": [
{
"title": "Open atlas-nodes dashboard" ,
"url": "/d/atlas-nodes" ,
"targetBlank": true
}
]
},
{
"id": 10 ,
"type": "stat" ,
2025-11-17 20:00:40 -03:00
"title": "Hottest node: I/O (r+w)" ,
2025-11-17 16:27:38 -03:00
"datasource": {
"type": "prometheus" ,
"uid": "atlas-vm"
},
"gridPos": {
2025-12-02 15:12:16 -03:00
"h": 3 ,
2025-11-17 16:27:38 -03:00
"w": 6 ,
"x": 18 ,
"y": 5
},
"targets": [
{
2025-11-17 23:42:55 -03:00
"expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")" ,
2025-11-17 19:24:03 -03:00
"refId": "A" ,
2025-11-17 20:14:11 -03:00
"legendFormat": "{{node}}" ,
2025-11-17 19:38:40 -03:00
"instant": true
2025-11-17 16:27:38 -03:00
}
] ,
"fieldConfig": {
"defaults": {
"color": {
2025-12-12 20:44:20 -03:00
"mode": "thresholds"
2025-11-17 16:27:38 -03:00
},
"mappings": [ ] ,
"thresholds": {
"mode": "absolute" ,
"steps": [
{
"color": "rgba(115, 115, 115, 1)" ,
"value": null
},
{
"color": "green" ,
"value": 1
}
]
},
2025-11-17 18:55:11 -03:00
"unit": "Bps" ,
2025-11-17 16:27:38 -03:00
"custom": {
"displayMode": "auto"
2025-11-17 19:56:57 -03:00
}
2025-11-17 16:27:38 -03:00
},
"overrides": [ ]
},
"options": {
"colorMode": "value" ,
"graphMode": "area" ,
"justifyMode": "center" ,
"reduceOptions": {
"calcs": [
"lastNotNull"
] ,
"fields": "" ,
"values": false
},
2025-11-17 19:49:50 -03:00
"textMode": "name_and_value"
2025-11-17 16:27:38 -03:00
},
"links": [
{
"title": "Open atlas-nodes dashboard" ,
"url": "/d/atlas-nodes" ,
"targetBlank": true
}
]
},
{
2025-11-18 14:08:33 -03:00
"id": 23 ,
"type": "stat" ,
2025-12-02 14:41:39 -03:00
"title": "Astreae Usage" ,
2025-11-15 21:03:11 -03:00
"datasource": {
"type": "prometheus" ,
"uid": "atlas-vm"
},
"gridPos": {
2025-11-18 14:08:33 -03:00
"h": 6 ,
"w": 6 ,
2025-11-15 21:03:11 -03:00
"x": 0 ,
2025-11-17 16:27:38 -03:00
"y": 10
2025-11-14 00:02:59 -03:00
},
"targets": [
{
2025-11-18 14:08:33 -03:00
"expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)" ,
"refId": "A"
2025-11-14 00:02:59 -03:00
}
] ,
2025-11-15 21:03:11 -03:00
"fieldConfig": {
"defaults": {
2025-11-18 14:08:33 -03:00
"color": {
2025-12-12 20:44:20 -03:00
"mode": "thresholds"
2025-11-18 14:08:33 -03:00
},
"mappings": [ ] ,
"thresholds": {
"mode": "percentage" ,
"steps": [
{
"color": "green" ,
"value": null
},
{
"color": "yellow" ,
"value": 70
},
{
"color": "red" ,
"value": 85
}
]
},
"unit": "percent" ,
"custom": {
"displayMode": "auto"
}
2025-11-15 21:03:11 -03:00
},
2025-11-17 22:39:50 -03:00
"overrides": [ ]
2025-11-15 21:03:11 -03:00
},
"options": {
2025-11-18 14:08:33 -03:00
"colorMode": "value" ,
"graphMode": "area" ,
"justifyMode": "center" ,
2025-11-15 21:03:11 -03:00
"reduceOptions": {
"calcs": [
"lastNotNull"
] ,
"fields": "" ,
"values": false
2025-11-18 14:08:33 -03:00
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-storage dashboard" ,
"url": "/d/atlas-storage" ,
"targetBlank": true
2025-11-15 21:03:11 -03:00
}
2025-11-18 14:08:33 -03:00
]
2025-11-14 00:02:59 -03:00
},
{
2025-11-18 14:08:33 -03:00
"id": 24 ,
"type": "stat" ,
2025-12-02 14:41:39 -03:00
"title": "Asteria Usage" ,
2025-11-17 23:12:16 -03:00
"datasource": {
"type": "prometheus" ,
"uid": "atlas-vm"
},
"gridPos": {
2025-11-18 14:08:33 -03:00
"h": 6 ,
"w": 6 ,
"x": 6 ,
2025-11-17 23:12:16 -03:00
"y": 10
},
"targets": [
{
2025-11-18 14:08:33 -03:00
"expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)" ,
"refId": "A"
2025-11-17 23:12:16 -03:00
}
] ,
"fieldConfig": {
"defaults": {
2025-11-18 14:08:33 -03:00
"color": {
2025-12-12 20:44:20 -03:00
"mode": "thresholds"
2025-11-18 14:08:33 -03:00
},
"mappings": [ ] ,
"thresholds": {
"mode": "percentage" ,
"steps": [
{
"color": "green" ,
"value": null
},
{
"color": "yellow" ,
"value": 70
},
{
"color": "red" ,
"value": 85
}
]
},
"unit": "percent" ,
"custom": {
"displayMode": "auto"
}
2025-11-17 23:12:16 -03:00
},
"overrides": [ ]
},
"options": {
2025-11-18 14:08:33 -03:00
"colorMode": "value" ,
"graphMode": "area" ,
"justifyMode": "center" ,
2025-11-17 23:12:16 -03:00
"reduceOptions": {
"calcs": [
"lastNotNull"
] ,
"fields": "" ,
"values": false
2025-11-18 14:08:33 -03:00
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-storage dashboard" ,
"url": "/d/atlas-storage" ,
"targetBlank": true
2025-11-17 23:12:16 -03:00
}
2025-11-18 14:08:33 -03:00
]
2025-11-17 23:12:16 -03:00
},
{
2025-11-18 14:08:33 -03:00
"id": 25 ,
"type": "stat" ,
2025-12-02 14:41:39 -03:00
"title": "Astreae Free" ,
2025-11-18 00:11:39 -03:00
"datasource": {
"type": "prometheus" ,
"uid": "atlas-vm"
},
"gridPos": {
2025-11-18 14:08:33 -03:00
"h": 6 ,
"w": 6 ,
"x": 12 ,
2025-11-18 00:11:39 -03:00
"y": 10
},
"targets": [
{
2025-11-18 14:08:33 -03:00
"expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})" ,
"refId": "A"
2025-11-18 00:11:39 -03:00
}
] ,
"fieldConfig": {
"defaults": {
2025-11-18 14:08:33 -03:00
"color": {
2025-12-12 20:44:20 -03:00
"mode": "thresholds"
2025-11-18 14:08:33 -03:00
},
"mappings": [ ] ,
"thresholds": {
"mode": "absolute" ,
"steps": [
{
"color": "rgba(115, 115, 115, 1)" ,
"value": null
},
{
"color": "green" ,
"value": 1
}
]
},
"unit": "decbytes" ,
"custom": {
"displayMode": "auto"
}
2025-11-18 00:11:39 -03:00
},
"overrides": [ ]
},
"options": {
2025-11-18 14:08:33 -03:00
"colorMode": "value" ,
"graphMode": "area" ,
"justifyMode": "center" ,
2025-11-18 00:11:39 -03:00
"reduceOptions": {
"calcs": [
"lastNotNull"
] ,
"fields": "" ,
"values": false
2025-11-18 14:08:33 -03:00
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-storage dashboard" ,
"url": "/d/atlas-storage" ,
"targetBlank": true
2025-11-18 00:11:39 -03:00
}
2025-11-18 14:08:33 -03:00
]
2025-11-18 00:11:39 -03:00
},
{
2025-11-18 14:08:33 -03:00
"id": 26 ,
"type": "stat" ,
2025-12-02 14:41:39 -03:00
"title": "Asteria Free" ,
2025-11-15 11:59:48 -03:00
"datasource": {
"type": "prometheus" ,
"uid": "atlas-vm"
},
"gridPos": {
2025-11-18 14:08:33 -03:00
"h": 6 ,
"w": 6 ,
"x": 18 ,
"y": 10
2025-11-15 11:59:48 -03:00
},
"targets": [
{
2025-11-18 14:08:33 -03:00
"expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})" ,
"refId": "A"
2025-11-15 21:03:11 -03:00
}
] ,
"fieldConfig": {
"defaults": {
2025-11-18 14:08:33 -03:00
"color": {
2025-12-12 20:44:20 -03:00
"mode": "thresholds"
2025-11-18 14:08:33 -03:00
},
"mappings": [ ] ,
"thresholds": {
"mode": "absolute" ,
"steps": [
{
"color": "rgba(115, 115, 115, 1)" ,
"value": null
},
{
"color": "green" ,
"value": 1
}
]
},
"unit": "decbytes" ,
"custom": {
"displayMode": "auto"
}
},
"overrides": [ ]
},
"options": {
"colorMode": "value" ,
"graphMode": "area" ,
"justifyMode": "center" ,
"reduceOptions": {
"calcs": [
"lastNotNull"
] ,
"fields": "" ,
"values": false
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-storage dashboard" ,
"url": "/d/atlas-storage" ,
"targetBlank": true
}
]
},
{
"id": 11 ,
"type": "piechart" ,
2025-12-02 14:41:39 -03:00
"title": "Namespace CPU Share" ,
2025-11-18 14:08:33 -03:00
"datasource": {
"type": "prometheus" ,
"uid": "atlas-vm"
},
"gridPos": {
"h": 9 ,
"w": 8 ,
"x": 0 ,
"y": 16
},
"targets": [
{
2025-11-18 17:09:13 -03:00
"expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)" ,
2025-11-18 14:08:33 -03:00
"refId": "A" ,
"legendFormat": "{{namespace}}"
}
] ,
"fieldConfig": {
"defaults": {
"unit": "percent" ,
"color": {
"mode": "palette-classic"
}
},
"overrides": [ ]
},
"options": {
"legend": {
"displayMode": "list" ,
"placement": "right"
},
"pieType": "pie" ,
2025-12-12 20:40:32 -03:00
"displayLabels": [ ] ,
2025-11-18 14:08:33 -03:00
"tooltip": {
"mode": "single"
},
"colorScheme": "interpolateSpectral" ,
"colorBy": "value" ,
"reduceOptions": {
"calcs": [
"lastNotNull"
] ,
"fields": "" ,
"values": false
}
}
},
{
"id": 12 ,
"type": "piechart" ,
2025-12-02 14:41:39 -03:00
"title": "Namespace GPU Share" ,
2025-11-18 14:08:33 -03:00
"datasource": {
"type": "prometheus" ,
"uid": "atlas-vm"
},
"gridPos": {
"h": 9 ,
"w": 8 ,
"x": 8 ,
"y": 16
},
"targets": [
{
2025-12-02 20:28:35 -03:00
"expr": "100 * ( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)" ,
2025-11-18 14:08:33 -03:00
"refId": "A" ,
"legendFormat": "{{namespace}}"
}
] ,
"fieldConfig": {
"defaults": {
"unit": "percent" ,
"color": {
"mode": "palette-classic"
}
},
"overrides": [ ]
},
"options": {
"legend": {
"displayMode": "list" ,
"placement": "right"
},
"pieType": "pie" ,
2025-12-12 20:40:32 -03:00
"displayLabels": [ ] ,
2025-11-18 14:08:33 -03:00
"tooltip": {
"mode": "single"
},
"colorScheme": "interpolateSpectral" ,
"colorBy": "value" ,
"reduceOptions": {
"calcs": [
"lastNotNull"
] ,
"fields": "" ,
"values": false
}
}
},
{
"id": 13 ,
"type": "piechart" ,
2025-12-02 14:41:39 -03:00
"title": "Namespace RAM Share" ,
2025-11-18 14:08:33 -03:00
"datasource": {
"type": "prometheus" ,
"uid": "atlas-vm"
},
"gridPos": {
"h": 9 ,
"w": 8 ,
"x": 16 ,
"y": 16
},
"targets": [
{
2025-11-18 17:09:13 -03:00
"expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)" ,
2025-11-18 14:08:33 -03:00
"refId": "A" ,
"legendFormat": "{{namespace}}"
}
] ,
"fieldConfig": {
"defaults": {
"unit": "percent" ,
"color": {
"mode": "palette-classic"
}
},
"overrides": [ ]
},
"options": {
"legend": {
"displayMode": "list" ,
"placement": "right"
},
"pieType": "pie" ,
2025-12-12 20:40:32 -03:00
"displayLabels": [ ] ,
2025-11-18 14:08:33 -03:00
"tooltip": {
"mode": "single"
},
"colorScheme": "interpolateSpectral" ,
"colorBy": "value" ,
"reduceOptions": {
"calcs": [
"lastNotNull"
] ,
"fields": "" ,
"values": false
}
}
},
{
"id": 14 ,
"type": "timeseries" ,
2025-12-02 14:41:39 -03:00
"title": "Worker Node CPU" ,
2025-11-18 14:08:33 -03:00
"datasource": {
"type": "prometheus" ,
"uid": "atlas-vm"
},
"gridPos": {
2025-12-02 15:15:21 -03:00
"h": 12 ,
2025-11-18 14:08:33 -03:00
"w": 12 ,
"x": 0 ,
2025-11-18 15:55:24 -03:00
"y": 32
2025-11-18 14:08:33 -03:00
},
"targets": [
{
"expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" ,
"refId": "A" ,
"legendFormat": "{{node}}"
}
] ,
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": [ ]
},
"options": {
"legend": {
"displayMode": "table" ,
"placement": "right" ,
"calcs": [
"last"
]
},
"tooltip": {
"mode": "multi"
}
},
"links": [
{
"title": "Open atlas-nodes dashboard" ,
2025-11-17 16:27:38 -03:00
"url": "/d/atlas-nodes" ,
"targetBlank": true
}
]
2025-11-15 21:03:11 -03:00
},
{
2025-11-18 00:11:39 -03:00
"id": 15 ,
2025-11-15 21:03:11 -03:00
"type": "timeseries" ,
2025-12-02 14:41:39 -03:00
"title": "Worker Node RAM" ,
2025-11-15 21:03:11 -03:00
"datasource": {
"type": "prometheus" ,
"uid": "atlas-vm"
},
"gridPos": {
2025-12-02 15:15:21 -03:00
"h": 12 ,
2025-11-15 21:03:11 -03:00
"w": 12 ,
"x": 12 ,
2025-11-18 15:55:24 -03:00
"y": 32
2025-11-15 21:03:11 -03:00
},
"targets": [
2025-11-15 11:59:48 -03:00
{
2025-11-17 21:48:12 -03:00
"expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" ,
2025-11-15 21:03:11 -03:00
"refId": "A" ,
2025-11-16 00:55:28 -03:00
"legendFormat": "{{node}}"
2025-11-14 00:02:59 -03:00
}
2025-11-15 11:59:48 -03:00
] ,
2025-11-15 21:03:11 -03:00
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": [ ]
},
2025-11-15 11:59:48 -03:00
"options": {
"legend": {
"displayMode": "table" ,
2025-11-17 14:22:46 -03:00
"placement": "right" ,
"calcs": [
"last"
]
2025-11-15 11:59:48 -03:00
},
2025-11-15 21:03:11 -03:00
"tooltip": {
"mode": "multi"
}
2025-11-17 16:27:38 -03:00
},
"links": [
{
"title": "Open atlas-nodes dashboard" ,
"url": "/d/atlas-nodes" ,
"targetBlank": true
}
]
2025-11-15 11:59:48 -03:00
},
{
2025-11-18 00:11:39 -03:00
"id": 16 ,
2025-11-17 16:27:38 -03:00
"type": "timeseries" ,
2025-11-17 21:48:12 -03:00
"title": "Control plane CPU" ,
2025-11-15 11:59:48 -03:00
"datasource": {
"type": "prometheus" ,
"uid": "atlas-vm"
2025-11-14 00:02:59 -03:00
},
2025-11-16 00:55:28 -03:00
"gridPos": {
2025-12-02 15:15:21 -03:00
"h": 10 ,
2025-11-16 00:55:28 -03:00
"w": 12 ,
"x": 0 ,
2025-12-02 15:15:21 -03:00
"y": 44
2025-11-16 00:55:28 -03:00
},
"targets": [
{
2025-11-17 21:48:12 -03:00
"expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" ,
2025-11-17 16:27:38 -03:00
"refId": "A" ,
"legendFormat": "{{node}}"
2025-11-16 00:55:28 -03:00
}
] ,
"fieldConfig": {
"defaults": {
2025-11-17 16:27:38 -03:00
"unit": "percent"
2025-11-16 00:55:28 -03:00
},
"overrides": [ ]
},
"options": {
2025-11-17 16:27:38 -03:00
"legend": {
"displayMode": "table" ,
"placement": "right"
},
"tooltip": {
"mode": "multi"
2025-11-17 14:22:46 -03:00
}
2025-11-17 16:27:38 -03:00
}
2025-11-16 00:55:28 -03:00
},
{
2025-11-18 00:11:39 -03:00
"id": 17 ,
2025-11-17 16:27:38 -03:00
"type": "timeseries" ,
2025-11-17 21:48:12 -03:00
"title": "Control plane RAM" ,
2025-11-16 00:55:28 -03:00
"datasource": {
"type": "prometheus" ,
"uid": "atlas-vm"
},
"gridPos": {
2025-12-02 15:15:21 -03:00
"h": 10 ,
2025-11-16 00:55:28 -03:00
"w": 12 ,
"x": 12 ,
2025-12-02 15:15:21 -03:00
"y": 44
2025-11-16 00:55:28 -03:00
},
"targets": [
{
2025-11-17 21:48:12 -03:00
"expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" ,
2025-11-17 16:27:38 -03:00
"refId": "A" ,
"legendFormat": "{{node}}"
2025-11-16 00:55:28 -03:00
}
] ,
"fieldConfig": {
"defaults": {
2025-11-17 16:27:38 -03:00
"unit": "percent"
2025-11-16 00:55:28 -03:00
},
"overrides": [ ]
},
"options": {
2025-11-17 16:27:38 -03:00
"legend": {
"displayMode": "table" ,
"placement": "right"
2025-11-17 14:22:46 -03:00
},
2025-11-17 16:27:38 -03:00
"tooltip": {
"mode": "multi"
2025-11-17 14:22:46 -03:00
}
2025-11-17 16:27:38 -03:00
}
2025-11-16 00:55:28 -03:00
},
2025-12-12 18:51:43 -03:00
{
"id": 28 ,
"type": "piechart" ,
2025-12-12 20:30:00 -03:00
"title": "Node Pod Share" ,
2025-12-12 18:51:43 -03:00
"datasource": {
"type": "prometheus" ,
"uid": "atlas-vm"
},
"gridPos": {
"h": 10 ,
"w": 12 ,
"x": 0 ,
"y": 54
},
"targets": [
{
2025-12-12 20:40:32 -03:00
"expr": "(sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node) / clamp_min(sum(kube_pod_info{pod!=\"\" , node!=\"\"}), 1)) * 100" ,
2025-12-12 18:51:43 -03:00
"refId": "A" ,
"legendFormat": "{{namespace}}"
}
] ,
"fieldConfig": {
"defaults": {
"unit": "percent" ,
"color": {
"mode": "palette-classic"
}
},
"overrides": [ ]
},
"options": {
"legend": {
"displayMode": "list" ,
"placement": "right"
},
"pieType": "pie" ,
2025-12-12 20:40:32 -03:00
"displayLabels": [ ] ,
2025-12-12 18:51:43 -03:00
"tooltip": {
"mode": "single"
},
"colorScheme": "interpolateSpectral" ,
"colorBy": "value" ,
"reduceOptions": {
"calcs": [
"lastNotNull"
] ,
"fields": "" ,
"values": false
}
}
},
{
"id": 29 ,
"type": "bargauge" ,
"title": "Top Nodes by Pod Count" ,
"datasource": {
"type": "prometheus" ,
"uid": "atlas-vm"
},
"gridPos": {
"h": 10 ,
"w": 12 ,
"x": 12 ,
"y": 54
},
"targets": [
{
2025-12-12 19:09:51 -03:00
"expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))" ,
2025-12-12 18:51:43 -03:00
"refId": "A" ,
2025-12-12 20:30:00 -03:00
"legendFormat": "{{node}}" ,
"instant": true
2025-12-12 18:51:43 -03:00
}
] ,
"fieldConfig": {
"defaults": {
"unit": "none" ,
"min": 0 ,
"max": null ,
"thresholds": {
"mode": "absolute" ,
"steps": [
{
"color": "green" ,
"value": null
},
{
"color": "yellow" ,
"value": 50
},
{
"color": "orange" ,
2025-12-12 20:20:13 -03:00
"value": 75
2025-12-12 18:51:43 -03:00
},
{
"color": "red" ,
2025-12-12 20:20:13 -03:00
"value": 100
2025-12-12 18:51:43 -03:00
}
]
2025-12-12 20:20:13 -03:00
},
"decimals": 0
2025-12-12 18:51:43 -03:00
},
"overrides": [ ]
},
"options": {
"displayMode": "gradient" ,
"orientation": "horizontal" ,
"reduceOptions": {
"calcs": [
"lastNotNull"
] ,
"fields": "" ,
"values": false
}
},
"transformations": [
{
"id": "sortBy" ,
"options": {
"fields": [
"Value"
] ,
"order": "desc"
}
2025-12-12 18:56:13 -03:00
},
{
"id": "limit" ,
"options": {
"limit": 12
}
2025-12-12 18:51:43 -03:00
}
]
},
2025-11-16 00:55:28 -03:00
{
2025-11-18 00:11:39 -03:00
"id": 18 ,
2025-11-16 00:55:28 -03:00
"type": "timeseries" ,
2025-12-02 14:41:39 -03:00
"title": "Cluster Ingress Throughput" ,
2025-11-16 00:55:28 -03:00
"datasource": {
"type": "prometheus" ,
"uid": "atlas-vm"
},
2025-11-15 11:59:48 -03:00
"gridPos": {
2025-11-17 14:22:46 -03:00
"h": 7 ,
2025-11-18 14:08:33 -03:00
"w": 8 ,
2025-11-15 11:59:48 -03:00
"x": 0 ,
2025-11-18 15:55:24 -03:00
"y": 25
2025-11-15 11:59:48 -03:00
},
2025-11-14 00:02:59 -03:00
"targets": [
{
2025-11-18 16:18:52 -03:00
"expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)" ,
2025-11-18 00:11:39 -03:00
"refId": "A" ,
2025-11-18 14:08:33 -03:00
"legendFormat": "Ingress (Traefik)"
2025-11-16 00:55:28 -03:00
}
] ,
"fieldConfig": {
"defaults": {
2025-11-17 18:55:11 -03:00
"unit": "Bps"
2025-11-16 00:55:28 -03:00
},
"overrides": [ ]
},
"options": {
"legend": {
2025-11-17 16:27:38 -03:00
"displayMode": "list" ,
2025-11-16 00:55:28 -03:00
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
2025-11-17 16:27:38 -03:00
},
"links": [
{
"title": "Open atlas-network dashboard" ,
"url": "/d/atlas-network" ,
"targetBlank": true
}
]
2025-11-16 00:55:28 -03:00
},
{
2025-11-18 00:11:39 -03:00
"id": 19 ,
2025-11-17 14:22:46 -03:00
"type": "timeseries" ,
2025-12-02 14:41:39 -03:00
"title": "Cluster Egress Throughput" ,
2025-11-17 14:22:46 -03:00
"datasource": {
"type": "prometheus" ,
"uid": "atlas-vm"
},
"gridPos": {
"h": 7 ,
2025-11-18 14:08:33 -03:00
"w": 8 ,
"x": 8 ,
2025-11-18 15:55:24 -03:00
"y": 25
2025-11-17 14:22:46 -03:00
},
"targets": [
{
2025-11-18 16:18:52 -03:00
"expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)" ,
2025-11-18 00:11:39 -03:00
"refId": "A" ,
2025-11-18 14:08:33 -03:00
"legendFormat": "Egress (Traefik)"
2025-11-17 14:22:46 -03:00
}
] ,
"fieldConfig": {
"defaults": {
2025-11-17 18:55:11 -03:00
"unit": "Bps"
2025-11-17 14:22:46 -03:00
},
"overrides": [ ]
},
"options": {
"legend": {
2025-11-17 16:27:38 -03:00
"displayMode": "list" ,
2025-11-17 14:22:46 -03:00
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
2025-11-17 16:27:38 -03:00
},
"links": [
{
"title": "Open atlas-network dashboard" ,
"url": "/d/atlas-network" ,
"targetBlank": true
}
]
2025-11-17 14:22:46 -03:00
},
{
2025-11-18 00:11:39 -03:00
"id": 20 ,
2025-11-17 14:22:46 -03:00
"type": "timeseries" ,
2025-12-02 14:41:39 -03:00
"title": "Intra-Cluster Throughput" ,
2025-11-17 14:22:46 -03:00
"datasource": {
"type": "prometheus" ,
"uid": "atlas-vm"
},
"gridPos": {
2025-11-18 14:08:33 -03:00
"h": 7 ,
"w": 8 ,
"x": 16 ,
2025-11-18 15:55:24 -03:00
"y": 25
2025-11-17 14:22:46 -03:00
},
"targets": [
{
2025-11-18 17:09:13 -03:00
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)" ,
2025-11-17 14:22:46 -03:00
"refId": "A" ,
2025-11-18 14:08:33 -03:00
"legendFormat": "Internal traffic"
2025-11-17 14:22:46 -03:00
}
] ,
"fieldConfig": {
"defaults": {
2025-11-18 14:08:33 -03:00
"unit": "Bps"
2025-11-17 14:22:46 -03:00
},
"overrides": [ ]
},
"options": {
"legend": {
2025-11-18 14:08:33 -03:00
"displayMode": "list" ,
"placement": "bottom"
2025-11-17 14:22:46 -03:00
},
"tooltip": {
"mode": "multi"
}
},
2025-11-17 16:27:38 -03:00
"links": [
{
2025-11-18 14:08:33 -03:00
"title": "Open atlas-network dashboard" ,
"url": "/d/atlas-network" ,
2025-11-17 16:27:38 -03:00
"targetBlank": true
}
]
2025-11-17 14:22:46 -03:00
},
{
2025-11-18 00:11:39 -03:00
"id": 21 ,
2025-11-18 14:08:33 -03:00
"type": "timeseries" ,
2025-12-02 14:41:39 -03:00
"title": "Root Filesystem Usage" ,
2025-11-16 00:55:28 -03:00
"datasource": {
"type": "prometheus" ,
"uid": "atlas-vm"
},
"gridPos": {
2025-12-02 15:15:21 -03:00
"h": 16 ,
2025-11-16 00:55:28 -03:00
"w": 12 ,
2025-11-18 14:08:33 -03:00
"x": 0 ,
2025-12-12 18:51:43 -03:00
"y": 64
2025-11-16 00:55:28 -03:00
},
"targets": [
{
2025-11-18 14:08:33 -03:00
"expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" ,
2025-11-18 00:11:39 -03:00
"refId": "A" ,
"legendFormat": "{{node}}"
2025-11-14 00:02:59 -03:00
}
] ,
2025-11-15 11:59:48 -03:00
"fieldConfig": {
"defaults": {
2025-11-18 14:08:33 -03:00
"unit": "percent"
2025-11-16 00:55:28 -03:00
},
"overrides": [ ]
},
"options": {
2025-11-18 14:08:33 -03:00
"legend": {
"displayMode": "table" ,
"placement": "right" ,
2025-11-16 00:55:28 -03:00
"calcs": [
2025-11-18 14:08:33 -03:00
"last"
]
},
"tooltip": {
"mode": "multi"
2025-11-16 00:55:28 -03:00
}
2025-11-17 16:27:38 -03:00
},
2025-11-18 14:08:33 -03:00
"timeFrom": "30d" ,
2025-11-17 16:27:38 -03:00
"links": [
{
"title": "Open atlas-storage dashboard" ,
"url": "/d/atlas-storage" ,
"targetBlank": true
}
]
2025-11-16 00:55:28 -03:00
},
{
2025-11-18 14:08:33 -03:00
"id": 22 ,
"type": "bargauge" ,
2025-12-02 14:41:39 -03:00
"title": "Nodes Closest to Full Root Disks" ,
2025-11-16 00:55:28 -03:00
"datasource": {
"type": "prometheus" ,
"uid": "atlas-vm"
},
"gridPos": {
2025-12-02 15:15:21 -03:00
"h": 16 ,
2025-11-18 14:08:33 -03:00
"w": 12 ,
"x": 12 ,
2025-12-12 18:51:43 -03:00
"y": 64
2025-11-16 00:55:28 -03:00
},
"targets": [
{
2025-12-02 15:21:02 -03:00
"expr": "topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))" ,
2025-11-18 14:08:33 -03:00
"refId": "A" ,
"legendFormat": "{{node}}"
2025-11-16 00:55:28 -03:00
}
] ,
"fieldConfig": {
"defaults": {
2025-11-18 14:08:33 -03:00
"unit": "percent" ,
"min": 0 ,
"max": 100 ,
2025-11-15 11:59:48 -03:00
"thresholds": {
2025-11-18 14:08:33 -03:00
"mode": "absolute" ,
2025-11-15 11:59:48 -03:00
"steps": [
2025-11-16 00:55:28 -03:00
{
"color": "green" ,
"value": null
},
{
"color": "yellow" ,
2025-11-18 14:08:33 -03:00
"value": 50
2025-11-16 00:55:28 -03:00
},
{
2025-11-18 14:08:33 -03:00
"color": "orange" ,
2025-11-16 00:55:28 -03:00
"value": 70
},
{
"color": "red" ,
"value": 85
}
]
2025-11-17 16:27:38 -03:00
}
2025-11-16 00:55:28 -03:00
},
"overrides": [ ]
},
"options": {
2025-11-18 14:08:33 -03:00
"displayMode": "gradient" ,
"orientation": "horizontal" ,
2025-11-16 00:55:28 -03:00
"reduceOptions": {
"calcs": [
"lastNotNull"
] ,
2025-12-02 14:56:36 -03:00
"fields": "" ,
2025-11-16 00:55:28 -03:00
"values": false
}
2025-11-17 16:27:38 -03:00
},
"links": [
{
"title": "Open atlas-storage dashboard" ,
"url": "/d/atlas-storage" ,
"targetBlank": true
}
2025-12-12 18:51:43 -03:00
] ,
"transformations": [
{
"id": "sortBy" ,
"options": {
"fields": [
"Value"
] ,
"order": "desc"
}
}
2025-11-17 14:22:46 -03:00
]
2025-11-14 00:02:59 -03:00
}
] ,
"schemaVersion": 39 ,
"style": "dark" ,
"tags": [
"atlas" ,
2025-11-17 14:22:46 -03:00
"overview"
2025-11-14 00:02:59 -03:00
] ,
"templating": {
"list": [ ]
},
"time": {
2025-12-02 14:41:39 -03:00
"from": "now-1h" ,
2025-11-14 00:02:59 -03:00
"to": "now"
2025-11-17 16:27:38 -03:00
},
2025-12-02 14:41:39 -03:00
"refresh": "1m" ,
2025-12-12 18:32:45 -03:00
"links": [ ]
2025-11-14 00:02:59 -03:00
}