titan-iac/services/monitoring/grafana-dashboard-overview.yaml

2715 lines
78 KiB
YAML
Raw Normal View History

# services/monitoring/grafana-dashboard-overview.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboard-overview
labels:
grafana_dashboard: "1"
data:
atlas-overview.json: |
{
"uid": "atlas-overview",
"title": "Atlas Overview",
"folderUid": "overview",
2025-11-17 16:27:38 -03:00
"editable": false,
"annotations": {
2025-11-17 16:27:38 -03:00
"list": []
},
"panels": [
{
"id": 2,
2025-11-18 12:11:47 -03:00
"type": "gauge",
"title": "Control Plane Ready",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
2025-11-15 21:03:11 -03:00
"gridPos": {
2025-11-16 00:55:28 -03:00
"h": 5,
"w": 4,
2025-11-15 21:03:11 -03:00
"x": 0,
"y": 0
},
"targets": [
{
"expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})",
2025-11-15 21:03:11 -03:00
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
2025-11-18 12:11:47 -03:00
"min": 0,
"max": 3,
"thresholds": {
"mode": "absolute",
"steps": [
{
2025-11-17 19:49:50 -03:00
"color": "red",
"value": null
2025-11-15 21:03:11 -03:00
},
{
"color": "green",
"value": 3
}
]
2025-11-17 16:27:38 -03:00
}
},
"overrides": []
},
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
2025-11-16 00:55:28 -03:00
},
2025-11-18 12:11:47 -03:00
"orientation": "auto",
"showThresholdMarkers": false,
"showThresholdLabels": false
2025-11-15 21:03:11 -03:00
}
},
{
"id": 3,
"type": "stat",
"title": "Control Plane Workloads",
2025-11-15 21:03:11 -03:00
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
2025-11-16 00:55:28 -03:00
"h": 5,
"w": 3,
"x": 4,
2025-11-15 21:03:11 -03:00
"y": 0
},
"targets": [
{
"expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"}) or on() vector(0)",
"refId": "A"
}
],
2025-11-15 21:03:11 -03:00
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
2025-11-15 21:03:11 -03:00
"thresholds": {
2025-11-16 00:55:28 -03:00
"mode": "absolute",
2025-11-15 21:03:11 -03:00
"steps": [
{
"color": "green",
"value": null
2025-11-15 21:03:11 -03:00
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 3
2025-11-16 00:55:28 -03:00
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
2025-11-17 16:27:38 -03:00
}
2025-11-16 00:55:28 -03:00
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
2025-11-16 00:55:28 -03:00
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-pods dashboard",
"url": "/d/atlas-pods",
"targetBlank": true
}
]
2025-11-16 00:55:28 -03:00
},
{
"id": 5,
"type": "stat",
"title": "Stuck Terminating",
2025-11-16 00:55:28 -03:00
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 3,
"x": 7,
2025-11-16 00:55:28 -03:00
"y": 0
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)",
2025-11-16 00:55:28 -03:00
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
2025-11-16 00:55:28 -03:00
"thresholds": {
"mode": "absolute",
"steps": [
{
2025-11-17 19:49:50 -03:00
"color": "green",
"value": null
2025-11-15 21:03:11 -03:00
},
2025-11-17 19:49:50 -03:00
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 3
2025-11-15 21:03:11 -03:00
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
2025-11-17 16:27:38 -03:00
}
2025-11-15 21:03:11 -03:00
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
2025-11-15 21:03:11 -03:00
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
2025-11-16 00:55:28 -03:00
},
"textMode": "value"
2025-11-17 19:49:50 -03:00
},
"links": [
{
"title": "Open atlas-pods dashboard",
"url": "/d/atlas-pods",
"targetBlank": true
}
]
},
{
"id": 27,
"type": "stat",
2025-12-19 13:46:34 -03:00
"title": "Atlas Availability",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 4,
"x": 10,
"y": 0
},
"targets": [
{
"expr": "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[365d:1h])",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": null
},
{
"color": "orange",
"value": 0.99
},
{
"color": "yellow",
"value": 0.999
},
{
"color": "green",
"value": 0.9999
},
{
"color": "blue",
"value": 0.99999
}
]
},
"unit": "percentunit",
"custom": {
"displayMode": "auto"
},
"decimals": 4
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
2025-11-16 00:55:28 -03:00
"id": 4,
"type": "stat",
"title": "Problem Pods",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
2025-11-15 21:03:11 -03:00
"gridPos": {
2025-11-16 00:55:28 -03:00
"h": 5,
"w": 3,
"x": 14,
2025-11-15 21:03:11 -03:00
"y": 0
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)",
2025-11-15 21:03:11 -03:00
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
2025-11-17 16:27:38 -03:00
"color": "green",
"value": null
},
2025-11-17 19:24:03 -03:00
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 3
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-pods dashboard",
"url": "/d/atlas-pods",
"targetBlank": true
}
]
},
{
"id": 6,
"type": "stat",
"title": "CrashLoop / ImagePull",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 3,
"x": 17,
"y": 0
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 3
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
2025-11-17 16:27:38 -03:00
}
},
"overrides": []
},
2025-11-15 21:03:11 -03:00
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
2025-11-15 21:03:11 -03:00
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
2025-11-16 00:55:28 -03:00
},
"textMode": "value"
2025-11-17 16:27:38 -03:00
},
"links": [
{
"title": "Open atlas-pods dashboard",
"url": "/d/atlas-pods",
"targetBlank": true
}
]
2025-11-15 21:03:11 -03:00
},
{
"id": 1,
"type": "gauge",
"title": "Workers Ready",
2025-11-15 21:03:11 -03:00
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
2025-11-16 00:55:28 -03:00
"h": 5,
"w": 4,
"x": 20,
"y": 0
},
2025-11-15 21:03:11 -03:00
"targets": [
{
"expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"min": 0,
"max": 20,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": null
},
{
"color": "orange",
"value": 18
2025-11-17 19:24:03 -03:00
},
{
"color": "yellow",
"value": 19
2025-11-17 19:24:03 -03:00
},
{
"color": "green",
"value": 20
}
]
2025-11-17 16:27:38 -03:00
}
},
"overrides": []
},
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"orientation": "auto",
"showThresholdMarkers": false,
"showThresholdLabels": false
}
},
{
"id": 7,
"type": "stat",
"title": "Hottest node: CPU",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 3,
2025-11-17 16:27:38 -03:00
"w": 6,
"x": 0,
"y": 5
},
"targets": [
{
2025-11-17 23:42:55 -03:00
"expr": "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
2025-11-17 19:24:03 -03:00
"refId": "A",
"legendFormat": "{{node}}",
"instant": true
}
],
2025-11-15 21:03:11 -03:00
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
2025-11-15 21:03:11 -03:00
},
"mappings": [],
"thresholds": {
"mode": "absolute",
2025-11-15 21:03:11 -03:00
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 50
},
{
"color": "orange",
"value": 75
2025-11-15 21:03:11 -03:00
},
{
"color": "red",
"value": 91.5
2025-11-15 21:03:11 -03:00
}
]
},
2025-11-17 16:27:38 -03:00
"unit": "percent",
"custom": {
"displayMode": "auto"
2025-11-17 19:56:57 -03:00
}
2025-11-15 21:03:11 -03:00
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
2025-11-15 21:03:11 -03:00
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
2025-11-16 00:55:28 -03:00
},
2025-11-17 19:49:50 -03:00
"textMode": "name_and_value"
2025-11-17 16:27:38 -03:00
},
"links": [
{
"title": "Open atlas-nodes dashboard",
"url": "/d/atlas-nodes",
"targetBlank": true
}
]
2025-11-16 00:55:28 -03:00
},
{
"id": 8,
2025-11-16 00:55:28 -03:00
"type": "stat",
"title": "Hottest node: RAM",
2025-11-16 00:55:28 -03:00
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 3,
2025-11-17 16:27:38 -03:00
"w": 6,
"x": 6,
"y": 5
2025-11-16 00:55:28 -03:00
},
"targets": [
{
2025-11-17 23:42:55 -03:00
"expr": "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
2025-11-17 19:24:03 -03:00
"refId": "A",
"legendFormat": "{{node}}",
"instant": true
2025-11-15 21:03:11 -03:00
}
2025-11-16 00:55:28 -03:00
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
2025-11-16 00:55:28 -03:00
},
"mappings": [],
"thresholds": {
"mode": "absolute",
2025-11-16 00:55:28 -03:00
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 50
},
{
"color": "orange",
"value": 75
2025-11-16 00:55:28 -03:00
},
{
"color": "red",
"value": 91.5
2025-11-16 00:55:28 -03:00
}
]
},
2025-11-17 16:27:38 -03:00
"unit": "percent",
"custom": {
"displayMode": "auto"
2025-11-17 19:56:57 -03:00
}
2025-11-16 00:55:28 -03:00
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
2025-11-17 19:49:50 -03:00
"textMode": "name_and_value"
2025-11-17 16:27:38 -03:00
},
"links": [
{
"title": "Open atlas-nodes dashboard",
"url": "/d/atlas-nodes",
"targetBlank": true
}
]
2025-11-15 21:03:11 -03:00
},
{
"id": 9,
2025-11-17 16:27:38 -03:00
"type": "stat",
2025-11-17 20:00:40 -03:00
"title": "Hottest node: NET (rx+tx)",
2025-11-17 16:27:38 -03:00
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 3,
2025-11-17 16:27:38 -03:00
"w": 6,
"x": 12,
"y": 5
},
"targets": [
{
2025-11-17 23:42:55 -03:00
"expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
2025-11-17 19:24:03 -03:00
"refId": "A",
"legendFormat": "{{node}}",
"instant": true
2025-11-17 16:27:38 -03:00
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
2025-11-17 16:27:38 -03:00
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
2025-11-17 18:55:11 -03:00
"unit": "Bps",
2025-11-17 16:27:38 -03:00
"custom": {
"displayMode": "auto"
2025-11-17 19:56:57 -03:00
}
2025-11-17 16:27:38 -03:00
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
2025-11-17 19:49:50 -03:00
"textMode": "name_and_value"
2025-11-17 16:27:38 -03:00
},
"links": [
{
"title": "Open atlas-nodes dashboard",
"url": "/d/atlas-nodes",
"targetBlank": true
}
]
},
{
"id": 10,
"type": "stat",
2025-11-17 20:00:40 -03:00
"title": "Hottest node: I/O (r+w)",
2025-11-17 16:27:38 -03:00
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 3,
2025-11-17 16:27:38 -03:00
"w": 6,
"x": 18,
"y": 5
},
"targets": [
{
2025-11-17 23:42:55 -03:00
"expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
2025-11-17 19:24:03 -03:00
"refId": "A",
"legendFormat": "{{node}}",
"instant": true
2025-11-17 16:27:38 -03:00
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
2025-11-17 16:27:38 -03:00
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
2025-11-17 18:55:11 -03:00
"unit": "Bps",
2025-11-17 16:27:38 -03:00
"custom": {
"displayMode": "auto"
2025-11-17 19:56:57 -03:00
}
2025-11-17 16:27:38 -03:00
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
2025-11-17 19:49:50 -03:00
"textMode": "name_and_value"
2025-11-17 16:27:38 -03:00
},
"links": [
{
"title": "Open atlas-nodes dashboard",
"url": "/d/atlas-nodes",
"targetBlank": true
}
]
},
{
"id": 30,
"type": "stat",
"title": "Mail Sent (1d)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
2026-01-21 13:37:36 -03:00
"h": 3,
"w": 4,
"x": 0,
"y": 8
},
"targets": [
{
"expr": "max(postmark_outbound_sent{window=\"1d\"})",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-mail dashboard",
"url": "/d/atlas-mail",
"targetBlank": true
}
]
},
{
"id": 31,
"type": "stat",
"title": "Mail Bounces (1d)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
2026-01-21 13:37:36 -03:00
"h": 3,
"w": 4,
"x": 8,
"y": 8
},
"targets": [
{
"expr": "max(postmark_outbound_bounce_rate{window=\"1d\"})",
"refId": "A",
"legendFormat": "Rate"
},
{
"expr": "max(postmark_outbound_bounced{window=\"1d\"})",
"refId": "B",
"legendFormat": "Count"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"custom": {
"displayMode": "auto"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 5
},
{
"color": "orange",
"value": 8
},
{
"color": "red",
"value": 10
}
]
},
"unit": "none"
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Rate"
},
"properties": [
{
"id": "unit",
"value": "percent"
}
]
},
{
"matcher": {
"id": "byName",
"options": "Count"
},
"properties": [
{
"id": "unit",
"value": "none"
}
]
}
]
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "name_and_value"
},
"links": [
{
"title": "Open atlas-mail dashboard",
"url": "/d/atlas-mail",
"targetBlank": true
}
]
},
{
"id": 32,
"type": "stat",
"title": "Mail Success Rate (1d)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
2026-01-21 13:37:36 -03:00
"h": 3,
"w": 4,
"x": 4,
"y": 8
},
"targets": [
{
"expr": "clamp_min(100 - max(postmark_outbound_bounce_rate{window=\"1d\"}), 0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": null
},
{
"color": "orange",
"value": 90
},
{
"color": "yellow",
"value": 95
},
{
"color": "green",
"value": 98
}
]
},
"unit": "percent",
"custom": {
"displayMode": "auto"
},
"decimals": 1
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-mail dashboard",
"url": "/d/atlas-mail",
"targetBlank": true
}
]
},
{
"id": 33,
"type": "stat",
"title": "Mail Limit Used (30d)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
2026-01-21 13:37:36 -03:00
"h": 3,
"w": 4,
"x": 12,
"y": 8
},
"targets": [
{
"expr": "max(postmark_sending_limit_used_percent)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 70
},
{
"color": "orange",
"value": 85
},
{
"color": "red",
"value": 95
}
]
},
"unit": "percent",
"custom": {
"displayMode": "auto"
},
"decimals": 1
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-mail dashboard",
"url": "/d/atlas-mail",
"targetBlank": true
}
]
},
{
"id": 34,
"type": "stat",
"title": "Postgres Connections Used",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 3,
"w": 4,
"x": 16,
"y": 8
},
"targets": [
{
"expr": "label_replace(sum(pg_stat_activity_count), \"conn\", \"used\", \"__name__\", \".*\") or label_replace(max(pg_settings_max_connections), \"conn\", \"max\", \"__name__\", \".*\")",
"refId": "A",
"legendFormat": "{{conn}}",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
},
"decimals": 0
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "name_and_value"
}
},
{
"id": 35,
"type": "stat",
"title": "Postgres Hottest Connections",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 3,
"w": 4,
"x": 20,
"y": 8
},
"targets": [
{
"expr": "topk(1, sum by (datname) (pg_stat_activity_count))",
"refId": "A",
"legendFormat": "{{datname}}",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
},
"decimals": 0
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "name_and_value"
}
},
2025-11-17 16:27:38 -03:00
{
"id": 23,
"type": "stat",
"title": "Astreae Usage",
2025-11-15 21:03:11 -03:00
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 3,
"w": 6,
2025-11-15 21:03:11 -03:00
"x": 0,
2026-01-21 13:37:36 -03:00
"y": 11
},
"targets": [
{
"expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)",
"refId": "A"
}
],
2025-11-15 21:03:11 -03:00
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 50
},
{
"color": "orange",
"value": 75
},
{
"color": "red",
"value": 91.5
}
]
},
"unit": "percent",
"custom": {
"displayMode": "auto"
}
2025-11-15 21:03:11 -03:00
},
"overrides": []
2025-11-15 21:03:11 -03:00
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
2025-11-15 21:03:11 -03:00
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-storage dashboard",
"url": "/d/atlas-storage",
"targetBlank": true
2025-11-15 21:03:11 -03:00
}
]
},
{
"id": 24,
"type": "stat",
"title": "Asteria Usage",
2025-11-17 23:12:16 -03:00
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 3,
"w": 6,
"x": 6,
2026-01-21 13:37:36 -03:00
"y": 11
2025-11-17 23:12:16 -03:00
},
"targets": [
{
"expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)",
"refId": "A"
2025-11-17 23:12:16 -03:00
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 50
},
{
"color": "orange",
"value": 75
},
{
"color": "red",
"value": 91.5
}
]
},
"unit": "percent",
"custom": {
"displayMode": "auto"
}
2025-11-17 23:12:16 -03:00
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
2025-11-17 23:12:16 -03:00
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-storage dashboard",
"url": "/d/atlas-storage",
"targetBlank": true
2025-11-17 23:12:16 -03:00
}
]
2025-11-17 23:12:16 -03:00
},
{
"id": 25,
"type": "stat",
"title": "Astreae Free",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 3,
"w": 6,
"x": 12,
2026-01-21 13:37:36 -03:00
"y": 11
},
"targets": [
{
"expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
"unit": "decbytes",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-storage dashboard",
"url": "/d/atlas-storage",
"targetBlank": true
}
]
},
{
"id": 26,
"type": "stat",
"title": "Asteria Free",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 3,
"w": 6,
"x": 18,
2026-01-21 13:37:36 -03:00
"y": 11
},
"targets": [
{
"expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})",
"refId": "A"
2025-11-15 21:03:11 -03:00
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
"unit": "decbytes",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-storage dashboard",
"url": "/d/atlas-storage",
"targetBlank": true
}
]
},
2026-01-21 13:37:36 -03:00
{
"id": 40,
"type": "bargauge",
"title": "One-off Job Pods (age hours)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 6,
"w": 6,
2026-01-21 13:37:36 -03:00
"x": 0,
"y": 14
2026-01-21 13:37:36 -03:00
},
"targets": [
{
2026-01-21 15:12:53 -03:00
"expr": "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))",
2026-01-21 13:37:36 -03:00
"refId": "A",
"legendFormat": "{{namespace}}/{{pod}}",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"unit": "h",
"min": 0,
"max": null,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 6
},
{
"color": "orange",
"value": 24
},
{
"color": "red",
"value": 48
}
]
},
"decimals": 2
2026-01-21 13:37:36 -03:00
},
"overrides": []
},
"options": {
"displayMode": "gradient",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
},
"transformations": [
{
"id": "sortBy",
"options": {
"fields": [
"Value"
],
"order": "desc"
}
},
{
"id": "limit",
"options": {
"limit": 8
}
}
]
},
{
"id": 41,
"type": "timeseries",
2026-01-21 15:12:53 -03:00
"title": "Ariadne Attempts / Failures",
2026-01-21 13:37:36 -03:00
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 6,
"w": 6,
"x": 6,
"y": 14
2026-01-21 13:37:36 -03:00
},
"targets": [
{
"expr": "sum(increase(ariadne_task_runs_total[$__interval]))",
2026-01-21 13:37:36 -03:00
"refId": "A",
"legendFormat": "Attempts"
},
{
"expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))",
2026-01-21 15:12:53 -03:00
"refId": "B",
2026-01-21 13:37:36 -03:00
"legendFormat": "Failures"
}
],
"fieldConfig": {
"defaults": {
"unit": "none"
},
"overrides": [
{
"matcher": {
"id": "byName",
2026-01-21 15:12:53 -03:00
"options": "Attempts"
},
"properties": [
{
"id": "color",
"value": {
"mode": "fixed",
2026-01-21 15:12:53 -03:00
"fixedColor": "green"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Failures"
},
"properties": [
{
"id": "color",
"value": {
"mode": "fixed",
"fixedColor": "red"
}
}
]
}
]
2026-01-21 13:37:36 -03:00
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right"
},
"tooltip": {
"mode": "multi"
}
}
},
{
"id": 42,
"type": "timeseries",
"title": "Ariadne Test Success Rate",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 6,
"w": 6,
2026-01-21 13:37:36 -03:00
"x": 12,
"y": 14
2026-01-21 13:37:36 -03:00
},
"targets": [
{
"expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[30d])), 1)",
2026-01-21 13:37:36 -03:00
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"max": 100
2026-01-21 13:37:36 -03:00
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
}
},
{
"id": 43,
"type": "bargauge",
"title": "Tests with Failures (24h)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 6,
"w": 6,
"x": 18,
"y": 14
2026-01-21 13:37:36 -03:00
},
"targets": [
{
2026-01-21 15:12:53 -03:00
"expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))",
2026-01-21 13:37:36 -03:00
"refId": "A",
"legendFormat": "{{result}}",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"unit": "none",
"min": 0,
"max": null,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 5
},
{
"color": "red",
"value": 10
}
]
}
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "error"
},
"properties": [
{
"id": "color",
"value": {
"mode": "fixed",
"fixedColor": "yellow"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "failed"
},
"properties": [
{
"id": "color",
"value": {
"mode": "fixed",
"fixedColor": "red"
}
}
]
}
]
2026-01-21 13:37:36 -03:00
},
"options": {
"displayMode": "gradient",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
},
"transformations": [
{
"id": "sortBy",
"options": {
"fields": [
"Value"
],
"order": "desc"
}
}
]
},
{
"id": 11,
"type": "piechart",
"title": "Namespace CPU Share",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 9,
"w": 8,
"x": 0,
"y": 20
},
"targets": [
{
"expr": "100 * ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_cpu}[1m])) by (namespace) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_cpu}[1m])) by (namespace) ), 1)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"color": {
"mode": "palette-classic"
}
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "list",
"placement": "right"
},
"pieType": "pie",
"displayLabels": [],
"tooltip": {
"mode": "single"
},
"colorScheme": "interpolateSpectral",
"colorBy": "value",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
},
"links": [
{
"title": "Workload namespaces only",
"url": "?var-namespace_scope_cpu=namespace%21~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}",
"targetBlank": false
},
{
"title": "All namespaces",
"url": "?var-namespace_scope_cpu=namespace%3D~%22.%2A%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}",
"targetBlank": false
},
{
"title": "Infrastructure namespaces only",
"url": "?var-namespace_scope_cpu=namespace%3D~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}",
"targetBlank": false
}
],
"description": "Shares are normalized within the selected filter. Switching scope changes the denominator."
},
{
"id": 12,
"type": "piechart",
"title": "Namespace GPU Share",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 9,
"w": 8,
"x": 8,
"y": 20
},
"targets": [
{
"expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))",
"refId": "A",
"legendFormat": "{{namespace}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"color": {
"mode": "palette-classic"
}
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "list",
"placement": "right"
},
"pieType": "pie",
"displayLabels": [],
"tooltip": {
"mode": "single"
},
"colorScheme": "interpolateSpectral",
"colorBy": "value",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
},
"links": [
{
"title": "Workload namespaces only",
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}",
"targetBlank": false
},
{
"title": "All namespaces",
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22.%2A%22&var-namespace_scope_ram=${namespace_scope_ram}",
"targetBlank": false
},
{
"title": "Infrastructure namespaces only",
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}",
"targetBlank": false
}
],
"description": "Shares are normalized within the selected filter. Switching scope changes the denominator."
},
{
"id": 13,
"type": "piechart",
"title": "Namespace RAM Share",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 9,
"w": 8,
"x": 16,
"y": 20
},
"targets": [
{
"expr": "100 * ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_ram}) by (namespace) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_ram}) by (namespace) ), 1)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"color": {
"mode": "palette-classic"
}
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "list",
"placement": "right"
},
"pieType": "pie",
"displayLabels": [],
"tooltip": {
"mode": "single"
},
"colorScheme": "interpolateSpectral",
"colorBy": "value",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
},
"links": [
{
"title": "Workload namespaces only",
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%21~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22",
"targetBlank": false
},
{
"title": "All namespaces",
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%3D~%22.%2A%22",
"targetBlank": false
},
{
"title": "Infrastructure namespaces only",
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%3D~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22",
"targetBlank": false
}
],
"description": "Shares are normalized within the selected filter. Switching scope changes the denominator."
},
{
"id": 14,
"type": "timeseries",
"title": "Worker Node CPU",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 12,
"w": 12,
"x": 0,
"y": 36
},
"targets": [
{
"expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
"refId": "A",
"legendFormat": "{{node}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right",
"calcs": [
"last"
]
},
"tooltip": {
"mode": "multi"
}
},
"links": [
{
"title": "Open atlas-nodes dashboard",
2025-11-17 16:27:38 -03:00
"url": "/d/atlas-nodes",
"targetBlank": true
}
]
2025-11-15 21:03:11 -03:00
},
{
"id": 15,
2025-11-15 21:03:11 -03:00
"type": "timeseries",
"title": "Worker Node RAM",
2025-11-15 21:03:11 -03:00
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 12,
2025-11-15 21:03:11 -03:00
"w": 12,
"x": 12,
"y": 36
2025-11-15 21:03:11 -03:00
},
"targets": [
{
"expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
2025-11-15 21:03:11 -03:00
"refId": "A",
2025-11-16 00:55:28 -03:00
"legendFormat": "{{node}}"
}
],
2025-11-15 21:03:11 -03:00
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right",
"calcs": [
"last"
]
},
2025-11-15 21:03:11 -03:00
"tooltip": {
"mode": "multi"
}
2025-11-17 16:27:38 -03:00
},
"links": [
{
"title": "Open atlas-nodes dashboard",
"url": "/d/atlas-nodes",
"targetBlank": true
}
]
},
{
"id": 16,
2025-11-17 16:27:38 -03:00
"type": "timeseries",
"title": "Control plane CPU",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
2025-11-16 00:55:28 -03:00
"gridPos": {
"h": 10,
2025-11-16 00:55:28 -03:00
"w": 12,
"x": 0,
"y": 48
2025-11-16 00:55:28 -03:00
},
"targets": [
{
"expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
2025-11-17 16:27:38 -03:00
"refId": "A",
"legendFormat": "{{node}}"
2025-11-16 00:55:28 -03:00
}
],
"fieldConfig": {
"defaults": {
2025-11-17 16:27:38 -03:00
"unit": "percent"
2025-11-16 00:55:28 -03:00
},
"overrides": []
},
"options": {
2025-11-17 16:27:38 -03:00
"legend": {
"displayMode": "table",
"placement": "right"
},
"tooltip": {
"mode": "multi"
}
2025-11-17 16:27:38 -03:00
}
2025-11-16 00:55:28 -03:00
},
{
"id": 17,
2025-11-17 16:27:38 -03:00
"type": "timeseries",
"title": "Control plane RAM",
2025-11-16 00:55:28 -03:00
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 10,
2025-11-16 00:55:28 -03:00
"w": 12,
"x": 12,
"y": 48
2025-11-16 00:55:28 -03:00
},
"targets": [
{
"expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
2025-11-17 16:27:38 -03:00
"refId": "A",
"legendFormat": "{{node}}"
2025-11-16 00:55:28 -03:00
}
],
"fieldConfig": {
"defaults": {
2025-11-17 16:27:38 -03:00
"unit": "percent"
2025-11-16 00:55:28 -03:00
},
"overrides": []
},
"options": {
2025-11-17 16:27:38 -03:00
"legend": {
"displayMode": "table",
"placement": "right"
},
2025-11-17 16:27:38 -03:00
"tooltip": {
"mode": "multi"
}
2025-11-17 16:27:38 -03:00
}
2025-11-16 00:55:28 -03:00
},
{
"id": 28,
"type": "piechart",
"title": "Node Pod Share",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 10,
"w": 12,
"x": 0,
"y": 58
},
"targets": [
{
"expr": "(sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node) / clamp_min(sum(kube_pod_info{pod!=\"\" , node!=\"\"}), 1)) * 100",
"refId": "A",
"legendFormat": "{{namespace}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"color": {
"mode": "palette-classic"
}
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "list",
"placement": "right"
},
"pieType": "pie",
"displayLabels": [],
"tooltip": {
"mode": "single"
},
"colorScheme": "interpolateSpectral",
"colorBy": "value",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
}
},
{
"id": 29,
"type": "bargauge",
"title": "Top Nodes by Pod Count",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 10,
"w": 12,
"x": 12,
"y": 58
},
"targets": [
{
2026-01-21 15:12:53 -03:00
"expr": "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))",
"refId": "A",
"legendFormat": "{{node}}",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"unit": "none",
"min": 0,
"max": null,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 50
},
{
"color": "orange",
"value": 75
},
{
"color": "red",
"value": 100
}
]
},
"decimals": 0
},
"overrides": []
},
"options": {
"displayMode": "gradient",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
},
"transformations": [
{
"id": "sortBy",
"options": {
"fields": [
"Value"
],
"order": "desc"
}
},
{
"id": "limit",
"options": {
"limit": 12
}
}
]
},
2025-11-16 00:55:28 -03:00
{
"id": 18,
2025-11-16 00:55:28 -03:00
"type": "timeseries",
"title": "Cluster Ingress Throughput",
2025-11-16 00:55:28 -03:00
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 7,
"w": 8,
"x": 0,
"y": 29
},
"targets": [
{
"expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)",
"refId": "A",
"legendFormat": "Ingress (Traefik)"
2025-11-16 00:55:28 -03:00
}
],
"fieldConfig": {
"defaults": {
2025-11-17 18:55:11 -03:00
"unit": "Bps"
2025-11-16 00:55:28 -03:00
},
"overrides": []
},
"options": {
"legend": {
2025-11-17 16:27:38 -03:00
"displayMode": "list",
2025-11-16 00:55:28 -03:00
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
2025-11-17 16:27:38 -03:00
},
"links": [
{
"title": "Open atlas-network dashboard",
"url": "/d/atlas-network",
"targetBlank": true
}
]
2025-11-16 00:55:28 -03:00
},
{
"id": 19,
"type": "timeseries",
"title": "Cluster Egress Throughput",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 7,
"w": 8,
"x": 8,
"y": 29
},
"targets": [
{
"expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)",
"refId": "A",
"legendFormat": "Egress (Traefik)"
}
],
"fieldConfig": {
"defaults": {
2025-11-17 18:55:11 -03:00
"unit": "Bps"
},
"overrides": []
},
"options": {
"legend": {
2025-11-17 16:27:38 -03:00
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
2025-11-17 16:27:38 -03:00
},
"links": [
{
"title": "Open atlas-network dashboard",
"url": "/d/atlas-network",
"targetBlank": true
}
]
},
{
"id": 20,
"type": "timeseries",
"title": "Intra-Cluster Throughput",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 7,
"w": 8,
"x": 16,
"y": 29
},
"targets": [
{
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)",
"refId": "A",
"legendFormat": "Internal traffic"
}
],
"fieldConfig": {
"defaults": {
"unit": "Bps"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
2025-11-17 16:27:38 -03:00
"links": [
{
"title": "Open atlas-network dashboard",
"url": "/d/atlas-network",
2025-11-17 16:27:38 -03:00
"targetBlank": true
}
]
},
{
"id": 21,
"type": "timeseries",
"title": "Root Filesystem Usage",
2025-11-16 00:55:28 -03:00
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 16,
2025-11-16 00:55:28 -03:00
"w": 12,
"x": 0,
"y": 68
2025-11-16 00:55:28 -03:00
},
"targets": [
{
"expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
"refId": "A",
"legendFormat": "{{node}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent"
2025-11-16 00:55:28 -03:00
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right",
2025-11-16 00:55:28 -03:00
"calcs": [
"last"
]
},
"tooltip": {
"mode": "multi"
2025-11-16 00:55:28 -03:00
}
2025-11-17 16:27:38 -03:00
},
"timeFrom": "30d",
2025-11-17 16:27:38 -03:00
"links": [
{
"title": "Open atlas-storage dashboard",
"url": "/d/atlas-storage",
"targetBlank": true
}
]
2025-11-16 00:55:28 -03:00
},
{
"id": 22,
"type": "bargauge",
"title": "Nodes Closest to Full Root Disks",
2025-11-16 00:55:28 -03:00
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 16,
"w": 12,
"x": 12,
"y": 68
2025-11-16 00:55:28 -03:00
},
"targets": [
{
2026-01-21 15:12:53 -03:00
"expr": "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))",
"refId": "A",
"legendFormat": "{{node}}"
2025-11-16 00:55:28 -03:00
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"thresholds": {
"mode": "absolute",
"steps": [
2025-11-16 00:55:28 -03:00
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 50
2025-11-16 00:55:28 -03:00
},
{
"color": "orange",
"value": 75
2025-11-16 00:55:28 -03:00
},
{
"color": "red",
"value": 91.5
2025-11-16 00:55:28 -03:00
}
]
2025-11-17 16:27:38 -03:00
}
2025-11-16 00:55:28 -03:00
},
"overrides": []
},
"options": {
"displayMode": "gradient",
"orientation": "horizontal",
2025-11-16 00:55:28 -03:00
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
2025-11-16 00:55:28 -03:00
"values": false
}
2025-11-17 16:27:38 -03:00
},
"links": [
{
"title": "Open atlas-storage dashboard",
"url": "/d/atlas-storage",
"targetBlank": true
}
],
"transformations": [
{
"id": "sortBy",
"options": {
"fields": [
"Value"
],
"order": "desc"
}
}
]
}
],
"schemaVersion": 39,
"style": "dark",
"tags": [
"atlas",
"overview"
],
"templating": {
"list": [
{
"name": "namespace_scope_cpu",
"label": "CPU namespace filter",
"type": "custom",
"query": "workload namespaces only : namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
"current": {
"text": "workload namespaces only",
"value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
"selected": true
},
"options": [
{
"text": "workload namespaces only",
"value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
"selected": true
},
{
"text": "all namespaces",
"value": "namespace=~\".*\"",
"selected": false
},
{
"text": "infrastructure namespaces only",
"value": "namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
"selected": false
}
],
"hide": 2,
"multi": false,
"includeAll": false,
"refresh": 1,
"sort": 0,
"skipUrlSync": false
},
{
"name": "namespace_scope_gpu",
"label": "GPU namespace filter",
"type": "custom",
"query": "workload namespaces only : namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
"current": {
"text": "workload namespaces only",
"value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
"selected": true
},
"options": [
{
"text": "workload namespaces only",
"value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
"selected": true
},
{
"text": "all namespaces",
"value": "namespace=~\".*\"",
"selected": false
},
{
"text": "infrastructure namespaces only",
"value": "namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
"selected": false
}
],
"hide": 2,
"multi": false,
"includeAll": false,
"refresh": 1,
"sort": 0,
"skipUrlSync": false
},
{
"name": "namespace_scope_ram",
"label": "RAM namespace filter",
"type": "custom",
"query": "workload namespaces only : namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
"current": {
"text": "workload namespaces only",
"value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
"selected": true
},
"options": [
{
"text": "workload namespaces only",
"value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
"selected": true
},
{
"text": "all namespaces",
"value": "namespace=~\".*\"",
"selected": false
},
{
"text": "infrastructure namespaces only",
"value": "namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
"selected": false
}
],
"hide": 2,
"multi": false,
"includeAll": false,
"refresh": 1,
"sort": 0,
"skipUrlSync": false
}
]
},
"time": {
"from": "now-1h",
"to": "now"
2025-11-17 16:27:38 -03:00
},
"refresh": "1m",
"links": []
}