titan-iac/services/monitoring/grafana-dashboard-overview.yaml

4527 lines
142 KiB
YAML

# services/monitoring/grafana-dashboard-overview.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboard-overview
labels:
grafana_dashboard: "1"
data:
atlas-overview.json: |
{
"uid": "atlas-overview",
"title": "Atlas Overview",
"folderUid": "overview",
"editable": false,
"annotations": {
"list": []
},
"panels": [
{
"id": 2,
"type": "gauge",
"title": "Control Plane Ready",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 4,
"x": 0,
"y": 0
},
"targets": [
{
"expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"min": 0,
"max": 3,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "dark-red",
"value": null
},
{
"color": "dark-green",
"value": 3
}
]
}
},
"overrides": []
},
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"orientation": "auto",
"showThresholdMarkers": false,
"showThresholdLabels": false
},
"description": "Control-plane nodes currently Ready; full count is good, lower means Kubernetes core capacity is missing."
},
{
"id": 3,
"type": "stat",
"title": "Control Plane Workloads",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 3,
"x": 4,
"y": 0
},
"targets": [
{
"expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"}) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "dark-green",
"value": null
},
{
"color": "dark-yellow",
"value": 1
},
{
"color": "dark-orange",
"value": 2
},
{
"color": "dark-red",
"value": 3
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-pods dashboard",
"url": "/d/atlas-pods",
"targetBlank": true
}
],
"description": "Non-core pods running on control-plane nodes; zero is good because control nodes should stay focused."
},
{
"id": 5,
"type": "stat",
"title": "Stuck Terminating",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 3,
"x": 7,
"y": 0
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "dark-green",
"value": null
},
{
"color": "dark-yellow",
"value": 1
},
{
"color": "dark-orange",
"value": 2
},
{
"color": "dark-red",
"value": 3
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-pods dashboard",
"url": "/d/atlas-pods",
"targetBlank": true
}
],
"description": "Pods that Kubernetes cannot finish deleting; zero is good, growth means cleanup or storage may be stuck."
},
{
"id": 27,
"type": "stat",
"title": "Atlas Availability (365d)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 4,
"x": 10,
"y": 0
},
"targets": [
{
"expr": "last_over_time(atlas:availability:ratio_365d{scope=\"atlas\"}[24h])",
"refId": "A",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "dark-red",
"value": null
},
{
"color": "dark-orange",
"value": 0.99
},
{
"color": "dark-yellow",
"value": 0.999
},
{
"color": "dark-green",
"value": 0.9999
},
{
"color": "dark-blue",
"value": 0.99999
}
]
},
"unit": "percentunit",
"custom": {
"displayMode": "auto"
},
"decimals": 4
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. Grafana keeps the last successful rollup for up to 24h so one missed long-window evaluation does not render as No data."
},
{
"id": 4,
"type": "stat",
"title": "Problem Pods",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 3,
"x": 14,
"y": 0
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "dark-green",
"value": null
},
{
"color": "dark-yellow",
"value": 1
},
{
"color": "dark-orange",
"value": 2
},
{
"color": "dark-red",
"value": 3
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-pods dashboard",
"url": "/d/atlas-pods",
"targetBlank": true
}
],
"description": "Pods in unhealthy phases; zero is good, any count means a workload needs attention."
},
{
"id": 6,
"type": "stat",
"title": "CrashLoop / ImagePull",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 3,
"x": 17,
"y": 0
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "dark-green",
"value": null
},
{
"color": "dark-yellow",
"value": 1
},
{
"color": "dark-orange",
"value": 2
},
{
"color": "dark-red",
"value": 3
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-pods dashboard",
"url": "/d/atlas-pods",
"targetBlank": true
}
],
"description": "Pods restarting or unable to pull images; zero is good, any count usually blocks a service."
},
{
"id": 1,
"type": "gauge",
"title": "Workers Ready",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 4,
"x": 20,
"y": 0
},
"targets": [
{
"expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"min": 0,
"max": 20,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "dark-red",
"value": null
},
{
"color": "dark-orange",
"value": 18
},
{
"color": "dark-yellow",
"value": 19
},
{
"color": "dark-green",
"value": 20
}
]
}
},
"overrides": []
},
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"orientation": "auto",
"showThresholdMarkers": false,
"showThresholdLabels": false
},
"description": "Worker nodes currently Ready; full count is good, lower means less place to run services."
},
{
"id": 7,
"type": "stat",
"title": "Hottest node: CPU",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 2,
"w": 3,
"x": 0,
"y": 5
},
"targets": [
{
"expr": "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
"refId": "A",
"legendFormat": "{{node}}",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "dark-green",
"value": null
},
{
"color": "dark-yellow",
"value": 50
},
{
"color": "dark-orange",
"value": 75
},
{
"color": "dark-red",
"value": 91.5
}
]
},
"unit": "percent",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "name_and_value"
},
"links": [
{
"title": "Open atlas-nodes dashboard",
"url": "/d/atlas-nodes",
"targetBlank": true
}
],
"description": "Highest worker CPU load right now; lower is calmer, hot nodes may need pods moved."
},
{
"id": 8,
"type": "stat",
"title": "Hottest node: RAM",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 2,
"w": 3,
"x": 3,
"y": 5
},
"targets": [
{
"expr": "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
"refId": "A",
"legendFormat": "{{node}}",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "dark-green",
"value": null
},
{
"color": "dark-yellow",
"value": 50
},
{
"color": "dark-orange",
"value": 75
},
{
"color": "dark-red",
"value": 91.5
}
]
},
"unit": "percent",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "name_and_value"
},
"links": [
{
"title": "Open atlas-nodes dashboard",
"url": "/d/atlas-nodes",
"targetBlank": true
}
],
"description": "Highest worker memory use right now; lower is safer, high values risk evictions."
},
{
"id": 9,
"type": "stat",
"title": "Hottest node: NET (rx+tx)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 2,
"w": 3,
"x": 6,
"y": 5
},
"targets": [
{
"expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
"refId": "A",
"legendFormat": "{{node}}",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "dark-green",
"value": 1
}
]
},
"unit": "Bps",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "name_and_value"
},
"links": [
{
"title": "Open atlas-nodes dashboard",
"url": "/d/atlas-nodes",
"targetBlank": true
}
],
"description": "Busiest node network rate; spikes can reveal traffic concentration or noisy services."
},
{
"id": 10,
"type": "stat",
"title": "Hottest node: I/O (r+w)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 2,
"w": 3,
"x": 9,
"y": 5
},
"targets": [
{
"expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
"refId": "A",
"legendFormat": "{{node}}",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "dark-green",
"value": 1
}
]
},
"unit": "Bps",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "name_and_value"
},
"links": [
{
"title": "Open atlas-nodes dashboard",
"url": "/d/atlas-nodes",
"targetBlank": true
}
],
"description": "Busiest node disk I/O rate; high values can explain slow storage-backed apps."
},
{
"id": 23,
"type": "stat",
"title": "Astreae Usage",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 2,
"w": 3,
"x": 12,
"y": 5
},
"targets": [
{
"expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "dark-green",
"value": null
},
{
"color": "dark-yellow",
"value": 50
},
{
"color": "dark-orange",
"value": 75
},
{
"color": "dark-red",
"value": 91.5
}
]
},
"unit": "percent",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-storage dashboard",
"url": "/d/atlas-storage",
"targetBlank": true
}
],
"description": "Percent of Astreae used; lower is safer, high values reduce storage headroom."
},
{
"id": 24,
"type": "stat",
"title": "Asteria Usage",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 2,
"w": 3,
"x": 15,
"y": 5
},
"targets": [
{
"expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "dark-green",
"value": null
},
{
"color": "dark-yellow",
"value": 50
},
{
"color": "dark-orange",
"value": 75
},
{
"color": "dark-red",
"value": 91.5
}
]
},
"unit": "percent",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-storage dashboard",
"url": "/d/atlas-storage",
"targetBlank": true
}
],
"description": "Percent of Asteria used; lower is safer, high values reduce storage headroom."
},
{
"id": 25,
"type": "stat",
"title": "Astreae Free",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 2,
"w": 3,
"x": 18,
"y": 5
},
"targets": [
{
"expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "dark-green",
"value": 1
}
]
},
"unit": "decbytes",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-storage dashboard",
"url": "/d/atlas-storage",
"targetBlank": true
}
],
"description": "Free space on Astreae; higher is better for backups and workload growth."
},
{
"id": 26,
"type": "stat",
"title": "Asteria Free",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 2,
"w": 3,
"x": 21,
"y": 5
},
"targets": [
{
"expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "dark-green",
"value": 1
}
]
},
"unit": "decbytes",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-storage dashboard",
"url": "/d/atlas-storage",
"targetBlank": true
}
],
"description": "Free space on Asteria; higher is better for backups and workload growth."
},
{
"id": 40,
"type": "stat",
"title": "Pyrphoros UPS Current",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 3,
"w": 3,
"x": 0,
"y": 7
},
"targets": [
{
"expr": "label_replace(max((ananke_ups_load_percent{job=\"ananke-power\",source=\"Pyrphoros\"} * ananke_ups_power_nominal_watts{job=\"ananke-power\",source=\"Pyrphoros\"}) / 100) or on() vector(0), \"metric\", \"Draw\", \"__name__\", \".*\") or label_replace(max(ananke_ups_runtime_seconds{job=\"ananke-power\",source=\"Pyrphoros\"}) or on() vector(0), \"metric\", \"Runtime\", \"__name__\", \".*\")",
"refId": "A",
"legendFormat": "{{metric}}",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "dark-green",
"value": 1
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Draw"
},
"properties": [
{
"id": "unit",
"value": "watt"
}
]
},
{
"matcher": {
"id": "byName",
"options": "Runtime"
},
"properties": [
{
"id": "unit",
"value": "s"
}
]
}
]
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "name_and_value",
"text": {
"titleSize": 11,
"valueSize": 20
}
},
"links": [
{
"title": "Open atlas-power dashboard",
"url": "/d/atlas-power",
"targetBlank": true
}
],
"description": "Live Pyrphoros UPS draw and runtime; stable runtime means the lab can ride out short outages."
},
{
"id": 144,
"type": "stat",
"title": "Statera UPS Current",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 3,
"w": 3,
"x": 0,
"y": 10
},
"targets": [
{
"expr": "label_replace(max((ananke_ups_load_percent{job=\"ananke-power\",source=\"Statera\"} * ananke_ups_power_nominal_watts{job=\"ananke-power\",source=\"Statera\"}) / 100) or on() vector(0), \"metric\", \"Draw\", \"__name__\", \".*\") or label_replace(max(ananke_ups_runtime_seconds{job=\"ananke-power\",source=\"Statera\"}) or on() vector(0), \"metric\", \"Runtime\", \"__name__\", \".*\")",
"refId": "A",
"legendFormat": "{{metric}}",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "dark-green",
"value": 1
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Draw"
},
"properties": [
{
"id": "unit",
"value": "watt"
}
]
},
{
"matcher": {
"id": "byName",
"options": "Runtime"
},
"properties": [
{
"id": "unit",
"value": "s"
}
]
}
]
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "name_and_value",
"text": {
"titleSize": 11,
"valueSize": 20
}
},
"links": [
{
"title": "Open atlas-power dashboard",
"url": "/d/atlas-power",
"targetBlank": true
}
],
"description": "Live Statera UPS draw and runtime; stable runtime means the lab can ride out short outages."
},
{
"id": 41,
"type": "timeseries",
"title": "UPS History (Power Draw)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 6,
"w": 6,
"x": 3,
"y": 7
},
"targets": [
{
"refId": "A",
"expr": "max((ananke_ups_load_percent{job=\"ananke-power\",source=\"Pyrphoros\"} * ananke_ups_power_nominal_watts{job=\"ananke-power\",source=\"Pyrphoros\"}) / 100)",
"legendFormat": "Pyrphoros"
},
{
"refId": "B",
"expr": "max((ananke_ups_load_percent{job=\"ananke-power\",source=\"Statera\"} * ananke_ups_power_nominal_watts{job=\"ananke-power\",source=\"Statera\"}) / 100)",
"legendFormat": "Statera"
}
],
"fieldConfig": {
"defaults": {
"unit": "watt",
"custom": {
"drawStyle": "line",
"lineInterpolation": "linear",
"lineWidth": 2,
"fillOpacity": 22,
"showPoints": "never",
"spanNulls": true
}
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Pyrphoros"
},
"properties": [
{
"id": "color",
"value": {
"mode": "fixed",
"fixedColor": "dark-blue"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Statera"
},
"properties": [
{
"id": "color",
"value": {
"mode": "fixed",
"fixedColor": "dark-yellow"
}
}
]
}
]
},
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"links": [
{
"title": "Open atlas-power dashboard",
"url": "/d/atlas-power",
"targetBlank": true
}
],
"description": "UPS power draw over time; steady draw is normal, spikes show sudden load changes."
},
{
"id": 42,
"type": "stat",
"title": "Current Enclosure Temperature",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 3,
"w": 3,
"x": 0,
"y": 13
},
"targets": [
{
"expr": "label_replace(max(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_temperature_celsius != 0)) or on() vector(0), \"metric\", \"\u00b0C\", \"__name__\", \".*\") or label_replace(max((max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_temperature_celsius != 0)) * 9 / 5 + 32) or on() vector(0), \"metric\", \"\u00b0F\", \"__name__\", \".*\")",
"refId": "A",
"legendFormat": "{{metric}}",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "dark-green",
"value": 1
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "\u00b0C"
},
"properties": [
{
"id": "unit",
"value": "celsius"
}
]
},
{
"matcher": {
"id": "byName",
"options": "\u00b0F"
},
"properties": [
{
"id": "unit",
"value": "fahrenheit"
}
]
}
]
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "name_and_value",
"text": {
"titleSize": 11,
"valueSize": 20
}
},
"links": [
{
"title": "Open atlas-power dashboard",
"url": "/d/atlas-power",
"targetBlank": true
}
],
"description": "Current tent temperature in C and F; moderate values protect hardware and plants."
},
{
"id": 143,
"type": "stat",
"title": "Current Enclosure Climate",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 3,
"w": 3,
"x": 0,
"y": 16
},
"targets": [
{
"expr": "label_replace(max(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_relative_humidity_percent != 0)) or on() vector(0), \"metric\", \"%RH\", \"__name__\", \".*\") or label_replace(max(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_vpd_kpa != 0)) or on() vector(0), \"metric\", \"kPa\", \"__name__\", \".*\")",
"refId": "A",
"legendFormat": "{{metric}}",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "dark-green",
"value": 1
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "%RH"
},
"properties": [
{
"id": "unit",
"value": "suffix:%RH"
}
]
},
{
"matcher": {
"id": "byName",
"options": "kPa"
},
"properties": [
{
"id": "unit",
"value": "suffix:kPa"
}
]
}
]
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "name_and_value",
"text": {
"titleSize": 11,
"valueSize": 20
}
},
"links": [
{
"title": "Open atlas-power dashboard",
"url": "/d/atlas-power",
"targetBlank": true
}
],
"description": "Current humidity and VPD; in-range values mean the enclosure climate is stable."
},
{
"id": 43,
"type": "timeseries",
"title": "Enclosure Climate History",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 6,
"w": 6,
"x": 3,
"y": 13
},
"targets": [
{
"refId": "A",
"expr": "max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_temperature_celsius != 0)",
"legendFormat": "C"
},
{
"refId": "B",
"expr": "max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_relative_humidity_percent != 0)",
"legendFormat": "RH"
},
{
"refId": "C",
"expr": "max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_vpd_kpa != 0)",
"legendFormat": "P"
},
{
"refId": "D",
"expr": "(min_over_time(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_temperature_celsius != 0)[$__range]) - 0.08)",
"legendFormat": "C bound min"
},
{
"refId": "E",
"expr": "(max_over_time(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_temperature_celsius != 0)[$__range]) + 0.08)",
"legendFormat": "C bound max"
},
{
"refId": "F",
"expr": "clamp_min((min_over_time(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_relative_humidity_percent != 0)[$__range]) - 0.35), 0)",
"legendFormat": "RH bound min"
},
{
"refId": "G",
"expr": "clamp_max((max_over_time(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_relative_humidity_percent != 0)[$__range]) + 0.35), 100)",
"legendFormat": "RH bound max"
},
{
"refId": "H",
"expr": "clamp_min((min_over_time(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_vpd_kpa != 0)[$__range]) - 0.03), 0)",
"legendFormat": "P bound min"
},
{
"refId": "I",
"expr": "(max_over_time(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_vpd_kpa != 0)[$__range]) + 0.03)",
"legendFormat": "P bound max"
}
],
"fieldConfig": {
"defaults": {
"unit": "none",
"custom": {
"drawStyle": "line",
"lineInterpolation": "linear",
"lineWidth": 2,
"fillOpacity": 10,
"showPoints": "never",
"spanNulls": true
}
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "C"
},
"properties": [
{
"id": "unit",
"value": "suffix:\u00b0C"
},
{
"id": "decimals",
"value": 2
},
{
"id": "custom.axisPlacement",
"value": "left"
},
{
"id": "custom.axisCenteredZero",
"value": false
}
]
},
{
"matcher": {
"id": "byRegexp",
"options": "C bound .*"
},
"properties": [
{
"id": "unit",
"value": "suffix:\u00b0C"
},
{
"id": "custom.axisPlacement",
"value": "left"
},
{
"id": "custom.axisCenteredZero",
"value": false
},
{
"id": "custom.hideFrom",
"value": {
"legend": true,
"tooltip": true,
"viz": false
}
},
{
"id": "custom.lineWidth",
"value": 0
},
{
"id": "custom.fillOpacity",
"value": 0
},
{
"id": "custom.showPoints",
"value": "never"
},
{
"id": "color",
"value": {
"mode": "fixed",
"fixedColor": "transparent"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "RH"
},
"properties": [
{
"id": "unit",
"value": "suffix:%"
},
{
"id": "decimals",
"value": 2
},
{
"id": "custom.axisPlacement",
"value": "right"
},
{
"id": "custom.axisCenteredZero",
"value": false
}
]
},
{
"matcher": {
"id": "byRegexp",
"options": "RH bound .*"
},
"properties": [
{
"id": "unit",
"value": "suffix:%"
},
{
"id": "custom.axisPlacement",
"value": "right"
},
{
"id": "custom.axisCenteredZero",
"value": false
},
{
"id": "custom.hideFrom",
"value": {
"legend": true,
"tooltip": true,
"viz": false
}
},
{
"id": "custom.lineWidth",
"value": 0
},
{
"id": "custom.fillOpacity",
"value": 0
},
{
"id": "custom.showPoints",
"value": "never"
},
{
"id": "color",
"value": {
"mode": "fixed",
"fixedColor": "transparent"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "P"
},
"properties": [
{
"id": "unit",
"value": "suffix:kPa"
},
{
"id": "custom.axisPlacement",
"value": "right"
},
{
"id": "decimals",
"value": 2
},
{
"id": "custom.axisCenteredZero",
"value": false
}
]
},
{
"matcher": {
"id": "byRegexp",
"options": "P bound .*"
},
"properties": [
{
"id": "unit",
"value": "suffix:kPa"
},
{
"id": "custom.axisPlacement",
"value": "right"
},
{
"id": "custom.axisCenteredZero",
"value": false
},
{
"id": "custom.hideFrom",
"value": {
"legend": true,
"tooltip": true,
"viz": false
}
},
{
"id": "custom.lineWidth",
"value": 0
},
{
"id": "custom.fillOpacity",
"value": 0
},
{
"id": "custom.showPoints",
"value": "never"
},
{
"id": "color",
"value": {
"mode": "fixed",
"fixedColor": "transparent"
}
}
]
}
]
},
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"links": [
{
"title": "Open atlas-power dashboard",
"url": "/d/atlas-power",
"targetBlank": true
}
],
"description": "Temperature on left axis, humidity and pressure on right axis with dynamic bound series so small swings remain visible."
},
{
"id": 141,
"type": "state-timeline",
"title": "Fan Intensity History",
"description": "Fan intensity lanes on the 0-10 controller scale. Cooler colors are quiet/low intensity; warmer colors mean the enclosure is pushing harder.",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 6,
"w": 6,
"x": 9,
"y": 13
},
"targets": [
{
"expr": "label_replace(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"1\"}), \"fan\", \"Outlet\", \"__name__\", \".*\") or label_replace(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"2\"}), \"fan\", \"Inlet - Inside\", \"__name__\", \".*\") or label_replace(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"3\"}), \"fan\", \"Inlet - Outside\", \"__name__\", \".*\") or label_replace(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"4\"}), \"fan\", \"Tent Interior\", \"__name__\", \".*\")",
"refId": "A",
"legendFormat": "{{fan}}",
"format": "time_series",
"instant": false,
"range": true
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"unit": "none",
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#8f1d1d",
"value": null
},
{
"color": "#c92a2a",
"value": 1
},
{
"color": "#d95718",
"value": 2
},
{
"color": "#e06c00",
"value": 3
},
{
"color": "#d69605",
"value": 4
},
{
"color": "#d4b106",
"value": 5
},
{
"color": "#76a935",
"value": 6
},
{
"color": "#2f9e44",
"value": 7
},
{
"color": "#2f8599",
"value": 8
},
{
"color": "#2870b8",
"value": 9
},
{
"color": "#1f60c4",
"value": 10
}
]
},
"custom": {
"fillOpacity": 70,
"lineWidth": 0,
"spanNulls": true
},
"min": 0,
"max": 10,
"mappings": [
{
"type": "value",
"options": {
"0": {
"text": "Off",
"color": "#8f1d1d"
},
"1": {
"text": "1",
"color": "#c92a2a"
},
"2": {
"text": "2",
"color": "#d95718"
},
"3": {
"text": "3",
"color": "#e06c00"
},
"4": {
"text": "4",
"color": "#d69605"
},
"5": {
"text": "5",
"color": "#d4b106"
},
"6": {
"text": "6",
"color": "#76a935"
},
"7": {
"text": "7",
"color": "#2f9e44"
},
"8": {
"text": "8",
"color": "#2f8599"
},
"9": {
"text": "9",
"color": "#2870b8"
},
"10": {
"text": "10",
"color": "#1f60c4"
}
}
}
]
},
"overrides": []
},
"options": {
"mergeValues": false,
"showValue": "auto",
"legend": {
"displayMode": "list",
"placement": "bottom",
"showLegend": false
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"links": [
{
"title": "Open atlas-power dashboard",
"url": "/d/atlas-power",
"targetBlank": true
}
]
},
{
"id": 140,
"type": "stat",
"title": "Flux Source",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 2,
"w": 3,
"x": 21,
"y": 7
},
"targets": [
{
"expr": "max by (branch, revision) (ananke_gitops_flux_source_info{job=\"ananke-power\",namespace=\"flux-system\",name=\"flux-system\"}) or on() vector(0)",
"refId": "A",
"legendFormat": "{{branch}}",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "dark-red",
"value": null
},
{
"color": "dark-blue",
"value": 1
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "name",
"text": {
"titleSize": 10,
"valueSize": 14
}
},
"links": [
{
"title": "Open atlas-gitops dashboard",
"url": "/d/atlas-gitops",
"targetBlank": true
}
],
"description": "Flux GitRepository branch reported by Ananke. Revision and object detail live in Atlas GitOps."
},
{
"id": 151,
"type": "stat",
"title": "Current Gate Health",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 2,
"w": 3,
"x": 21,
"y": 9
},
"targets": [
{
"expr": "(avg((min by (suite) (((100 * sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1) unless on(suite, check) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1))) / clamp_min(sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!=\"\"})) > 0), 1)), 1))) or (min by (suite) (platform_quality:test_category_health_rate:percent_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch!=\"\",branch=~\"main|master|origin/main|origin/master\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"}))))) or on() vector(0))",
"refId": "A",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "dark-red",
"value": null
},
{
"color": "dark-orange",
"value": 70
},
{
"color": "dark-yellow",
"value": 85
},
{
"color": "dark-green",
"value": 95
},
{
"color": "dark-blue",
"value": 100
}
]
},
"unit": "percent",
"custom": {
"displayMode": "auto"
},
"decimals": 1
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value",
"text": {
"titleSize": 10,
"valueSize": 19
}
},
"links": [
{
"title": "Open atlas-testing dashboard",
"url": "/d/atlas-testing",
"targetBlank": true
}
],
"description": "Current gate-check health across suites; skipped or not-applicable checks count as healthy, failures lower it."
},
{
"id": 152,
"type": "stat",
"title": "CI Run Success (24h)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 2,
"w": 3,
"x": 21,
"y": 11
},
"targets": [
{
"expr": "100 * ((sum(platform_quality:suite_runs:increase_24h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|pegasus-health|pegasus_health|soteria|titan_iac|titan-iac|typhon|bstein_home|bstein-home|data_prepper|data-prepper\",status=~\"ok|passed|success\"}) or on() vector(0))) / clamp_min(((sum(platform_quality:suite_runs:increase_24h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|pegasus-health|pegasus_health|soteria|titan_iac|titan-iac|typhon|bstein_home|bstein-home|data_prepper|data-prepper\"}) or on() vector(0))), 1)",
"refId": "A",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "dark-red",
"value": null
},
{
"color": "dark-orange",
"value": 70
},
{
"color": "dark-yellow",
"value": 85
},
{
"color": "dark-green",
"value": 95
},
{
"color": "dark-blue",
"value": 100
}
]
},
"unit": "percent",
"custom": {
"displayMode": "auto"
},
"decimals": 1
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value",
"text": {
"titleSize": 10,
"valueSize": 19
}
},
"links": [
{
"title": "Open atlas-testing dashboard",
"url": "/d/atlas-testing",
"targetBlank": true
}
],
"description": "Percent of published quality-gate CI runs that completed successfully in 24h; this is automation health, not raw test pass rate."
},
{
"id": 153,
"type": "stat",
"title": "Failed Runs (24h)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 2,
"w": 3,
"x": 21,
"y": 13
},
"targets": [
{
"expr": "(sum(platform_quality:suite_runs:increase_24h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|pegasus-health|pegasus_health|soteria|titan_iac|titan-iac|typhon|bstein_home|bstein-home|data_prepper|data-prepper\",status!~\"ok|passed|success\"}) or on() vector(0))",
"refId": "A",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "dark-blue",
"value": null
},
{
"color": "dark-yellow",
"value": 1
},
{
"color": "dark-orange",
"value": 3
},
{
"color": "dark-red",
"value": 5
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
},
"decimals": 0
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value",
"text": {
"titleSize": 10,
"valueSize": 19
}
},
"links": [
{
"title": "Open atlas-testing dashboard",
"url": "/d/atlas-testing",
"targetBlank": true
}
],
"description": "Published quality-gate runs that failed in 24h; zero is good, any value needs a look."
},
{
"id": 154,
"type": "stat",
"title": "Suites With Runs (24h)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 2,
"w": 3,
"x": 21,
"y": 15
},
"targets": [
{
"expr": "sum((sum by (suite) (platform_quality:suite_runs:increase_24h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\"}) > bool 0)) or on() vector(0)",
"refId": "A",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "dark-red",
"value": null
},
{
"color": "dark-yellow",
"value": 9
},
{
"color": "dark-green",
"value": 10
},
{
"color": "dark-blue",
"value": 11
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
},
"decimals": 0
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value",
"text": {
"titleSize": 10,
"valueSize": 19
}
},
"links": [
{
"title": "Open atlas-testing dashboard",
"url": "/d/atlas-testing",
"targetBlank": true
}
],
"description": "Configured suites with at least one published quality-gate run in 24h; full count means the dashboard is fresh."
},
{
"id": 155,
"type": "stat",
"title": "Avg Coverage",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 2,
"w": 3,
"x": 21,
"y": 17
},
"targets": [
{
"expr": "(avg((max by (suite) (platform_quality:suite_coverage_percent:latest_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\"}))) or on() vector(0))",
"refId": "A",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "dark-red",
"value": null
},
{
"color": "dark-orange",
"value": 70
},
{
"color": "dark-yellow",
"value": 85
},
{
"color": "dark-green",
"value": 95
},
{
"color": "dark-blue",
"value": 100
}
]
},
"unit": "percent",
"custom": {
"displayMode": "auto"
},
"decimals": 1
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value",
"text": {
"titleSize": 10,
"valueSize": 19
}
},
"links": [
{
"title": "Open atlas-testing dashboard",
"url": "/d/atlas-testing",
"targetBlank": true
}
],
"description": "Average latest line coverage across suites; higher means code is better protected by tests."
},
{
"id": 150,
"type": "state-timeline",
"title": "GitOps Health",
"description": "GitOps readiness and suspension health over time. Blue means perfect; warmer colors mean a readiness or suspension problem appeared.",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 6,
"w": 6,
"x": 15,
"y": 7
},
"targets": [
{
"expr": "label_replace(100 * sum(max by (namespace, name) (ananke_gitops_kustomization_ready{job=\"ananke-power\"})) / clamp_min(count(max by (namespace, name) (ananke_gitops_kustomization_ready{job=\"ananke-power\"})), 1), \"signal\", \"Kustomizations Ready\", \"__name__\", \".*\") or label_replace(100 * sum(max by (namespace, name) (ananke_gitops_helmrelease_ready{job=\"ananke-power\"})) / clamp_min(count(max by (namespace, name) (ananke_gitops_helmrelease_ready{job=\"ananke-power\"})), 1), \"signal\", \"HelmReleases Ready\", \"__name__\", \".*\") or label_replace(100 * (1 - (sum(max by (namespace, name) (ananke_gitops_kustomization_suspended{job=\"ananke-power\"})) or on() vector(0)) / clamp_min((count(max by (namespace, name) (ananke_gitops_kustomization_ready{job=\"ananke-power\"})) or on() vector(0)), 1)), \"signal\", \"Kustomizations Not Suspended\", \"__name__\", \".*\") or label_replace(100 * (1 - (sum(max by (namespace, name) (ananke_gitops_helmrelease_suspended{job=\"ananke-power\"})) or on() vector(0)) / clamp_min((count(max by (namespace, name) (ananke_gitops_helmrelease_ready{job=\"ananke-power\"})) or on() vector(0)), 1)), \"signal\", \"HelmReleases Not Suspended\", \"__name__\", \".*\")",
"refId": "A",
"legendFormat": "{{signal}}",
"format": "time_series",
"instant": false,
"range": true
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"unit": "percent",
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "dark-red",
"value": null
},
{
"color": "dark-orange",
"value": 70
},
{
"color": "dark-yellow",
"value": 85
},
{
"color": "dark-green",
"value": 95
},
{
"color": "dark-blue",
"value": 100
}
]
},
"custom": {
"fillOpacity": 70,
"lineWidth": 0,
"spanNulls": true
},
"min": 0,
"max": 100
},
"overrides": []
},
"options": {
"mergeValues": true,
"showValue": "never",
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"links": [
{
"title": "Open atlas-gitops dashboard",
"url": "/d/atlas-gitops",
"targetBlank": true
}
]
},
{
"id": 44,
"type": "bargauge",
"title": "One-off Job Pods (age hours)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 8,
"x": 0,
"y": 32
},
"targets": [
{
"expr": "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))",
"refId": "A",
"legendFormat": "{{namespace}}/{{pod}}",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"unit": "h",
"min": 0,
"max": null,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "dark-green",
"value": null
},
{
"color": "dark-yellow",
"value": 6
},
{
"color": "dark-orange",
"value": 24
},
{
"color": "dark-red",
"value": 48
}
]
},
"decimals": 2
},
"overrides": []
},
"options": {
"displayMode": "basic",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
},
"links": [
{
"title": "Open atlas-testing dashboard",
"url": "/d/atlas-testing",
"targetBlank": true
}
],
"transformations": [
{
"id": "sortBy",
"options": {
"fields": [
"Value"
],
"order": "desc"
}
},
{
"id": "limit",
"options": {
"limit": 12
}
}
],
"description": "Temporary job pods by age; low or empty is good, old pods usually need cleanup."
},
{
"id": 45,
"type": "timeseries",
"title": "Ariadne Run Volume",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 6,
"w": 6,
"x": 9,
"y": 7
},
"targets": [
{
"expr": "sum(increase(ariadne_task_runs_total[5m])) or on() vector(0)",
"refId": "A",
"legendFormat": "Attempts"
},
{
"expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[5m])) or on() vector(0)",
"refId": "B",
"legendFormat": "Failures"
}
],
"fieldConfig": {
"defaults": {
"unit": "none",
"custom": {
"drawStyle": "bars",
"barAlignment": 0,
"barWidthFactor": 0.72,
"lineWidth": 0,
"fillOpacity": 70,
"gradientMode": "none",
"showPoints": "never",
"spanNulls": true
}
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Attempts"
},
"properties": [
{
"id": "color",
"value": {
"mode": "fixed",
"fixedColor": "dark-blue"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Failures"
},
"properties": [
{
"id": "color",
"value": {
"mode": "fixed",
"fixedColor": "dark-red"
}
}
]
}
]
},
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"links": [
{
"title": "Open atlas-testing dashboard",
"url": "/d/atlas-testing",
"targetBlank": true
}
],
"description": "Ariadne automation attempts and failures; attempts show activity, failures show work to investigate."
},
{
"id": 46,
"type": "state-timeline",
"title": "Test Category Health",
"description": "Health by major test category across all suites over the last 24 hours. Skipped tests are healthy; failures and errors lower the lane.",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 6,
"w": 6,
"x": 15,
"y": 13
},
"targets": [
{
"expr": "avg by (category) (platform_quality:test_category_health_rate:percent_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch!=\"\",branch=~\"main|master|origin/main|origin/master\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|performance|regression|reliability|security|smoke|system|ui\"})",
"refId": "A",
"legendFormat": "{{category}}",
"format": "time_series",
"instant": false,
"range": true
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"unit": "percent",
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "dark-red",
"value": null
},
{
"color": "dark-orange",
"value": 70
},
{
"color": "dark-yellow",
"value": 85
},
{
"color": "dark-green",
"value": 95
},
{
"color": "dark-blue",
"value": 100
}
]
},
"custom": {
"fillOpacity": 70,
"lineWidth": 0,
"spanNulls": true
},
"min": 0,
"max": 100
},
"overrides": []
},
"options": {
"mergeValues": false,
"showValue": "auto",
"legend": {
"displayMode": "list",
"placement": "bottom",
"showLegend": false
},
"tooltip": {
"mode": "multi",
"sort": "none"
},
"rowHeight": 0.9
},
"links": [
{
"title": "Open atlas-testing dashboard",
"url": "/d/atlas-testing",
"targetBlank": true
}
],
"timeFrom": "24h"
},
{
"id": 142,
"type": "stat",
"title": "Jenkins Last Success (h, newest first)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 4,
"x": 8,
"y": 32
},
"targets": [
{
"refId": "A",
"expr": "sort((label_replace((sort(bottomk(6, min by (exported_job,job_url,weather_icon) ((time() - ariadne_jenkins_build_weather_job_last_success_timestamp_seconds) / 3600)))) and on(exported_job,job_url,weather_icon) (max by (exported_job,job_url,weather_icon) (ariadne_jenkins_build_weather_job_last_status) == 1), \"run_state\", \"ok\", \"exported_job\", \".*\")) or (label_replace((sort(bottomk(6, min by (exported_job,job_url,weather_icon) ((time() - ariadne_jenkins_build_weather_job_last_success_timestamp_seconds) / 3600)))) and on(exported_job,job_url,weather_icon) (max by (exported_job,job_url,weather_icon) (ariadne_jenkins_build_weather_job_last_status) != 1), \"run_state\", \"bad\", \"exported_job\", \".*\")))",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"unit": "h",
"decimals": 1,
"min": 0,
"displayName": "${__field.labels.weather_icon} ${__field.labels.exported_job}",
"links": [
{
"title": "Open Jenkins job",
"url": "https://ci.bstein.dev/job/${__field.labels.exported_job}/",
"targetBlank": true
}
]
},
"overrides": [
{
"matcher": {
"id": "byRegexp",
"options": ".*run_state=\"ok\".*"
},
"properties": [
{
"id": "color",
"value": {
"mode": "fixed",
"fixedColor": "dark-green"
}
}
]
},
{
"matcher": {
"id": "byRegexp",
"options": ".*run_state=\"bad\".*"
},
"properties": [
{
"id": "color",
"value": {
"mode": "fixed",
"fixedColor": "dark-red"
}
}
]
}
]
},
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "left",
"orientation": "horizontal",
"wideLayout": true,
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "name_and_value",
"text": {
"titleSize": 11,
"valueSize": 11
}
},
"transformations": [
{
"id": "sortBy",
"options": {
"fields": [
"Value"
],
"order": "asc"
}
}
],
"links": [
{
"title": "Open atlas-testing dashboard",
"url": "/d/atlas-testing",
"targetBlank": true
}
],
"description": "Top 6 most recent Jenkins successes by age (newest first). Green means last run succeeded; red means last run did not succeed. Use Atlas Jobs for the full list."
},
{
"id": 243,
"type": "stat",
"title": "Jenkins Last Failure (h, newest first)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 4,
"x": 12,
"y": 32
},
"targets": [
{
"refId": "A",
"expr": "sort((label_replace((sort(bottomk(6, min by (exported_job,job_url,weather_icon) ((time() - ariadne_jenkins_build_weather_job_last_failure_timestamp_seconds) / 3600)))) and on(exported_job,job_url,weather_icon) (max by (exported_job,job_url,weather_icon) (ariadne_jenkins_build_weather_job_last_status) == 1), \"run_state\", \"ok\", \"exported_job\", \".*\")) or (label_replace((sort(bottomk(6, min by (exported_job,job_url,weather_icon) ((time() - ariadne_jenkins_build_weather_job_last_failure_timestamp_seconds) / 3600)))) and on(exported_job,job_url,weather_icon) (max by (exported_job,job_url,weather_icon) (ariadne_jenkins_build_weather_job_last_status) != 1), \"run_state\", \"bad\", \"exported_job\", \".*\")))",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"unit": "h",
"decimals": 1,
"min": 0,
"displayName": "${__field.labels.weather_icon} ${__field.labels.exported_job}",
"links": [
{
"title": "Open Jenkins job",
"url": "https://ci.bstein.dev/job/${__field.labels.exported_job}/",
"targetBlank": true
}
]
},
"overrides": [
{
"matcher": {
"id": "byRegexp",
"options": ".*run_state=\"ok\".*"
},
"properties": [
{
"id": "color",
"value": {
"mode": "fixed",
"fixedColor": "dark-green"
}
}
]
},
{
"matcher": {
"id": "byRegexp",
"options": ".*run_state=\"bad\".*"
},
"properties": [
{
"id": "color",
"value": {
"mode": "fixed",
"fixedColor": "dark-red"
}
}
]
}
]
},
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "left",
"orientation": "horizontal",
"wideLayout": true,
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "name_and_value",
"text": {
"titleSize": 11,
"valueSize": 11
}
},
"transformations": [
{
"id": "sortBy",
"options": {
"fields": [
"Value"
],
"order": "asc"
}
}
],
"links": [
{
"title": "Open atlas-testing dashboard",
"url": "/d/atlas-testing",
"targetBlank": true
}
],
"description": "Top 6 most recent Jenkins failures by age (newest first). Green means last run succeeded; red means last run did not succeed. Use Atlas Jobs for the full list."
},
{
"id": 47,
"type": "bargauge",
"title": "PVC Backup Health / Age",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 8,
"x": 16,
"y": 32
},
"targets": [
{
"expr": "sort_desc(max by (namespace, pvc) (((time() - pvc_backup_last_success_timestamp_seconds{driver=\"restic\"}) / 3600) or on(namespace,pvc,volume,driver) ((((pvc_backup_health_reason{driver=\"restic\",reason=~\"missing|no_completed|lookup_failed|unknown_timestamp\"} > 0) * (pvc_backup_count{driver=\"restic\"} > bool 0)) * 999))) or on() ((label_replace(label_replace(vector(999), \"namespace\", \"maintenance\", \"__name__\", \".*\"), \"pvc\", \"backup-telemetry-missing\", \"__name__\", \".*\")) unless on() ((count({__name__=~\"pvc_backup_(count|last_success_timestamp_seconds|health_reason)\",driver=\"restic\"})) > 0)))",
"refId": "A",
"legendFormat": "{{namespace}}/{{pvc}}",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"unit": "h",
"min": 0,
"max": null,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "dark-green",
"value": null
},
{
"color": "dark-yellow",
"value": 20
},
{
"color": "dark-orange",
"value": 40
},
{
"color": "dark-red",
"value": 50
}
]
}
},
"overrides": []
},
"options": {
"displayMode": "basic",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
},
"transformations": [
{
"id": "sortBy",
"options": {
"fields": [
"Value"
],
"order": "desc"
}
}
],
"links": [
{
"title": "Open atlas-storage dashboard",
"url": "/d/atlas-storage",
"targetBlank": true
}
],
"description": "Backup age in hours computed from last-success timestamps for restic-managed PVCs (nightly target: <=20h green, <40h yellow, <50h orange, >=50h red). PVCs that have backup history but currently no successful backup (missing/no_completed/error) are pinned to 999h for visibility."
},
{
"id": 30,
"type": "stat",
"title": "Mail Sent (1d)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 2,
"w": 4,
"x": 0,
"y": 19
},
"targets": [
{
"expr": "max(postmark_outbound_sent{window=\"1d\"})",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "dark-green",
"value": 1
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-mail dashboard",
"url": "/d/atlas-mail",
"targetBlank": true
}
],
"description": "Outbound mail sent in the last day; useful context for mail health and bounce rates."
},
{
"id": 31,
"type": "stat",
"title": "Mail Bounces (1d)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 2,
"w": 4,
"x": 8,
"y": 19
},
"targets": [
{
"expr": "max(postmark_outbound_bounce_rate{window=\"1d\"})",
"refId": "A",
"legendFormat": "Rate"
},
{
"expr": "max(postmark_outbound_bounced{window=\"1d\"})",
"refId": "B",
"legendFormat": "Count"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"custom": {
"displayMode": "auto"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "dark-green",
"value": null
},
{
"color": "dark-yellow",
"value": 5
},
{
"color": "dark-orange",
"value": 8
},
{
"color": "dark-red",
"value": 10
}
]
},
"unit": "none"
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Rate"
},
"properties": [
{
"id": "unit",
"value": "percent"
}
]
},
{
"matcher": {
"id": "byName",
"options": "Count"
},
"properties": [
{
"id": "unit",
"value": "none"
}
]
}
]
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "name_and_value"
},
"links": [
{
"title": "Open atlas-mail dashboard",
"url": "/d/atlas-mail",
"targetBlank": true
}
],
"description": "Outbound mail bounce rate and count; zero is best, high values risk delivery reputation."
},
{
"id": 32,
"type": "stat",
"title": "Mail Success Rate (1d)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 2,
"w": 4,
"x": 4,
"y": 19
},
"targets": [
{
"expr": "clamp_min(100 - max(postmark_outbound_bounce_rate{window=\"1d\"}), 0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "dark-red",
"value": null
},
{
"color": "dark-orange",
"value": 90
},
{
"color": "dark-yellow",
"value": 95
},
{
"color": "dark-green",
"value": 98
}
]
},
"unit": "percent",
"custom": {
"displayMode": "auto"
},
"decimals": 1
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-mail dashboard",
"url": "/d/atlas-mail",
"targetBlank": true
}
],
"description": "Outbound mail success rate; higher is better for user notifications."
},
{
"id": 33,
"type": "stat",
"title": "Mail Limit Used (30d)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 2,
"w": 4,
"x": 12,
"y": 19
},
"targets": [
{
"expr": "max(postmark_sending_limit_used_percent)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "dark-green",
"value": null
},
{
"color": "dark-yellow",
"value": 70
},
{
"color": "dark-orange",
"value": 85
},
{
"color": "dark-red",
"value": 95
}
]
},
"unit": "percent",
"custom": {
"displayMode": "auto"
},
"decimals": 1
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"links": [
{
"title": "Open atlas-mail dashboard",
"url": "/d/atlas-mail",
"targetBlank": true
}
],
"description": "Postmark monthly send limit used; lower leaves more quota headroom."
},
{
"id": 34,
"type": "stat",
"title": "Postgres Connections Used",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 2,
"w": 4,
"x": 16,
"y": 19
},
"targets": [
{
"expr": "label_replace(sum(pg_stat_activity_count), \"conn\", \"used\", \"__name__\", \".*\") or label_replace(max(pg_settings_max_connections), \"conn\", \"max\", \"__name__\", \".*\")",
"refId": "A",
"legendFormat": "{{conn}}",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "dark-green",
"value": 1
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
},
"decimals": 0
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "name_and_value"
},
"description": "Current Postgres connections; lower leaves room for apps during spikes."
},
{
"id": 35,
"type": "stat",
"title": "Postgres Hottest Connections",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 2,
"w": 4,
"x": 20,
"y": 19
},
"targets": [
{
"expr": "topk(1, sum by (datname) (pg_stat_activity_count))",
"refId": "A",
"legendFormat": "{{datname}}",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "dark-green",
"value": 1
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
},
"decimals": 0
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "name_and_value"
},
"description": "Database with the most active connections; high values identify the pressure source."
},
{
"id": 11,
"type": "piechart",
"title": "Namespace CPU Share",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 9,
"w": 8,
"x": 0,
"y": 23
},
"targets": [
{
"expr": "100 * ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_cpu}[1m])) by (namespace) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_cpu}[1m])) by (namespace) ), 1)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"color": {
"mode": "palette-classic"
}
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "list",
"placement": "right"
},
"pieType": "pie",
"displayLabels": [],
"tooltip": {
"mode": "single"
},
"colorScheme": "interpolateSpectral",
"colorBy": "value",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
},
"links": [
{
"title": "Workload namespaces only",
"url": "?var-namespace_scope_cpu=namespace%21~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}",
"targetBlank": false
},
{
"title": "All namespaces",
"url": "?var-namespace_scope_cpu=namespace%3D~%22.%2A%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}",
"targetBlank": false
},
{
"title": "Infrastructure namespaces only",
"url": "?var-namespace_scope_cpu=namespace%3D~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}",
"targetBlank": false
}
],
"description": "Shares are normalized within the selected filter. Switching scope changes the denominator."
},
{
"id": 12,
"type": "piechart",
"title": "Namespace GPU Utilization",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 9,
"w": 8,
"x": 8,
"y": 23
},
"targets": [
{
"expr": "(100 * ((sum by (namespace) (avg_over_time(((max by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) and on(UUID) (count by (UUID) (count by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) == 1))[$__range:$__interval]))) or (label_replace(sum(avg_over_time(((max by (UUID) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) and on(UUID) (count by (UUID) (count by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) > 1))[$__range:$__interval])), \"namespace\", \"shared\", \"\", \"\"))) / clamp_min((sum((sum by (namespace) (avg_over_time(((max by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) and on(UUID) (count by (UUID) (count by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) == 1))[$__range:$__interval]))) or (label_replace(sum(avg_over_time(((max by (UUID) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) and on(UUID) (count by (UUID) (count by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) > 1))[$__range:$__interval])), \"namespace\", \"shared\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(max_over_time((max by (UUID) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu}))[$__range:$__interval])) or on() vector(0)) == 0))",
"refId": "A",
"legendFormat": "{{namespace}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"color": {
"mode": "palette-classic"
}
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "list",
"placement": "right"
},
"pieType": "pie",
"displayLabels": [],
"tooltip": {
"mode": "single"
},
"colorScheme": "interpolateSpectral",
"colorBy": "value",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
},
"links": [
{
"title": "Workload namespaces only",
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}",
"targetBlank": false
},
{
"title": "All namespaces",
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22.%2A%22&var-namespace_scope_ram=${namespace_scope_ram}",
"targetBlank": false
},
{
"title": "Infrastructure namespaces only",
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}",
"targetBlank": false
}
],
"description": "Shares are based on measured DCGM GPU utilization. If multiple namespaces share one physical GPU and DCGM cannot attribute work safely, activity is grouped as shared. Idle appears only when utilization is zero."
},
{
"id": 13,
"type": "piechart",
"title": "Namespace RAM Share",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 9,
"w": 8,
"x": 16,
"y": 23
},
"targets": [
{
"expr": "100 * ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_ram}) by (namespace) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_ram}) by (namespace) ), 1)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"color": {
"mode": "palette-classic"
}
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "list",
"placement": "right"
},
"pieType": "pie",
"displayLabels": [],
"tooltip": {
"mode": "single"
},
"colorScheme": "interpolateSpectral",
"colorBy": "value",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
},
"links": [
{
"title": "Workload namespaces only",
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%21~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22",
"targetBlank": false
},
{
"title": "All namespaces",
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%3D~%22.%2A%22",
"targetBlank": false
},
{
"title": "Infrastructure namespaces only",
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%3D~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22",
"targetBlank": false
}
],
"description": "Shares are normalized within the selected filter. Switching scope changes the denominator."
},
{
"id": 14,
"type": "timeseries",
"title": "Worker Node CPU",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 12,
"w": 12,
"x": 0,
"y": 44
},
"targets": [
{
"expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
"refId": "A",
"legendFormat": "{{node}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right",
"calcs": [
"last"
]
},
"tooltip": {
"mode": "multi"
}
},
"links": [
{
"title": "Open atlas-nodes dashboard",
"url": "/d/atlas-nodes",
"targetBlank": true
}
],
"description": "Worker CPU over time; lower is calmer, sustained high load may need rescheduling."
},
{
"id": 15,
"type": "timeseries",
"title": "Worker Node RAM",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 12,
"w": 12,
"x": 12,
"y": 44
},
"targets": [
{
"expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
"refId": "A",
"legendFormat": "{{node}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right",
"calcs": [
"last"
]
},
"tooltip": {
"mode": "multi"
}
},
"links": [
{
"title": "Open atlas-nodes dashboard",
"url": "/d/atlas-nodes",
"targetBlank": true
}
],
"description": "Worker memory over time; lower is safer, sustained high use risks evictions."
},
{
"id": 16,
"type": "timeseries",
"title": "Control plane CPU",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 10,
"w": 12,
"x": 0,
"y": 56
},
"targets": [
{
"expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
"refId": "A",
"legendFormat": "{{node}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right"
},
"tooltip": {
"mode": "multi"
}
},
"description": "Control-plane CPU over time; low steady usage means Kubernetes has control headroom."
},
{
"id": 17,
"type": "timeseries",
"title": "Control plane RAM",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 10,
"w": 12,
"x": 12,
"y": 56
},
"targets": [
{
"expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
"refId": "A",
"legendFormat": "{{node}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right"
},
"tooltip": {
"mode": "multi"
}
},
"description": "Control-plane memory over time; low steady usage means Kubernetes has control headroom."
},
{
"id": 28,
"type": "piechart",
"title": "Node Pod Share",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 10,
"w": 12,
"x": 0,
"y": 66
},
"targets": [
{
"expr": "(sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node) / clamp_min(sum(kube_pod_info{pod!=\"\" , node!=\"\"}), 1)) * 100",
"refId": "A",
"legendFormat": "{{namespace}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"color": {
"mode": "palette-classic"
}
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "list",
"placement": "right"
},
"pieType": "pie",
"displayLabels": [],
"tooltip": {
"mode": "single"
},
"colorScheme": "interpolateSpectral",
"colorBy": "value",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
},
"description": "Share of pods per node; uneven share can reveal overloaded workers."
},
{
"id": 29,
"type": "bargauge",
"title": "Top Nodes by Pod Count",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 10,
"w": 12,
"x": 12,
"y": 66
},
"targets": [
{
"expr": "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))",
"refId": "A",
"legendFormat": "{{node}}",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"unit": "none",
"min": 0,
"max": null,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "dark-green",
"value": null
},
{
"color": "dark-yellow",
"value": 50
},
{
"color": "dark-orange",
"value": 75
},
{
"color": "dark-red",
"value": 100
}
]
},
"decimals": 0
},
"overrides": []
},
"options": {
"displayMode": "basic",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
},
"transformations": [
{
"id": "sortBy",
"options": {
"fields": [
"Value"
],
"order": "desc"
}
},
{
"id": "limit",
"options": {
"limit": 12
}
}
],
"description": "Nodes with the most pods; lower and balanced is easier to operate."
},
{
"id": 18,
"type": "timeseries",
"title": "Cluster Ingress Throughput",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 7,
"w": 8,
"x": 0,
"y": 37
},
"targets": [
{
"expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)",
"refId": "A",
"legendFormat": "Ingress (Traefik)"
}
],
"fieldConfig": {
"defaults": {
"unit": "Bps"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"links": [
{
"title": "Open atlas-network dashboard",
"url": "/d/atlas-network",
"targetBlank": true
}
],
"description": "Traffic entering the cluster; spikes should line up with expected usage."
},
{
"id": 19,
"type": "timeseries",
"title": "Cluster Egress Throughput",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 7,
"w": 8,
"x": 8,
"y": 37
},
"targets": [
{
"expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)",
"refId": "A",
"legendFormat": "Egress (Traefik)"
}
],
"fieldConfig": {
"defaults": {
"unit": "Bps"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"links": [
{
"title": "Open atlas-network dashboard",
"url": "/d/atlas-network",
"targetBlank": true
}
],
"description": "Traffic leaving the cluster; spikes should line up with expected usage."
},
{
"id": 20,
"type": "timeseries",
"title": "Intra-Cluster Throughput",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 7,
"w": 8,
"x": 16,
"y": 37
},
"targets": [
{
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)",
"refId": "A",
"legendFormat": "Internal traffic"
}
],
"fieldConfig": {
"defaults": {
"unit": "Bps"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"links": [
{
"title": "Open atlas-network dashboard",
"url": "/d/atlas-network",
"targetBlank": true
}
],
"description": "Traffic inside the cluster; high values can expose chatty services."
},
{
"id": 21,
"type": "timeseries",
"title": "Root Filesystem Usage",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 16,
"w": 12,
"x": 0,
"y": 76
},
"targets": [
{
"expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
"refId": "A",
"legendFormat": "{{node}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right",
"calcs": [
"last"
]
},
"tooltip": {
"mode": "multi"
}
},
"timeFrom": "30d",
"links": [
{
"title": "Open atlas-storage dashboard",
"url": "/d/atlas-storage",
"targetBlank": true
}
],
"description": "Node root disk usage; lower is safer, high values can break kubelet."
},
{
"id": 22,
"type": "timeseries",
"title": "Nodes Closest to Full Astraios Disks",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 16,
"w": 12,
"x": 12,
"y": 76
},
"targets": [
{
"expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
"refId": "A",
"legendFormat": "{{node}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right",
"calcs": [
"last"
]
},
"tooltip": {
"mode": "multi"
}
},
"timeFrom": "1w",
"links": [
{
"title": "Open atlas-storage dashboard",
"url": "/d/atlas-storage",
"targetBlank": true
}
],
"description": "Astraios disk fullness by node; lower is safer for storage reliability."
}
],
"schemaVersion": 39,
"style": "dark",
"tags": [
"atlas",
"overview"
],
"templating": {
"list": [
{
"name": "namespace_scope_cpu",
"label": "CPU namespace filter",
"type": "custom",
"query": "workload namespaces only : namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
"current": {
"text": "workload namespaces only",
"value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
"selected": true
},
"options": [
{
"text": "workload namespaces only",
"value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
"selected": true
},
{
"text": "all namespaces",
"value": "namespace=~\".*\"",
"selected": false
},
{
"text": "infrastructure namespaces only",
"value": "namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
"selected": false
}
],
"hide": 2,
"multi": false,
"includeAll": false,
"refresh": 1,
"sort": 0,
"skipUrlSync": false
},
{
"name": "namespace_scope_gpu",
"label": "GPU namespace filter",
"type": "custom",
"query": "workload namespaces only : namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
"current": {
"text": "workload namespaces only",
"value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
"selected": true
},
"options": [
{
"text": "workload namespaces only",
"value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
"selected": true
},
{
"text": "all namespaces",
"value": "namespace=~\".*\"",
"selected": false
},
{
"text": "infrastructure namespaces only",
"value": "namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
"selected": false
}
],
"hide": 2,
"multi": false,
"includeAll": false,
"refresh": 1,
"sort": 0,
"skipUrlSync": false
},
{
"name": "namespace_scope_ram",
"label": "RAM namespace filter",
"type": "custom",
"query": "workload namespaces only : namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
"current": {
"text": "workload namespaces only",
"value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
"selected": true
},
"options": [
{
"text": "workload namespaces only",
"value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
"selected": true
},
{
"text": "all namespaces",
"value": "namespace=~\".*\"",
"selected": false
},
{
"text": "infrastructure namespaces only",
"value": "namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
"selected": false
}
],
"hide": 2,
"multi": false,
"includeAll": false,
"refresh": 1,
"sort": 0,
"skipUrlSync": false
}
]
},
"time": {
"from": "now-1h",
"to": "now"
},
"refresh": "1m",
"links": [
{
"title": "Atlas Testing",
"url": "/d/atlas-testing",
"targetBlank": true
}
]
}