{ "uid": "atlas-overview", "title": "Atlas Overview", "folderUid": "overview", "editable": false, "annotations": { "list": [] }, "panels": [ { "id": 2, "type": "gauge", "title": "Control Plane Ready", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 5, "w": 4, "x": 0, "y": 0 }, "targets": [ { "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})", "refId": "A" } ], "fieldConfig": { "defaults": { "min": 0, "max": 3, "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "green", "value": 3 } ] } }, "overrides": [] }, "options": { "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "orientation": "auto", "showThresholdMarkers": false, "showThresholdLabels": false } }, { "id": 3, "type": "stat", "title": "Control Plane Workloads", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 5, "w": 3, "x": 4, "y": 0 }, "targets": [ { "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"}) or on() vector(0)", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 1 }, { "color": "orange", "value": 2 }, { "color": "red", "value": 3 } ] }, "unit": "none", "custom": { "displayMode": "auto" } }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" }, "links": [ { "title": "Open atlas-pods dashboard", "url": "/d/atlas-pods", "targetBlank": true } ] }, { "id": 5, "type": "stat", "title": "Stuck Terminating", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 5, "w": 3, "x": 7, "y": 0 }, "targets": [ { "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 1 }, { "color": "orange", "value": 2 }, { "color": "red", "value": 3 } ] }, "unit": "none", "custom": { "displayMode": "auto" } }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" }, "links": [ { "title": "Open atlas-pods dashboard", "url": "/d/atlas-pods", "targetBlank": true } ] }, { "id": 27, "type": "stat", "title": "Atlas Availability", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 5, "w": 4, "x": 10, "y": 0 }, "targets": [ { "expr": "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[365d:1h])", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "orange", "value": 0.99 }, { "color": "yellow", "value": 0.999 }, { "color": "green", "value": 0.9999 }, { "color": "blue", "value": 0.99999 } ] }, "unit": "percentunit", "custom": { "displayMode": "auto" }, "decimals": 4 }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" } }, { "id": 4, "type": "stat", "title": "Problem Pods", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 5, "w": 3, "x": 14, "y": 0 }, "targets": [ { "expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 1 }, { "color": "orange", "value": 2 }, { "color": "red", "value": 3 } ] }, "unit": "none", "custom": { "displayMode": "auto" } }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" }, "links": [ { "title": "Open atlas-pods dashboard", "url": "/d/atlas-pods", "targetBlank": true } ] }, { "id": 6, "type": "stat", "title": "CrashLoop / ImagePull", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 5, "w": 3, "x": 17, "y": 0 }, "targets": [ { "expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 1 }, { "color": "orange", "value": 2 }, { "color": "red", "value": 3 } ] }, "unit": "none", "custom": { "displayMode": "auto" } }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" }, "links": [ { "title": "Open atlas-pods dashboard", "url": "/d/atlas-pods", "targetBlank": true } ] }, { "id": 1, "type": "gauge", "title": "Workers Ready", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 5, "w": 4, "x": 20, "y": 0 }, "targets": [ { "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})", "refId": "A" } ], "fieldConfig": { "defaults": { "min": 0, "max": 20, "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "orange", "value": 18 }, { "color": "yellow", "value": 19 }, { "color": "green", "value": 20 } ] } }, "overrides": [] }, "options": { "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "orientation": "auto", "showThresholdMarkers": false, "showThresholdLabels": false } }, { "id": 7, "type": "stat", "title": "Hottest node: CPU", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 2, "w": 3, "x": 0, "y": 5 }, "targets": [ { "expr": "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}", "instant": true } ], "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 50 }, { "color": "orange", "value": 75 }, { "color": "red", "value": 91.5 } ] }, "unit": "percent", "custom": { "displayMode": "auto" } }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "name_and_value" }, "links": [ { "title": "Open atlas-nodes dashboard", "url": "/d/atlas-nodes", "targetBlank": true } ] }, { "id": 8, "type": "stat", "title": "Hottest node: RAM", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 2, "w": 3, "x": 3, "y": 5 }, "targets": [ { "expr": "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}", "instant": true } ], "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 50 }, { "color": "orange", "value": 75 }, { "color": "red", "value": 91.5 } ] }, "unit": "percent", "custom": { "displayMode": "auto" } }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "name_and_value" }, "links": [ { "title": "Open atlas-nodes dashboard", "url": "/d/atlas-nodes", "targetBlank": true } ] }, { "id": 9, "type": "stat", "title": "Hottest node: NET (rx+tx)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 2, "w": 3, "x": 6, "y": 5 }, "targets": [ { "expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}", "instant": true } ], "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "rgba(115, 115, 115, 1)", "value": null }, { "color": "green", "value": 1 } ] }, "unit": "Bps", "custom": { "displayMode": "auto" } }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "name_and_value" }, "links": [ { "title": "Open atlas-nodes dashboard", "url": "/d/atlas-nodes", "targetBlank": true } ] }, { "id": 10, "type": "stat", "title": "Hottest node: I/O (r+w)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 2, "w": 3, "x": 9, "y": 5 }, "targets": [ { "expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}", "instant": true } ], "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "rgba(115, 115, 115, 1)", "value": null }, { "color": "green", "value": 1 } ] }, "unit": "Bps", "custom": { "displayMode": "auto" } }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "name_and_value" }, "links": [ { "title": "Open atlas-nodes dashboard", "url": "/d/atlas-nodes", "targetBlank": true } ] }, { "id": 23, "type": "stat", "title": "Astreae Usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 2, "w": 3, "x": 12, "y": 5 }, "targets": [ { "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 50 }, { "color": "orange", "value": 75 }, { "color": "red", "value": 91.5 } ] }, "unit": "percent", "custom": { "displayMode": "auto" } }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" }, "links": [ { "title": "Open atlas-storage dashboard", "url": "/d/atlas-storage", "targetBlank": true } ] }, { "id": 24, "type": "stat", "title": "Asteria Usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 2, "w": 3, "x": 15, "y": 5 }, "targets": [ { "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 50 }, { "color": "orange", "value": 75 }, { "color": "red", "value": 91.5 } ] }, "unit": "percent", "custom": { "displayMode": "auto" } }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" }, "links": [ { "title": "Open atlas-storage dashboard", "url": "/d/atlas-storage", "targetBlank": true } ] }, { "id": 25, "type": "stat", "title": "Astreae Free", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 2, "w": 3, "x": 18, "y": 5 }, "targets": [ { "expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "rgba(115, 115, 115, 1)", "value": null }, { "color": "green", "value": 1 } ] }, "unit": "decbytes", "custom": { "displayMode": "auto" } }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" }, "links": [ { "title": "Open atlas-storage dashboard", "url": "/d/atlas-storage", "targetBlank": true } ] }, { "id": 26, "type": "stat", "title": "Asteria Free", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 2, "w": 3, "x": 21, "y": 5 }, "targets": [ { "expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "rgba(115, 115, 115, 1)", "value": null }, { "color": "green", "value": 1 } ] }, "unit": "decbytes", "custom": { "displayMode": "auto" } }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" }, "links": [ { "title": "Open atlas-storage dashboard", "url": "/d/atlas-storage", "targetBlank": true } ] }, { "id": 40, "type": "stat", "title": "Pyrphoros UPS Current", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 3, "w": 6, "x": 0, "y": 7 }, "targets": [ { "expr": "label_replace(max((ananke_ups_load_percent{job=\"ananke-power\",source=\"Pyrphoros\"} * ananke_ups_power_nominal_watts{job=\"ananke-power\",source=\"Pyrphoros\"}) / 100) or on() vector(0), \"metric\", \"Draw\", \"__name__\", \".*\") or label_replace(max(ananke_ups_runtime_seconds{job=\"ananke-power\",source=\"Pyrphoros\"}) or on() vector(0), \"metric\", \"Runtime\", \"__name__\", \".*\")", "refId": "A", "legendFormat": "{{metric}}", "instant": true } ], "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "rgba(115, 115, 115, 1)", "value": null }, { "color": "green", "value": 1 } ] }, "unit": "none", "custom": { "displayMode": "auto" } }, "overrides": [ { "matcher": { "id": "byName", "options": "Draw" }, "properties": [ { "id": "unit", "value": "watt" } ] }, { "matcher": { "id": "byName", "options": "Runtime" }, "properties": [ { "id": "unit", "value": "s" } ] } ] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "name_and_value", "text": { "titleSize": 14, "valueSize": 30 } }, "links": [ { "title": "Open atlas-power dashboard", "url": "/d/atlas-power", "targetBlank": true } ] }, { "id": 144, "type": "stat", "title": "Statera UPS Current", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 3, "w": 6, "x": 0, "y": 10 }, "targets": [ { "expr": "label_replace(max((ananke_ups_load_percent{job=\"ananke-power\",source=\"Statera\"} * ananke_ups_power_nominal_watts{job=\"ananke-power\",source=\"Statera\"}) / 100) or on() vector(0), \"metric\", \"Draw\", \"__name__\", \".*\") or label_replace(max(ananke_ups_runtime_seconds{job=\"ananke-power\",source=\"Statera\"}) or on() vector(0), \"metric\", \"Runtime\", \"__name__\", \".*\")", "refId": "A", "legendFormat": "{{metric}}", "instant": true } ], "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "rgba(115, 115, 115, 1)", "value": null }, { "color": "green", "value": 1 } ] }, "unit": "none", "custom": { "displayMode": "auto" } }, "overrides": [ { "matcher": { "id": "byName", "options": "Draw" }, "properties": [ { "id": "unit", "value": "watt" } ] }, { "matcher": { "id": "byName", "options": "Runtime" }, "properties": [ { "id": "unit", "value": "s" } ] } ] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "name_and_value", "text": { "titleSize": 14, "valueSize": 30 } }, "links": [ { "title": "Open atlas-power dashboard", "url": "/d/atlas-power", "targetBlank": true } ] }, { "id": 41, "type": "timeseries", "title": "UPS History (Power Draw)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 6, "w": 6, "x": 6, "y": 7 }, "targets": [ { "refId": "A", "expr": "((ananke_ups_load_percent{job=\"ananke-power\",source=\"Pyrphoros\"} * ananke_ups_power_nominal_watts{job=\"ananke-power\",source=\"Pyrphoros\"}) / 100)", "legendFormat": "Pyrphoros" }, { "refId": "B", "expr": "((ananke_ups_load_percent{job=\"ananke-power\",source=\"Statera\"} * ananke_ups_power_nominal_watts{job=\"ananke-power\",source=\"Statera\"}) / 100)", "legendFormat": "Statera" } ], "fieldConfig": { "defaults": { "unit": "watt" }, "overrides": [] }, "options": { "legend": { "displayMode": "table", "placement": "right" }, "tooltip": { "mode": "multi" } }, "links": [ { "title": "Open atlas-power dashboard", "url": "/d/atlas-power", "targetBlank": true } ] }, { "id": 42, "type": "stat", "title": "Current Enclosure Temperature", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 3, "w": 6, "x": 0, "y": 13 }, "targets": [ { "expr": "label_replace(max(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_temperature_celsius != 0)) or on() vector(0), \"metric\", \"\u00b0C\", \"__name__\", \".*\") or label_replace(max((max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_temperature_celsius != 0)) * 9 / 5 + 32) or on() vector(0), \"metric\", \"\u00b0F\", \"__name__\", \".*\")", "refId": "A", "legendFormat": "{{metric}}", "instant": true } ], "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "rgba(115, 115, 115, 1)", "value": null }, { "color": "green", "value": 1 } ] }, "unit": "none", "custom": { "displayMode": "auto" } }, "overrides": [ { "matcher": { "id": "byName", "options": "\u00b0C" }, "properties": [ { "id": "unit", "value": "celsius" } ] }, { "matcher": { "id": "byName", "options": "\u00b0F" }, "properties": [ { "id": "unit", "value": "fahrenheit" } ] } ] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "name_and_value", "text": { "titleSize": 14, "valueSize": 30 } }, "links": [ { "title": "Open atlas-power dashboard", "url": "/d/atlas-power", "targetBlank": true } ] }, { "id": 143, "type": "stat", "title": "Current Enclosure Climate", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 3, "w": 6, "x": 0, "y": 16 }, "targets": [ { "expr": "label_replace(max(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_relative_humidity_percent != 0)) or on() vector(0), \"metric\", \"%RH\", \"__name__\", \".*\") or label_replace(max(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_vpd_kpa != 0)) or on() vector(0), \"metric\", \"kPa\", \"__name__\", \".*\")", "refId": "A", "legendFormat": "{{metric}}", "instant": true } ], "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "rgba(115, 115, 115, 1)", "value": null }, { "color": "green", "value": 1 } ] }, "unit": "none", "custom": { "displayMode": "auto" } }, "overrides": [ { "matcher": { "id": "byName", "options": "%RH" }, "properties": [ { "id": "unit", "value": "suffix:%RH" } ] }, { "matcher": { "id": "byName", "options": "kPa" }, "properties": [ { "id": "unit", "value": "suffix:kPa" } ] } ] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "name_and_value", "text": { "titleSize": 14, "valueSize": 30 } }, "links": [ { "title": "Open atlas-power dashboard", "url": "/d/atlas-power", "targetBlank": true } ] }, { "id": 43, "type": "timeseries", "title": "Enclosure Climate History", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 6, "w": 6, "x": 6, "y": 13 }, "targets": [ { "refId": "A", "expr": "max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_temperature_celsius != 0)", "legendFormat": "C" }, { "refId": "B", "expr": "max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_relative_humidity_percent != 0)", "legendFormat": "RH" }, { "refId": "C", "expr": "max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_vpd_kpa != 0)", "legendFormat": "P" }, { "refId": "D", "expr": "(min_over_time(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_temperature_celsius != 0)[$__range]) - 0.08)", "legendFormat": "C bound min" }, { "refId": "E", "expr": "(max_over_time(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_temperature_celsius != 0)[$__range]) + 0.08)", "legendFormat": "C bound max" }, { "refId": "F", "expr": "clamp_min((min_over_time(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_relative_humidity_percent != 0)[$__range]) - 0.35), 0)", "legendFormat": "RH bound min" }, { "refId": "G", "expr": "clamp_max((max_over_time(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_relative_humidity_percent != 0)[$__range]) + 0.35), 100)", "legendFormat": "RH bound max" }, { "refId": "H", "expr": "clamp_min((min_over_time(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_vpd_kpa != 0)[$__range]) - 0.03), 0)", "legendFormat": "P bound min" }, { "refId": "I", "expr": "(max_over_time(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_vpd_kpa != 0)[$__range]) + 0.03)", "legendFormat": "P bound max" } ], "fieldConfig": { "defaults": { "unit": "none", "custom": { "drawStyle": "line", "lineInterpolation": "linear", "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": true } }, "overrides": [ { "matcher": { "id": "byName", "options": "C" }, "properties": [ { "id": "unit", "value": "suffix:\u00b0C" }, { "id": "decimals", "value": 2 }, { "id": "custom.axisPlacement", "value": "left" }, { "id": "custom.axisCenteredZero", "value": false } ] }, { "matcher": { "id": "byRegexp", "options": "C bound .*" }, "properties": [ { "id": "unit", "value": "suffix:\u00b0C" }, { "id": "custom.axisPlacement", "value": "left" }, { "id": "custom.axisCenteredZero", "value": false }, { "id": "custom.hideFrom", "value": { "legend": true, "tooltip": true, "viz": false } }, { "id": "custom.lineWidth", "value": 0 }, { "id": "custom.fillOpacity", "value": 0 }, { "id": "custom.showPoints", "value": "never" }, { "id": "color", "value": { "mode": "fixed", "fixedColor": "transparent" } } ] }, { "matcher": { "id": "byName", "options": "RH" }, "properties": [ { "id": "unit", "value": "suffix:%" }, { "id": "decimals", "value": 2 }, { "id": "custom.axisPlacement", "value": "right" }, { "id": "custom.axisCenteredZero", "value": false } ] }, { "matcher": { "id": "byRegexp", "options": "RH bound .*" }, "properties": [ { "id": "unit", "value": "suffix:%" }, { "id": "custom.axisPlacement", "value": "right" }, { "id": "custom.axisCenteredZero", "value": false }, { "id": "custom.hideFrom", "value": { "legend": true, "tooltip": true, "viz": false } }, { "id": "custom.lineWidth", "value": 0 }, { "id": "custom.fillOpacity", "value": 0 }, { "id": "custom.showPoints", "value": "never" }, { "id": "color", "value": { "mode": "fixed", "fixedColor": "transparent" } } ] }, { "matcher": { "id": "byName", "options": "P" }, "properties": [ { "id": "unit", "value": "suffix:kPa" }, { "id": "custom.axisPlacement", "value": "right" }, { "id": "decimals", "value": 2 }, { "id": "custom.axisCenteredZero", "value": false } ] }, { "matcher": { "id": "byRegexp", "options": "P bound .*" }, "properties": [ { "id": "unit", "value": "suffix:kPa" }, { "id": "custom.axisPlacement", "value": "right" }, { "id": "custom.axisCenteredZero", "value": false }, { "id": "custom.hideFrom", "value": { "legend": true, "tooltip": true, "viz": false } }, { "id": "custom.lineWidth", "value": 0 }, { "id": "custom.fillOpacity", "value": 0 }, { "id": "custom.showPoints", "value": "never" }, { "id": "color", "value": { "mode": "fixed", "fixedColor": "transparent" } } ] } ] }, "options": { "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { "mode": "multi" } }, "links": [ { "title": "Open atlas-power dashboard", "url": "/d/atlas-power", "targetBlank": true } ], "description": "Temperature on left axis, humidity and pressure on right axis with dynamic bound series so small swings remain visible." }, { "id": 140, "type": "stat", "title": "Fan Activity", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 6, "w": 6, "x": 12, "y": 13 }, "targets": [ { "expr": "label_replace((round(max(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"1\"})) or on() vector(0))), \"metric\", \"Outlet\", \"__name__\", \".*\") or label_replace((round(max(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"2\"})) or on() vector(0))), \"metric\", \"Inlet - In\", \"__name__\", \".*\") or label_replace((round(max(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"3\"})) or on() vector(0))), \"metric\", \"Inlet - Out\", \"__name__\", \".*\") or label_replace((round(max(max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"4\"})) or on() vector(0))), \"metric\", \"Interior\", \"__name__\", \".*\")", "refId": "A", "legendFormat": "{{metric}}", "instant": true } ], "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 7 }, { "color": "red", "value": 9 } ] }, "unit": "none", "custom": { "displayMode": "auto" }, "decimals": 0 }, "overrides": [ { "matcher": { "id": "byName", "options": "Outlet" }, "properties": [ { "id": "decimals", "value": 0 } ] }, { "matcher": { "id": "byName", "options": "Inlet - In" }, "properties": [ { "id": "decimals", "value": 0 } ] }, { "matcher": { "id": "byName", "options": "Inlet - Out" }, "properties": [ { "id": "decimals", "value": 0 } ] }, { "matcher": { "id": "byName", "options": "Interior" }, "properties": [ { "id": "decimals", "value": 0 } ] } ] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "name_and_value", "orientation": "vertical", "wideLayout": false, "text": { "valueSize": 26 } }, "links": [ { "title": "Open atlas-power dashboard", "url": "/d/atlas-power", "targetBlank": true } ] }, { "id": 141, "type": "timeseries", "title": "Fan History (0-10)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 6, "w": 6, "x": 18, "y": 13 }, "targets": [ { "refId": "A", "expr": "max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"1\"})", "legendFormat": "Outlet" }, { "refId": "B", "expr": "max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"2\"})", "legendFormat": "Inlet - Inside" }, { "refId": "C", "expr": "max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"3\"})", "legendFormat": "Inlet - Outside" }, { "refId": "D", "expr": "max without (job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group) (typhon_fan_speed_level{port=\"4\"})", "legendFormat": "Interior" } ], "fieldConfig": { "defaults": { "unit": "none", "max": 10 }, "overrides": [] }, "options": { "legend": { "displayMode": "table", "placement": "right" }, "tooltip": { "mode": "multi" } }, "links": [ { "title": "Open atlas-power dashboard", "url": "/d/atlas-power", "targetBlank": true } ] }, { "id": 44, "type": "bargauge", "title": "One-off Job Pods (age hours)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 5, "w": 8, "x": 0, "y": 32 }, "targets": [ { "expr": "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))", "refId": "A", "legendFormat": "{{namespace}}/{{pod}}", "instant": true } ], "fieldConfig": { "defaults": { "unit": "h", "min": 0, "max": null, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 6 }, { "color": "orange", "value": 24 }, { "color": "red", "value": 48 } ] }, "decimals": 2 }, "overrides": [] }, "options": { "displayMode": "gradient", "orientation": "horizontal", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false } }, "links": [ { "title": "Open atlas-jobs dashboard", "url": "/d/atlas-jobs", "targetBlank": true } ], "transformations": [ { "id": "sortBy", "options": { "fields": [ "Value" ], "order": "desc" } }, { "id": "limit", "options": { "limit": 12 } } ] }, { "id": 45, "type": "timeseries", "title": "Ariadne Attempts / Failures", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 6, "w": 6, "x": 12, "y": 7 }, "targets": [ { "expr": "sum(increase(ariadne_task_runs_total[$__interval])) or on() vector(0)", "refId": "A", "legendFormat": "Attempts" }, { "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval])) or on() vector(0)", "refId": "B", "legendFormat": "Failures" } ], "fieldConfig": { "defaults": { "unit": "none" }, "overrides": [ { "matcher": { "id": "byName", "options": "Attempts" }, "properties": [ { "id": "color", "value": { "mode": "fixed", "fixedColor": "green" } } ] }, { "matcher": { "id": "byName", "options": "Failures" }, "properties": [ { "id": "color", "value": { "mode": "fixed", "fixedColor": "red" } } ] } ] }, "options": { "legend": { "displayMode": "table", "placement": "right" }, "tooltip": { "mode": "multi" } }, "links": [ { "title": "Open atlas-jobs dashboard", "url": "/d/atlas-jobs", "targetBlank": true } ] }, { "id": 46, "type": "timeseries", "title": "Platform Test Success Rate", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 6, "w": 6, "x": 18, "y": 7 }, "targets": [ { "refId": "A", "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=~\"ariadne\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=~\"ariadne\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=~\"ariadne\"}[1h]))) > 0) or on() vector(0)", "legendFormat": "ariadne" }, { "refId": "B", "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=~\"metis\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=~\"metis\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=~\"metis\"}[1h]))) > 0) or on() vector(0)", "legendFormat": "metis" }, { "refId": "C", "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=~\"ananke\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=~\"ananke\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=~\"ananke\"}[1h]))) > 0) or on() vector(0)", "legendFormat": "ananke" }, { "refId": "D", "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=~\"atlasbot\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=~\"atlasbot\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=~\"atlasbot\"}[1h]))) > 0) or on() vector(0)", "legendFormat": "atlasbot" }, { "refId": "E", "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=~\"lesavka\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=~\"lesavka\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=~\"lesavka\"}[1h]))) > 0) or on() vector(0)", "legendFormat": "lesavka" }, { "refId": "F", "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=~\"pegasus|pegasus-health|pegasus_health\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=~\"pegasus|pegasus-health|pegasus_health\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=~\"pegasus|pegasus-health|pegasus_health\"}[1h]))) > 0) or on() vector(0)", "legendFormat": "pegasus" }, { "refId": "G", "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=~\"soteria\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=~\"soteria\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=~\"soteria\"}[1h]))) > 0) or on() vector(0)", "legendFormat": "soteria" }, { "refId": "H", "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=~\"titan-iac|titan_iac\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=~\"titan-iac|titan_iac\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=~\"titan-iac|titan_iac\"}[1h]))) > 0) or on() vector(0)", "legendFormat": "titan-iac" }, { "refId": "I", "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=~\"bstein-home|bstein_home\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=~\"bstein-home|bstein_home\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=~\"bstein-home|bstein_home\"}[1h]))) > 0) or on() vector(0)", "legendFormat": "bstein-home" }, { "refId": "J", "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=~\"arcanagon\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=~\"arcanagon\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=~\"arcanagon\"}[1h]))) > 0) or on() vector(0)", "legendFormat": "arcanagon" }, { "refId": "K", "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=~\"data-prepper|data_prepper\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=~\"data-prepper|data_prepper\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=~\"data-prepper|data_prepper\"}[1h]))) > 0) or on() vector(0)", "legendFormat": "data-prepper" } ], "fieldConfig": { "defaults": { "unit": "percent", "min": 0, "max": 100, "custom": { "drawStyle": "line", "lineInterpolation": "linear", "lineWidth": 2, "fillOpacity": 10, "showPoints": "always", "pointSize": 4, "spanNulls": true } }, "overrides": [] }, "options": { "legend": { "displayMode": "table", "placement": "right", "calcs": [ "lastNotNull" ] }, "tooltip": { "mode": "multi" } }, "links": [ { "title": "Open atlas-testing dashboard", "url": "/d/atlas-testing", "targetBlank": true } ], "timeFrom": "7d", "description": "Per-run interval pass points (0-100) for each software suite over the last 7 days. Points are connected to show trend; missing-run intervals are ignored." }, { "id": 142, "type": "stat", "title": "Jenkins Last Success (h, newest first)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 5, "w": 4, "x": 8, "y": 32 }, "targets": [ { "refId": "A", "expr": "sort((label_replace((sort(bottomk(6, min by (exported_job,job_url,weather_icon) ((time() - ariadne_jenkins_build_weather_job_last_success_timestamp_seconds) / 3600)))) and on(exported_job,job_url,weather_icon) (max by (exported_job,job_url,weather_icon) (ariadne_jenkins_build_weather_job_last_status) == 1), \"run_state\", \"ok\", \"exported_job\", \".*\")) or (label_replace((sort(bottomk(6, min by (exported_job,job_url,weather_icon) ((time() - ariadne_jenkins_build_weather_job_last_success_timestamp_seconds) / 3600)))) and on(exported_job,job_url,weather_icon) (max by (exported_job,job_url,weather_icon) (ariadne_jenkins_build_weather_job_last_status) != 1), \"run_state\", \"bad\", \"exported_job\", \".*\")))", "instant": true } ], "fieldConfig": { "defaults": { "unit": "h", "decimals": 1, "min": 0, "displayName": "${__field.labels.weather_icon} ${__field.labels.exported_job}", "links": [ { "title": "Open Jenkins job", "url": "https://ci.bstein.dev/job/${__field.labels.exported_job}/", "targetBlank": true } ] }, "overrides": [ { "matcher": { "id": "byRegexp", "options": ".*run_state=\"ok\".*" }, "properties": [ { "id": "color", "value": { "mode": "fixed", "fixedColor": "green" } } ] }, { "matcher": { "id": "byRegexp", "options": ".*run_state=\"bad\".*" }, "properties": [ { "id": "color", "value": { "mode": "fixed", "fixedColor": "red" } } ] } ] }, "options": { "colorMode": "value", "graphMode": "none", "justifyMode": "left", "orientation": "horizontal", "wideLayout": true, "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "name_and_value", "text": { "titleSize": 11, "valueSize": 11 } }, "transformations": [ { "id": "sortBy", "options": { "fields": [ "Value" ], "order": "asc" } } ], "links": [ { "title": "Open atlas-jobs dashboard", "url": "/d/atlas-jobs", "targetBlank": true } ], "description": "Top 6 most recent Jenkins successes by age (newest first). Green means last run succeeded; red means last run did not succeed. Use Atlas Jobs for the full list." }, { "id": 243, "type": "stat", "title": "Jenkins Last Failure (h, newest first)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 5, "w": 4, "x": 12, "y": 32 }, "targets": [ { "refId": "A", "expr": "sort((label_replace((sort(bottomk(6, min by (exported_job,job_url,weather_icon) ((time() - ariadne_jenkins_build_weather_job_last_failure_timestamp_seconds) / 3600)))) and on(exported_job,job_url,weather_icon) (max by (exported_job,job_url,weather_icon) (ariadne_jenkins_build_weather_job_last_status) == 1), \"run_state\", \"ok\", \"exported_job\", \".*\")) or (label_replace((sort(bottomk(6, min by (exported_job,job_url,weather_icon) ((time() - ariadne_jenkins_build_weather_job_last_failure_timestamp_seconds) / 3600)))) and on(exported_job,job_url,weather_icon) (max by (exported_job,job_url,weather_icon) (ariadne_jenkins_build_weather_job_last_status) != 1), \"run_state\", \"bad\", \"exported_job\", \".*\")))", "instant": true } ], "fieldConfig": { "defaults": { "unit": "h", "decimals": 1, "min": 0, "displayName": "${__field.labels.weather_icon} ${__field.labels.exported_job}", "links": [ { "title": "Open Jenkins job", "url": "https://ci.bstein.dev/job/${__field.labels.exported_job}/", "targetBlank": true } ] }, "overrides": [ { "matcher": { "id": "byRegexp", "options": ".*run_state=\"ok\".*" }, "properties": [ { "id": "color", "value": { "mode": "fixed", "fixedColor": "green" } } ] }, { "matcher": { "id": "byRegexp", "options": ".*run_state=\"bad\".*" }, "properties": [ { "id": "color", "value": { "mode": "fixed", "fixedColor": "red" } } ] } ] }, "options": { "colorMode": "value", "graphMode": "none", "justifyMode": "left", "orientation": "horizontal", "wideLayout": true, "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "name_and_value", "text": { "titleSize": 11, "valueSize": 11 } }, "transformations": [ { "id": "sortBy", "options": { "fields": [ "Value" ], "order": "asc" } } ], "links": [ { "title": "Open atlas-jobs dashboard", "url": "/d/atlas-jobs", "targetBlank": true } ], "description": "Top 6 most recent Jenkins failures by age (newest first). Green means last run succeeded; red means last run did not succeed. Use Atlas Jobs for the full list." }, { "id": 47, "type": "bargauge", "title": "PVC Backup Health / Age", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 5, "w": 8, "x": 16, "y": 32 }, "targets": [ { "expr": "sort_desc(max by (namespace, pvc) (((time() - pvc_backup_last_success_timestamp_seconds{driver=\"restic\"}) / 3600) or on(namespace,pvc,volume,driver) ((((pvc_backup_health_reason{driver=\"restic\",reason=~\"missing|no_completed|lookup_failed|unknown_timestamp\"} > 0) * (pvc_backup_count{driver=\"restic\"} > bool 0)) * 999))) or on() vector(0))", "refId": "A", "legendFormat": "{{namespace}}/{{pvc}}", "instant": true } ], "fieldConfig": { "defaults": { "unit": "h", "min": 0, "max": null, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 20 }, { "color": "orange", "value": 40 }, { "color": "red", "value": 50 } ] } }, "overrides": [] }, "options": { "displayMode": "gradient", "orientation": "horizontal", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false } }, "transformations": [ { "id": "sortBy", "options": { "fields": [ "Value" ], "order": "desc" } } ], "links": [ { "title": "Open atlas-storage dashboard", "url": "/d/atlas-storage", "targetBlank": true } ], "description": "Backup age in hours computed from last-success timestamps for restic-managed PVCs (nightly target: <=20h green, <40h yellow, <50h orange, >=50h red). PVCs that have backup history but currently no successful backup (missing/no_completed/error) are pinned to 999h for visibility." }, { "id": 30, "type": "stat", "title": "Mail Sent (1d)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 2, "w": 4, "x": 0, "y": 19 }, "targets": [ { "expr": "max(postmark_outbound_sent{window=\"1d\"})", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "rgba(115, 115, 115, 1)", "value": null }, { "color": "green", "value": 1 } ] }, "unit": "none", "custom": { "displayMode": "auto" } }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" }, "links": [ { "title": "Open atlas-mail dashboard", "url": "/d/atlas-mail", "targetBlank": true } ] }, { "id": 31, "type": "stat", "title": "Mail Bounces (1d)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 2, "w": 4, "x": 8, "y": 19 }, "targets": [ { "expr": "max(postmark_outbound_bounce_rate{window=\"1d\"})", "refId": "A", "legendFormat": "Rate" }, { "expr": "max(postmark_outbound_bounced{window=\"1d\"})", "refId": "B", "legendFormat": "Count" } ], "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "custom": { "displayMode": "auto" }, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 5 }, { "color": "orange", "value": 8 }, { "color": "red", "value": 10 } ] }, "unit": "none" }, "overrides": [ { "matcher": { "id": "byName", "options": "Rate" }, "properties": [ { "id": "unit", "value": "percent" } ] }, { "matcher": { "id": "byName", "options": "Count" }, "properties": [ { "id": "unit", "value": "none" } ] } ] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "name_and_value" }, "links": [ { "title": "Open atlas-mail dashboard", "url": "/d/atlas-mail", "targetBlank": true } ] }, { "id": 32, "type": "stat", "title": "Mail Success Rate (1d)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 2, "w": 4, "x": 4, "y": 19 }, "targets": [ { "expr": "clamp_min(100 - max(postmark_outbound_bounce_rate{window=\"1d\"}), 0)", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "orange", "value": 90 }, { "color": "yellow", "value": 95 }, { "color": "green", "value": 98 } ] }, "unit": "percent", "custom": { "displayMode": "auto" }, "decimals": 1 }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" }, "links": [ { "title": "Open atlas-mail dashboard", "url": "/d/atlas-mail", "targetBlank": true } ] }, { "id": 33, "type": "stat", "title": "Mail Limit Used (30d)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 2, "w": 4, "x": 12, "y": 19 }, "targets": [ { "expr": "max(postmark_sending_limit_used_percent)", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "orange", "value": 85 }, { "color": "red", "value": 95 } ] }, "unit": "percent", "custom": { "displayMode": "auto" }, "decimals": 1 }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" }, "links": [ { "title": "Open atlas-mail dashboard", "url": "/d/atlas-mail", "targetBlank": true } ] }, { "id": 34, "type": "stat", "title": "Postgres Connections Used", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 2, "w": 4, "x": 16, "y": 19 }, "targets": [ { "expr": "label_replace(sum(pg_stat_activity_count), \"conn\", \"used\", \"__name__\", \".*\") or label_replace(max(pg_settings_max_connections), \"conn\", \"max\", \"__name__\", \".*\")", "refId": "A", "legendFormat": "{{conn}}", "instant": true } ], "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "rgba(115, 115, 115, 1)", "value": null }, { "color": "green", "value": 1 } ] }, "unit": "none", "custom": { "displayMode": "auto" }, "decimals": 0 }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "name_and_value" } }, { "id": 35, "type": "stat", "title": "Postgres Hottest Connections", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 2, "w": 4, "x": 20, "y": 19 }, "targets": [ { "expr": "topk(1, sum by (datname) (pg_stat_activity_count))", "refId": "A", "legendFormat": "{{datname}}", "instant": true } ], "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "rgba(115, 115, 115, 1)", "value": null }, { "color": "green", "value": 1 } ] }, "unit": "none", "custom": { "displayMode": "auto" }, "decimals": 0 }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "name_and_value" } }, { "id": 11, "type": "piechart", "title": "Namespace CPU Share", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 9, "w": 8, "x": 0, "y": 23 }, "targets": [ { "expr": "100 * ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_cpu}[1m])) by (namespace) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_cpu}[1m])) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } ], "fieldConfig": { "defaults": { "unit": "percent", "color": { "mode": "palette-classic" } }, "overrides": [] }, "options": { "legend": { "displayMode": "list", "placement": "right" }, "pieType": "pie", "displayLabels": [], "tooltip": { "mode": "single" }, "colorScheme": "interpolateSpectral", "colorBy": "value", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false } }, "links": [ { "title": "Workload namespaces only", "url": "?var-namespace_scope_cpu=namespace%21~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}", "targetBlank": false }, { "title": "All namespaces", "url": "?var-namespace_scope_cpu=namespace%3D~%22.%2A%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}", "targetBlank": false }, { "title": "Infrastructure namespaces only", "url": "?var-namespace_scope_cpu=namespace%3D~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}", "targetBlank": false } ], "description": "Shares are normalized within the selected filter. Switching scope changes the denominator." }, { "id": 12, "type": "piechart", "title": "Namespace GPU Share", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 9, "w": 8, "x": 8, "y": 23 }, "targets": [ { "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))", "refId": "A", "legendFormat": "{{namespace}}" } ], "fieldConfig": { "defaults": { "unit": "percent", "color": { "mode": "palette-classic" } }, "overrides": [] }, "options": { "legend": { "displayMode": "list", "placement": "right" }, "pieType": "pie", "displayLabels": [], "tooltip": { "mode": "single" }, "colorScheme": "interpolateSpectral", "colorBy": "value", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false } }, "links": [ { "title": "Workload namespaces only", "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}", "targetBlank": false }, { "title": "All namespaces", "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22.%2A%22&var-namespace_scope_ram=${namespace_scope_ram}", "targetBlank": false }, { "title": "Infrastructure namespaces only", "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}", "targetBlank": false } ], "description": "Shares are normalized within the selected filter. Switching scope changes the denominator." }, { "id": 13, "type": "piechart", "title": "Namespace RAM Share", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 9, "w": 8, "x": 16, "y": 23 }, "targets": [ { "expr": "100 * ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_ram}) by (namespace) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_ram}) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } ], "fieldConfig": { "defaults": { "unit": "percent", "color": { "mode": "palette-classic" } }, "overrides": [] }, "options": { "legend": { "displayMode": "list", "placement": "right" }, "pieType": "pie", "displayLabels": [], "tooltip": { "mode": "single" }, "colorScheme": "interpolateSpectral", "colorBy": "value", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false } }, "links": [ { "title": "Workload namespaces only", "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%21~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22", "targetBlank": false }, { "title": "All namespaces", "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%3D~%22.%2A%22", "targetBlank": false }, { "title": "Infrastructure namespaces only", "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%3D~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22", "targetBlank": false } ], "description": "Shares are normalized within the selected filter. Switching scope changes the denominator." }, { "id": 14, "type": "timeseries", "title": "Worker Node CPU", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 12, "w": 12, "x": 0, "y": 44 }, "targets": [ { "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } ], "fieldConfig": { "defaults": { "unit": "percent" }, "overrides": [] }, "options": { "legend": { "displayMode": "table", "placement": "right", "calcs": [ "last" ] }, "tooltip": { "mode": "multi" } }, "links": [ { "title": "Open atlas-nodes dashboard", "url": "/d/atlas-nodes", "targetBlank": true } ] }, { "id": 15, "type": "timeseries", "title": "Worker Node RAM", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 12, "w": 12, "x": 12, "y": 44 }, "targets": [ { "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } ], "fieldConfig": { "defaults": { "unit": "percent" }, "overrides": [] }, "options": { "legend": { "displayMode": "table", "placement": "right", "calcs": [ "last" ] }, "tooltip": { "mode": "multi" } }, "links": [ { "title": "Open atlas-nodes dashboard", "url": "/d/atlas-nodes", "targetBlank": true } ] }, { "id": 16, "type": "timeseries", "title": "Control plane CPU", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 10, "w": 12, "x": 0, "y": 56 }, "targets": [ { "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } ], "fieldConfig": { "defaults": { "unit": "percent" }, "overrides": [] }, "options": { "legend": { "displayMode": "table", "placement": "right" }, "tooltip": { "mode": "multi" } } }, { "id": 17, "type": "timeseries", "title": "Control plane RAM", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 10, "w": 12, "x": 12, "y": 56 }, "targets": [ { "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } ], "fieldConfig": { "defaults": { "unit": "percent" }, "overrides": [] }, "options": { "legend": { "displayMode": "table", "placement": "right" }, "tooltip": { "mode": "multi" } } }, { "id": 28, "type": "piechart", "title": "Node Pod Share", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 10, "w": 12, "x": 0, "y": 66 }, "targets": [ { "expr": "(sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node) / clamp_min(sum(kube_pod_info{pod!=\"\" , node!=\"\"}), 1)) * 100", "refId": "A", "legendFormat": "{{namespace}}" } ], "fieldConfig": { "defaults": { "unit": "percent", "color": { "mode": "palette-classic" } }, "overrides": [] }, "options": { "legend": { "displayMode": "list", "placement": "right" }, "pieType": "pie", "displayLabels": [], "tooltip": { "mode": "single" }, "colorScheme": "interpolateSpectral", "colorBy": "value", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false } } }, { "id": 29, "type": "bargauge", "title": "Top Nodes by Pod Count", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 10, "w": 12, "x": 12, "y": 66 }, "targets": [ { "expr": "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))", "refId": "A", "legendFormat": "{{node}}", "instant": true } ], "fieldConfig": { "defaults": { "unit": "none", "min": 0, "max": null, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 50 }, { "color": "orange", "value": 75 }, { "color": "red", "value": 100 } ] }, "decimals": 0 }, "overrides": [] }, "options": { "displayMode": "gradient", "orientation": "horizontal", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false } }, "transformations": [ { "id": "sortBy", "options": { "fields": [ "Value" ], "order": "desc" } }, { "id": "limit", "options": { "limit": 12 } } ] }, { "id": 18, "type": "timeseries", "title": "Cluster Ingress Throughput", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 7, "w": 8, "x": 0, "y": 37 }, "targets": [ { "expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)", "refId": "A", "legendFormat": "Ingress (Traefik)" } ], "fieldConfig": { "defaults": { "unit": "Bps" }, "overrides": [] }, "options": { "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { "mode": "multi" } }, "links": [ { "title": "Open atlas-network dashboard", "url": "/d/atlas-network", "targetBlank": true } ] }, { "id": 19, "type": "timeseries", "title": "Cluster Egress Throughput", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 7, "w": 8, "x": 8, "y": 37 }, "targets": [ { "expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)", "refId": "A", "legendFormat": "Egress (Traefik)" } ], "fieldConfig": { "defaults": { "unit": "Bps" }, "overrides": [] }, "options": { "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { "mode": "multi" } }, "links": [ { "title": "Open atlas-network dashboard", "url": "/d/atlas-network", "targetBlank": true } ] }, { "id": 20, "type": "timeseries", "title": "Intra-Cluster Throughput", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 7, "w": 8, "x": 16, "y": 37 }, "targets": [ { "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)", "refId": "A", "legendFormat": "Internal traffic" } ], "fieldConfig": { "defaults": { "unit": "Bps" }, "overrides": [] }, "options": { "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { "mode": "multi" } }, "links": [ { "title": "Open atlas-network dashboard", "url": "/d/atlas-network", "targetBlank": true } ] }, { "id": 21, "type": "timeseries", "title": "Root Filesystem Usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 16, "w": 12, "x": 0, "y": 76 }, "targets": [ { "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", "refId": "A", "legendFormat": "{{node}}" } ], "fieldConfig": { "defaults": { "unit": "percent" }, "overrides": [] }, "options": { "legend": { "displayMode": "table", "placement": "right", "calcs": [ "last" ] }, "tooltip": { "mode": "multi" } }, "timeFrom": "30d", "links": [ { "title": "Open atlas-storage dashboard", "url": "/d/atlas-storage", "targetBlank": true } ] }, { "id": 22, "type": "timeseries", "title": "Nodes Closest to Full Astraios Disks", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 16, "w": 12, "x": 12, "y": 76 }, "targets": [ { "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", "refId": "A", "legendFormat": "{{node}}" } ], "fieldConfig": { "defaults": { "unit": "percent" }, "overrides": [] }, "options": { "legend": { "displayMode": "table", "placement": "right", "calcs": [ "last" ] }, "tooltip": { "mode": "multi" } }, "timeFrom": "1w", "links": [ { "title": "Open atlas-storage dashboard", "url": "/d/atlas-storage", "targetBlank": true } ] } ], "schemaVersion": 39, "style": "dark", "tags": [ "atlas", "overview" ], "templating": { "list": [ { "name": "namespace_scope_cpu", "label": "CPU namespace filter", "type": "custom", "query": "workload namespaces only : namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "current": { "text": "workload namespaces only", "value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": true }, "options": [ { "text": "workload namespaces only", "value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": true }, { "text": "all namespaces", "value": "namespace=~\".*\"", "selected": false }, { "text": "infrastructure namespaces only", "value": "namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": false } ], "hide": 2, "multi": false, "includeAll": false, "refresh": 1, "sort": 0, "skipUrlSync": false }, { "name": "namespace_scope_gpu", "label": "GPU namespace filter", "type": "custom", "query": "workload namespaces only : namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "current": { "text": "workload namespaces only", "value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": true }, "options": [ { "text": "workload namespaces only", "value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": true }, { "text": "all namespaces", "value": "namespace=~\".*\"", "selected": false }, { "text": "infrastructure namespaces only", "value": "namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": false } ], "hide": 2, "multi": false, "includeAll": false, "refresh": 1, "sort": 0, "skipUrlSync": false }, { "name": "namespace_scope_ram", "label": "RAM namespace filter", "type": "custom", "query": "workload namespaces only : namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "current": { "text": "workload namespaces only", "value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": true }, "options": [ { "text": "workload namespaces only", "value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": true }, { "text": "all namespaces", "value": "namespace=~\".*\"", "selected": false }, { "text": "infrastructure namespaces only", "value": "namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": false } ], "hide": 2, "multi": false, "includeAll": false, "refresh": 1, "sort": 0, "skipUrlSync": false } ] }, "time": { "from": "now-1h", "to": "now" }, "refresh": "1m", "links": [] }